diff --git a/.asf.yaml b/.asf.yaml index ae4827046242c..3a67e45bfdbeb 100644 --- a/.asf.yaml +++ b/.asf.yaml @@ -38,7 +38,7 @@ github: required_approving_review_count: 1 required_signatures: false required_linear_history: true - required_conversation_resolution: true + required_conversation_resolution: false features: issues: true discussions: true diff --git a/.github/ISSUE_TEMPLATE/bug.yml b/.github/ISSUE_TEMPLATE/bug.yml index ad2667ed87da3..1de2d1c755de5 100644 --- a/.github/ISSUE_TEMPLATE/bug.yml +++ b/.github/ISSUE_TEMPLATE/bug.yml @@ -50,6 +50,8 @@ body: options: - Spark-3.2.x - Spark-3.3.x + - Spark-3.4.x + - Spark-3.5.x validations: required: false diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 0000000000000..545ccc8c21d78 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +blank_issues_enabled: false +contact_links: + - name: Gluten Discussions + url: https://github.com/apache/incubator-gluten/discussions + about: Ask questions or discuss new feature ideas here. diff --git a/.github/workflows/build_bundle_package.yml b/.github/workflows/build_bundle_package.yml index 01ddd6f438570..8ce6593667706 100644 --- a/.github/workflows/build_bundle_package.yml +++ b/.github/workflows/build_bundle_package.yml @@ -38,7 +38,7 @@ on: jobs: build-native-lib: runs-on: ubuntu-20.04 - container: apache/gluten:gluten-vcpkg-builder_2024_03_17 + container: apache/gluten:gluten-vcpkg-builder_2024_05_29 steps: - uses: actions/checkout@v2 - name: Build Gluten velox third party @@ -53,11 +53,17 @@ jobs: export NUM_THREADS=4 ./dev/builddeps-veloxbe.sh --build_tests=OFF --build_benchmarks=OFF --enable_s3=OFF \ --enable_gcs=OFF --enable_hdfs=ON --enable_abfs=OFF - - uses: actions/upload-artifact@v2 + - name: Upload native libs + uses: actions/upload-artifact@v2 with: path: ./cpp/build/releases/ name: velox-native-lib-${{github.sha}} retention-days: 1 + - name: Upload Artifact Arrow Jar + uses: actions/upload-artifact@v2 + with: + path: /root/.m2/repository/org/apache/arrow/ + name: velox-arrow-jar-centos-7-${{github.sha}} build-bundle-package-ubuntu: if: startsWith(github.event.inputs.os, 'ubuntu') @@ -71,6 +77,11 @@ jobs: with: name: velox-native-lib-${{github.sha}} path: ./cpp/build/releases + - name: Download All Arrow Jar Artifacts + uses: actions/download-artifact@v2 + with: + name: velox-arrow-jar-centos-7-${{github.sha}} + path: /root/.m2/repository/org/apache/arrow/ - name: Setup java and maven run: | apt-get update && \ @@ -99,6 +110,11 @@ jobs: with: name: velox-native-lib-${{github.sha}} path: ./cpp/build/releases + - name: Download All Arrow Jar Artifacts + uses: actions/download-artifact@v2 + with: + name: velox-arrow-jar-centos-7-${{github.sha}} + path: /root/.m2/repository/org/apache/arrow/ - name: Setup java and maven run: | yum update -y && yum install -y java-1.8.0-openjdk-devel wget && \ @@ -130,6 +146,11 @@ jobs: with: name: velox-native-lib-${{github.sha}} path: ./cpp/build/releases + - name: Download All Arrow Jar Artifacts + uses: actions/download-artifact@v2 + with: + name: velox-arrow-jar-centos-7-${{github.sha}} + path: /root/.m2/repository/org/apache/arrow/ - name: Update mirror list run: | sed -i -e "s|mirrorlist=|#mirrorlist=|g" /etc/yum.repos.d/CentOS-* || true && \ diff --git a/ep/build-velox/src/build_arrow_deps_centos8.sh b/.github/workflows/check_license.yml old mode 100755 new mode 100644 similarity index 53% rename from ep/build-velox/src/build_arrow_deps_centos8.sh rename to .github/workflows/check_license.yml index 8dfc2af9375e6..338397dbd6cbf --- a/ep/build-velox/src/build_arrow_deps_centos8.sh +++ b/.github/workflows/check_license.yml @@ -1,4 +1,3 @@ -#!/bin/bash # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. @@ -14,32 +13,22 @@ # See the License for the specific language governing permissions and # limitations under the License. -set -exu +name: license header check +on: + pull_request +concurrency: + group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} + cancel-in-progress: true -NPROC=$(getconf _NPROCESSORS_ONLN) - -function wget_and_untar { - local URL=$1 - local DIR=$2 - mkdir -p "${DIR}" - pushd "${DIR}" - curl -L "${URL}" > $2.tar.gz - tar -xz --strip-components=1 -f $2.tar.gz - popd -} - -function install_openssl { - wget_and_untar https://github.com/openssl/openssl/archive/refs/tags/OpenSSL_1_1_1s.tar.gz openssl - cd openssl - ./config no-shared && make depend && make && sudo make install - cd .. -} - -function install_arrow_deps { - install_openssl -} - -# Activate gcc9; enable errors on unset variables afterwards. -# source /opt/rh/gcc-toolset-9/enable || exit 1 -install_arrow_deps -echo "All dependencies for Arrow installed!" +jobs: + license-check: + name: License Header Check + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Check License Header + run: | + git fetch --recurse-submodules=no origin main ${{github.event.pull_request.base.sha}} + pip install regex + cd $GITHUB_WORKSPACE/ + ./.github/workflows/util/check.sh ${{github.event.pull_request.base.sha}} diff --git a/.github/workflows/code_style.yml b/.github/workflows/code_style.yml index f1c9c25483247..8417264373a06 100644 --- a/.github/workflows/code_style.yml +++ b/.github/workflows/code_style.yml @@ -14,43 +14,56 @@ # limitations under the License. name: Code style checks - on: - pull_request - + pull_request: + paths: + - '.github/workflows/code_style.yml' + - 'cpp/**' + - 'cpp-ch/**' concurrency: group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} cancel-in-progress: true jobs: - formatting-check: - name: Formatting Check + CPP-format-check: + name: CPP Format Check runs-on: ubuntu-latest strategy: matrix: path: - - check: 'cpp/core' - exclude: '' - - check: 'cpp/velox' + - check: 'cpp' exclude: '' steps: - uses: actions/checkout@v4 - - name: Run clang-format style check for C/C++ programs. + - name: Run clang-format style check for C/C++ uses: jidicula/clang-format-action@v4.11.0 with: clang-format-version: '15' check-path: ${{ matrix.path['check'] }} fallback-style: 'Google' # optional - license-check: - name: License Header Check + CMake-format-check: + name: CMake Format Check runs-on: ubuntu-latest - + container: ubuntu:22.04 steps: - - uses: actions/checkout@v3 - - - name: Check License Header + - name: Install tools + run: | + apt update -y + apt install git python3-pip -y + pip3 install --user cmake-format + - uses: actions/checkout@v4 + - name: Check CMake format run: | - git fetch --recurse-submodules=no origin main ${{github.event.pull_request.base.sha}} - pip install regex - dev/check.sh ${{github.event.pull_request.base.sha}} + git config --global --add safe.directory $GITHUB_WORKSPACE + cd $GITHUB_WORKSPACE/ + fileList=$(find ./cpp ./cpp-ch -name CMakeLists.txt -o -name *.cmake) + for file in $fileList; do + /github/home/.local/bin/cmake-format --first-comment-is-literal True --in-place $file + done + if [ -n "$(git status --porcelain)" ]; then + echo "Please use cmake-format to format cmake files or apply the below patch." + git diff -- '*CMakeLists.txt' '*.cmake' + exit 1 + fi + echo "No CMake format issue." diff --git a/dev/check.py b/.github/workflows/util/check.py similarity index 100% rename from dev/check.py rename to .github/workflows/util/check.py diff --git a/dev/check.sh b/.github/workflows/util/check.sh similarity index 90% rename from dev/check.sh rename to .github/workflows/util/check.sh index 9b940845f92dd..d8db8bd402c37 100755 --- a/dev/check.sh +++ b/.github/workflows/util/check.sh @@ -15,9 +15,9 @@ # limitations under the License. export BASE_COMMIT=$1 -dev/check.py header branch +./.github/workflows/util/check.py header branch if [ $? -ne 0 ]; then - dev/check.py header branch --fix + ./.github/workflows/util/check.py header branch --fix echo -e "\n==== Apply using:" echo "patch -p1 \<> $GITHUB_ENV if [ "${{ matrix.java }}" = "java-17" ]; then echo "JAVA_HOME=/usr/lib/jvm/java-17-openjdk" >> $GITHUB_ENV + elif [ "${{ matrix.java }}" = "java-11" ]; then + echo "JAVA_HOME=/usr/lib/jvm/java-11-openjdk" >> $GITHUB_ENV else echo "JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk" >> $GITHUB_ENV fi @@ -172,9 +223,9 @@ jobs: run: | echo "JAVA_HOME: $JAVA_HOME" cd $GITHUB_WORKSPACE/ - mvn -ntp clean install -P${{ matrix.spark }} -P${{ matrix.java }} -Pbackends-velox -DskipTests + $MVN_CMD clean install -P${{ matrix.spark }} -P${{ matrix.java }} -Pbackends-velox -DskipTests cd $GITHUB_WORKSPACE/tools/gluten-it - mvn -ntp clean install -P${{ matrix.spark }} -P${{ matrix.java }} + $MVN_CMD clean install -P${{ matrix.spark }} -P${{ matrix.java }} - name: Run TPC-H / TPC-DS run: | echo "JAVA_HOME: $JAVA_HOME" @@ -215,11 +266,16 @@ jobs: sudo docker image prune --all --force > /dev/null df -h - uses: actions/checkout@v2 - - name: Download All Artifacts + - name: Download All Native Artifacts uses: actions/download-artifact@v2 with: name: velox-native-lib-centos-7-${{github.sha}} - path: ./cpp/build/releases + path: ./cpp/build/releases/ + - name: Download All Arrow Jar Artifacts + uses: actions/download-artifact@v2 + with: + name: velox-arrow-jar-centos-7-${{github.sha}} + path: /home/runner/.m2/repository/org/apache/arrow/ - name: Setup java and maven run: | sudo apt-get update @@ -230,14 +286,14 @@ jobs: - name: Build for Spark ${{ matrix.spark }} run: | cd $GITHUB_WORKSPACE/ - mvn -ntp clean install -P${{ matrix.spark }} -Pbackends-velox -DskipTests + $MVN_CMD clean install -P${{ matrix.spark }} -Pbackends-velox -DskipTests cd $GITHUB_WORKSPACE/tools/gluten-it - mvn -ntp clean install -P${{ matrix.spark }} + $MVN_CMD clean install -P${{ matrix.spark }} GLUTEN_IT_JVM_ARGS=-Xmx6G sbin/gluten-it.sh data-gen-only --local --benchmark-type=ds -s=30.0 --threads=12 - name: TPC-DS SF30.0 Parquet local spark3.2 Q67/Q95 low memory, memory isolation off run: | cd tools/gluten-it \ - && GLUTEN_IT_JVM_ARGS=-Xmx6G sbin/gluten-it.sh parameterized \ + && GLUTEN_IT_JVM_ARGS=-Xmx3G sbin/gluten-it.sh parameterized \ --local --preset=velox --benchmark-type=ds --error-on-memleak --queries=q67,q95 -s=30.0 --threads=12 --shuffle-partitions=72 --iterations=1 \ --skip-data-gen -m=OffHeapExecutionMemory \ -d=ISOLATION:OFF,spark.gluten.memory.isolation=false \ @@ -249,7 +305,7 @@ jobs: - name: TPC-DS SF30.0 Parquet local spark3.2 Q67 low memory, memory isolation on run: | cd tools/gluten-it \ - && GLUTEN_IT_JVM_ARGS=-Xmx6G sbin/gluten-it.sh parameterized \ + && GLUTEN_IT_JVM_ARGS=-Xmx3G sbin/gluten-it.sh parameterized \ --local --preset=velox --benchmark-type=ds --error-on-memleak --queries=q67 -s=30.0 --threads=12 --shuffle-partitions=72 --iterations=1 \ --skip-data-gen -m=OffHeapExecutionMemory \ -d=ISOLATION:ON,spark.gluten.memory.isolation=true,spark.memory.storageFraction=0.1 \ @@ -260,7 +316,7 @@ jobs: - name: (To be fixed) TPC-DS SF30.0 Parquet local spark3.2 Q95 low memory, memory isolation on run: | cd tools/gluten-it \ - && GLUTEN_IT_JVM_ARGS=-Xmx6G sbin/gluten-it.sh parameterized \ + && GLUTEN_IT_JVM_ARGS=-Xmx3G sbin/gluten-it.sh parameterized \ --local --preset=velox --benchmark-type=ds --error-on-memleak --queries=q95 -s=30.0 --threads=12 --shuffle-partitions=72 --iterations=1 \ --skip-data-gen -m=OffHeapExecutionMemory \ -d=ISOLATION:ON,spark.gluten.memory.isolation=true,spark.memory.storageFraction=0.1 \ @@ -271,19 +327,29 @@ jobs: - name: TPC-DS SF30.0 Parquet local spark3.2 Q23A/Q23B low memory run: | cd tools/gluten-it \ - && GLUTEN_IT_JVM_ARGS=-Xmx6G sbin/gluten-it.sh parameterized \ + && GLUTEN_IT_JVM_ARGS=-Xmx3G sbin/gluten-it.sh parameterized \ --local --preset=velox --benchmark-type=ds --error-on-memleak --queries=q23a,q23b -s=30.0 --threads=12 --shuffle-partitions=72 --iterations=1 \ --skip-data-gen -m=OffHeapExecutionMemory \ -d=ISOLATION:OFF,spark.gluten.memory.isolation=false \ - -d=ISOLATION:ON,spark.gluten.memory.isolation=true,spark.memory.storageFraction=0.1 \ -d=OFFHEAP_SIZE:2g,spark.memory.offHeap.size=2g \ -d=FLUSH_MODE:DISABLED,spark.gluten.sql.columnar.backend.velox.flushablePartialAggregation=false,spark.gluten.sql.columnar.backend.velox.maxPartialAggregationMemoryRatio=1.0,spark.gluten.sql.columnar.backend.velox.maxExtendedPartialAggregationMemoryRatio=1.0,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinPct=100,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinRows=0 \ -d=FLUSH_MODE:ABANDONED,spark.gluten.sql.columnar.backend.velox.maxPartialAggregationMemoryRatio=1.0,spark.gluten.sql.columnar.backend.velox.maxExtendedPartialAggregationMemoryRatio=1.0,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinPct=0,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinRows=0 \ -d=FLUSH_MODE:FLUSHED,spark.gluten.sql.columnar.backend.velox.maxPartialAggregationMemoryRatio=0.05,spark.gluten.sql.columnar.backend.velox.maxExtendedPartialAggregationMemoryRatio=0.1,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinPct=100,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinRows=0 - - name: (To be fixed) TPC-DS SF30.0 Parquet local spark3.2 Q97 low memory # The case currently causes crash with "free: invalid size". + - name: (To be fixed) TPC-DS SF30.0 Parquet local spark3.2 Q23A/Q23B low memory, memory isolation on # Disabled as error https://gist.github.com/zhztheplayer/abd5e83ccdc48730678ae7ebae479fcc + run: | + cd tools/gluten-it \ + && GLUTEN_IT_JVM_ARGS=-Xmx3G sbin/gluten-it.sh parameterized \ + --local --preset=velox --benchmark-type=ds --error-on-memleak --queries=q23a,q23b -s=30.0 --threads=12 --shuffle-partitions=72 --iterations=1 \ + --skip-data-gen -m=OffHeapExecutionMemory \ + -d=ISOLATION:ON,spark.gluten.memory.isolation=true,spark.memory.storageFraction=0.1 \ + -d=OFFHEAP_SIZE:2g,spark.memory.offHeap.size=2g \ + -d=FLUSH_MODE:DISABLED,spark.gluten.sql.columnar.backend.velox.flushablePartialAggregation=false,spark.gluten.sql.columnar.backend.velox.maxPartialAggregationMemoryRatio=1.0,spark.gluten.sql.columnar.backend.velox.maxExtendedPartialAggregationMemoryRatio=1.0,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinPct=100,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinRows=0 \ + -d=FLUSH_MODE:ABANDONED,spark.gluten.sql.columnar.backend.velox.maxPartialAggregationMemoryRatio=1.0,spark.gluten.sql.columnar.backend.velox.maxExtendedPartialAggregationMemoryRatio=1.0,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinPct=0,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinRows=0 \ + -d=FLUSH_MODE:FLUSHED,spark.gluten.sql.columnar.backend.velox.maxPartialAggregationMemoryRatio=0.05,spark.gluten.sql.columnar.backend.velox.maxExtendedPartialAggregationMemoryRatio=0.1,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinPct=100,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinRows=0 || true + - name: TPC-DS SF30.0 Parquet local spark3.2 Q97 low memory run: | cd tools/gluten-it \ - && GLUTEN_IT_JVM_ARGS=-Xmx6G sbin/gluten-it.sh parameterized \ + && GLUTEN_IT_JVM_ARGS=-Xmx3G sbin/gluten-it.sh parameterized \ --local --preset=velox --benchmark-type=ds --error-on-memleak --queries=q97 -s=30.0 --threads=12 --shuffle-partitions=72 --iterations=1 \ --skip-data-gen -m=OffHeapExecutionMemory \ -d=ISOLATION:OFF,spark.gluten.memory.isolation=false \ @@ -312,11 +378,16 @@ jobs: sudo docker image prune --all --force > /dev/null df -h - uses: actions/checkout@v2 - - name: Download All Artifacts + - name: Download All Native Artifacts uses: actions/download-artifact@v2 with: name: velox-native-lib-centos-7-${{github.sha}} - path: ./cpp/build/releases + path: ./cpp/build/releases/ + - name: Download All Arrow Jar Artifacts + uses: actions/download-artifact@v2 + with: + name: velox-arrow-jar-centos-7-${{github.sha}} + path: /home/runner/.m2/repository/org/apache/arrow/ - name: Setup java and maven run: | sudo apt-get update @@ -327,16 +398,16 @@ jobs: - name: Build for Spark ${{ matrix.spark }} run: | cd $GITHUB_WORKSPACE/ - mvn -ntp clean install -P${{ matrix.spark }} -Pbackends-velox -DskipTests + $MVN_CMD clean install -P${{ matrix.spark }} -Pbackends-velox -DskipTests cd $GITHUB_WORKSPACE/tools/gluten-it - mvn -ntp clean install -P${{ matrix.spark }} + $MVN_CMD clean install -P${{ matrix.spark }} GLUTEN_IT_JVM_ARGS=-Xmx6G sbin/gluten-it.sh data-gen-only --local --benchmark-type=ds -s=30.0 --threads=12 - name: TPC-DS SF30.0 Parquet local spark3.2 random kill tasks run: | cd tools/gluten-it \ && GLUTEN_IT_JVM_ARGS=-Xmx6G sbin/gluten-it.sh queries \ --local --preset=velox --benchmark-type=ds --error-on-memleak -s=30.0 --off-heap-size=8g --threads=12 --shuffle-partitions=72 --iterations=1 \ - --skip-data-gen --random-kill-tasks + --skip-data-gen --random-kill-tasks --no-session-reuse # run-tpc-test-ubuntu-sf30: # needs: build-native-lib-centos-7 @@ -375,9 +446,9 @@ jobs: # - name: Build for Spark ${{ matrix.spark }} # run: | # cd $GITHUB_WORKSPACE/ - # mvn -ntp clean install -P${{ matrix.spark }} -Pbackends-velox -DskipTests + # $MVN_CMD clean install -P${{ matrix.spark }} -Pbackends-velox -DskipTests # cd $GITHUB_WORKSPACE/tools/gluten-it - # mvn -ntp clean install -P${{ matrix.spark }} + # $MVN_CMD clean install -P${{ matrix.spark }} # GLUTEN_IT_JVM_ARGS=-Xmx6G sbin/gluten-it.sh data-gen-only --local --benchmark-type=h -s=30.0 --threads=12 # GLUTEN_IT_JVM_ARGS=-Xmx6G sbin/gluten-it.sh data-gen-only --local --benchmark-type=ds -s=30.0 --threads=12 # - name: TPC-H / TPC-DS SF30.0 Parquet local ${{ matrix.spark }} @@ -395,16 +466,21 @@ jobs: strategy: fail-fast: false matrix: - spark: ["spark-3.2"] + spark: [ "spark-3.2" ] runs-on: ubuntu-20.04 container: centos:8 steps: - uses: actions/checkout@v2 - - name: Download All Artifacts + - name: Download All Native Artifacts uses: actions/download-artifact@v2 with: name: velox-native-lib-centos-7-${{github.sha}} - path: ./cpp/build/releases + path: ./cpp/build/releases/ + - name: Download All Arrow Jar Artifacts + uses: actions/download-artifact@v2 + with: + name: velox-arrow-jar-centos-7-${{github.sha}} + path: /root/.m2/repository/org/apache/arrow/ - name: Update mirror list run: | sed -i -e "s|mirrorlist=|#mirrorlist=|g" /etc/yum.repos.d/CentOS-* || true @@ -420,19 +496,19 @@ jobs: cd $GITHUB_WORKSPACE/ && \ export MAVEN_HOME=/usr/lib/maven && \ export PATH=${PATH}:${MAVEN_HOME}/bin && \ - mvn clean install -P${{ matrix.spark }} -Pbackends-velox -Puniffle -DskipTests + $MVN_CMD clean install -P${{ matrix.spark }} -Pbackends-velox -Puniffle -DskipTests - name: TPC-H SF1.0 && TPC-DS SF1.0 Parquet local spark3.2 with uniffle 0.8.0 run: | export MAVEN_HOME=/usr/lib/maven && \ export PATH=${PATH}:${MAVEN_HOME}/bin && \ - export export JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk && \ + export JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk && \ cd /opt && \ git clone -b branch-0.8 https://github.com/apache/incubator-uniffle.git && \ cd incubator-uniffle && \ sed -i '250d' ./server/src/main/java/org/apache/uniffle/server/ShuffleTaskManager.java && \ sed -i '228d' ./server/src/main/java/org/apache/uniffle/server/ShuffleTaskManager.java && \ sed -i '226d' ./server/src/main/java/org/apache/uniffle/server/ShuffleTaskManager.java && \ - mvn clean install -Phadoop2.8 -DskipTests + $MVN_CMD clean install -Phadoop2.8 -DskipTests cd /opt && \ wget -nv https://archive.apache.org/dist/incubator/uniffle/0.8.0/apache-uniffle-0.8.0-incubating-bin.tar.gz && \ tar xzf apache-uniffle-0.8.0-incubating-bin.tar.gz -C /opt/ && mv /opt/rss-0.8.0-hadoop2.8 /opt/uniffle && \ @@ -446,7 +522,7 @@ jobs: bash -c "echo -e 'rss.coordinator.shuffle.nodes.max 1\nrss.rpc.server.port 19999' > ./conf/coordinator.conf" && \ bash -c "echo -e 'rss.server.app.expired.withoutHeartbeat 7200000\nrss.server.heartbeat.delay 3000\nrss.rpc.server.port 19997\nrss.jetty.http.port 19996\nrss.server.netty.port 19995\nrss.storage.basePath /opt/uniffle/shuffle_data\nrss.storage.type MEMORY_LOCALFILE\nrss.coordinator.quorum localhost:19999\nrss.server.flush.thread.alive 10\nrss.server.single.buffer.flush.threshold 64m' > ./conf/server.conf" && \ bash ./bin/start-coordinator.sh && bash ./bin/start-shuffle-server.sh - cd $GITHUB_WORKSPACE/tools/gluten-it && mvn clean install -Pspark-3.2 -Puniffle && \ + cd $GITHUB_WORKSPACE/tools/gluten-it && $MVN_CMD clean install -Pspark-3.2 -Puniffle && \ GLUTEN_IT_JVM_ARGS=-Xmx5G sbin/gluten-it.sh queries-compare \ --local --preset=velox-with-uniffle --benchmark-type=h --error-on-memleak --off-heap-size=10g -s=1.0 --threads=16 --iterations=1 @@ -455,40 +531,50 @@ jobs: strategy: fail-fast: false matrix: - spark: ["spark-3.2"] - celeborn: ["celeborn-0.4.0", "celeborn-0.3.2"] + spark: [ "spark-3.2" ] + celeborn: [ "celeborn-0.4.1", "celeborn-0.3.2-incubating" ] runs-on: ubuntu-20.04 container: ubuntu:22.04 steps: - uses: actions/checkout@v2 - - name: Download All Artifacts + - name: Download All Native Artifacts uses: actions/download-artifact@v2 with: name: velox-native-lib-centos-7-${{github.sha}} - path: ./cpp/build/releases + path: ./cpp/build/releases/ + - name: Download All Arrow Jar Artifacts + uses: actions/download-artifact@v2 + with: + name: velox-arrow-jar-centos-7-${{github.sha}} + path: /root/.m2/repository/org/apache/arrow/ + - name: Setup tzdata + run: | + apt-get update + TZ="Etc/GMT" DEBIAN_FRONTEND=noninteractive apt-get install -y tzdata - name: Setup java and maven run: | apt-get update && apt-get install -y openjdk-8-jdk maven wget + apt remove openjdk-11* -y + echo "JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64" >> $GITHUB_ENV - name: Build for Spark ${{ matrix.spark }} run: | cd $GITHUB_WORKSPACE/ - mvn clean install -P${{ matrix.spark }} -Pbackends-velox -Pceleborn -DskipTests + $MVN_CMD clean install -P${{ matrix.spark }} -Pbackends-velox -Pceleborn -DskipTests - name: TPC-H SF1.0 && TPC-DS SF1.0 Parquet local spark3.2 with ${{ matrix.celeborn }} run: | - export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 EXTRA_PROFILE="" if [ "${{ matrix.celeborn }}" = "celeborn-0.4.0" ]; then EXTRA_PROFILE="-Pceleborn-0.4" fi echo "EXTRA_PROFILE: ${EXTRA_PROFILE}" cd /opt && mkdir -p celeborn && \ - wget https://archive.apache.org/dist/incubator/celeborn/${{ matrix.celeborn }}-incubating/apache-${{ matrix.celeborn }}-incubating-bin.tgz && \ - tar xzf apache-${{ matrix.celeborn }}-incubating-bin.tgz -C /opt/celeborn --strip-components=1 && cd celeborn && \ + wget https://archive.apache.org/dist/celeborn/${{ matrix.celeborn }}/apache-${{ matrix.celeborn }}-bin.tgz && \ + tar xzf apache-${{ matrix.celeborn }}-bin.tgz -C /opt/celeborn --strip-components=1 && cd celeborn && \ mv ./conf/celeborn-env.sh.template ./conf/celeborn-env.sh && \ bash -c "echo -e 'CELEBORN_MASTER_MEMORY=4g\nCELEBORN_WORKER_MEMORY=4g\nCELEBORN_WORKER_OFFHEAP_MEMORY=8g' > ./conf/celeborn-env.sh" && \ bash -c "echo -e 'celeborn.worker.commitFiles.threads 128\nceleborn.worker.sortPartition.threads 64' > ./conf/celeborn-defaults.conf" && \ bash ./sbin/start-master.sh && bash ./sbin/start-worker.sh && \ - cd $GITHUB_WORKSPACE/tools/gluten-it && mvn clean install -Pspark-3.2 -Pceleborn ${EXTRA_PROFILE} && \ + cd $GITHUB_WORKSPACE/tools/gluten-it && $MVN_CMD clean install -Pspark-3.2 -Pceleborn ${EXTRA_PROFILE} && \ GLUTEN_IT_JVM_ARGS=-Xmx5G sbin/gluten-it.sh queries-compare \ --local --preset=velox-with-celeborn --benchmark-type=h --error-on-memleak --off-heap-size=10g -s=1.0 --threads=8 --iterations=1 && \ GLUTEN_IT_JVM_ARGS=-Xmx5G sbin/gluten-it.sh queries-compare \ @@ -506,8 +592,14 @@ jobs: id: cache uses: actions/cache/restore@v3 with: - path: ./cpp/build/releases/ + path: | + ./cpp/build/releases/ + /root/.m2/repository/org/apache/arrow/ key: cache-velox-build-centos-8-${{ hashFiles('./cache-key') }} + - name: Update mirror list + run: | + sed -i -e "s|mirrorlist=|#mirrorlist=|g" /etc/yum.repos.d/CentOS-* || true + sed -i -e "s|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g" /etc/yum.repos.d/CentOS-* || true - name: Setup build dependency if: ${{ steps.cache.outputs.cache-hit != 'true' }} run: | @@ -519,13 +611,15 @@ jobs: - name: Build Gluten Velox third party if: ${{ steps.cache.outputs.cache-hit != 'true' }} run: | - cd ep/build-velox/src - ./get_velox.sh source /opt/rh/gcc-toolset-9/enable - ./build_arrow_deps_centos8.sh - ./build_velox.sh --run_setup_script=OFF --enable_ep_cache=OFF --build_test_utils=ON - cd $GITHUB_WORKSPACE/cpp - ./compile.sh --build_velox_backend=ON --build_protobuf=ON --build_tests=ON --build_examples=ON --build_benchmarks=ON + source ./dev/build_arrow.sh + install_arrow_deps + ./dev/builddeps-veloxbe.sh --run_setup_script=OFF --enable_ep_cache=OFF --build_tests=ON \ + --build_examples=ON --build_benchmarks=ON --build_protobuf=ON + - name: Gluten CPP Test + run: | + cd ./cpp/build && \ + ctest -V - uses: actions/upload-artifact@v2 with: name: velox-native-lib-centos-8-${{github.sha}} @@ -534,6 +628,10 @@ jobs: with: name: udf-example-lib-centos-8-${{github.sha}} path: ./cpp/build/velox/udf/examples/ + - uses: actions/upload-artifact@v2 + with: + name: benchmark-centos-8-${{github.sha}} + path: ./cpp/build/velox/benchmarks/ - uses: actions/upload-artifact@v2 with: name: arrow-jars-centos-8-${{github.sha}} @@ -557,11 +655,20 @@ jobs: with: name: udf-example-lib-centos-8-${{github.sha}} path: ./cpp/build/velox/udf/examples/ + - name: Download Benchmark + uses: actions/download-artifact@v2 + with: + name: benchmark-centos-8-${{github.sha}} + path: ./cpp/build/velox/benchmarks/ - name: Download Arrow Jars uses: actions/download-artifact@v2 with: name: arrow-jars-centos-8-${{github.sha}} path: /root/.m2/repository/org/apache/arrow/ + - name: Update mirror list + run: | + sed -i -e "s|mirrorlist=|#mirrorlist=|g" /etc/yum.repos.d/CentOS-* || true + sed -i -e "s|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g" /etc/yum.repos.d/CentOS-* || true - name: Setup build dependency run: | yum install sudo patch java-1.8.0-openjdk-devel wget -y @@ -578,10 +685,6 @@ jobs: working-directory: ${{ github.workspace }} run: | mkdir -p '${{ env.CCACHE_DIR }}' - - name: Gluten CPP Test - run: | - cd $GITHUB_WORKSPACE/cpp/build && \ - ctest -V - name: Prepare spark.test.home for Spark 3.2.2 (other tests) run: | cd $GITHUB_WORKSPACE/ && \ @@ -604,19 +707,20 @@ jobs: run: | cd $GITHUB_WORKSPACE/ export SPARK_SCALA_VERSION=2.12 - mvn -ntp clean install -Pspark-3.2 -Pspark-ut -Pbackends-velox -Pceleborn -Piceberg -Pdelta -DargLine="-Dspark.test.home=$GITHUB_WORKSPACE//shims/spark32/spark_home/" -DtagsToExclude=org.apache.spark.tags.ExtendedSQLTest,org.apache.gluten.tags.UDFTest,org.apache.gluten.tags.SkipTestTags && \ - mvn -ntp test -Pspark-3.2 -Pbackends-velox -Piceberg -Pdelta -DtagsToExclude=None -DtagsToInclude=org.apache.gluten.tags.UDFTest + $MVN_CMD clean install -Pspark-3.2 -Pspark-ut -Pbackends-velox -Pceleborn -Piceberg -Pdelta -DargLine="-Dspark.test.home=$GITHUB_WORKSPACE//shims/spark32/spark_home/" -DtagsToExclude=org.apache.spark.tags.ExtendedSQLTest,org.apache.gluten.tags.UDFTest,org.apache.gluten.tags.SkipTestTags && \ + $MVN_CMD test -Pspark-3.2 -Pbackends-velox -Piceberg -Pdelta -DtagsToExclude=None -DtagsToInclude=org.apache.gluten.tags.UDFTest - name: Upload golden files if: failure() uses: actions/upload-artifact@v4 with: name: golden-files-spark32 path: /tmp/tpch-approved-plan/** - # - name: Gluten CPP Benchmark Test - # run: | - # # This test depends on example.json generated by the above mvn test. - # cd $GITHUB_WORKSPACE/cpp/build/velox/benchmarks && \ - # ./generic_benchmark --run-example --with-shuffle --threads 1 --iterations 1 + - name: Gluten CPP Benchmark Test + run: | + # This test depends on example.json generated by the above mvn test. + cd $GITHUB_WORKSPACE/cpp/build/velox/benchmarks && \ + sudo chmod +x ./generic_benchmark && \ + ./generic_benchmark --run-example --with-shuffle --threads 1 --iterations 1 run-spark-test-spark32-slow: needs: build-native-lib-centos-8 @@ -636,6 +740,10 @@ jobs: with: name: arrow-jars-centos-8-${{github.sha}} path: /root/.m2/repository/org/apache/arrow/ + - name: Update mirror list + run: | + sed -i -e "s|mirrorlist=|#mirrorlist=|g" /etc/yum.repos.d/CentOS-* || true + sed -i -e "s|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g" /etc/yum.repos.d/CentOS-* || true - name: Setup build dependency run: | yum install sudo patch java-1.8.0-openjdk-devel wget -y @@ -662,7 +770,7 @@ jobs: - name: Build and run unit test for Spark 3.2.2 (slow tests) run: | cd $GITHUB_WORKSPACE/ - mvn -ntp clean install -Pspark-3.2 -Pspark-ut -Pbackends-velox -Pceleborn -Piceberg -Pdelta -DargLine="-Dspark.test.home=$GITHUB_WORKSPACE//shims/spark32/spark_home/" -DtagsToInclude=org.apache.spark.tags.ExtendedSQLTest + $MVN_CMD clean install -Pspark-3.2 -Pspark-ut -Pbackends-velox -Pceleborn -Piceberg -Pdelta -DargLine="-Dspark.test.home=$GITHUB_WORKSPACE//shims/spark32/spark_home/" -DtagsToInclude=org.apache.spark.tags.ExtendedSQLTest run-spark-test-spark33: needs: build-native-lib-centos-8 @@ -687,6 +795,10 @@ jobs: with: name: arrow-jars-centos-8-${{github.sha}} path: /root/.m2/repository/org/apache/arrow/ + - name: Update mirror list + run: | + sed -i -e "s|mirrorlist=|#mirrorlist=|g" /etc/yum.repos.d/CentOS-* || true + sed -i -e "s|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g" /etc/yum.repos.d/CentOS-* || true - name: Setup build dependency run: | yum install sudo patch java-1.8.0-openjdk-devel wget -y @@ -725,8 +837,8 @@ jobs: run: | cd $GITHUB_WORKSPACE/ export SPARK_SCALA_VERSION=2.12 - mvn -ntp clean install -Pspark-3.3 -Pbackends-velox -Pceleborn -Piceberg -Pdelta -Pspark-ut -DargLine="-Dspark.test.home=$GITHUB_WORKSPACE//shims/spark33/spark_home/" -DtagsToExclude=org.apache.spark.tags.ExtendedSQLTest,org.apache.gluten.tags.UDFTest,org.apache.gluten.tags.SkipTestTags && \ - mvn -ntp test -Pspark-3.3 -Pbackends-velox -Piceberg -Pdelta -DtagsToExclude=None -DtagsToInclude=org.apache.gluten.tags.UDFTest + $MVN_CMD clean install -Pspark-3.3 -Pbackends-velox -Pceleborn -Piceberg -Pdelta -Pspark-ut -DargLine="-Dspark.test.home=$GITHUB_WORKSPACE//shims/spark33/spark_home/" -DtagsToExclude=org.apache.spark.tags.ExtendedSQLTest,org.apache.gluten.tags.UDFTest,org.apache.gluten.tags.SkipTestTags && \ + $MVN_CMD test -Pspark-3.3 -Pbackends-velox -Piceberg -Pdelta -DtagsToExclude=None -DtagsToInclude=org.apache.gluten.tags.UDFTest - name: Upload golden files if: failure() uses: actions/upload-artifact@v4 @@ -753,6 +865,10 @@ jobs: with: name: arrow-jars-centos-8-${{github.sha}} path: /root/.m2/repository/org/apache/arrow/ + - name: Update mirror list + run: | + sed -i -e "s|mirrorlist=|#mirrorlist=|g" /etc/yum.repos.d/CentOS-* || true + sed -i -e "s|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g" /etc/yum.repos.d/CentOS-* || true - name: Setup build dependency run: | yum install sudo patch java-1.8.0-openjdk-devel wget -y @@ -779,7 +895,7 @@ jobs: - name: Build and Run unit test for Spark 3.3.1 (slow tests) run: | cd $GITHUB_WORKSPACE/ - mvn -ntp clean install -Pspark-3.3 -Pbackends-velox -Pceleborn -Piceberg -Pdelta -Pspark-ut -DargLine="-Dspark.test.home=$GITHUB_WORKSPACE//shims/spark33/spark_home/" -DtagsToInclude=org.apache.spark.tags.ExtendedSQLTest + $MVN_CMD clean install -Pspark-3.3 -Pbackends-velox -Pceleborn -Piceberg -Pdelta -Pspark-ut -DargLine="-Dspark.test.home=$GITHUB_WORKSPACE//shims/spark33/spark_home/" -DtagsToInclude=org.apache.spark.tags.ExtendedSQLTest run-spark-test-spark34: needs: build-native-lib-centos-8 @@ -804,6 +920,10 @@ jobs: with: name: arrow-jars-centos-8-${{github.sha}} path: /root/.m2/repository/org/apache/arrow/ + - name: Update mirror list + run: | + sed -i -e "s|mirrorlist=|#mirrorlist=|g" /etc/yum.repos.d/CentOS-* || true + sed -i -e "s|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g" /etc/yum.repos.d/CentOS-* || true - name: Setup build dependency run: | yum install sudo patch java-1.8.0-openjdk-devel wget -y @@ -842,8 +962,8 @@ jobs: run: | cd $GITHUB_WORKSPACE/ export SPARK_SCALA_VERSION=2.12 - mvn -ntp clean install -Pspark-3.4 -Pbackends-velox -Pceleborn -Piceberg -Pdelta -Pspark-ut -DargLine="-Dspark.test.home=$GITHUB_WORKSPACE//shims/spark34/spark_home/" -DtagsToExclude=org.apache.spark.tags.ExtendedSQLTest,org.apache.gluten.tags.UDFTest,org.apache.gluten.tags.SkipTestTags && \ - mvn -ntp test -Pspark-3.4 -Pbackends-velox -Piceberg -Pdelta -DtagsToExclude=None -DtagsToInclude=org.apache.gluten.tags.UDFTest + $MVN_CMD clean install -Pspark-3.4 -Pbackends-velox -Pceleborn -Piceberg -Pdelta -Pspark-ut -DargLine="-Dspark.test.home=$GITHUB_WORKSPACE//shims/spark34/spark_home/" -DtagsToExclude=org.apache.spark.tags.ExtendedSQLTest,org.apache.gluten.tags.UDFTest,org.apache.gluten.tags.SkipTestTags && \ + $MVN_CMD test -Pspark-3.4 -Pbackends-velox -Piceberg -Pdelta -DtagsToExclude=None -DtagsToInclude=org.apache.gluten.tags.UDFTest - name: Upload golden files if: failure() uses: actions/upload-artifact@v4 @@ -870,6 +990,10 @@ jobs: with: name: arrow-jars-centos-8-${{github.sha}} path: /root/.m2/repository/org/apache/arrow/ + - name: Update mirror list + run: | + sed -i -e "s|mirrorlist=|#mirrorlist=|g" /etc/yum.repos.d/CentOS-* || true + sed -i -e "s|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g" /etc/yum.repos.d/CentOS-* || true - name: Setup build dependency run: | yum install sudo patch java-1.8.0-openjdk-devel wget -y @@ -896,7 +1020,7 @@ jobs: - name: Build and Run unit test for Spark 3.4.2 (slow tests) run: | cd $GITHUB_WORKSPACE/ - mvn -ntp clean install -Pspark-3.4 -Pbackends-velox -Pceleborn -Piceberg -Pdelta -Pspark-ut -DargLine="-Dspark.test.home=$GITHUB_WORKSPACE//shims/spark34/spark_home/" -DtagsToInclude=org.apache.spark.tags.ExtendedSQLTest + $MVN_CMD clean install -Pspark-3.4 -Pbackends-velox -Pceleborn -Piceberg -Pdelta -Pspark-ut -DargLine="-Dspark.test.home=$GITHUB_WORKSPACE//shims/spark34/spark_home/" -DtagsToInclude=org.apache.spark.tags.ExtendedSQLTest run-spark-test-spark35: needs: build-native-lib-centos-8 @@ -921,6 +1045,10 @@ jobs: with: name: arrow-jars-centos-8-${{github.sha}} path: /root/.m2/repository/org/apache/arrow/ + - name: Update mirror list + run: | + sed -i -e "s|mirrorlist=|#mirrorlist=|g" /etc/yum.repos.d/CentOS-* || true + sed -i -e "s|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g" /etc/yum.repos.d/CentOS-* || true - name: Setup build dependency run: | yum install sudo patch java-1.8.0-openjdk-devel wget -y @@ -959,8 +1087,8 @@ jobs: run: | cd $GITHUB_WORKSPACE/ export SPARK_SCALA_VERSION=2.12 - mvn -ntp clean install -Pspark-3.5 -Pbackends-velox -Pceleborn -Piceberg -Pdelta -Pspark-ut -DargLine="-Dspark.test.home=$GITHUB_WORKSPACE//shims/spark35/spark_home/" -DtagsToExclude=org.apache.spark.tags.ExtendedSQLTest,org.apache.gluten.tags.UDFTest,org.apache.gluten.tags.SkipTestTags && \ - mvn -ntp test -Pspark-3.5 -Pbackends-velox -Piceberg -Pdelta -DtagsToExclude=None -DtagsToInclude=org.apache.gluten.tags.UDFTest + $MVN_CMD clean install -Pspark-3.5 -Pbackends-velox -Pceleborn -Piceberg -Pdelta -Pspark-ut -DargLine="-Dspark.test.home=$GITHUB_WORKSPACE//shims/spark35/spark_home/" -DtagsToExclude=org.apache.spark.tags.ExtendedSQLTest,org.apache.gluten.tags.UDFTest,org.apache.gluten.tags.SkipTestTags && \ + $MVN_CMD test -Pspark-3.5 -Pbackends-velox -Piceberg -Pdelta -DtagsToExclude=None -DtagsToInclude=org.apache.gluten.tags.UDFTest - name: Upload golden files if: failure() uses: actions/upload-artifact@v4 @@ -986,6 +1114,10 @@ jobs: with: name: arrow-jars-centos-8-${{github.sha}} path: /root/.m2/repository/org/apache/arrow/ + - name: Update mirror list + run: | + sed -i -e "s|mirrorlist=|#mirrorlist=|g" /etc/yum.repos.d/CentOS-* || true + sed -i -e "s|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g" /etc/yum.repos.d/CentOS-* || true - name: Setup build dependency run: | yum install sudo patch java-1.8.0-openjdk-devel wget -y @@ -1018,4 +1150,4 @@ jobs: - name: Build and Run unit test for Spark 3.5.1 (slow tests) run: | cd $GITHUB_WORKSPACE/ - mvn -ntp clean install -Pspark-3.5 -Pbackends-velox -Pceleborn -Piceberg -Pdelta -Pspark-ut -DargLine="-Dspark.test.home=$GITHUB_WORKSPACE//shims/spark35/spark_home/" -DtagsToInclude=org.apache.spark.tags.ExtendedSQLTest \ No newline at end of file + $MVN_CMD clean install -Pspark-3.5 -Pbackends-velox -Pceleborn -Piceberg -Pdelta -Pspark-ut -DargLine="-Dspark.test.home=$GITHUB_WORKSPACE//shims/spark35/spark_home/" -DtagsToInclude=org.apache.spark.tags.ExtendedSQLTest diff --git a/.github/workflows/velox_docker_cache.yml b/.github/workflows/velox_docker_cache.yml index 591be26093eea..3c05acf7eca21 100644 --- a/.github/workflows/velox_docker_cache.yml +++ b/.github/workflows/velox_docker_cache.yml @@ -27,7 +27,7 @@ concurrency: jobs: cache-native-lib: runs-on: ubuntu-20.04 - container: apache/gluten:gluten-vcpkg-builder_2024_05_22 # centos7 with dependencies installed + container: apache/gluten:gluten-vcpkg-builder_2024_05_29 # centos7 with dependencies installed steps: - uses: actions/checkout@v2 - name: Generate cache key @@ -38,7 +38,9 @@ jobs: uses: actions/cache/restore@v3 with: lookup-only: true - path: ./cpp/build/releases/ + path: | + ./cpp/build/releases/ + /root/.m2/repository/org/apache/arrow/ key: cache-velox-build-${{ hashFiles('./cache-key') }} - name: Build Gluten Velox third party if: steps.check-cache.outputs.cache-hit != 'true' @@ -49,7 +51,9 @@ jobs: id: cache uses: actions/cache/save@v3 with: - path: ./cpp/build/releases/ + path: | + ./cpp/build/releases/ + /root/.m2/repository/org/apache/arrow/ key: cache-velox-build-${{ hashFiles('./cache-key') }} # ccache-native-lib-ubuntu-velox-ut: diff --git a/NOTICE b/NOTICE new file mode 100644 index 0000000000000..214ea5a87954a --- /dev/null +++ b/NOTICE @@ -0,0 +1,26 @@ +Apache Gluten(incubating) +Copyright 2023-2024 The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). + +Apache Spark +Copyright 2014 and onwards The Apache Software Foundation. + +Apache Celeborn +Copyright 2022-2024 The Apache Software Foundation. + +Apache Uniffle (incubating) +Copyright 2022 and onwards The Apache Software Foundation. + +Apache Iceberg +Copyright 2017-2024 The Apache Software Foundation. + +Apache Parquet MR +Copyright 2014-2024 The Apache Software Foundation. + +Apache ORC +Copyright 2013 and onwards The Apache Software Foundation. + +Apache Thrift +Copyright (C) 2006 - 2019, The Apache Software Foundation. diff --git a/backends-clickhouse/pom.xml b/backends-clickhouse/pom.xml index 94df2f36f183d..5672056b41607 100644 --- a/backends-clickhouse/pom.xml +++ b/backends-clickhouse/pom.xml @@ -100,7 +100,7 @@ org.scalacheck scalacheck_${scala.binary.version} - 1.13.5 + 1.17.0 test @@ -126,13 +126,13 @@ org.scalatestplus - scalatestplus-mockito_2.12 + scalatestplus-mockito_${scala.binary.version} 1.0.0-M2 test org.scalatestplus - scalatestplus-scalacheck_2.12 + scalatestplus-scalacheck_${scala.binary.version} 3.1.0.0-RC2 test @@ -215,6 +215,38 @@ 8.5.9 test + + org.apache.arrow + arrow-memory-core + ${arrow.version} + provided + + + io.netty + netty-common + + + io.netty + netty-buffer + + + + + org.apache.arrow + arrow-vector + ${arrow.version} + provided + + + io.netty + netty-common + + + io.netty + netty-buffer + + + @@ -270,6 +302,13 @@ src/main/delta-${delta.binary.version}/**/*.scala src/test/delta-${delta.binary.version}/**/*.scala + + src/main/delta-${delta.binary.version}/org/apache/spark/sql/delta/commands/*.scala + src/main/delta-${delta.binary.version}/org/apache/spark/sql/delta/commands/merge/*.scala + src/main/delta-${delta.binary.version}/org/apache/spark/sql/delta/files/*.scala + src/main/delta-${delta.binary.version}/org/apache/spark/sql/delta/DeltaLog.scala + src/main/delta-${delta.binary.version}/org/apache/spark/sql/delta/Snapshot.scala + diff --git a/backends-clickhouse/src/main/delta-20/org/apache/spark/sql/delta/ClickhouseOptimisticTransaction.scala b/backends-clickhouse/src/main/delta-20/org/apache/spark/sql/delta/ClickhouseOptimisticTransaction.scala new file mode 100644 index 0000000000000..0794b45158e64 --- /dev/null +++ b/backends-clickhouse/src/main/delta-20/org/apache/spark/sql/delta/ClickhouseOptimisticTransaction.scala @@ -0,0 +1,183 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.delta + +import org.apache.gluten.backendsapi.clickhouse.CHBackendSettings +import org.apache.gluten.execution.ColumnarToRowExecBase + +import org.apache.spark.SparkException +import org.apache.spark.sql.Dataset +import org.apache.spark.sql.delta.actions._ +import org.apache.spark.sql.delta.catalog.ClickHouseTableV2 +import org.apache.spark.sql.delta.constraints.{Constraint, Constraints} +import org.apache.spark.sql.delta.files.MergeTreeCommitProtocol +import org.apache.spark.sql.delta.schema.InvariantViolationException +import org.apache.spark.sql.delta.sources.DeltaSQLConf +import org.apache.spark.sql.execution.{SparkPlan, SQLExecution} +import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec +import org.apache.spark.sql.execution.datasources.{BasicWriteJobStatsTracker, FakeRowAdaptor, FileFormatWriter, WriteJobStatsTracker} +import org.apache.spark.sql.execution.datasources.v1.clickhouse.MergeTreeFileFormatWriter +import org.apache.spark.sql.execution.datasources.v2.clickhouse.ClickHouseConfig +import org.apache.spark.util.{Clock, SerializableConfiguration} + +import org.apache.commons.lang3.exception.ExceptionUtils + +import scala.collection.mutable.ListBuffer + +object ClickhouseOptimisticTransaction {} +class ClickhouseOptimisticTransaction( + override val deltaLog: DeltaLog, + override val snapshot: Snapshot)(implicit override val clock: Clock) + extends OptimisticTransaction(deltaLog, snapshot) { + + def this(deltaLog: DeltaLog, snapshotOpt: Option[Snapshot] = None)(implicit clock: Clock) { + this( + deltaLog, + snapshotOpt.getOrElse(deltaLog.update()) + ) + } + + def insertFakeRowAdaptor(queryPlan: SparkPlan): SparkPlan = queryPlan match { + // if the child is columnar, we can just wrap&transfer the columnar data + case c2r: ColumnarToRowExecBase => + FakeRowAdaptor(c2r.child) + // If the child is aqe, we make aqe "support columnar", + // then aqe itself will guarantee to generate columnar outputs. + // So FakeRowAdaptor will always consumes columnar data, + // thus avoiding the case of c2r->aqe->r2c->writer + case aqe: AdaptiveSparkPlanExec => + FakeRowAdaptor( + AdaptiveSparkPlanExec( + aqe.inputPlan, + aqe.context, + aqe.preprocessingRules, + aqe.isSubquery, + supportsColumnar = true + )) + case other => FakeRowAdaptor(other) + } + + override def writeFiles( + inputData: Dataset[_], + writeOptions: Option[DeltaOptions], + additionalConstraints: Seq[Constraint]): Seq[FileAction] = { + if (ClickHouseConfig.isMergeTreeFormatEngine(metadata.configuration)) { + hasWritten = true + + val spark = inputData.sparkSession + val (data, partitionSchema) = performCDCPartition(inputData) + val outputPath = deltaLog.dataPath + + val (queryExecution, output, generatedColumnConstraints, _) = + normalizeData(deltaLog, data) + val partitioningColumns = getPartitioningColumns(partitionSchema, output) + + val committer = new MergeTreeCommitProtocol("delta-mergetree", outputPath.toString, None) + + // val (optionalStatsTracker, _) = + // getOptionalStatsTrackerAndStatsCollection(output, outputPath, partitionSchema, data) + val (optionalStatsTracker, _) = (None, None) + + val constraints = + Constraints.getAll(metadata, spark) ++ generatedColumnConstraints ++ additionalConstraints + + SQLExecution.withNewExecutionId(queryExecution, Option("deltaTransactionalWrite")) { + val outputSpec = FileFormatWriter.OutputSpec(outputPath.toString, Map.empty, output) + + val queryPlan = queryExecution.executedPlan + val newQueryPlan = insertFakeRowAdaptor(queryPlan) + + val statsTrackers: ListBuffer[WriteJobStatsTracker] = ListBuffer() + + if (spark.conf.get(DeltaSQLConf.DELTA_HISTORY_METRICS_ENABLED)) { + val basicWriteJobStatsTracker = new BasicWriteJobStatsTracker( + new SerializableConfiguration(deltaLog.newDeltaHadoopConf()), + BasicWriteJobStatsTracker.metrics) + // registerSQLMetrics(spark, basicWriteJobStatsTracker.driverSideMetrics) + statsTrackers.append(basicWriteJobStatsTracker) + } + + // Retain only a minimal selection of Spark writer options to avoid any potential + // compatibility issues + var options = writeOptions match { + case None => Map.empty[String, String] + case Some(writeOptions) => + writeOptions.options.filterKeys { + key => + key.equalsIgnoreCase(DeltaOptions.MAX_RECORDS_PER_FILE) || + key.equalsIgnoreCase(DeltaOptions.COMPRESSION) + }.toMap + } + + spark.conf.getAll.foreach( + entry => { + if ( + entry._1.startsWith(s"${CHBackendSettings.getBackendConfigPrefix}.runtime_settings") + || entry._1.equalsIgnoreCase(DeltaSQLConf.DELTA_OPTIMIZE_MIN_FILE_SIZE.key) + ) { + options += (entry._1 -> entry._2) + } + }) + + try { + val tableV2 = ClickHouseTableV2.getTable(deltaLog) + MergeTreeFileFormatWriter.write( + sparkSession = spark, + plan = newQueryPlan, + fileFormat = tableV2.getFileFormat(metadata), + // formats. + committer = committer, + outputSpec = outputSpec, + // scalastyle:off deltahadoopconfiguration + hadoopConf = spark.sessionState + .newHadoopConfWithOptions(metadata.configuration ++ deltaLog.options), + // scalastyle:on deltahadoopconfiguration + orderByKeyOption = tableV2.orderByKeyOption, + lowCardKeyOption = tableV2.lowCardKeyOption, + minmaxIndexKeyOption = tableV2.minmaxIndexKeyOption, + bfIndexKeyOption = tableV2.bfIndexKeyOption, + setIndexKeyOption = tableV2.setIndexKeyOption, + primaryKeyOption = tableV2.primaryKeyOption, + partitionColumns = partitioningColumns, + bucketSpec = tableV2.bucketOption, + statsTrackers = optionalStatsTracker.toSeq ++ statsTrackers, + options = options, + constraints = constraints + ) + } catch { + case s: SparkException => + // Pull an InvariantViolationException up to the top level if it was the root cause. + val violationException = ExceptionUtils.getRootCause(s) + if (violationException.isInstanceOf[InvariantViolationException]) { + throw violationException + } else { + throw s + } + } + } + committer.addedStatuses.toSeq ++ committer.changeFiles + } else { + // TODO: support native delta parquet write + // 1. insert FakeRowAdaptor + // 2. DeltaInvariantCheckerExec transform + // 3. DeltaTaskStatisticsTracker collect null count / min values / max values + // 4. set the parameters 'staticPartitionWriteOnly', 'isNativeAppliable', + // 'nativeFormat' in the LocalProperty of the sparkcontext + super.writeFiles(inputData, writeOptions, additionalConstraints) + } + } +} diff --git a/backends-clickhouse/src/main/delta-22/org/apache/spark/sql/execution/datasources/v2/clickhouse/DeltaLogAdapter.scala b/backends-clickhouse/src/main/delta-20/org/apache/spark/sql/delta/DeltaAdapter.scala similarity index 78% rename from backends-clickhouse/src/main/delta-22/org/apache/spark/sql/execution/datasources/v2/clickhouse/DeltaLogAdapter.scala rename to backends-clickhouse/src/main/delta-20/org/apache/spark/sql/delta/DeltaAdapter.scala index dd6ad383e0d7d..b6d4c04844c43 100644 --- a/backends-clickhouse/src/main/delta-22/org/apache/spark/sql/execution/datasources/v2/clickhouse/DeltaLogAdapter.scala +++ b/backends-clickhouse/src/main/delta-20/org/apache/spark/sql/delta/DeltaAdapter.scala @@ -14,10 +14,8 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.spark.sql.execution.datasources.v2.clickhouse +package org.apache.spark.sql.delta -import org.apache.spark.sql.delta.{DeltaLog, Snapshot} - -object DeltaLogAdapter { - def snapshot(deltaLog: DeltaLog): Snapshot = deltaLog.unsafeVolatileSnapshot +object DeltaAdapter extends DeltaAdapterTrait { + override def snapshot(deltaLog: DeltaLog): Snapshot = deltaLog.snapshot } diff --git a/backends-clickhouse/src/main/delta-20/org/apache/spark/sql/delta/DeltaLog.scala b/backends-clickhouse/src/main/delta-20/org/apache/spark/sql/delta/DeltaLog.scala index 0f6455997e568..57c6c8550fc07 100644 --- a/backends-clickhouse/src/main/delta-20/org/apache/spark/sql/delta/DeltaLog.scala +++ b/backends-clickhouse/src/main/delta-20/org/apache/spark/sql/delta/DeltaLog.scala @@ -31,6 +31,7 @@ import org.apache.spark.sql.delta.metering.DeltaLogging import org.apache.spark.sql.delta.schema.{SchemaMergingUtils, SchemaUtils} import org.apache.spark.sql.delta.sources.DeltaSQLConf import org.apache.spark.sql.delta.storage.LogStoreProvider +import org.apache.spark.sql.execution.datasources.v2.clickhouse.ClickHouseConfig import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.sources.{BaseRelation, InsertableRelation} import org.apache.spark.sql.types.{StructField, StructType} @@ -214,7 +215,9 @@ class DeltaLog private ( */ def startTransaction(): OptimisticTransaction = { update() + // --- modified start new ClickhouseOptimisticTransaction(this, None) + // --- modified end } /** @@ -443,7 +446,13 @@ class DeltaLog private ( val fileIndex = TahoeLogFileIndex(spark, this, dataPath, snapshotToUse, partitionFilters, isTimeTravelQuery) - val bucketSpec: Option[BucketSpec] = ClickHouseTableV2.getTable(this).bucketOption + // --- modified start + val bucketSpec: Option[BucketSpec] = + if (ClickHouseConfig.isMergeTreeFormatEngine(snapshotToUse.metadata.configuration)) { + ClickHouseTableV2.getTable(this).bucketOption + } else { + None + } new DeltaHadoopFsRelation( fileIndex, partitionSchema = @@ -464,20 +473,28 @@ class DeltaLog private ( spark, this ) + // --- modified end } - override def fileFormat(metadata: Metadata = metadata): FileFormat = - ClickHouseTableV2.getTable(this).getFileFormat(metadata) - + override def fileFormat(metadata: Metadata = metadata): FileFormat = { + // --- modified start + if (ClickHouseConfig.isMergeTreeFormatEngine(metadata.configuration)) { + ClickHouseTableV2.getTable(this).getFileFormat(metadata) + } else { + super.fileFormat(metadata) + } + // --- modified end + } } object DeltaLog extends DeltaLogging { + // --- modified start @SuppressWarnings(Array("io.github.zhztheplayer.scalawarts.InheritFromCaseClass")) private class DeltaHadoopFsRelation( location: FileIndex, partitionSchema: StructType, - // The top-level columns in `dataSchema` should match the actual physical file schema, otherwise - // the ORC data source may not work with the by-ordinal mode. + // The top-level columns in `dataSchema` should match the actual physical file schema, + // otherwise the ORC data source may not work with the by-ordinal mode. dataSchema: StructType, bucketSpec: Option[BucketSpec], fileFormat: FileFormat, @@ -502,6 +519,7 @@ object DeltaLog extends DeltaLogging { ).run(sparkSession) } } + // --- modified end /** * The key type of `DeltaLog` cache. It's a pair of the canonicalized table path and the file diff --git a/backends-clickhouse/src/main/delta-20/org/apache/spark/sql/delta/Snapshot.scala b/backends-clickhouse/src/main/delta-20/org/apache/spark/sql/delta/Snapshot.scala index 2233aa0cd1efc..2e4d6bb2207c6 100644 --- a/backends-clickhouse/src/main/delta-20/org/apache/spark/sql/delta/Snapshot.scala +++ b/backends-clickhouse/src/main/delta-20/org/apache/spark/sql/delta/Snapshot.scala @@ -27,6 +27,7 @@ import org.apache.spark.sql.delta.sources.DeltaSQLConf import org.apache.spark.sql.delta.stats.{DataSkippingReader, DeltaScan, FileSizeHistogram, StatisticsCollection} import org.apache.spark.sql.delta.util.StateCache import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation} +import org.apache.spark.sql.execution.datasources.v2.clickhouse.ClickHouseConfig import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.StructType import org.apache.spark.util.{SerializableConfiguration, Utils} @@ -404,6 +405,7 @@ class Snapshot( s"${getClass.getSimpleName}(path=$path, version=$version, metadata=$metadata, " + s"logSegment=$logSegment, checksumOpt=$checksumOpt)" + // --- modified start override def filesForScan( projection: Seq[Attribute], filters: Seq[Expression], @@ -418,31 +420,36 @@ class Snapshot( } private def replaceWithAddMergeTreeParts(deltaScan: DeltaScan) = { - DeltaScan.apply( - deltaScan.version, - deltaScan.files - .map( - addFile => { - val addFileAsKey = AddFileAsKey(addFile) - - val ret = ClickhouseSnapshot.addFileToAddMTPCache.get(addFileAsKey) - // this is for later use - ClickhouseSnapshot.pathToAddMTPCache.put(ret.fullPartPath(), ret) - ret - }), - deltaScan.total, - deltaScan.partition, - deltaScan.scanned - )( - deltaScan.scannedSnapshot, - deltaScan.partitionFilters, - deltaScan.dataFilters, - deltaScan.unusedFilters, - deltaScan.projection, - deltaScan.scanDurationMs, - deltaScan.dataSkippingType - ) + if (ClickHouseConfig.isMergeTreeFormatEngine(metadata.configuration)) { + DeltaScan.apply( + deltaScan.version, + deltaScan.files + .map( + addFile => { + val addFileAsKey = AddFileAsKey(addFile) + + val ret = ClickhouseSnapshot.addFileToAddMTPCache.get(addFileAsKey) + // this is for later use + ClickhouseSnapshot.pathToAddMTPCache.put(ret.fullPartPath(), ret) + ret + }), + deltaScan.total, + deltaScan.partition, + deltaScan.scanned + )( + deltaScan.scannedSnapshot, + deltaScan.partitionFilters, + deltaScan.dataFilters, + deltaScan.unusedFilters, + deltaScan.projection, + deltaScan.scanDurationMs, + deltaScan.dataSkippingType + ) + } else { + deltaScan + } } + // --- modified end logInfo(s"Created snapshot $this") init() diff --git a/backends-clickhouse/src/main/scala/org/apache/spark/sql/delta/catalog/ClickHouseTableV2.scala b/backends-clickhouse/src/main/delta-20/org/apache/spark/sql/delta/catalog/ClickHouseTableV2.scala similarity index 54% rename from backends-clickhouse/src/main/scala/org/apache/spark/sql/delta/catalog/ClickHouseTableV2.scala rename to backends-clickhouse/src/main/delta-20/org/apache/spark/sql/delta/catalog/ClickHouseTableV2.scala index 1107c6a2ef76d..90370f0b1d998 100644 --- a/backends-clickhouse/src/main/scala/org/apache/spark/sql/delta/catalog/ClickHouseTableV2.scala +++ b/backends-clickhouse/src/main/delta-20/org/apache/spark/sql/delta/catalog/ClickHouseTableV2.scala @@ -17,18 +17,16 @@ package org.apache.spark.sql.delta.catalog import org.apache.spark.internal.Logging import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogTable} +import org.apache.spark.sql.catalyst.catalog.CatalogTable import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} -import org.apache.spark.sql.connector.catalog.TableCatalog import org.apache.spark.sql.connector.read.InputPartition import org.apache.spark.sql.connector.write.{LogicalWriteInfo, WriteBuilder} -import org.apache.spark.sql.delta.{ClickhouseSnapshot, DeltaErrors, DeltaLog, DeltaTimeTravelSpec} +import org.apache.spark.sql.delta.{ClickhouseSnapshot, DeltaErrors, DeltaLog, DeltaTimeTravelSpec, Snapshot} import org.apache.spark.sql.delta.actions.Metadata import org.apache.spark.sql.delta.catalog.ClickHouseTableV2.deltaLog2Table import org.apache.spark.sql.delta.sources.DeltaDataSource import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, PartitionDirectory} import org.apache.spark.sql.execution.datasources.utils.MergeTreePartsPartitionsUtil -import org.apache.spark.sql.execution.datasources.v2.clickhouse.ClickHouseConfig import org.apache.spark.sql.execution.datasources.v2.clickhouse.source.DeltaMergeTreeFileFormat import org.apache.spark.sql.util.CaseInsensitiveStringMap import org.apache.spark.util.collection.BitSet @@ -56,8 +54,8 @@ class ClickHouseTableV2( tableIdentifier, timeTravelOpt, options, - cdcOptions) { - protected def getMetadata: Metadata = if (snapshot == null) Metadata() else snapshot.metadata + cdcOptions) + with ClickHouseTableV2Base { lazy val (rootPath, partitionFilters, timeTravelByPath) = { if (catalogTable.isDefined) { @@ -83,7 +81,6 @@ class ClickHouseTableV2( override def properties(): ju.Map[String, String] = { val ret = super.properties() - ret.put(TableCatalog.PROP_PROVIDER, ClickHouseConfig.NAME) // for file path based write if (snapshot.version < 0 && clickhouseExtensionOptions.nonEmpty) { @@ -96,126 +93,6 @@ class ClickHouseTableV2( new WriteIntoDeltaBuilder(deltaLog, info.options) } - lazy val dataBaseName = catalogTable - .map(_.identifier.database.getOrElse("default")) - .getOrElse("clickhouse") - - lazy val tableName = catalogTable - .map(_.identifier.table) - .getOrElse(path.toUri.getPath) - - lazy val bucketOption: Option[BucketSpec] = { - val tableProperties = properties() - if (tableProperties.containsKey("numBuckets")) { - val numBuckets = tableProperties.get("numBuckets").trim.toInt - val bucketColumnNames: Seq[String] = - tableProperties.get("bucketColumnNames").split(",").map(_.trim).toSeq - val sortColumnNames: Seq[String] = if (tableProperties.containsKey("orderByKey")) { - tableProperties.get("orderByKey").split(",").map(_.trim).toSeq - } else Seq.empty[String] - Some(BucketSpec(numBuckets, bucketColumnNames, sortColumnNames)) - } else { - None - } - } - - lazy val lowCardKeyOption: Option[Seq[String]] = { - getCommaSeparatedColumns("lowCardKey") - } - - lazy val minmaxIndexKeyOption: Option[Seq[String]] = { - getCommaSeparatedColumns("minmaxIndexKey") - } - - lazy val bfIndexKeyOption: Option[Seq[String]] = { - getCommaSeparatedColumns("bloomfilterIndexKey") - } - - lazy val setIndexKeyOption: Option[Seq[String]] = { - getCommaSeparatedColumns("setIndexKey") - } - - private def getCommaSeparatedColumns(keyName: String) = { - val tableProperties = properties() - if (tableProperties.containsKey(keyName)) { - if (tableProperties.get(keyName).nonEmpty) { - val keys = tableProperties.get(keyName).split(",").map(_.trim).toSeq - keys.foreach( - s => { - if (s.contains(".")) { - throw new IllegalStateException( - s"$keyName $s can not contain '.' (not support nested column yet)") - } - }) - Some(keys.map(s => s.toLowerCase())) - } else { - None - } - } else { - None - } - } - - lazy val orderByKeyOption: Option[Seq[String]] = { - if (bucketOption.isDefined && bucketOption.get.sortColumnNames.nonEmpty) { - val orderByKes = bucketOption.get.sortColumnNames - val invalidKeys = orderByKes.intersect(partitionColumns) - if (invalidKeys.nonEmpty) { - throw new IllegalStateException( - s"partition cols $invalidKeys can not be in the order by keys.") - } - Some(orderByKes) - } else { - val tableProperties = properties() - if (tableProperties.containsKey("orderByKey")) { - if (tableProperties.get("orderByKey").nonEmpty) { - val orderByKes = tableProperties.get("orderByKey").split(",").map(_.trim).toSeq - val invalidKeys = orderByKes.intersect(partitionColumns) - if (invalidKeys.nonEmpty) { - throw new IllegalStateException( - s"partition cols $invalidKeys can not be in the order by keys.") - } - Some(orderByKes) - } else { - None - } - } else { - None - } - } - } - - lazy val primaryKeyOption: Option[Seq[String]] = { - if (orderByKeyOption.isDefined) { - val tableProperties = properties() - if (tableProperties.containsKey("primaryKey")) { - if (tableProperties.get("primaryKey").nonEmpty) { - val primaryKeys = tableProperties.get("primaryKey").split(",").map(_.trim).toSeq - if (!orderByKeyOption.get.mkString(",").startsWith(primaryKeys.mkString(","))) { - throw new IllegalStateException( - s"Primary key $primaryKeys must be a prefix of the sorting key") - } - Some(primaryKeys) - } else { - None - } - } else { - None - } - } else { - None - } - } - - lazy val partitionColumns = snapshot.metadata.partitionColumns - - lazy val clickhouseTableConfigs: Map[String, String] = { - val tableProperties = properties() - val configs = scala.collection.mutable.Map[String, String]() - configs += ("storage_policy" -> tableProperties.getOrDefault("storage_policy", "default")) - configs.toMap - } - def getFileFormat(meta: Metadata): DeltaMergeTreeFileFormat = { new DeltaMergeTreeFileFormat( meta, @@ -232,41 +109,20 @@ class ClickHouseTableV2( partitionColumns ) } - def cacheThis(): Unit = { - deltaLog2Table.put(deltaLog, this) - } - cacheThis() + override def deltaProperties(): ju.Map[String, String] = properties() - def primaryKey(): String = primaryKeyOption match { - case Some(keys) => keys.mkString(",") - case None => "" - } + override def deltaCatalog(): Option[CatalogTable] = catalogTable - def orderByKey(): String = orderByKeyOption match { - case Some(keys) => keys.mkString(",") - case None => "tuple()" - } - - def lowCardKey(): String = lowCardKeyOption match { - case Some(keys) => keys.mkString(",") - case None => "" - } + override def deltaPath(): Path = path - def minmaxIndexKey(): String = minmaxIndexKeyOption match { - case Some(keys) => keys.mkString(",") - case None => "" - } + override def deltaSnapshot(): Snapshot = snapshot - def bfIndexKey(): String = bfIndexKeyOption match { - case Some(keys) => keys.mkString(",") - case None => "" + def cacheThis(): Unit = { + deltaLog2Table.put(deltaLog, this) } - def setIndexKey(): String = setIndexKeyOption match { - case Some(keys) => keys.mkString(",") - case None => "" - } + cacheThis() } @SuppressWarnings(Array("io.github.zhztheplayer.scalawarts.InheritFromCaseClass")) diff --git a/backends-clickhouse/src/main/delta-20/org/apache/spark/sql/delta/commands/DeleteCommand.scala b/backends-clickhouse/src/main/delta-20/org/apache/spark/sql/delta/commands/DeleteCommand.scala index 527b9619eb5d3..61b0330723f53 100644 --- a/backends-clickhouse/src/main/delta-20/org/apache/spark/sql/delta/commands/DeleteCommand.scala +++ b/backends-clickhouse/src/main/delta-20/org/apache/spark/sql/delta/commands/DeleteCommand.scala @@ -216,6 +216,7 @@ case class DeleteCommand(deltaLog: DeltaLog, target: LogicalPlan, condition: Opt if (candidateFiles.isEmpty) { Array.empty[String] } else { + // --- modified start data .filter(new Column(cond)) .select(input_file_name().as("input_files")) @@ -224,6 +225,7 @@ case class DeleteCommand(deltaLog: DeltaLog, target: LogicalPlan, condition: Opt .distinct() .as[String] .collect() + // --- modified end } } diff --git a/backends-clickhouse/src/main/delta-20/org/apache/spark/sql/delta/commands/MergeIntoCommand.scala b/backends-clickhouse/src/main/delta-20/org/apache/spark/sql/delta/commands/MergeIntoCommand.scala index 89208dd453146..4b26d5d5d9498 100644 --- a/backends-clickhouse/src/main/delta-20/org/apache/spark/sql/delta/commands/MergeIntoCommand.scala +++ b/backends-clickhouse/src/main/delta-20/org/apache/spark/sql/delta/commands/MergeIntoCommand.scala @@ -407,7 +407,9 @@ case class MergeIntoCommand( val recordTouchedFileName = udf { (fileName: String) => { + // --- modified start fileName.split(",").foreach(name => touchedFilesAccum.add(name)) + // --- modified end 1 } }.asNondeterministic() diff --git a/backends-clickhouse/src/main/delta-20/org/apache/spark/sql/delta/commands/OptimizeTableCommand.scala b/backends-clickhouse/src/main/delta-20/org/apache/spark/sql/delta/commands/OptimizeTableCommand.scala index 571ac63f1eb60..6437ab75903b6 100644 --- a/backends-clickhouse/src/main/delta-20/org/apache/spark/sql/delta/commands/OptimizeTableCommand.scala +++ b/backends-clickhouse/src/main/delta-20/org/apache/spark/sql/delta/commands/OptimizeTableCommand.scala @@ -32,6 +32,7 @@ import org.apache.spark.sql.delta.schema.SchemaUtils import org.apache.spark.sql.delta.skipping.MultiDimClustering import org.apache.spark.sql.delta.sources.DeltaSQLConf import org.apache.spark.sql.execution.command.{LeafRunnableCommand, RunnableCommand} +import org.apache.spark.sql.execution.datasources.v2.clickhouse.ClickHouseConfig import org.apache.spark.sql.execution.datasources.v2.clickhouse.metadata.AddMergeTreeParts import org.apache.spark.sql.execution.datasources.v2.clickhouse.utils.CHDataSourceUtils import org.apache.spark.sql.execution.metric.SQLMetric @@ -127,8 +128,10 @@ case class OptimizeTableCommand( override val otherCopyArgs: Seq[AnyRef] = zOrderBy :: Nil override def run(sparkSession: SparkSession): Seq[Row] = { + // --- modified start CHDataSourceUtils.ensureClickHouseTableV2(tableId, sparkSession) val deltaLog = getDeltaLogClickhouse(sparkSession, path, tableId, "OPTIMIZE") + // --- modified end val partitionColumns = deltaLog.snapshot.metadata.partitionColumns // Parse the predicate expression into Catalyst expression and verify only simple filters @@ -177,6 +180,10 @@ class OptimizeExecutor( def optimize(): Seq[Row] = { recordDeltaOperation(deltaLog, "delta.optimize") { + // --- modified start + val isMergeTreeFormat = ClickHouseConfig + .isMergeTreeFormatEngine(deltaLog.snapshot.metadata.configuration) + // --- modified end val minFileSize = sparkSession.sessionState.conf.getConf(DeltaSQLConf.DELTA_OPTIMIZE_MIN_FILE_SIZE) val maxFileSize = @@ -194,37 +201,59 @@ class OptimizeExecutor( // select all files in case of multi-dimensional clustering val filesToProcess = candidateFiles.filter(_.size < minFileSize || isMultiDimClustering) - val partitionsToCompact = filesToProcess - .groupBy(file => (file.asInstanceOf[AddMergeTreeParts].bucketNum, file.partitionValues)) - .toSeq - - val jobs = groupFilesIntoBinsClickhouse(partitionsToCompact, maxFileSize) - - val parallelJobCollection = new ParVector(jobs.toVector) + // --- modified start // Create a task pool to parallelize the submission of optimization jobs to Spark. val threadPool = ThreadUtils.newForkJoinPool( "OptimizeJob", sparkSession.sessionState.conf.getConf(DeltaSQLConf.DELTA_OPTIMIZE_MAX_THREADS)) + val (updates, jobs) = if (isMergeTreeFormat) { + val partitionsToCompact = filesToProcess + .groupBy(file => (file.asInstanceOf[AddMergeTreeParts].bucketNum, file.partitionValues)) + .toSeq + + val jobs = groupFilesIntoBinsClickhouse(partitionsToCompact, maxFileSize) + + val parallelJobCollection = new ParVector(jobs.toVector) + + val updates = + try { + val forkJoinPoolTaskSupport = new ForkJoinTaskSupport(threadPool) + parallelJobCollection.tasksupport = forkJoinPoolTaskSupport + + parallelJobCollection + .flatMap( + partitionBinGroup => + runOptimizeBinJobClickhouse( + txn, + partitionBinGroup._1._2, + partitionBinGroup._1._1, + partitionBinGroup._2, + maxFileSize)) + .seq + } finally { + threadPool.shutdownNow() + } + (updates, jobs) + } else { + val partitionsToCompact = filesToProcess.groupBy(_.partitionValues).toSeq + + val jobs = groupFilesIntoBins(partitionsToCompact, maxFileSize) - val updates = - try { + val parallelJobCollection = new ParVector(jobs.toVector) + + val updates = try { val forkJoinPoolTaskSupport = new ForkJoinTaskSupport(threadPool) parallelJobCollection.tasksupport = forkJoinPoolTaskSupport - parallelJobCollection - .flatMap( - partitionBinGroup => - runOptimizeBinJobClickhouse( - txn, - partitionBinGroup._1._2, - partitionBinGroup._1._1, - partitionBinGroup._2, - maxFileSize)) - .seq + parallelJobCollection.flatMap(partitionBinGroup => + runOptimizeBinJob(txn, partitionBinGroup._1, partitionBinGroup._2, maxFileSize)).seq } finally { threadPool.shutdownNow() } + (updates, jobs) + } + // --- modified end val addedFiles = updates.collect { case a: AddFile => a } val removedFiles = updates.collect { case r: RemoveFile => r } diff --git a/backends-clickhouse/src/main/scala/org/apache/spark/sql/delta/commands/OptimizeTableCommandOverwrites.scala b/backends-clickhouse/src/main/delta-20/org/apache/spark/sql/delta/commands/OptimizeTableCommandOverwrites.scala similarity index 95% rename from backends-clickhouse/src/main/scala/org/apache/spark/sql/delta/commands/OptimizeTableCommandOverwrites.scala rename to backends-clickhouse/src/main/delta-20/org/apache/spark/sql/delta/commands/OptimizeTableCommandOverwrites.scala index 7af1abe437221..f7a180b6a239c 100644 --- a/backends-clickhouse/src/main/scala/org/apache/spark/sql/delta/commands/OptimizeTableCommandOverwrites.scala +++ b/backends-clickhouse/src/main/delta-20/org/apache/spark/sql/delta/commands/OptimizeTableCommandOverwrites.scala @@ -25,6 +25,7 @@ import org.apache.spark.internal.io.SparkHadoopWriterUtils import org.apache.spark.shuffle.FetchFailedException import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier} +import org.apache.spark.sql.catalyst.catalog.CatalogTableType import org.apache.spark.sql.delta._ import org.apache.spark.sql.delta.actions.{AddFile, FileAction} import org.apache.spark.sql.delta.catalog.ClickHouseTableV2 @@ -243,21 +244,6 @@ object OptimizeTableCommandOverwrites extends Logging { } - private def isDeltaTable(spark: SparkSession, tableName: TableIdentifier): Boolean = { - val catalog = spark.sessionState.catalog - val tableIsNotTemporaryTable = !catalog.isTempView(tableName) - val tableExists = { - (tableName.database.isEmpty || catalog.databaseExists(tableName.database.get)) && - catalog.tableExists(tableName) - } - tableIsNotTemporaryTable && tableExists && catalog - .getTableMetadata(tableName) - .provider - .get - .toLowerCase() - .equals("clickhouse") - } - def getDeltaLogClickhouse( spark: SparkSession, path: Option[String], @@ -276,7 +262,17 @@ object OptimizeTableCommandOverwrites extends Logging { } else if (CHDataSourceUtils.isClickHouseTable(spark, tableIdentifier.get)) { new Path(metadata.location) } else { - throw DeltaErrors.notADeltaTableException(operationName) + DeltaTableIdentifier(spark, tableIdentifier.get) match { + case Some(id) if id.path.nonEmpty => + new Path(id.path.get) + case Some(id) if id.table.nonEmpty => + new Path(metadata.location) + case _ => + if (metadata.tableType == CatalogTableType.VIEW) { + throw DeltaErrors.viewNotSupported(operationName) + } + throw DeltaErrors.notADeltaTableException(operationName) + } } } else { throw DeltaErrors.missingTableIdentifierException(operationName) diff --git a/backends-clickhouse/src/main/delta-20/org/apache/spark/sql/delta/commands/UpdateCommand.scala b/backends-clickhouse/src/main/delta-20/org/apache/spark/sql/delta/commands/UpdateCommand.scala index f6e2968b703f1..42a081788eb1d 100644 --- a/backends-clickhouse/src/main/delta-20/org/apache/spark/sql/delta/commands/UpdateCommand.scala +++ b/backends-clickhouse/src/main/delta-20/org/apache/spark/sql/delta/commands/UpdateCommand.scala @@ -144,6 +144,7 @@ case class UpdateCommand( }.asNondeterministic() val pathsToRewrite = withStatusCode("DELTA", UpdateCommand.FINDING_TOUCHED_FILES_MSG) { + // --- modified start data .filter(new Column(updateCondition)) .filter(updatedRowUdf()) @@ -152,6 +153,7 @@ case class UpdateCommand( .distinct() .as[String] .collect() + // --- modified end } scanTimeMs = (System.nanoTime() - startTime) / 1000 / 1000 diff --git a/backends-clickhouse/src/main/delta-20/org/apache/spark/sql/delta/commands/VacuumCommand.scala b/backends-clickhouse/src/main/delta-20/org/apache/spark/sql/delta/commands/VacuumCommand.scala index 11f6aa977904c..c5527933b2fcc 100644 --- a/backends-clickhouse/src/main/delta-20/org/apache/spark/sql/delta/commands/VacuumCommand.scala +++ b/backends-clickhouse/src/main/delta-20/org/apache/spark/sql/delta/commands/VacuumCommand.scala @@ -16,11 +16,14 @@ */ package org.apache.spark.sql.delta.commands +import org.apache.gluten.utils.QueryPlanSelector + import org.apache.spark.sql.{DataFrame, SparkSession} import org.apache.spark.sql.delta._ import org.apache.spark.sql.delta.actions.{FileAction, RemoveFile} import org.apache.spark.sql.delta.sources.DeltaSQLConf import org.apache.spark.sql.delta.util.DeltaFileOperations +import org.apache.spark.sql.execution.datasources.v2.clickhouse.ClickHouseConfig import org.apache.spark.sql.functions.{col, expr, when} import org.apache.spark.util.{Clock, SerializableConfiguration, SystemClock} @@ -41,7 +44,9 @@ import java.util.concurrent.TimeUnit */ object VacuumCommand extends VacuumCommandImpl with Serializable { - case class FileNameAndSize(path: String, length: Long, isDir: Boolean) + // --- modified start + case class FileNameAndSize(path: String, length: Long, isDir: Boolean = false) + // --- modified end /** * Additional check on retention duration to prevent people from shooting themselves in the foot. @@ -112,6 +117,11 @@ object VacuumCommand extends VacuumCommandImpl with Serializable { "No state defined for this table. Is this really " + "a Delta table? Refusing to garbage collect.") + // --- modified start + val isMergeTreeFormat = ClickHouseConfig + .isMergeTreeFormatEngine(deltaLog.snapshot.metadata.configuration) + // --- modified end + val retentionMillis = retentionHours.map(h => TimeUnit.HOURS.toMillis(math.round(h))) checkRetentionPeriodSafety(spark, retentionMillis, deltaLog.tombstoneRetentionMillis) @@ -133,6 +143,13 @@ object VacuumCommand extends VacuumCommandImpl with Serializable { val relativizeIgnoreError = spark.sessionState.conf.getConf(DeltaSQLConf.DELTA_VACUUM_RELATIVIZE_IGNORE_ERROR) + // --- modified start + val originalEnabledGluten = + spark.sparkContext.getLocalProperty(QueryPlanSelector.GLUTEN_ENABLE_FOR_THREAD_KEY) + // gluten can not support vacuum command + spark.sparkContext.setLocalProperty(QueryPlanSelector.GLUTEN_ENABLE_FOR_THREAD_KEY, "false") + // --- modified end + val validFiles = snapshot.stateDS .mapPartitions { actions => @@ -209,60 +226,92 @@ object VacuumCommand extends VacuumCommandImpl with Serializable { // 5. We subtract all the valid files and tombstones in our state // 6. We filter all paths with a count of 1, which will correspond to files not in the // state, and empty directories. We can safely delete all of these - val diff_temp = allFilesAndDirs - .where('modificationTime < deleteBeforeTimestamp || 'isDir) - .mapPartitions { - fileStatusIterator => - val reservoirBase = new Path(basePath) - val fs = reservoirBase.getFileSystem(hadoopConf.value.value) - fileStatusIterator.flatMap { - fileStatus => - if (fileStatus.isDir) { - implicit val fileNameAndSizeEncoder = - org.apache.spark.sql.Encoders.product[FileNameAndSize] - Iterator.single( - FileNameAndSize( - relativize(fileStatus.getPath, fs, reservoirBase, isDir = true), - 0, - true) - ) - } else { - val dirs = getAllSubdirs(basePath, fileStatus.path, fs) - val dirsWithSlash = dirs.map { - p => + // --- modified start + val diff = if (isMergeTreeFormat) { + val diff_temp = allFilesAndDirs + .where('modificationTime < deleteBeforeTimestamp || 'isDir) + .mapPartitions { + fileStatusIterator => + val reservoirBase = new Path(basePath) + val fs = reservoirBase.getFileSystem(hadoopConf.value.value) + fileStatusIterator.flatMap { + fileStatus => + if (fileStatus.isDir) { + implicit val fileNameAndSizeEncoder = + org.apache.spark.sql.Encoders.product[FileNameAndSize] + Iterator.single( FileNameAndSize( - relativize(new Path(p), fs, reservoirBase, isDir = true), + relativize(fileStatus.getPath, fs, reservoirBase, isDir = true), 0, true) + ) + } else { + val dirs = getAllSubdirs(basePath, fileStatus.path, fs) + val dirsWithSlash = dirs.map { + p => + FileNameAndSize( + relativize(new Path(p), fs, reservoirBase, isDir = true), + 0, + true) + } + dirsWithSlash ++ Iterator( + FileNameAndSize( + relativize(new Path(fileStatus.path), fs, reservoirBase, isDir = false), + 0, + false)) } - dirsWithSlash ++ Iterator( - FileNameAndSize( - relativize(new Path(fileStatus.path), fs, reservoirBase, isDir = false), - 0, - false)) + } + } + .withColumn( + "dir", + when(col("isDir"), col("path")) + .otherwise(expr("substring_index(path, '/',size(split(path, '/')) -1)"))) + .groupBy(col("path"), col("dir")) + .count() + + diff_temp + .join(validFiles, diff_temp("dir") === validFiles("path"), "leftanti") + .where('count === 1) + .select('path) + .as[String] + .map { + relativePath => + assert( + !stringToPath(relativePath).isAbsolute, + "Shouldn't have any absolute paths for deletion here.") + pathToString(DeltaFileOperations.absolutePath(basePath, relativePath)) + } + } else { + allFilesAndDirs + .where('modificationTime < deleteBeforeTimestamp || 'isDir) + .mapPartitions { fileStatusIterator => + val reservoirBase = new Path(basePath) + val fs = reservoirBase.getFileSystem(hadoopConf.value.value) + fileStatusIterator.flatMap { fileStatus => + if (fileStatus.isDir) { + Iterator.single(relativize(fileStatus.getPath, fs, reservoirBase, isDir = true)) + } else { + val dirs = getAllSubdirs(basePath, fileStatus.path, fs) + val dirsWithSlash = dirs.map { p => + relativize(new Path(p), fs, reservoirBase, isDir = true) } + dirsWithSlash ++ Iterator( + relativize(new Path(fileStatus.path), fs, reservoirBase, isDir = false)) + } } - } -// .groupBy(col("path")) - .withColumn( - "dir", - when(col("isDir"), col("path")) - .otherwise(expr("substring_index(path, '/',size(split(path, '/')) -1)"))) - .groupBy(col("path"), col("dir")) - .count() - - val diff = diff_temp - .join(validFiles, diff_temp("dir") === validFiles("path"), "leftanti") - .where('count === 1) - .select('path) - .as[String] - .map { - relativePath => - assert( - !stringToPath(relativePath).isAbsolute, + }.groupBy($"value" as 'path) + .count() + .join(validFiles, Seq("path"), "leftanti") + .where('count === 1) + .select('path) + .as[String] + .map { relativePath => + assert(!stringToPath(relativePath).isAbsolute, "Shouldn't have any absolute paths for deletion here.") pathToString(DeltaFileOperations.absolutePath(basePath, relativePath)) - } + } + } + // --- modified end if (dryRun) { val numFiles = diff.count() @@ -318,6 +367,16 @@ object VacuumCommand extends VacuumCommandImpl with Serializable { spark.createDataset(Seq(basePath)).toDF("path") } finally { allFilesAndDirs.unpersist() + + // --- modified start + if (originalEnabledGluten != null) { + spark.sparkContext.setLocalProperty( + QueryPlanSelector.GLUTEN_ENABLE_FOR_THREAD_KEY, originalEnabledGluten) + } else { + spark.sparkContext.setLocalProperty( + QueryPlanSelector.GLUTEN_ENABLE_FOR_THREAD_KEY, "true") + } + // --- modified end } } } diff --git a/backends-clickhouse/src/main/delta-20/org/apache/spark/sql/delta/files/MergeTreeCommitProtocol.scala b/backends-clickhouse/src/main/delta-20/org/apache/spark/sql/delta/files/MergeTreeCommitProtocol.scala index bfc2555e82a25..4b6d56644b9e7 100644 --- a/backends-clickhouse/src/main/delta-20/org/apache/spark/sql/delta/files/MergeTreeCommitProtocol.scala +++ b/backends-clickhouse/src/main/delta-20/org/apache/spark/sql/delta/files/MergeTreeCommitProtocol.scala @@ -212,19 +212,9 @@ class MergeTreeCommitProtocol(jobId: String, path: String, randomPrefixLength: O } override def commitTask(taskContext: TaskAttemptContext): TaskCommitMessage = { - if (addedFiles.nonEmpty) { - /* val fs = new Path(path, addedFiles.head._2).getFileSystem(taskContext.getConfiguration) - val statuses: Seq[FileAction] = addedFiles.map { f => - val filePath = new Path(path, new Path(new URI(f._2))) - val stat = fs.getFileStatus(filePath) - - buildActionFromAddedFile(f, stat, taskContext) - }.toSeq */ - - new TaskCommitMessage(Nil) - } else { - new TaskCommitMessage(Nil) - } + // will return TaskCommitMessage(Nil) directly, + // the FileStatus list will be get from the CH backend. + new TaskCommitMessage(Nil) } override def abortTask(taskContext: TaskAttemptContext): Unit = { diff --git a/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/datasources/v2/clickhouse/ClickHouseDataSource.scala b/backends-clickhouse/src/main/delta-20/org/apache/spark/sql/execution/datasources/v2/clickhouse/ClickHouseDataSource.scala similarity index 100% rename from backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/datasources/v2/clickhouse/ClickHouseDataSource.scala rename to backends-clickhouse/src/main/delta-20/org/apache/spark/sql/execution/datasources/v2/clickhouse/ClickHouseDataSource.scala diff --git a/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/datasources/v2/clickhouse/ClickHouseSparkCatalog.scala b/backends-clickhouse/src/main/delta-20/org/apache/spark/sql/execution/datasources/v2/clickhouse/ClickHouseSparkCatalog.scala similarity index 63% rename from backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/datasources/v2/clickhouse/ClickHouseSparkCatalog.scala rename to backends-clickhouse/src/main/delta-20/org/apache/spark/sql/execution/datasources/v2/clickhouse/ClickHouseSparkCatalog.scala index c2d208df5ed90..61e1da44d0af0 100644 --- a/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/datasources/v2/clickhouse/ClickHouseSparkCatalog.scala +++ b/backends-clickhouse/src/main/delta-20/org/apache/spark/sql/execution/datasources/v2/clickhouse/ClickHouseSparkCatalog.scala @@ -15,6 +15,7 @@ * limitations under the License. */ package org.apache.spark.sql.execution.datasources.v2.clickhouse + import org.apache.gluten.sql.shims.SparkShimLoader import org.apache.spark.sql.{AnalysisException, DataFrame, SparkSession} @@ -26,12 +27,12 @@ import org.apache.spark.sql.connector.catalog._ import org.apache.spark.sql.connector.catalog.TableCapability.V1_BATCH_WRITE import org.apache.spark.sql.connector.expressions.Transform import org.apache.spark.sql.connector.write.{LogicalWriteInfo, V1Write, WriteBuilder} -import org.apache.spark.sql.delta.{DeltaErrors, DeltaLog, DeltaOptions} +import org.apache.spark.sql.delta.{DeltaConfigs, DeltaErrors, DeltaLog, DeltaOptions, DeltaTableUtils} import org.apache.spark.sql.delta.DeltaTableIdentifier.gluePermissionError -import org.apache.spark.sql.delta.catalog.{ClickHouseTableV2, TempClickHouseTableV2} +import org.apache.spark.sql.delta.catalog.{ClickHouseTableV2, DeltaTableV2, TempClickHouseTableV2} import org.apache.spark.sql.delta.commands.{CreateDeltaTableCommand, TableCreationModes, WriteIntoDelta} import org.apache.spark.sql.delta.metering.DeltaLogging -import org.apache.spark.sql.delta.sources.DeltaSQLConf +import org.apache.spark.sql.delta.sources.{DeltaSourceUtils, DeltaSQLConf} import org.apache.spark.sql.execution.datasources.{DataSource, PartitioningUtils} import org.apache.spark.sql.execution.datasources.v2.clickhouse.utils.CHDataSourceUtils import org.apache.spark.sql.sources.InsertableRelation @@ -52,6 +53,15 @@ class ClickHouseSparkCatalog val spark = SparkSession.active + private def createCatalogTable( + ident: Identifier, + schema: StructType, + partitions: Array[Transform], + properties: util.Map[String, String] + ): Table = { + super.createTable(ident, schema, partitions, properties) + } + override def createTable( ident: Identifier, schema: StructType, @@ -66,8 +76,18 @@ class ClickHouseSparkCatalog Map.empty, sourceQuery = None, TableCreationModes.Create) + } else if (DeltaSourceUtils.isDeltaDataSourceName(getProvider(properties))) { + createDeltaTable( + ident, + schema, + partitions, + properties, + Map.empty, + sourceQuery = None, + TableCreationModes.Create + ) } else { - super.createTable(ident, schema, partitions, properties) + createCatalogTable(ident, schema, partitions, properties) } } @@ -120,7 +140,10 @@ class ClickHouseSparkCatalog .copy(locationUri = locUriOpt) val tableType = if (location.isDefined) CatalogTableType.EXTERNAL else CatalogTableType.MANAGED - val id = TableIdentifier(ident.name(), ident.namespace().lastOption) + val id = { + TableIdentifier(ident.name(), ident.namespace().lastOption) + } + val existingTableOpt = getExistingTableIfExists(id) val loc = new Path(locUriOpt.getOrElse(spark.sessionState.catalog.defaultTablePath(id))) val commentOpt = Option(allTableProperties.get("comment")) @@ -136,7 +159,7 @@ class ClickHouseSparkCatalog comment = commentOpt ) - val withDb = verifyTableAndSolidify(tableDesc, None) + val withDb = verifyTableAndSolidify(tableDesc, None, true) val writer = sourceQuery.map { df => @@ -156,7 +179,7 @@ class ClickHouseSparkCatalog CreateDeltaTableCommand( withDb, - getExistingTableIfExists(tableDesc), + existingTableOpt, operation.mode, writer, operation = operation, @@ -166,14 +189,134 @@ class ClickHouseSparkCatalog } logInfo(s"create table ${ident.toString} successfully.") - val loadedNewTable = loadTable(ident) - loadedNewTable + loadTable(ident) + } + + /** + * Creates a Delta table + * + * @param ident + * The identifier of the table + * @param schema + * The schema of the table + * @param partitions + * The partition transforms for the table + * @param allTableProperties + * The table properties that configure the behavior of the table or provide information about + * the table + * @param writeOptions + * Options specific to the write during table creation or replacement + * @param sourceQuery + * A query if this CREATE request came from a CTAS or RTAS + * @param operation + * The specific table creation mode, whether this is a Create/Replace/Create or Replace + */ + private def createDeltaTable( + ident: Identifier, + schema: StructType, + partitions: Array[Transform], + allTableProperties: util.Map[String, String], + writeOptions: Map[String, String], + sourceQuery: Option[DataFrame], + operation: TableCreationModes.CreationMode + ): Table = { + // These two keys are tableProperties in data source v2 but not in v1, so we have to filter + // them out. Otherwise property consistency checks will fail. + val tableProperties = allTableProperties.asScala.filterKeys { + case TableCatalog.PROP_LOCATION => false + case TableCatalog.PROP_PROVIDER => false + case TableCatalog.PROP_COMMENT => false + case TableCatalog.PROP_OWNER => false + case TableCatalog.PROP_EXTERNAL => false + case "path" => false + case _ => true + }.toMap + val (partitionColumns, maybeBucketSpec) = + SparkShimLoader.getSparkShims.convertPartitionTransforms(partitions) + var newSchema = schema + var newPartitionColumns = partitionColumns + var newBucketSpec = maybeBucketSpec + val conf = spark.sessionState.conf + + val isByPath = isPathIdentifier(ident) + if ( + isByPath && !conf.getConf(DeltaSQLConf.DELTA_LEGACY_ALLOW_AMBIGUOUS_PATHS) + && allTableProperties.containsKey("location") + // The location property can be qualified and different from the path in the identifier, so + // we check `endsWith` here. + && Option(allTableProperties.get("location")).exists(!_.endsWith(ident.name())) + ) { + throw DeltaErrors.ambiguousPathsInCreateTableException( + ident.name(), + allTableProperties.get("location")) + } + val location = if (isByPath) { + Option(ident.name()) + } else { + Option(allTableProperties.get("location")) + } + val id = { + TableIdentifier(ident.name(), ident.namespace().lastOption) + } + var locUriOpt = location.map(CatalogUtils.stringToURI) + val existingTableOpt = getExistingTableIfExists(id) + val loc = locUriOpt + .orElse(existingTableOpt.flatMap(_.storage.locationUri)) + .getOrElse(spark.sessionState.catalog.defaultTablePath(id)) + val storage = DataSource + .buildStorageFormatFromOptions(writeOptions) + .copy(locationUri = Option(loc)) + val tableType = + if (location.isDefined) CatalogTableType.EXTERNAL else CatalogTableType.MANAGED + val commentOpt = Option(allTableProperties.get("comment")) + + var tableDesc = new CatalogTable( + identifier = id, + tableType = tableType, + storage = storage, + schema = newSchema, + provider = Some(DeltaSourceUtils.ALT_NAME), + partitionColumnNames = newPartitionColumns, + bucketSpec = newBucketSpec, + properties = tableProperties, + comment = commentOpt + ) + + val withDb = verifyTableAndSolidify(tableDesc, None) + + val writer = sourceQuery.map { + df => + WriteIntoDelta( + DeltaLog.forTable(spark, new Path(loc)), + operation.mode, + new DeltaOptions(withDb.storage.properties, spark.sessionState.conf), + withDb.partitionColumnNames, + withDb.properties ++ commentOpt.map("comment" -> _), + df, + schemaInCatalog = if (newSchema != schema) Some(newSchema) else None + ) + } + + CreateDeltaTableCommand( + withDb, + existingTableOpt, + operation.mode, + writer, + operation, + tableByPath = isByPath).run(spark) + + loadTable(ident) } /** Performs checks on the parameters provided for table creation for a ClickHouse table. */ private def verifyTableAndSolidify( tableDesc: CatalogTable, - query: Option[LogicalPlan]): CatalogTable = { + query: Option[LogicalPlan], + isMergeTree: Boolean = false): CatalogTable = { + + if (!isMergeTree && tableDesc.bucketSpec.isDefined) { + throw DeltaErrors.operationNotSupportedException("Bucketing", tableDesc.identifier) + } val schema = query .map { @@ -189,30 +332,36 @@ class ClickHouseSparkCatalog caseSensitive = false ) // Delta is case insensitive + val validatedConfigurations = if (isMergeTree) { + tableDesc.properties + } else { + DeltaConfigs.validateConfigurations(tableDesc.properties) + } + val db = tableDesc.identifier.database.getOrElse(catalog.getCurrentDatabase) val tableIdentWithDB = tableDesc.identifier.copy(database = Some(db)) tableDesc.copy( identifier = tableIdentWithDB, schema = schema, - properties = tableDesc.properties) + properties = validatedConfigurations) } /** Checks if a table already exists for the provided identifier. */ - private def getExistingTableIfExists(table: CatalogTable): Option[CatalogTable] = { + def getExistingTableIfExists(table: TableIdentifier): Option[CatalogTable] = { // If this is a path identifier, we cannot return an existing CatalogTable. The Create command // will check the file system itself if (isPathIdentifier(table)) return None - val tableExists = catalog.tableExists(table.identifier) + val tableExists = catalog.tableExists(table) if (tableExists) { - val oldTable = catalog.getTableMetadata(table.identifier) + val oldTable = catalog.getTableMetadata(table) if (oldTable.tableType == CatalogTableType.VIEW) { - throw new AnalysisException( - s"${table.identifier} is a view. You may not write data into a view.") + throw new AnalysisException(s"$table is a view. You may not write data into a view.") } - if (!CHDataSourceUtils.isClickHouseTable(oldTable.provider)) { - throw new AnalysisException( - s"${table.identifier} is not a ClickHouse table. Please drop " + - s"this table first if you would like to recreate it.") + if ( + !DeltaSourceUtils.isDeltaTable(oldTable.provider) && + !CHDataSourceUtils.isClickHouseTable(oldTable.provider) + ) { + throw DeltaErrors.notADeltaTable(table.table) } Some(oldTable) } else { @@ -233,6 +382,12 @@ class ClickHouseSparkCatalog new Path(v1.catalogTable.location), catalogTable = Some(v1.catalogTable), tableIdentifier = Some(ident.toString)) + case v1: V1Table if DeltaTableUtils.isDeltaTable(v1.catalogTable) => + DeltaTableV2( + spark, + new Path(v1.catalogTable.location), + catalogTable = Some(v1.catalogTable), + tableIdentifier = Some(ident.toString)) case o => o } @@ -249,8 +404,12 @@ class ClickHouseSparkCatalog } } - private def newDeltaPathTable(ident: Identifier): ClickHouseTableV2 = { - new ClickHouseTableV2(spark, new Path(ident.name())) + private def newDeltaPathTable(ident: Identifier): DeltaTableV2 = { + if (hasClickHouseNamespace(ident)) { + new ClickHouseTableV2(spark, new Path(ident.name())) + } else { + DeltaTableV2(spark, new Path(ident.name())) + } } /** support to delete mergetree data from the external table */ @@ -284,11 +443,15 @@ class ClickHouseSparkCatalog partitions: Array[Transform], properties: util.Map[String, String]): StagedTable = recordFrameProfile("DeltaCatalog", "stageReplace") { - if (CHDataSourceUtils.isClickHouseDataSourceName(getProvider(properties))) { + if ( + CHDataSourceUtils.isClickHouseDataSourceName(getProvider(properties)) || + DeltaSourceUtils.isDeltaDataSourceName(getProvider(properties)) + ) { new StagedDeltaTableV2(ident, schema, partitions, properties, TableCreationModes.Replace) } else { super.dropTable(ident) - BestEffortStagedTable(ident, super.createTable(ident, schema, partitions, properties), this) + val table = createCatalogTable(ident, schema, partitions, properties) + BestEffortStagedTable(ident, table, this) } } @@ -298,7 +461,10 @@ class ClickHouseSparkCatalog partitions: Array[Transform], properties: util.Map[String, String]): StagedTable = recordFrameProfile("DeltaCatalog", "stageCreateOrReplace") { - if (CHDataSourceUtils.isClickHouseDataSourceName(getProvider(properties))) { + if ( + CHDataSourceUtils.isClickHouseDataSourceName(getProvider(properties)) || + DeltaSourceUtils.isDeltaDataSourceName(getProvider(properties)) + ) { new StagedDeltaTableV2( ident, schema, @@ -311,7 +477,8 @@ class ClickHouseSparkCatalog case _: NoSuchDatabaseException => // this is fine case _: NoSuchTableException => // this is fine } - BestEffortStagedTable(ident, super.createTable(ident, schema, partitions, properties), this) + val table = createCatalogTable(ident, schema, partitions, properties) + BestEffortStagedTable(ident, table, this) } } @@ -321,13 +488,22 @@ class ClickHouseSparkCatalog partitions: Array[Transform], properties: util.Map[String, String]): StagedTable = recordFrameProfile("DeltaCatalog", "stageCreate") { - if (CHDataSourceUtils.isClickHouseDataSourceName(getProvider(properties))) { + if ( + CHDataSourceUtils.isClickHouseDataSourceName(getProvider(properties)) || + DeltaSourceUtils.isDeltaDataSourceName(getProvider(properties)) + ) { new StagedDeltaTableV2(ident, schema, partitions, properties, TableCreationModes.Create) } else { - BestEffortStagedTable(ident, super.createTable(ident, schema, partitions, properties), this) + val table = createCatalogTable(ident, schema, partitions, properties) + BestEffortStagedTable(ident, table, this) } } + /** + * A staged delta table, which creates a HiveMetaStore entry and appends data if this was a + * CTAS/RTAS command. We have a ugly way of using this API right now, but it's the best way to + * maintain old behavior compatibility between Databricks Runtime and OSS Delta Lake. + */ private class StagedDeltaTableV2( ident: Identifier, override val schema: StructType, @@ -374,14 +550,18 @@ class ClickHouseSparkCatalog } } } - createClickHouseTable( - ident, - schema, - partitions, - props, - writeOptions, - asSelectQuery, - operation) + if (CHDataSourceUtils.isClickHouseDataSourceName(getProvider(properties))) { + createClickHouseTable( + ident, + schema, + partitions, + props, + writeOptions, + asSelectQuery, + operation) + } else { + createDeltaTable(ident, schema, partitions, props, writeOptions, asSelectQuery, operation) + } } override def name(): String = ident.name() @@ -454,20 +634,29 @@ trait SupportsPathIdentifier extends TableCatalog { protected def isPathIdentifier(ident: Identifier): Boolean = { // Should be a simple check of a special PathIdentifier class in the future try { - supportSQLOnFile && hasClickHouseNamespace(ident) && new Path(ident.name()).isAbsolute + supportSQLOnFile && (hasClickHouseNamespace(ident) || hasDeltaNamespace(ident)) && + new Path(ident.name()).isAbsolute } catch { case _: IllegalArgumentException => false } } + protected def isPathIdentifier(table: CatalogTable): Boolean = { + isPathIdentifier(table.identifier) + } + + protected def isPathIdentifier(tableIdentifier: TableIdentifier): Boolean = { + isPathIdentifier(Identifier.of(tableIdentifier.database.toArray, tableIdentifier.table)) + } + private def supportSQLOnFile: Boolean = spark.sessionState.conf.runSQLonFile - private def hasClickHouseNamespace(ident: Identifier): Boolean = { + protected def hasClickHouseNamespace(ident: Identifier): Boolean = { ident.namespace().length == 1 && CHDataSourceUtils.isClickHouseDataSourceName(ident.namespace().head) } - protected def isPathIdentifier(table: CatalogTable): Boolean = { - isPathIdentifier(Identifier.of(table.identifier.database.toArray, table.identifier.table)) + protected def hasDeltaNamespace(ident: Identifier): Boolean = { + ident.namespace().length == 1 && DeltaSourceUtils.isDeltaDataSourceName(ident.namespace().head) } } diff --git a/backends-clickhouse/src/main/delta-22/org/apache/spark/sql/delta/commands/VacuumCommand.scala b/backends-clickhouse/src/main/delta-22/org/apache/spark/sql/delta/commands/VacuumCommand.scala deleted file mode 100644 index dc833c2b2fae9..0000000000000 --- a/backends-clickhouse/src/main/delta-22/org/apache/spark/sql/delta/commands/VacuumCommand.scala +++ /dev/null @@ -1,464 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.spark.sql.delta.commands - -import org.apache.spark.broadcast.Broadcast -import org.apache.spark.sql.{Column, DataFrame, Dataset, SparkSession} -import org.apache.spark.sql.delta._ -import org.apache.spark.sql.delta.actions.{FileAction, RemoveFile} -import org.apache.spark.sql.delta.sources.DeltaSQLConf -import org.apache.spark.sql.delta.util.DeltaFileOperations -import org.apache.spark.sql.delta.util.DeltaFileOperations.tryDeleteNonRecursive -import org.apache.spark.sql.functions._ -import org.apache.spark.util.{Clock, SerializableConfiguration, SystemClock} - -// scalastyle:off import.ordering.noEmptyLine -import com.fasterxml.jackson.databind.annotation.JsonDeserialize -import org.apache.hadoop.fs.{FileSystem, Path} - -import java.net.URI -import java.util.Date -import java.util.concurrent.TimeUnit - -import scala.collection.JavaConverters._ - -/** - * Gluten overwrite Delta: - * - * This file is copied from Delta 2.2.0. It is modified to overcome the following issues: - * 1. In Gluten, part is a directory, but VacuumCommand assumes part is a file. So we need some - * modifications to make it work. - */ - -/** - * Vacuums the table by clearing all untracked files and folders within this table. First lists all - * the files and directories in the table, and gets the relative paths with respect to the base of - * the table. Then it gets the list of all tracked files for this table, which may or may not be - * within the table base path, and gets the relative paths of all the tracked files with respect to - * the base of the table. Files outside of the table path will be ignored. Then we take a diff of - * the files and delete directories that were already empty, and all files that are within the table - * that are no longer tracked. - */ -object VacuumCommand extends VacuumCommandImpl with Serializable { - - case class FileNameAndSize(path: String, length: Long, isDir: Boolean) - - /** - * Additional check on retention duration to prevent people from shooting themselves in the foot. - */ - protected def checkRetentionPeriodSafety( - spark: SparkSession, - retentionMs: Option[Long], - configuredRetention: Long): Unit = { - require(retentionMs.forall(_ >= 0), "Retention for Vacuum can't be less than 0.") - val checkEnabled = - spark.sessionState.conf.getConf(DeltaSQLConf.DELTA_VACUUM_RETENTION_CHECK_ENABLED) - val retentionSafe = retentionMs.forall(_ >= configuredRetention) - var configuredRetentionHours = TimeUnit.MILLISECONDS.toHours(configuredRetention) - if (TimeUnit.HOURS.toMillis(configuredRetentionHours) < configuredRetention) { - configuredRetentionHours += 1 - } - require( - !checkEnabled || retentionSafe, - s"""Are you sure you would like to vacuum files with such a low retention period? If you have - |writers that are currently writing to this table, there is a risk that you may corrupt the - |state of your Delta table. - | - |If you are certain that there are no operations being performed on this table, such as - |insert/upsert/delete/optimize, then you may turn off this check by setting: - |spark.databricks.delta.retentionDurationCheck.enabled = false - | - |If you are not sure, please use a value not less than "$configuredRetentionHours hours". - """.stripMargin - ) - } - - /** - * Clears all untracked files and folders within this table. First lists all the files and - * directories in the table, and gets the relative paths with respect to the base of the table. - * Then it gets the list of all tracked files for this table, which may or may not be within the - * table base path, and gets the relative paths of all the tracked files with respect to the base - * of the table. Files outside of the table path will be ignored. Then we take a diff of the files - * and delete directories that were already empty, and all files that are within the table that - * are no longer tracked. - * - * @param dryRun - * If set to true, no files will be deleted. Instead, we will list all files and directories - * that will be cleared. - * @param retentionHours - * An optional parameter to override the default Delta tombstone retention period - * @return - * A Dataset containing the paths of the files/folders to delete in dryRun mode. Otherwise - * returns the base path of the table. - */ - def gc( - spark: SparkSession, - deltaLog: DeltaLog, - dryRun: Boolean = true, - retentionHours: Option[Double] = None, - clock: Clock = new SystemClock): DataFrame = { - recordDeltaOperation(deltaLog, "delta.gc") { - - val path = deltaLog.dataPath - val deltaHadoopConf = deltaLog.newDeltaHadoopConf() - val fs = path.getFileSystem(deltaHadoopConf) - - import org.apache.spark.sql.delta.implicits._ - - val snapshot = deltaLog.update() - - require( - snapshot.version >= 0, - "No state defined for this table. Is this really " + - "a Delta table? Refusing to garbage collect.") - - val retentionMillis = retentionHours.map(h => TimeUnit.HOURS.toMillis(math.round(h))) - checkRetentionPeriodSafety(spark, retentionMillis, deltaLog.tombstoneRetentionMillis) - - val deleteBeforeTimestamp = retentionMillis - .map(millis => clock.getTimeMillis() - millis) - .getOrElse(deltaLog.minFileRetentionTimestamp) - logInfo( - s"Starting garbage collection (dryRun = $dryRun) of untracked files older than " + - s"${new Date(deleteBeforeTimestamp).toString} in $path") - val hadoopConf = spark.sparkContext.broadcast(new SerializableConfiguration(deltaHadoopConf)) - val basePath = fs.makeQualified(path).toString - var isBloomFiltered = false - val parallelDeleteEnabled = - spark.sessionState.conf.getConf(DeltaSQLConf.DELTA_VACUUM_PARALLEL_DELETE_ENABLED) - val parallelDeletePartitions = - spark.sessionState.conf - .getConf(DeltaSQLConf.DELTA_VACUUM_PARALLEL_DELETE_PARALLELISM) - .getOrElse(spark.sessionState.conf.numShufflePartitions) - val relativizeIgnoreError = - spark.sessionState.conf.getConf(DeltaSQLConf.DELTA_VACUUM_RELATIVIZE_IGNORE_ERROR) - val startTimeToIdentifyEligibleFiles = System.currentTimeMillis() - val validFiles = snapshot.stateDS - .mapPartitions { - actions => - val reservoirBase = new Path(basePath) - val fs = reservoirBase.getFileSystem(hadoopConf.value.value) - actions.flatMap { - _.unwrap match { - case tombstone: RemoveFile if tombstone.delTimestamp < deleteBeforeTimestamp => - Nil - case fa: FileAction => - getValidRelativePathsAndSubdirs( - fa, - fs, - reservoirBase, - relativizeIgnoreError, - isBloomFiltered) - case _ => Nil - } - } - } - .toDF("path") - - val partitionColumns = snapshot.metadata.partitionSchema.fieldNames - val parallelism = spark.sessionState.conf.parallelPartitionDiscoveryParallelism - - val allFilesAndDirs = DeltaFileOperations - .recursiveListDirs( - spark, - Seq(basePath), - hadoopConf, - hiddenDirNameFilter = DeltaTableUtils.isHiddenDirectory(partitionColumns, _), - hiddenFileNameFilter = DeltaTableUtils.isHiddenDirectory(partitionColumns, _), - fileListingParallelism = Option(parallelism) - ) - .groupByKey(_.path) - .mapGroups { - (k, v) => - val duplicates = v.toSeq - // of all the duplicates we can return the newest file. - duplicates.maxBy(_.modificationTime) - } - - try { - allFilesAndDirs.cache() - - implicit val fileNameAndSizeEncoder = org.apache.spark.sql.Encoders.product[FileNameAndSize] - - val dirCounts = allFilesAndDirs.where(col("isDir")).count() + 1 // +1 for the base path - - // The logic below is as follows: - // 1. We take all the files and directories listed in our reservoir - // 2. We filter all files older than our tombstone retention period and directories - // 3. We get the subdirectories of all files so that we can find non-empty directories - // 4. We groupBy each path, and count to get how many files are in each sub-directory - // 5. We subtract all the valid files and tombstones in our state - // 6. We filter all paths with a count of 1, which will correspond to files not in the - // state, and empty directories. We can safely delete all of these - val diff_temp = allFilesAndDirs - .where(col("modificationTime") < deleteBeforeTimestamp || col("isDir")) - .mapPartitions { - fileStatusIterator => - val reservoirBase = new Path(basePath) - val fs = reservoirBase.getFileSystem(hadoopConf.value.value) - fileStatusIterator.flatMap { - fileStatus => - if (fileStatus.isDir) { - Iterator.single( - FileNameAndSize( - relativize(fileStatus.getHadoopPath, fs, reservoirBase, isDir = true), - 0L, - true)) - } else { - val dirs = getAllSubdirs(basePath, fileStatus.path, fs) - val dirsWithSlash = dirs.map { - p => - val relativizedPath = - relativize(new Path(p), fs, reservoirBase, isDir = true) - FileNameAndSize(relativizedPath, 0L, true) - } - dirsWithSlash ++ Iterator( - FileNameAndSize( - relativize(fileStatus.getHadoopPath, fs, reservoirBase, isDir = false), - fileStatus.length, - false)) - } - } - } - .withColumn( - "dir", - when(col("isDir"), col("path")) - .otherwise(expr("substring_index(path, '/',size(split(path, '/')) -1)"))) - .groupBy(col("path"), col("dir")) - .agg(count(new Column("*")).as("count"), sum("length").as("length")) - - val diff = diff_temp - .join(validFiles, diff_temp("dir") === validFiles("path"), "leftanti") - .where(col("count") === 1) - - val sizeOfDataToDeleteRow = diff.agg(sum("length").cast("long")).first - val sizeOfDataToDelete = if (sizeOfDataToDeleteRow.isNullAt(0)) { - 0L - } else { - sizeOfDataToDeleteRow.getLong(0) - } - - val diffFiles = diff - .select(col("path")) - .as[String] - .map { - relativePath => - assert( - !stringToPath(relativePath).isAbsolute, - "Shouldn't have any absolute paths for deletion here.") - pathToString(DeltaFileOperations.absolutePath(basePath, relativePath)) - } - val timeTakenToIdentifyEligibleFiles = - System.currentTimeMillis() - startTimeToIdentifyEligibleFiles - - if (dryRun) { - val numFiles = diffFiles.count() - val stats = DeltaVacuumStats( - isDryRun = true, - specifiedRetentionMillis = retentionMillis, - defaultRetentionMillis = deltaLog.tombstoneRetentionMillis, - minRetainedTimestamp = deleteBeforeTimestamp, - dirsPresentBeforeDelete = dirCounts, - objectsDeleted = numFiles, - sizeOfDataToDelete = sizeOfDataToDelete, - timeTakenToIdentifyEligibleFiles = timeTakenToIdentifyEligibleFiles, - timeTakenForDelete = 0L - ) - - recordDeltaEvent(deltaLog, "delta.gc.stats", data = stats) - logConsole( - s"Found $numFiles files and directories in a total of " + - s"$dirCounts directories that are safe to delete.$stats") - - return diffFiles.map(f => stringToPath(f).toString).toDF("path") - } - logVacuumStart( - spark, - deltaLog, - path, - diffFiles, - sizeOfDataToDelete, - retentionMillis, - deltaLog.tombstoneRetentionMillis) - - val deleteStartTime = System.currentTimeMillis() - val filesDeleted = - try { - delete( - diffFiles, - spark, - basePath, - hadoopConf, - parallelDeleteEnabled, - parallelDeletePartitions) - } catch { - case t: Throwable => - logVacuumEnd(deltaLog, spark, path) - throw t - } - val timeTakenForDelete = System.currentTimeMillis() - deleteStartTime - val stats = DeltaVacuumStats( - isDryRun = false, - specifiedRetentionMillis = retentionMillis, - defaultRetentionMillis = deltaLog.tombstoneRetentionMillis, - minRetainedTimestamp = deleteBeforeTimestamp, - dirsPresentBeforeDelete = dirCounts, - objectsDeleted = filesDeleted, - sizeOfDataToDelete = sizeOfDataToDelete, - timeTakenToIdentifyEligibleFiles = timeTakenToIdentifyEligibleFiles, - timeTakenForDelete = timeTakenForDelete - ) - recordDeltaEvent(deltaLog, "delta.gc.stats", data = stats) - logVacuumEnd(deltaLog, spark, path, Some(filesDeleted), Some(dirCounts)) - - spark.createDataset(Seq(basePath)).toDF("path") - } finally { - allFilesAndDirs.unpersist() - } - } - } -} - -trait VacuumCommandImpl extends DeltaCommand { - - protected def logVacuumStart( - spark: SparkSession, - deltaLog: DeltaLog, - path: Path, - diff: Dataset[String], - sizeOfDataToDelete: Long, - specifiedRetentionMillis: Option[Long], - defaultRetentionMillis: Long): Unit = { - logInfo( - s"Deleting untracked files and empty directories in $path. The amount of data to be " + - s"deleted is $sizeOfDataToDelete (in bytes)") - } - - protected def logVacuumEnd( - deltaLog: DeltaLog, - spark: SparkSession, - path: Path, - filesDeleted: Option[Long] = None, - dirCounts: Option[Long] = None): Unit = { - if (filesDeleted.nonEmpty) { - logConsole( - s"Deleted ${filesDeleted.get} files and directories in a total " + - s"of ${dirCounts.get} directories.") - } - } - - /** - * Attempts to relativize the `path` with respect to the `reservoirBase` and converts the path to - * a string. - */ - protected def relativize( - path: Path, - fs: FileSystem, - reservoirBase: Path, - isDir: Boolean): String = { - pathToString(DeltaFileOperations.tryRelativizePath(fs, reservoirBase, path)) - } - - /** - * Wrapper function for DeltaFileOperations.getAllSubDirectories returns all subdirectories that - * `file` has with respect to `base`. - */ - protected def getAllSubdirs(base: String, file: String, fs: FileSystem): Iterator[String] = { - DeltaFileOperations.getAllSubDirectories(base, file)._1 - } - - /** Attempts to delete the list of candidate files. Returns the number of files deleted. */ - protected def delete( - diff: Dataset[String], - spark: SparkSession, - basePath: String, - hadoopConf: Broadcast[SerializableConfiguration], - parallel: Boolean, - parallelPartitions: Int): Long = { - import org.apache.spark.sql.delta.implicits._ - - if (parallel) { - diff - .repartition(parallelPartitions) - .mapPartitions { - files => - val fs = new Path(basePath).getFileSystem(hadoopConf.value.value) - val filesDeletedPerPartition = - files.map(p => stringToPath(p)).count(f => tryDeleteNonRecursive(fs, f)) - Iterator(filesDeletedPerPartition) - } - .collect() - .sum - } else { - val fs = new Path(basePath).getFileSystem(hadoopConf.value.value) - val fileResultSet = diff.toLocalIterator().asScala - fileResultSet.map(p => stringToPath(p)).count(f => tryDeleteNonRecursive(fs, f)) - } - } - - protected def stringToPath(path: String): Path = new Path(new URI(path)) - - protected def pathToString(path: Path): String = path.toUri.toString - - /** Returns the relative path of a file action or None if the file lives outside of the table. */ - protected def getActionRelativePath( - action: FileAction, - fs: FileSystem, - basePath: Path, - relativizeIgnoreError: Boolean): Option[String] = { - val filePath = stringToPath(action.path) - if (filePath.isAbsolute) { - val maybeRelative = - DeltaFileOperations.tryRelativizePath(fs, basePath, filePath, relativizeIgnoreError) - if (maybeRelative.isAbsolute) { - // This file lives outside the directory of the table. - None - } else { - Some(pathToString(maybeRelative)) - } - } else { - Some(pathToString(filePath)) - } - } - - /** - * Returns the relative paths of all files and subdirectories for this action that must be - * retained during GC. - */ - protected def getValidRelativePathsAndSubdirs( - action: FileAction, - fs: FileSystem, - basePath: Path, - relativizeIgnoreError: Boolean, - isBloomFiltered: Boolean): Seq[String] = { - getActionRelativePath(action, fs, basePath, relativizeIgnoreError) - .map(relativePath => Seq(relativePath) ++ getAllSubdirs("/", relativePath, fs)) - .getOrElse(Seq.empty) - } -} - -case class DeltaVacuumStats( - isDryRun: Boolean, - @JsonDeserialize(contentAs = classOf[java.lang.Long]) - specifiedRetentionMillis: Option[Long], - defaultRetentionMillis: Long, - minRetainedTimestamp: Long, - dirsPresentBeforeDelete: Long, - objectsDeleted: Long, - sizeOfDataToDelete: Long, - timeTakenToIdentifyEligibleFiles: Long, - timeTakenForDelete: Long) diff --git a/backends-clickhouse/src/main/delta-22/io/delta/tables/ClickhouseTable.scala b/backends-clickhouse/src/main/delta-23/io/delta/tables/ClickhouseTable.scala similarity index 100% rename from backends-clickhouse/src/main/delta-22/io/delta/tables/ClickhouseTable.scala rename to backends-clickhouse/src/main/delta-23/io/delta/tables/ClickhouseTable.scala diff --git a/backends-clickhouse/src/main/delta-23/org/apache/spark/sql/delta/ClickhouseOptimisticTransaction.scala b/backends-clickhouse/src/main/delta-23/org/apache/spark/sql/delta/ClickhouseOptimisticTransaction.scala new file mode 100644 index 0000000000000..0794b45158e64 --- /dev/null +++ b/backends-clickhouse/src/main/delta-23/org/apache/spark/sql/delta/ClickhouseOptimisticTransaction.scala @@ -0,0 +1,183 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.delta + +import org.apache.gluten.backendsapi.clickhouse.CHBackendSettings +import org.apache.gluten.execution.ColumnarToRowExecBase + +import org.apache.spark.SparkException +import org.apache.spark.sql.Dataset +import org.apache.spark.sql.delta.actions._ +import org.apache.spark.sql.delta.catalog.ClickHouseTableV2 +import org.apache.spark.sql.delta.constraints.{Constraint, Constraints} +import org.apache.spark.sql.delta.files.MergeTreeCommitProtocol +import org.apache.spark.sql.delta.schema.InvariantViolationException +import org.apache.spark.sql.delta.sources.DeltaSQLConf +import org.apache.spark.sql.execution.{SparkPlan, SQLExecution} +import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec +import org.apache.spark.sql.execution.datasources.{BasicWriteJobStatsTracker, FakeRowAdaptor, FileFormatWriter, WriteJobStatsTracker} +import org.apache.spark.sql.execution.datasources.v1.clickhouse.MergeTreeFileFormatWriter +import org.apache.spark.sql.execution.datasources.v2.clickhouse.ClickHouseConfig +import org.apache.spark.util.{Clock, SerializableConfiguration} + +import org.apache.commons.lang3.exception.ExceptionUtils + +import scala.collection.mutable.ListBuffer + +object ClickhouseOptimisticTransaction {} +class ClickhouseOptimisticTransaction( + override val deltaLog: DeltaLog, + override val snapshot: Snapshot)(implicit override val clock: Clock) + extends OptimisticTransaction(deltaLog, snapshot) { + + def this(deltaLog: DeltaLog, snapshotOpt: Option[Snapshot] = None)(implicit clock: Clock) { + this( + deltaLog, + snapshotOpt.getOrElse(deltaLog.update()) + ) + } + + def insertFakeRowAdaptor(queryPlan: SparkPlan): SparkPlan = queryPlan match { + // if the child is columnar, we can just wrap&transfer the columnar data + case c2r: ColumnarToRowExecBase => + FakeRowAdaptor(c2r.child) + // If the child is aqe, we make aqe "support columnar", + // then aqe itself will guarantee to generate columnar outputs. + // So FakeRowAdaptor will always consumes columnar data, + // thus avoiding the case of c2r->aqe->r2c->writer + case aqe: AdaptiveSparkPlanExec => + FakeRowAdaptor( + AdaptiveSparkPlanExec( + aqe.inputPlan, + aqe.context, + aqe.preprocessingRules, + aqe.isSubquery, + supportsColumnar = true + )) + case other => FakeRowAdaptor(other) + } + + override def writeFiles( + inputData: Dataset[_], + writeOptions: Option[DeltaOptions], + additionalConstraints: Seq[Constraint]): Seq[FileAction] = { + if (ClickHouseConfig.isMergeTreeFormatEngine(metadata.configuration)) { + hasWritten = true + + val spark = inputData.sparkSession + val (data, partitionSchema) = performCDCPartition(inputData) + val outputPath = deltaLog.dataPath + + val (queryExecution, output, generatedColumnConstraints, _) = + normalizeData(deltaLog, data) + val partitioningColumns = getPartitioningColumns(partitionSchema, output) + + val committer = new MergeTreeCommitProtocol("delta-mergetree", outputPath.toString, None) + + // val (optionalStatsTracker, _) = + // getOptionalStatsTrackerAndStatsCollection(output, outputPath, partitionSchema, data) + val (optionalStatsTracker, _) = (None, None) + + val constraints = + Constraints.getAll(metadata, spark) ++ generatedColumnConstraints ++ additionalConstraints + + SQLExecution.withNewExecutionId(queryExecution, Option("deltaTransactionalWrite")) { + val outputSpec = FileFormatWriter.OutputSpec(outputPath.toString, Map.empty, output) + + val queryPlan = queryExecution.executedPlan + val newQueryPlan = insertFakeRowAdaptor(queryPlan) + + val statsTrackers: ListBuffer[WriteJobStatsTracker] = ListBuffer() + + if (spark.conf.get(DeltaSQLConf.DELTA_HISTORY_METRICS_ENABLED)) { + val basicWriteJobStatsTracker = new BasicWriteJobStatsTracker( + new SerializableConfiguration(deltaLog.newDeltaHadoopConf()), + BasicWriteJobStatsTracker.metrics) + // registerSQLMetrics(spark, basicWriteJobStatsTracker.driverSideMetrics) + statsTrackers.append(basicWriteJobStatsTracker) + } + + // Retain only a minimal selection of Spark writer options to avoid any potential + // compatibility issues + var options = writeOptions match { + case None => Map.empty[String, String] + case Some(writeOptions) => + writeOptions.options.filterKeys { + key => + key.equalsIgnoreCase(DeltaOptions.MAX_RECORDS_PER_FILE) || + key.equalsIgnoreCase(DeltaOptions.COMPRESSION) + }.toMap + } + + spark.conf.getAll.foreach( + entry => { + if ( + entry._1.startsWith(s"${CHBackendSettings.getBackendConfigPrefix}.runtime_settings") + || entry._1.equalsIgnoreCase(DeltaSQLConf.DELTA_OPTIMIZE_MIN_FILE_SIZE.key) + ) { + options += (entry._1 -> entry._2) + } + }) + + try { + val tableV2 = ClickHouseTableV2.getTable(deltaLog) + MergeTreeFileFormatWriter.write( + sparkSession = spark, + plan = newQueryPlan, + fileFormat = tableV2.getFileFormat(metadata), + // formats. + committer = committer, + outputSpec = outputSpec, + // scalastyle:off deltahadoopconfiguration + hadoopConf = spark.sessionState + .newHadoopConfWithOptions(metadata.configuration ++ deltaLog.options), + // scalastyle:on deltahadoopconfiguration + orderByKeyOption = tableV2.orderByKeyOption, + lowCardKeyOption = tableV2.lowCardKeyOption, + minmaxIndexKeyOption = tableV2.minmaxIndexKeyOption, + bfIndexKeyOption = tableV2.bfIndexKeyOption, + setIndexKeyOption = tableV2.setIndexKeyOption, + primaryKeyOption = tableV2.primaryKeyOption, + partitionColumns = partitioningColumns, + bucketSpec = tableV2.bucketOption, + statsTrackers = optionalStatsTracker.toSeq ++ statsTrackers, + options = options, + constraints = constraints + ) + } catch { + case s: SparkException => + // Pull an InvariantViolationException up to the top level if it was the root cause. + val violationException = ExceptionUtils.getRootCause(s) + if (violationException.isInstanceOf[InvariantViolationException]) { + throw violationException + } else { + throw s + } + } + } + committer.addedStatuses.toSeq ++ committer.changeFiles + } else { + // TODO: support native delta parquet write + // 1. insert FakeRowAdaptor + // 2. DeltaInvariantCheckerExec transform + // 3. DeltaTaskStatisticsTracker collect null count / min values / max values + // 4. set the parameters 'staticPartitionWriteOnly', 'isNativeAppliable', + // 'nativeFormat' in the LocalProperty of the sparkcontext + super.writeFiles(inputData, writeOptions, additionalConstraints) + } + } +} diff --git a/backends-clickhouse/src/main/delta-23/org/apache/spark/sql/delta/DeltaAdapter.scala b/backends-clickhouse/src/main/delta-23/org/apache/spark/sql/delta/DeltaAdapter.scala new file mode 100644 index 0000000000000..8a9c5585e888b --- /dev/null +++ b/backends-clickhouse/src/main/delta-23/org/apache/spark/sql/delta/DeltaAdapter.scala @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.delta + +object DeltaAdapter extends DeltaAdapterTrait { + override def snapshot(deltaLog: DeltaLog): Snapshot = deltaLog.unsafeVolatileSnapshot +} diff --git a/backends-clickhouse/src/main/delta-22/org/apache/spark/sql/delta/DeltaLog.scala b/backends-clickhouse/src/main/delta-23/org/apache/spark/sql/delta/DeltaLog.scala similarity index 65% rename from backends-clickhouse/src/main/delta-22/org/apache/spark/sql/delta/DeltaLog.scala rename to backends-clickhouse/src/main/delta-23/org/apache/spark/sql/delta/DeltaLog.scala index 4cab6454d15a6..78fbc3fcdb99f 100644 --- a/backends-clickhouse/src/main/delta-22/org/apache/spark/sql/delta/DeltaLog.scala +++ b/backends-clickhouse/src/main/delta-23/org/apache/spark/sql/delta/DeltaLog.scala @@ -14,15 +14,22 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.spark.sql.delta -import org.apache.spark.sql._ -import org.apache.spark.sql.catalyst.TableIdentifier -import org.apache.spark.sql.catalyst.analysis.{Resolver, UnresolvedAttribute} -import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogTable} -import org.apache.spark.sql.catalyst.expressions.{And, Attribute, Cast, Expression, Literal} -import org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper -import org.apache.spark.sql.catalyst.util.FailFastMode +// scalastyle:off import.ordering.noEmptyLine +import java.io.File +import java.lang.ref.WeakReference +import java.net.URI +import java.util.concurrent.TimeUnit +import java.util.concurrent.locks.ReentrantLock + +import scala.collection.JavaConverters._ +import scala.collection.mutable +import scala.util.Try +import scala.util.control.NonFatal + +import com.databricks.spark.util.TagDefinitions._ import org.apache.spark.sql.delta.actions._ import org.apache.spark.sql.delta.catalog.ClickHouseTableV2 import org.apache.spark.sql.delta.commands.WriteIntoDelta @@ -30,9 +37,21 @@ import org.apache.spark.sql.delta.commands.cdc.CDCReader import org.apache.spark.sql.delta.files.{TahoeBatchFileIndex, TahoeLogFileIndex} import org.apache.spark.sql.delta.metering.DeltaLogging import org.apache.spark.sql.delta.schema.{SchemaMergingUtils, SchemaUtils} -import org.apache.spark.sql.delta.sources.DeltaSQLConf +import org.apache.spark.sql.delta.sources._ import org.apache.spark.sql.delta.storage.LogStoreProvider +import com.google.common.cache.{CacheBuilder, RemovalNotification} +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.{FileStatus, FileSystem, Path} + +import org.apache.spark.sql._ +import org.apache.spark.sql.catalyst.TableIdentifier +import org.apache.spark.sql.catalyst.analysis.{Resolver, UnresolvedAttribute} +import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogTable} +import org.apache.spark.sql.catalyst.expressions.{And, Attribute, Cast, Expression, Literal} +import org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper +import org.apache.spark.sql.catalyst.util.FailFastMode import org.apache.spark.sql.execution.datasources._ +import org.apache.spark.sql.execution.datasources.v2.clickhouse.ClickHouseConfig import org.apache.spark.sql.expressions.UserDefinedFunction import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.sources.{BaseRelation, InsertableRelation} @@ -40,56 +59,46 @@ import org.apache.spark.sql.types.{StructField, StructType} import org.apache.spark.sql.util.CaseInsensitiveStringMap import org.apache.spark.util._ -// scalastyle:off import.ordering.noEmptyLine -import com.databricks.spark.util.TagDefinitions._ -import com.google.common.cache.{CacheBuilder, RemovalNotification} -import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.fs.{FileStatus, FileSystem, Path} - -import java.io.File -import java.lang.ref.WeakReference -import java.net.URI -import java.util.concurrent.TimeUnit -import java.util.concurrent.locks.ReentrantLock - -import scala.collection.JavaConverters._ -import scala.collection.mutable -import scala.util.Try -import scala.util.control.NonFatal - -// This class is copied from Delta 2.2.0 because it has a private constructor, -// which makes it impossible to extend - /** * Gluten overwrite Delta: * - * This file is copied from Delta 2.2.0 It is modified to overcome the following issues: - * 1. return ClickhouseOptimisticTransaction 2. return DeltaMergeTreeFileFormat + * This file is copied from Delta 2.3.0, it is modified to overcome the following issues: + * 1. return ClickhouseOptimisticTransaction + * 2. return DeltaMergeTreeFileFormat + * 3. create HadoopFsRelation with the bucket options */ - /** - * Used to query the current state of the log as well as modify it by adding new atomic collections - * of actions. + * Used to query the current state of the log as well as modify it by adding + * new atomic collections of actions. * - * Internally, this class implements an optimistic concurrency control algorithm to handle multiple - * readers or writers. Any single read is guaranteed to see a consistent snapshot of the table. + * Internally, this class implements an optimistic concurrency control + * algorithm to handle multiple readers or writers. Any single read + * is guaranteed to see a consistent snapshot of the table. + * + * @param logPath Path of the Delta log JSONs. + * @param dataPath Path of the data files. + * @param options Filesystem options filtered from `allOptions`. + * @param allOptions All options provided by the user, for example via `df.write.option()`. This + * includes but not limited to filesystem and table properties. + * @param clock Clock to be used when starting a new transaction. */ -class DeltaLog private ( +class DeltaLog private( val logPath: Path, val dataPath: Path, val options: Map[String, String], + val allOptions: Map[String, String], val clock: Clock -) extends Checkpoints + ) extends Checkpoints with MetadataCleanup with LogStoreProvider with SnapshotManagement with DeltaFileFormat with ReadChecksum { + import org.apache.spark.sql.delta.util.FileNames._ - import DeltaLog._ - implicit private lazy val _clock = clock + private lazy implicit val _clock = clock protected def spark = SparkSession.active @@ -120,8 +129,7 @@ class DeltaLog private ( /** Delta History Manager containing version and commit history. */ lazy val history = new DeltaHistoryManager( - this, - spark.sessionState.conf.getConf(DeltaSQLConf.DELTA_HISTORY_PAR_SEARCH_THRESHOLD)) + this, spark.sessionState.conf.getConf(DeltaSQLConf.DELTA_HISTORY_PAR_SEARCH_THRESHOLD)) /* --------------- * | Configuration | @@ -129,61 +137,25 @@ class DeltaLog private ( /** * The max lineage length of a Snapshot before Delta forces to build a Snapshot from scratch. - * Delta will build a Snapshot on top of the previous one if it doesn't see a checkpoint. However, - * there is a race condition that when two writers are writing at the same time, a writer may fail - * to pick up checkpoints written by another one, and the lineage will grow and finally cause - * StackOverflowError. Hence we have to force to build a Snapshot from scratch when the lineage - * length is too large to avoid hitting StackOverflowError. + * Delta will build a Snapshot on top of the previous one if it doesn't see a checkpoint. + * However, there is a race condition that when two writers are writing at the same time, + * a writer may fail to pick up checkpoints written by another one, and the lineage will grow + * and finally cause StackOverflowError. Hence we have to force to build a Snapshot from scratch + * when the lineage length is too large to avoid hitting StackOverflowError. */ def maxSnapshotLineageLength: Int = spark.sessionState.conf.getConf(DeltaSQLConf.DELTA_MAX_SNAPSHOT_LINEAGE_LENGTH) - /** How long to keep around logically deleted files before physically deleting them. */ - private[delta] def tombstoneRetentionMillis: Long = - DeltaConfigs.getMilliSeconds(DeltaConfigs.TOMBSTONE_RETENTION.fromMetaData(metadata)) - - // TODO: There is a race here where files could get dropped when increasing the - // retention interval... - protected def metadata = Option(unsafeVolatileSnapshot).map(_.metadata).getOrElse(Metadata()) - - /** - * Tombstones before this timestamp will be dropped from the state and the files can be garbage - * collected. - */ - def minFileRetentionTimestamp: Long = { - // TODO (Fred): Get rid of this FrameProfiler record once SC-94033 is addressed - recordFrameProfile("Delta", "DeltaLog.minFileRetentionTimestamp") { - clock.getTimeMillis() - tombstoneRetentionMillis - } - } - - /** - * [[SetTransaction]]s before this timestamp will be considered expired and dropped from the - * state, but no files will be deleted. - */ - def minSetTransactionRetentionTimestamp: Option[Long] = { - DeltaLog.minSetTransactionRetentionInterval(metadata).map(clock.getTimeMillis() - _) - } - - /** - * Checks whether this table only accepts appends. If so it will throw an error in operations that - * can remove data such as DELETE/UPDATE/MERGE. - */ - def assertRemovable(): Unit = { - if (DeltaConfigs.IS_APPEND_ONLY.fromMetaData(metadata)) { - throw DeltaErrors.modifyAppendOnlyTableException(metadata.name) - } - } - /** The unique identifier for this table. */ - def tableId: String = metadata.id + def tableId: String = unsafeVolatileMetadata.id // safe because table id never changes /** - * Combines the tableId with the path of the table to ensure uniqueness. Normally `tableId` should - * be globally unique, but nothing stops users from copying a Delta table directly to a separate - * location, where the transaction log is copied directly, causing the tableIds to match. When - * users mutate the copied table, and then try to perform some checks joining the two tables, - * optimizations that depend on `tableId` alone may not be correct. Hence we use a composite id. + * Combines the tableId with the path of the table to ensure uniqueness. Normally `tableId` + * should be globally unique, but nothing stops users from copying a Delta table directly to + * a separate location, where the transaction log is copied directly, causing the tableIds to + * match. When users mutate the copied table, and then try to perform some checks joining the + * two tables, optimizations that depend on `tableId` alone may not be correct. Hence we use a + * composite id. */ private[delta] def compositeId: (String, Path) = tableId -> dataPath @@ -224,19 +196,32 @@ class DeltaLog private ( "ignoreCorruptFiles" -> "false", "ignoreMissingFiles" -> "false" ) - val fsRelation = - HadoopFsRelation(index, index.partitionSchema, schema, None, index.format, allOptions)(spark) + // --- modified start + // Don't need to add the bucketOption here, it handles the delta log meta json file + // --- modified end + val fsRelation = HadoopFsRelation( + index, index.partitionSchema, schema, None, index.format, allOptions)(spark) LogicalRelation(fsRelation) } + /** + * Load the data using the FileIndex. This allows us to skip many checks that add overhead, e.g. + * file existence checks, partitioning schema inference. + */ + def loadIndex( + index: DeltaLogFileIndex, + schema: StructType = Action.logSchema): DataFrame = { + Dataset.ofRows(spark, indexToRelation(index, schema)) + } + /* ------------------ * | Delta Management | * ------------------ */ /** - * Returns a new [[OptimisticTransaction]] that can be used to read the current state of the log - * and then commit updates. The reads and updates will be checked for logical conflicts with any - * concurrent writes to the log. + * Returns a new [[OptimisticTransaction]] that can be used to read the current state of the + * log and then commit updates. The reads and updates will be checked for logical conflicts + * with any concurrent writes to the log. * * Note that all reads in a transaction must go through the returned transaction object, and not * directly to the [[DeltaLog]] otherwise they will not be checked for conflicts. @@ -244,17 +229,18 @@ class DeltaLog private ( def startTransaction(): OptimisticTransaction = startTransaction(None) def startTransaction(snapshotOpt: Option[Snapshot]): OptimisticTransaction = { + // --- modified start new ClickhouseOptimisticTransaction(this, snapshotOpt) + // --- modified end } /** - * Execute a piece of code within a new [[OptimisticTransaction]]. Reads/write sets will be - * recorded for this table, and all other tables will be read at a snapshot that is pinned on the - * first access. + * Execute a piece of code within a new [[OptimisticTransaction]]. Reads/write sets will + * be recorded for this table, and all other tables will be read + * at a snapshot that is pinned on the first access. * - * @note - * This uses thread-local variable to make the active transaction visible. So do not use - * multi-threaded code in the provided thunk. + * @note This uses thread-local variable to make the active transaction visible. So do not use + * multi-threaded code in the provided thunk. */ def withNewTransaction[T](thunk: OptimisticTransaction => T): T = { try { @@ -266,16 +252,16 @@ class DeltaLog private ( } } + /** * Upgrade the table's protocol version, by default to the maximum recognized reader and writer * versions in this DBR release. */ - def upgradeProtocol(snapshot: Snapshot, newVersion: Protocol): Unit = { + def upgradeProtocol( + snapshot: Snapshot, + newVersion: Protocol): Unit = { val currentVersion = snapshot.protocol - if ( - newVersion.minReaderVersion == currentVersion.minReaderVersion && - newVersion.minWriterVersion == currentVersion.minWriterVersion - ) { + if (newVersion == currentVersion) { logConsole(s"Table $dataPath is already at protocol version $newVersion.") return } @@ -292,7 +278,7 @@ class DeltaLog private ( } // Test-only!! - private[delta] def upgradeProtocol(newVersion: Protocol = Protocol()): Unit = { + private[delta] def upgradeProtocol(newVersion: Protocol): Unit = { upgradeProtocol(unsafeVolatileSnapshot, newVersion) } @@ -304,41 +290,39 @@ class DeltaLog private ( startVersion: Long, failOnDataLoss: Boolean = false): Iterator[(Long, Seq[Action])] = { val hadoopConf = newDeltaHadoopConf() - val deltas = store.listFrom(deltaFile(logPath, startVersion), hadoopConf).filter(isDeltaFile) + val deltas = store.listFrom(listingPrefix(logPath, startVersion), hadoopConf) + .filter(isDeltaFile) // Subtract 1 to ensure that we have the same check for the inclusive startVersion var lastSeenVersion = startVersion - 1 - deltas.map { - status => - val p = status.getPath - val version = deltaVersion(p) - if (failOnDataLoss && version > lastSeenVersion + 1) { - throw DeltaErrors.failOnDataLossException(lastSeenVersion + 1, version) - } - lastSeenVersion = version - (version, store.read(status, hadoopConf).map(Action.fromJson)) + deltas.map { status => + val p = status.getPath + val version = deltaVersion(p) + if (failOnDataLoss && version > lastSeenVersion + 1) { + throw DeltaErrors.failOnDataLossException(lastSeenVersion + 1, version) + } + lastSeenVersion = version + (version, store.read(status, hadoopConf).map(Action.fromJson)) } } /** - * Get access to all actions starting from "startVersion" (inclusive) via [[FileStatus]]. If - * `startVersion` doesn't exist, return an empty Iterator. + * Get access to all actions starting from "startVersion" (inclusive) via [[FileStatus]]. + * If `startVersion` doesn't exist, return an empty Iterator. */ def getChangeLogFiles( startVersion: Long, failOnDataLoss: Boolean = false): Iterator[(Long, FileStatus)] = { - val deltas = store - .listFrom(deltaFile(logPath, startVersion), newDeltaHadoopConf()) + val deltas = store.listFrom(listingPrefix(logPath, startVersion), newDeltaHadoopConf()) .filter(isDeltaFile) // Subtract 1 to ensure that we have the same check for the inclusive startVersion var lastSeenVersion = startVersion - 1 - deltas.map { - status => - val version = deltaVersion(status) - if (failOnDataLoss && version > lastSeenVersion + 1) { - throw DeltaErrors.failOnDataLossException(lastSeenVersion + 1, version) - } - lastSeenVersion = version - (version, status) + deltas.map { status => + val version = deltaVersion(status) + if (failOnDataLoss && version > lastSeenVersion + 1) { + throw DeltaErrors.failOnDataLossException(lastSeenVersion + 1, version) + } + lastSeenVersion = version + (version, status) } } @@ -346,40 +330,108 @@ class DeltaLog private ( | Protocol validation | * --------------------- */ + /** + * Asserts the highest protocol supported by this client is not less than what required by the + * table for performing read or write operations. This ensures the client to support a + * greater-or-equal protocol versions and recognizes/supports all features enabled by the table. + * + * The operation type to be checked is passed as a string in `readOrWrite`. Valid values are + * `read` and `write`. + */ + private def protocolCheck(tableProtocol: Protocol, readOrWrite: String): Unit = { + val clientSupportedProtocol = Action.supportedProtocolVersion() + // Depending on the operation, pull related protocol versions out of Protocol objects. + // `getEnabledFeatures` is a pointer to pull reader/writer features out of a Protocol. + val (clientSupportedVersion, tableRequiredVersion, getEnabledFeatures) = readOrWrite match { + case "read" => ( + clientSupportedProtocol.minReaderVersion, + tableProtocol.minReaderVersion, + (f: Protocol) => f.readerFeatureNames) + case "write" => ( + clientSupportedProtocol.minWriterVersion, + tableProtocol.minWriterVersion, + (f: Protocol) => f.writerFeatureNames) + case _ => + throw new IllegalArgumentException("Table operation must be either `read` or `write`.") + } + + // Check is complete when both the protocol version and all referenced features are supported. + val clientSupportedFeatureNames = getEnabledFeatures(clientSupportedProtocol) + val tableEnabledFeatureNames = getEnabledFeatures(tableProtocol) + if (tableEnabledFeatureNames.subsetOf(clientSupportedFeatureNames) && + clientSupportedVersion >= tableRequiredVersion) { + return + } + + // Otherwise, either the protocol version, or few features referenced by the table, is + // unsupported. + val clientUnsupportedFeatureNames = + tableEnabledFeatureNames.diff(clientSupportedFeatureNames) + // Prepare event log constants and the appropriate error message handler. + val (opType, versionKey, unsupportedFeaturesException) = readOrWrite match { + case "read" => ( + "delta.protocol.failure.read", + "minReaderVersion", + DeltaErrors.unsupportedReaderTableFeaturesInTableException _) + case "write" => ( + "delta.protocol.failure.write", + "minWriterVersion", + DeltaErrors.unsupportedWriterTableFeaturesInTableException _) + } + recordDeltaEvent( + this, + opType, + data = Map( + "clientVersion" -> clientSupportedVersion, + versionKey -> tableRequiredVersion, + "clientFeatures" -> clientSupportedFeatureNames.mkString(","), + "clientUnsupportedFeatures" -> clientUnsupportedFeatureNames.mkString(","))) + if (clientSupportedVersion < tableRequiredVersion) { + throw new InvalidProtocolVersionException(tableRequiredVersion, clientSupportedVersion) + } else { + throw unsupportedFeaturesException(clientUnsupportedFeatureNames) + } + } + + /** + * Asserts that the table's protocol enabled all features that are active in the metadata. + * + * A mismatch shouldn't happen when the table has gone through a proper write process because we + * require all active features during writes. However, other clients may void this guarantee. + */ + def assertTableFeaturesMatchMetadata( + targetProtocol: Protocol, + targetMetadata: Metadata): Unit = { + if (!targetProtocol.supportsReaderFeatures && !targetProtocol.supportsWriterFeatures) return + + val protocolEnabledFeatures = targetProtocol.writerFeatureNames + .flatMap(TableFeature.featureNameToFeature) + val activeFeatures: Set[TableFeature] = + TableFeature.allSupportedFeaturesMap.values.collect { + case f: TableFeature with FeatureAutomaticallyEnabledByMetadata + if f.metadataRequiresFeatureToBeEnabled(targetMetadata, spark) => + f + }.toSet + val activeButNotEnabled = activeFeatures.diff(protocolEnabledFeatures) + if (activeButNotEnabled.nonEmpty) { + throw DeltaErrors.tableFeatureMismatchException(activeButNotEnabled.map(_.name)) + } + } + /** * Asserts that the client is up to date with the protocol and allowed to read the table that is * using the given `protocol`. */ def protocolRead(protocol: Protocol): Unit = { - val supportedReaderVersion = - Action.supportedProtocolVersion(Some(spark.sessionState.conf)).minReaderVersion - if (supportedReaderVersion < protocol.minReaderVersion) { - recordDeltaEvent( - this, - "delta.protocol.failure.read", - data = Map( - "clientVersion" -> supportedReaderVersion, - "minReaderVersion" -> protocol.minReaderVersion)) - throw new InvalidProtocolVersionException - } + protocolCheck(protocol, "read") } /** - * Asserts that the client is up to date with the protocol and allowed to write to the table that - * is using the given `protocol`. + * Asserts that the client is up to date with the protocol and allowed to write to the table + * that is using the given `protocol`. */ - def protocolWrite(protocol: Protocol, logUpgradeMessage: Boolean = true): Unit = { - val supportedWriterVersion = - Action.supportedProtocolVersion(Some(spark.sessionState.conf)).minWriterVersion - if (supportedWriterVersion < protocol.minWriterVersion) { - recordDeltaEvent( - this, - "delta.protocol.failure.write", - data = Map( - "clientVersion" -> supportedWriterVersion, - "minWriterVersion" -> protocol.minWriterVersion)) - throw new InvalidProtocolVersionException - } + def protocolWrite(protocol: Protocol): Unit = { + protocolCheck(protocol, "write") } /* ---------------------------------------- * @@ -387,9 +439,10 @@ class DeltaLog private ( * ---------------------------------------- */ /** - * Whether a Delta table exists at this directory. It is okay to use the cached volatile snapshot - * here, since the worst case is that the table has recently started existing which hasn't been - * picked up here. If so, any subsequent command that updates the table will see the right value. + * Whether a Delta table exists at this directory. + * It is okay to use the cached volatile snapshot here, since the worst case is that the table + * has recently started existing which hasn't been picked up here. If so, any subsequent command + * that updates the table will see the right value. */ def tableExists: Boolean = unsafeVolatileSnapshot.version >= 0 @@ -420,38 +473,46 @@ class DeltaLog private ( /** * Returns a [[org.apache.spark.sql.DataFrame]] containing the new files within the specified * version range. + * */ def createDataFrame( snapshot: Snapshot, addFiles: Seq[AddFile], isStreaming: Boolean = false, - actionTypeOpt: Option[String] = None): DataFrame = { + actionTypeOpt: Option[String] = None + ): DataFrame = { val actionType = actionTypeOpt.getOrElse(if (isStreaming) "streaming" else "batch") val fileIndex = new TahoeBatchFileIndex(spark, actionType, addFiles, this, dataPath, snapshot) val hadoopOptions = snapshot.metadata.format.options ++ options + val partitionSchema = snapshot.metadata.partitionSchema + val metadata = snapshot.metadata + val relation = HadoopFsRelation( fileIndex, - partitionSchema = - DeltaColumnMapping.dropColumnMappingMetadata(snapshot.metadata.partitionSchema), + partitionSchema = DeltaColumnMapping.dropColumnMappingMetadata(partitionSchema), // We pass all table columns as `dataSchema` so that Spark will preserve the partition column // locations. Otherwise, for any partition columns not in `dataSchema`, Spark would just // append them to the end of `dataSchema`. dataSchema = DeltaColumnMapping.dropColumnMappingMetadata( - ColumnWithDefaultExprUtils.removeDefaultExpressions(snapshot.metadata.schema)), + ColumnWithDefaultExprUtils.removeDefaultExpressions(metadata.schema)), + // --- modified start + // TODO: Don't add the bucketOption here, it will cause the OOM when the merge into update + // key is the bucket column, fix later + // --- modified end bucketSpec = None, - snapshot.deltaLog.fileFormat(snapshot.metadata), - hadoopOptions - )(spark) + fileFormat(metadata), + hadoopOptions)(spark) Dataset.ofRows(spark, LogicalRelation(relation, isStreaming = isStreaming)) } /** - * Returns a [[BaseRelation]] that contains all of the data present in the table. This relation - * will be continually updated as files are added or removed from the table. However, new - * [[BaseRelation]] must be requested in order to see changes to the schema. + * Returns a [[BaseRelation]] that contains all of the data present + * in the table. This relation will be continually updated + * as files are added or removed from the table. However, new [[BaseRelation]] + * must be requested in order to see changes to the schema. */ def createRelation( partitionFilters: Seq[Expression] = Nil, @@ -473,21 +534,23 @@ class DeltaLog private ( if (!cdcOptions.isEmpty) { recordDeltaEvent(this, "delta.cdf.read", data = cdcOptions.asCaseSensitiveMap()) return CDCReader.getCDCRelation( - spark, - this, - snapshotToUse, - partitionFilters, - spark.sessionState.conf, - cdcOptions) + spark, snapshotToUse, isTimeTravelQuery, spark.sessionState.conf, cdcOptions) } - val fileIndex = - TahoeLogFileIndex(spark, this, dataPath, snapshotToUse, partitionFilters, isTimeTravelQuery) - var bucketSpec: Option[BucketSpec] = ClickHouseTableV2.getTable(this).bucketOption - new DeltaHadoopFsRelation( + val fileIndex = TahoeLogFileIndex( + spark, this, dataPath, snapshotToUse, partitionFilters, isTimeTravelQuery) + // --- modified start + var bucketSpec: Option[BucketSpec] = + if (ClickHouseConfig.isMergeTreeFormatEngine(snapshotToUse.metadata.configuration)) { + ClickHouseTableV2.getTable(this).bucketOption + } else { + None + } + + new DeltaLog.DeltaHadoopFsRelation( fileIndex, - partitionSchema = - DeltaColumnMapping.dropColumnMappingMetadata(snapshotToUse.metadata.partitionSchema), + partitionSchema = DeltaColumnMapping.dropColumnMappingMetadata( + snapshotToUse.metadata.partitionSchema), // We pass all table columns as `dataSchema` so that Spark will preserve the partition column // locations. Otherwise, for any partition columns not in `dataSchema`, Spark would just // append them to the end of `dataSchema` @@ -504,18 +567,21 @@ class DeltaLog private ( spark, this ) + // --- modified end } /** - * Verify the required Spark conf for delta Throw - * `DeltaErrors.configureSparkSessionWithExtensionAndCatalog` exception if - * `spark.sql.catalog.spark_catalog` config is missing. We do not check for `spark.sql.extensions` - * because DeltaSparkSessionExtension can alternatively be activated using the `.withExtension()` - * API. This check can be disabled by setting DELTA_CHECK_REQUIRED_SPARK_CONF to false. + * Verify the required Spark conf for delta + * Throw `DeltaErrors.configureSparkSessionWithExtensionAndCatalog` exception if + * `spark.sql.catalog.spark_catalog` config is missing. We do not check for + * `spark.sql.extensions` because DeltaSparkSessionExtension can alternatively + * be activated using the `.withExtension()` API. This check can be disabled + * by setting DELTA_CHECK_REQUIRED_SPARK_CONF to false. */ protected def checkRequiredConfigurations(): Unit = { if (spark.sessionState.conf.getConf(DeltaSQLConf.DELTA_REQUIRED_SPARK_CONFS_CHECK)) { - if (spark.conf.getOption(SQLConf.V2_SESSION_CATALOG_IMPLEMENTATION.key).isEmpty) { + if (spark.conf.getOption( + SQLConf.V2_SESSION_CATALOG_IMPLEMENTATION.key).isEmpty) { throw DeltaErrors.configureSparkSessionWithExtensionAndCatalog(None) } } @@ -524,9 +590,9 @@ class DeltaLog private ( /** * Returns a proper path canonicalization function for the current Delta log. * - * If `runsOnExecutors` is true, the returned method will use a broadcast Hadoop Configuration so - * that the method is suitable for execution on executors. Otherwise, the returned method will use - * a local Hadoop Configuration and the method can only be executed on the driver. + * If `runsOnExecutors` is true, the returned method will use a broadcast Hadoop Configuration + * so that the method is suitable for execution on executors. Otherwise, the returned method + * will use a local Hadoop Configuration and the method can only be executed on the driver. */ private[delta] def getCanonicalPathFunction(runsOnExecutors: Boolean): String => String = { val hadoopConf = newDeltaHadoopConf() @@ -535,7 +601,9 @@ class DeltaLog private ( val broadcastHadoopConf = spark.sparkContext.broadcast(new SerializableConfiguration(hadoopConf)) () => broadcastHadoopConf.value.value - } else { () => hadoopConf } + } else { + () => hadoopConf + } new DeltaLog.CanonicalPathFunction(getHadoopConf) } @@ -544,24 +612,33 @@ class DeltaLog private ( * Returns a proper path canonicalization UDF for the current Delta log. * * If `runsOnExecutors` is true, the returned UDF will use a broadcast Hadoop Configuration. - * Otherwise, the returned UDF will use a local Hadoop Configuration and the UDF can only be - * executed on the driver. + * Otherwise, the returned UDF will use a local Hadoop Configuration and the UDF can + * only be executed on the driver. */ private[delta] def getCanonicalPathUdf(runsOnExecutors: Boolean = true): UserDefinedFunction = { DeltaUDF.stringFromString(getCanonicalPathFunction(runsOnExecutors)) } - override def fileFormat(metadata: Metadata = metadata): FileFormat = - ClickHouseTableV2.getTable(this).getFileFormat(metadata) + override def fileFormat(metadata: Metadata): FileFormat = { + // --- modified start + if (ClickHouseConfig.isMergeTreeFormatEngine(metadata.configuration)) { + ClickHouseTableV2.getTable(this).getFileFormat(metadata) + } else { + super.fileFormat(metadata) + } + // --- modified end + } } object DeltaLog extends DeltaLogging { + + // --- modified start @SuppressWarnings(Array("io.github.zhztheplayer.scalawarts.InheritFromCaseClass")) private class DeltaHadoopFsRelation( location: FileIndex, partitionSchema: StructType, - // The top-level columns in `dataSchema` should match the actual physical file schema, otherwise - // the ORC data source may not work with the by-ordinal mode. + // The top-level columns in `dataSchema` should match the actual physical file schema, + // otherwise the ORC data source may not work with the by-ordinal mode. dataSchema: StructType, bucketSpec: Option[BucketSpec], fileFormat: FileFormat, @@ -573,7 +650,7 @@ object DeltaLog extends DeltaLogging { bucketSpec, fileFormat, options)(sparkSession) - with InsertableRelation { + with InsertableRelation { def insert(data: DataFrame, overwrite: Boolean): Unit = { val mode = if (overwrite) SaveMode.Overwrite else SaveMode.Append WriteIntoDelta( @@ -586,6 +663,7 @@ object DeltaLog extends DeltaLogging { ).run(sparkSession) } } + // --- modified end /** * The key type of `DeltaLog` cache. It's a pair of the canonicalized table path and the file @@ -602,30 +680,27 @@ object DeltaLog extends DeltaLogging { private[delta] def logPathFor(dataPath: File): Path = logPathFor(dataPath.getAbsolutePath) /** - * We create only a single [[DeltaLog]] for any given `DeltaLogCacheKey` to avoid wasted work in - * reconstructing the log. + * We create only a single [[DeltaLog]] for any given `DeltaLogCacheKey` to avoid wasted work + * in reconstructing the log. */ private val deltaLogCache = { - val builder = CacheBuilder - .newBuilder() + val builder = CacheBuilder.newBuilder() .expireAfterAccess(60, TimeUnit.MINUTES) - .removalListener( - (removalNotification: RemovalNotification[DeltaLogCacheKey, DeltaLog]) => { + .removalListener((removalNotification: RemovalNotification[DeltaLogCacheKey, DeltaLog]) => { val log = removalNotification.getValue // TODO: We should use ref-counting to uncache snapshots instead of a manual timed op - try log.unsafeVolatileSnapshot.uncache() - catch { + try log.unsafeVolatileSnapshot.uncache() catch { case _: java.lang.NullPointerException => // Various layers will throw null pointer if the RDD is already gone. } - }) - sys.props - .get("delta.log.cacheSize") + }) + sys.props.get("delta.log.cacheSize") .flatMap(v => Try(v.toLong).toOption) .foreach(builder.maximumSize) builder.build[DeltaLogCacheKey, DeltaLog]() } + // Don't tolerate malformed JSON when parsing Delta log actions (default is PERMISSIVE) val jsonCommitParseOption = Map("mode" -> FailFastMode.name) @@ -710,28 +785,38 @@ object DeltaLog extends DeltaLogging { private def apply(spark: SparkSession, rawPath: Path, clock: Clock = new SystemClock): DeltaLog = apply(spark, rawPath, Map.empty, clock) + /** Helper for getting a log, as well as the latest snapshot, of the table */ def forTableWithSnapshot(spark: SparkSession, dataPath: String): (DeltaLog, Snapshot) = - withFreshSnapshot(forTable(spark, dataPath, _)) + withFreshSnapshot { forTable(spark, dataPath, _) } /** Helper for getting a log, as well as the latest snapshot, of the table */ def forTableWithSnapshot(spark: SparkSession, dataPath: Path): (DeltaLog, Snapshot) = - withFreshSnapshot(forTable(spark, dataPath, _)) + withFreshSnapshot { forTable(spark, dataPath, _) } /** Helper for getting a log, as well as the latest snapshot, of the table */ - def forTableWithSnapshot(spark: SparkSession, tableName: TableIdentifier): (DeltaLog, Snapshot) = - withFreshSnapshot(forTable(spark, tableName, _)) + def forTableWithSnapshot( + spark: SparkSession, + tableName: TableIdentifier): (DeltaLog, Snapshot) = + withFreshSnapshot { forTable(spark, tableName, _) } /** Helper for getting a log, as well as the latest snapshot, of the table */ def forTableWithSnapshot( spark: SparkSession, tableName: DeltaTableIdentifier): (DeltaLog, Snapshot) = - withFreshSnapshot(forTable(spark, tableName, _)) + withFreshSnapshot { forTable(spark, tableName, _) } + + /** Helper for getting a log, as well as the latest snapshot, of the table */ + def forTableWithSnapshot( + spark: SparkSession, + dataPath: Path, + options: Map[String, String]): (DeltaLog, Snapshot) = + withFreshSnapshot { apply(spark, logPathFor(dataPath), options, _) } /** - * Helper function to be used with the forTableWithSnapshot calls. Thunk is a partially applied - * DeltaLog.forTable call, which we can then wrap around with a snapshot update. We use the system - * clock to avoid back-to-back updates. + * Helper function to be used with the forTableWithSnapshot calls. Thunk is a + * partially applied DeltaLog.forTable call, which we can then wrap around with a + * snapshot update. We use the system clock to avoid back-to-back updates. */ private[delta] def withFreshSnapshot(thunk: Clock => DeltaLog): (DeltaLog, Snapshot) = { val clock = new SystemClock @@ -748,14 +833,12 @@ object DeltaLog extends DeltaLogging { clock: Clock ): DeltaLog = { val fileSystemOptions: Map[String, String] = - if ( - spark.sessionState.conf.getConf( - DeltaSQLConf.LOAD_FILE_SYSTEM_CONFIGS_FROM_DATAFRAME_OPTIONS) - ) { + if (spark.sessionState.conf.getConf( + DeltaSQLConf.LOAD_FILE_SYSTEM_CONFIGS_FROM_DATAFRAME_OPTIONS)) { // We pick up only file system options so that we don't pass any parquet or json options to // the code that reads Delta transaction logs. - options.filterKeys { - k => DeltaTableUtils.validDeltaTableHadoopPrefixes.exists(k.startsWith) + options.filterKeys { k => + DeltaTableUtils.validDeltaTableHadoopPrefixes.exists(k.startsWith) }.toMap } else { Map.empty @@ -769,14 +852,15 @@ object DeltaLog extends DeltaLogging { null, "delta.log.create", Map(TAG_TAHOE_PATH -> path.getParent.toString)) { - AnalysisHelper.allowInvokingTransformsInAnalyzer { - new DeltaLog( - logPath = path, - dataPath = path.getParent, - options = fileSystemOptions, - clock = clock - ) - } + AnalysisHelper.allowInvokingTransformsInAnalyzer { + new DeltaLog( + logPath = path, + dataPath = path.getParent, + options = fileSystemOptions, + allOptions = options, + clock = clock + ) + } } def getDeltaLogFromCache(): DeltaLog = { // The following cases will still create a new ActionLog even if there is a cached @@ -785,7 +869,10 @@ object DeltaLog extends DeltaLogging { // - Different `authority` (e.g., different user tokens in the path) // - Different mount point. try { - deltaLogCache.get(path -> fileSystemOptions, () => createDeltaLog()) + deltaLogCache.get(path -> fileSystemOptions, () => { + createDeltaLog() + } + ) } catch { case e: com.google.common.util.concurrent.UncheckedExecutionException => throw e.getCause @@ -814,10 +901,8 @@ object DeltaLog extends DeltaLogging { // scalastyle:on deltahadoopconfiguration val path = fs.makeQualified(rawPath) - if ( - spark.sessionState.conf.getConf( - DeltaSQLConf.LOAD_FILE_SYSTEM_CONFIGS_FROM_DATAFRAME_OPTIONS) - ) { + if (spark.sessionState.conf.getConf( + DeltaSQLConf.LOAD_FILE_SYSTEM_CONFIGS_FROM_DATAFRAME_OPTIONS)) { // We rely on the fact that accessing the key set doesn't modify the entry access time. See // `CacheBuilder.expireAfterAccess`. val keysToBeRemoved = mutable.ArrayBuffer[DeltaLogCacheKey]() @@ -848,38 +933,42 @@ object DeltaLog extends DeltaLogging { /** * Filters the given [[Dataset]] by the given `partitionFilters`, returning those that match. - * @param files - * The active files in the DeltaLog state, which contains the partition value information - * @param partitionFilters - * Filters on the partition columns - * @param partitionColumnPrefixes - * The path to the `partitionValues` column, if it's nested + * @param files The active files in the DeltaLog state, which contains the partition value + * information + * @param partitionFilters Filters on the partition columns + * @param partitionColumnPrefixes The path to the `partitionValues` column, if it's nested + * @param shouldRewritePartitionFilters Whether to rewrite `partitionFilters` to be over the + * [[AddFile]] schema */ def filterFileList( partitionSchema: StructType, files: DataFrame, partitionFilters: Seq[Expression], - partitionColumnPrefixes: Seq[String] = Nil): DataFrame = { - val rewrittenFilters = rewritePartitionFilters( - partitionSchema, - files.sparkSession.sessionState.conf.resolver, - partitionFilters, - partitionColumnPrefixes) + partitionColumnPrefixes: Seq[String] = Nil, + shouldRewritePartitionFilters: Boolean = true): DataFrame = { + + val rewrittenFilters = if (shouldRewritePartitionFilters) { + rewritePartitionFilters( + partitionSchema, + files.sparkSession.sessionState.conf.resolver, + partitionFilters, + partitionColumnPrefixes) + } else { + partitionFilters + } val expr = rewrittenFilters.reduceLeftOption(And).getOrElse(Literal.TrueLiteral) val columnFilter = new Column(expr) files.filter(columnFilter) } /** - * Rewrite the given `partitionFilters` to be used for filtering partition values. We need to - * explicitly resolve the partitioning columns here because the partition columns are stored as - * keys of a Map type instead of attributes in the AddFile schema (below) and thus cannot be - * resolved automatically. + * Rewrite the given `partitionFilters` to be used for filtering partition values. + * We need to explicitly resolve the partitioning columns here because the partition columns + * are stored as keys of a Map type instead of attributes in the AddFile schema (below) and thus + * cannot be resolved automatically. * - * @param partitionFilters - * Filters on the partition columns - * @param partitionColumnPrefixes - * The path to the `partitionValues` column, if it's nested + * @param partitionFilters Filters on the partition columns + * @param partitionColumnPrefixes The path to the `partitionValues` column, if it's nested */ def rewritePartitionFilters( partitionSchema: StructType, @@ -891,7 +980,7 @@ object DeltaLog extends DeltaLogging { // If we have a special column name, e.g. `a.a`, then an UnresolvedAttribute returns // the column name as '`a.a`' instead of 'a.a', therefore we need to strip the backticks. val unquoted = a.name.stripPrefix("`").stripSuffix("`") - val partitionCol = partitionSchema.find(field => resolver(field.name, unquoted)) + val partitionCol = partitionSchema.find { field => resolver(field.name, unquoted) } partitionCol match { case Some(f: StructField) => val name = DeltaColumnMapping.getPhysicalName(f) @@ -907,16 +996,32 @@ object DeltaLog extends DeltaLogging { }) } + + /** + * Checks whether this table only accepts appends. If so it will throw an error in operations that + * can remove data such as DELETE/UPDATE/MERGE. + */ + def assertRemovable(snapshot: Snapshot): Unit = { + val metadata = snapshot.metadata + if (DeltaConfigs.IS_APPEND_ONLY.fromMetaData(metadata)) { + throw DeltaErrors.modifyAppendOnlyTableException(metadata.name) + } + } + + /** How long to keep around SetTransaction actions before physically deleting them. */ def minSetTransactionRetentionInterval(metadata: Metadata): Option[Long] = { DeltaConfigs.TRANSACTION_ID_RETENTION_DURATION .fromMetaData(metadata) .map(DeltaConfigs.getMilliSeconds) } + /** How long to keep around logically deleted files before physically deleting them. */ + def tombstoneRetentionMillis(metadata: Metadata): Long = { + DeltaConfigs.getMilliSeconds(DeltaConfigs.TOMBSTONE_RETENTION.fromMetaData(metadata)) + } /** Get a function that canonicalizes a given `path`. */ private[delta] class CanonicalPathFunction(getHadoopConf: () => Configuration) - extends Function[String, String] - with Serializable { + extends Function[String, String] with Serializable { // Mark it `@transient lazy val` so that de-serialization happens only once on every executor. @transient private lazy val fs = { diff --git a/backends-clickhouse/src/main/delta-22/org/apache/spark/sql/delta/Snapshot.scala b/backends-clickhouse/src/main/delta-23/org/apache/spark/sql/delta/Snapshot.scala similarity index 59% rename from backends-clickhouse/src/main/delta-22/org/apache/spark/sql/delta/Snapshot.scala rename to backends-clickhouse/src/main/delta-23/org/apache/spark/sql/delta/Snapshot.scala index 1c62d133174dc..b2b5ba42bb305 100644 --- a/backends-clickhouse/src/main/delta-22/org/apache/spark/sql/delta/Snapshot.scala +++ b/backends-clickhouse/src/main/delta-23/org/apache/spark/sql/delta/Snapshot.scala @@ -14,33 +14,38 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.spark.sql.delta -import org.apache.spark.sql._ -import org.apache.spark.sql.catalyst.expressions.Expression +// scalastyle:off import.ordering.noEmptyLine +import scala.collection.mutable + import org.apache.spark.sql.delta.actions._ import org.apache.spark.sql.delta.actions.Action.logSchema import org.apache.spark.sql.delta.metering.DeltaLogging import org.apache.spark.sql.delta.schema.SchemaUtils import org.apache.spark.sql.delta.sources.DeltaSQLConf -import org.apache.spark.sql.delta.stats.{DataSkippingReader, DeltaScan, FileSizeHistogram, StatisticsCollection} +import org.apache.spark.sql.delta.stats.DataSkippingReader +import org.apache.spark.sql.delta.stats.DeltaScan +import org.apache.spark.sql.delta.stats.FileSizeHistogram +import org.apache.spark.sql.delta.stats.StatisticsCollection import org.apache.spark.sql.delta.util.StateCache +import org.apache.hadoop.fs.{FileStatus, Path} + +import org.apache.spark.sql._ +import org.apache.spark.sql.catalyst.expressions.Expression +import org.apache.spark.sql.execution.datasources.v2.clickhouse.ClickHouseConfig import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.StructType import org.apache.spark.util.Utils -import org.apache.hadoop.fs.{FileStatus, Path} - -// scalastyle:off import.ordering.noEmptyLine -import scala.collection.mutable - /** * Gluten overwrite Delta: * - * This file is copied from Delta 2.2.0. It is modified to overcome the following issues: - * 1. filesForScan() should return DeltaScan of AddMergeTreeParts instead of AddFile + * This file is copied from Delta 2.3.0. It is modified to overcome the following issues: + * 1. filesForScan() will cache the DeltaScan by the FilterExprsAsKey + * 2. filesForScan() should return DeltaScan of AddMergeTreeParts instead of AddFile */ - /** * A description of a Delta [[Snapshot]], including basic information such its [[DeltaLog]] * metadata, protocol, and version. @@ -55,28 +60,27 @@ trait SnapshotDescriptor { } /** - * An immutable snapshot of the state of the log at some delta version. Internally this class - * manages the replay of actions stored in checkpoint or delta files. + * An immutable snapshot of the state of the log at some delta version. Internally + * this class manages the replay of actions stored in checkpoint or delta files. * - * After resolving any new actions, it caches the result and collects the following basic - * information to the driver: - * - Protocol Version - * - Metadata - * - Transaction state + * After resolving any new actions, it caches the result and collects the + * following basic information to the driver: + * - Protocol Version + * - Metadata + * - Transaction state + * + * @param timestamp The timestamp of the latest commit in milliseconds. Can also be set to -1 if the + * timestamp of the commit is unknown or the table has not been initialized, i.e. + * `version = -1`. * - * @param timestamp - * The timestamp of the latest commit in milliseconds. Can also be set to -1 if the timestamp of - * the commit is unknown or the table has not been initialized, i.e. `version = -1`. */ class Snapshot( val path: Path, override val version: Long, val logSegment: LogSegment, - val minFileRetentionTimestamp: Long, override val deltaLog: DeltaLog, val timestamp: Long, val checksumOpt: Option[VersionChecksum], - val minSetTransactionRetentionTimestamp: Option[Long] = None, checkpointMetadataOpt: Option[CheckpointMetaData] = None) extends SnapshotDescriptor with StateCache @@ -84,25 +88,25 @@ class Snapshot( with DataSkippingReader with DeltaLogging { - import org.apache.spark.sql.delta.implicits._ - - // For implicits which re-use Encoder: import Snapshot._ + // For implicits which re-use Encoder: + import org.apache.spark.sql.delta.implicits._ protected def spark = SparkSession.active + /** Snapshot to scan by the DeltaScanGenerator for metadata query optimizations */ override val snapshotToScan: Snapshot = this protected def getNumPartitions: Int = { - spark.sessionState.conf - .getConf(DeltaSQLConf.DELTA_SNAPSHOT_PARTITIONS) + spark.sessionState.conf.getConf(DeltaSQLConf.DELTA_SNAPSHOT_PARTITIONS) .getOrElse(Snapshot.defaultNumSnapshotPartitions) } /** Performs validations during initialization */ protected def init(): Unit = { deltaLog.protocolRead(protocol) + deltaLog.assertTableFeaturesMatchMetadata(protocol, metadata) SchemaUtils.recordUndefinedTypes(deltaLog, metadata.schema) } @@ -127,49 +131,61 @@ class Snapshot( val ADD_PATH_CANONICAL_COL_NAME = "add_path_canonical" val REMOVE_PATH_CANONICAL_COL_NAME = "remove_path_canonical" loadActions - .withColumn( - ADD_PATH_CANONICAL_COL_NAME, - when(col("add.path").isNotNull, canonicalPath(col("add.path")))) - .withColumn( - REMOVE_PATH_CANONICAL_COL_NAME, - when(col("remove.path").isNotNull, canonicalPath(col("remove.path")))) + .withColumn(ADD_PATH_CANONICAL_COL_NAME, when( + col("add.path").isNotNull, canonicalPath(col("add.path")))) + .withColumn(REMOVE_PATH_CANONICAL_COL_NAME, when( + col("remove.path").isNotNull, canonicalPath(col("remove.path")))) .repartition( getNumPartitions, coalesce(col(ADD_PATH_CANONICAL_COL_NAME), col(REMOVE_PATH_CANONICAL_COL_NAME))) .sortWithinPartitions(ACTION_SORT_COL_NAME) - .withColumn( - "add", - when( - col("add.path").isNotNull, - struct( - col(ADD_PATH_CANONICAL_COL_NAME).as("path"), - col("add.partitionValues"), - col("add.size"), - col("add.modificationTime"), - col("add.dataChange"), - col(ADD_STATS_TO_USE_COL_NAME).as("stats"), - col("add.tags") - ) - ) - ) - .withColumn( - "remove", - when( - col("remove.path").isNotNull, - col("remove").withField("path", col(REMOVE_PATH_CANONICAL_COL_NAME)))) + .withColumn("add", when( + col("add.path").isNotNull, + struct( + col(ADD_PATH_CANONICAL_COL_NAME).as("path"), + col("add.partitionValues"), + col("add.size"), + col("add.modificationTime"), + col("add.dataChange"), + col(ADD_STATS_TO_USE_COL_NAME).as("stats"), + col("add.tags"), + col("add.deletionVector") + ))) + .withColumn("remove", when( + col("remove.path").isNotNull, + col("remove").withField("path", col(REMOVE_PATH_CANONICAL_COL_NAME)))) .as[SingleAction] - .mapPartitions { - iter => - val state: LogReplay = - new InMemoryLogReplay( - localMinFileRetentionTimestamp, - localMinSetTransactionRetentionTimestamp) - state.append(0, iter.map(_.unwrap)) - state.checkpoint.map(_.wrap) + .mapPartitions { iter => + val state: LogReplay = + new InMemoryLogReplay( + localMinFileRetentionTimestamp, + localMinSetTransactionRetentionTimestamp) + state.append(0, iter.map(_.unwrap)) + state.checkpoint.map(_.wrap) } } } + /** + * Pulls the protocol and metadata of the table from the files that are used to compute the + * Snapshot directly--without triggering a full state reconstruction. This is important, because + * state reconstruction depends on protocol and metadata for correctness. + */ + protected def protocolAndMetadataReconstruction(): Array[(Protocol, Metadata)] = { + import implicits._ + + val schemaToUse = Action.logSchema(Set("protocol", "metaData")) + fileIndices.map(deltaLog.loadIndex(_, schemaToUse)) + .reduceOption(_.union(_)).getOrElse(emptyDF) + .withColumn(ACTION_SORT_COL_NAME, input_file_name()) + .select("protocol", "metaData", ACTION_SORT_COL_NAME) + .where("protocol.minReaderVersion is not null or metaData.id is not null") + .as[(Protocol, Metadata, String)] + .collect() + .sortBy(_._3) + .map { case (p, m, _) => p -> m } + } + def redactedPath: String = Utils.redact(spark.sessionState.conf.stringRedactionPattern, path.toUri.toString) @@ -189,7 +205,9 @@ class Snapshot( cachedState.getDF } - /** A Map of alias to aggregations which needs to be done to calculate the `computedState` */ + /** + * A Map of alias to aggregations which needs to be done to calculate the `computedState` + */ protected def aggregationsToComputeState: Map[String, Column] = { Map( // sum may return null for empty data set. @@ -223,24 +241,75 @@ class Snapshot( recordDeltaEvent( deltaLog, opType = "delta.assertions.missingAction", - data = - Map("version" -> version.toString, "action" -> "Protocol", "source" -> "Snapshot")) + data = Map( + "version" -> version.toString, "action" -> "Protocol", "source" -> "Snapshot")) + throw DeltaErrors.actionNotFoundException("protocol", version) + } else if (_computedState.protocol != protocol) { + recordDeltaEvent( + deltaLog, + opType = "delta.assertions.mismatchedAction", + data = Map( + "version" -> version.toString, "action" -> "Protocol", "source" -> "Snapshot", + "computedState.protocol" -> _computedState.protocol, + "extracted.protocol" -> protocol)) throw DeltaErrors.actionNotFoundException("protocol", version) } + if (_computedState.metadata == null) { recordDeltaEvent( deltaLog, opType = "delta.assertions.missingAction", - data = - Map("version" -> version.toString, "action" -> "Metadata", "source" -> "Metadata")) + data = Map( + "version" -> version.toString, "action" -> "Metadata", "source" -> "Metadata")) + throw DeltaErrors.actionNotFoundException("metadata", version) + } else if (_computedState.metadata != metadata) { + recordDeltaEvent( + deltaLog, + opType = "delta.assertions.mismatchedAction", + data = Map( + "version" -> version.toString, "action" -> "Metadata", "source" -> "Snapshot", + "computedState.metadata" -> _computedState.metadata, + "extracted.metadata" -> metadata)) throw DeltaErrors.actionNotFoundException("metadata", version) - } else { - _computedState } + + _computedState } } } + // Used by [[protocol]] and [[metadata]] below + private lazy val (_protocol, _metadata): (Protocol, Metadata) = { + // Should be small. At most 'checkpointInterval' rows, unless new commits are coming + // in before a checkpoint can be written + var protocol: Protocol = null + var metadata: Metadata = null + protocolAndMetadataReconstruction().foreach { + case (p: Protocol, _) => protocol = p + case (_, m: Metadata) => metadata = m + } + + if (protocol == null) { + recordDeltaEvent( + deltaLog, + opType = "delta.assertions.missingAction", + data = Map( + "version" -> version.toString, "action" -> "Protocol", "source" -> "Snapshot")) + throw DeltaErrors.actionNotFoundException("protocol", version) + } + + if (metadata == null) { + recordDeltaEvent( + deltaLog, + opType = "delta.assertions.missingAction", + data = Map( + "version" -> version.toString, "action" -> "Metadata", "source" -> "Snapshot")) + throw DeltaErrors.actionNotFoundException("metadata", version) + } + + protocol -> metadata + } + def sizeInBytes: Long = computedState.sizeInBytes def numOfSetTransactions: Long = computedState.numOfSetTransactions def numOfFiles: Long = computedState.numOfFiles @@ -248,18 +317,34 @@ class Snapshot( def numOfMetadata: Long = computedState.numOfMetadata def numOfProtocol: Long = computedState.numOfProtocol def setTransactions: Seq[SetTransaction] = computedState.setTransactions - override def metadata: Metadata = computedState.metadata - override def protocol: Protocol = computedState.protocol + override def metadata: Metadata = _metadata + override def protocol: Protocol = _protocol def fileSizeHistogram: Option[FileSizeHistogram] = computedState.fileSizeHistogram - private[delta] def sizeInBytesOpt: Option[Long] = Some(sizeInBytes) - private[delta] def setTransactionsOpt: Option[Seq[SetTransaction]] = Some(setTransactions) - private[delta] def numOfFilesOpt: Option[Long] = Some(numOfFiles) + private[delta] def sizeInBytesIfKnown: Option[Long] = Some(sizeInBytes) + private[delta] def setTransactionsIfKnown: Option[Seq[SetTransaction]] = Some(setTransactions) + private[delta] def numOfFilesIfKnown: Option[Long] = Some(numOfFiles) + + /** + * Tombstones before the [[minFileRetentionTimestamp]] timestamp will be dropped from the + * checkpoint. + */ + private[delta] def minFileRetentionTimestamp: Long = { + deltaLog.clock.getTimeMillis() - DeltaLog.tombstoneRetentionMillis(metadata) + } + + /** + * [[SetTransaction]]s before [[minSetTransactionRetentionTimestamp]] will be considered expired + * and dropped from the snapshot. + */ + private[delta] def minSetTransactionRetentionTimestamp: Option[Long] = { + DeltaLog.minSetTransactionRetentionInterval(metadata).map(deltaLog.clock.getTimeMillis() - _) + } /** - * Computes all the information that is needed by the checksum for the current snapshot. May kick - * off state reconstruction if needed by any of the underlying fields. Note that it's safe to set - * txnId to none, since the snapshot doesn't always have a txn attached. E.g. if a snapshot is - * created by reading a checkpoint, then no txnId is present. + * Computes all the information that is needed by the checksum for the current snapshot. + * May kick off state reconstruction if needed by any of the underlying fields. + * Note that it's safe to set txnId to none, since the snapshot doesn't always have a txn + * attached. E.g. if a snapshot is created by reading a checkpoint, then no txnId is present. */ def computeChecksum: VersionChecksum = VersionChecksum( txnId = None, @@ -271,8 +356,7 @@ class Snapshot( metadata = metadata, protocol = protocol, histogramOpt = fileSizeHistogram, - allFiles = checksumOpt.flatMap(_.allFiles) - ) + allFiles = checksumOpt.flatMap(_.allFiles)) /** A map to look up transaction version by appId. */ lazy val transactions: Map[String, Long] = setTransactions.map(t => t.appId -> t.version).toMap @@ -300,17 +384,23 @@ class Snapshot( lazy val numIndexedCols: Int = DeltaConfigs.DATA_SKIPPING_NUM_INDEXED_COLS.fromMetaData(metadata) /** Return the set of properties of the table. */ - def getProperties: mutable.HashMap[String, String] = { - val base = new mutable.HashMap[String, String]() - metadata.configuration.foreach { - case (k, v) => - if (k != "path") { - base.put(k, v) - } + def getProperties: mutable.Map[String, String] = { + val base = new mutable.LinkedHashMap[String, String]() + metadata.configuration.foreach { case (k, v) => + if (k != "path") { + base.put(k, v) + } } base.put(Protocol.MIN_READER_VERSION_PROP, protocol.minReaderVersion.toString) base.put(Protocol.MIN_WRITER_VERSION_PROP, protocol.minWriterVersion.toString) - base + if (protocol.supportsReaderFeatures || protocol.supportsWriterFeatures) { + val features = protocol.readerAndWriterFeatureNames.map(name => + s"${TableFeatureProtocolUtils.FEATURE_PROP_PREFIX}$name" -> + TableFeatureProtocolUtils.FEATURE_PROP_SUPPORTED) + base ++ features.toSeq.sorted + } else { + base + } } // Given the list of files from `LogSegment`, create respective file indices to help create @@ -345,10 +435,8 @@ class Snapshot( * config settings for delta.checkpoint.writeStatsAsJson and delta.checkpoint.writeStatsAsStruct). */ protected def loadActions: DataFrame = { - val dfs = fileIndices.map(index => Dataset.ofRows(spark, deltaLog.indexToRelation(index))) - dfs - .reduceOption(_.union(_)) - .getOrElse(emptyDF) + fileIndices.map(deltaLog.loadIndex(_)) + .reduceOption(_.union(_)).getOrElse(emptyDF) .withColumn(ACTION_SORT_COL_NAME, input_file_name()) .withColumn(ADD_STATS_TO_USE_COL_NAME, col("add.stats")) } @@ -356,6 +444,7 @@ class Snapshot( protected def emptyDF: DataFrame = spark.createDataFrame(spark.sparkContext.emptyRDD[Row], logSchema) + override def logInfo(msg: => String): Unit = { super.logInfo(s"[tableId=${deltaLog.tableId}] " + msg) } @@ -380,21 +469,22 @@ class Snapshot( s"${getClass.getSimpleName}(path=$path, version=$version, metadata=$metadata, " + s"logSegment=$logSegment, checksumOpt=$checksumOpt)" - override def filesForScan(filters: Seq[Expression], keepNumRecords: Boolean): DeltaScan = { + // --- modified start + override def filesForScan(limit: Long): DeltaScan = { val deltaScan = ClickhouseSnapshot.deltaScanCache.get( - FilterExprsAsKey(path, ClickhouseSnapshot.genSnapshotId(this), filters, None), + FilterExprsAsKey(path, ClickhouseSnapshot.genSnapshotId(this), Seq.empty, Some(limit)), () => { - super.filesForScan(filters, keepNumRecords) + super.filesForScan(limit) }) replaceWithAddMergeTreeParts(deltaScan) } - override def filesForScan(limit: Long): DeltaScan = { + override def filesForScan(filters: Seq[Expression], keepNumRecords: Boolean): DeltaScan = { val deltaScan = ClickhouseSnapshot.deltaScanCache.get( - FilterExprsAsKey(path, ClickhouseSnapshot.genSnapshotId(this), Seq.empty, Some(limit)), + FilterExprsAsKey(path, ClickhouseSnapshot.genSnapshotId(this), filters, None), () => { - super.filesForScan(limit) + super.filesForScan(filters, keepNumRecords) }) replaceWithAddMergeTreeParts(deltaScan) @@ -411,30 +501,35 @@ class Snapshot( } private def replaceWithAddMergeTreeParts(deltaScan: DeltaScan) = { - DeltaScan.apply( - deltaScan.version, - deltaScan.files - .map( - addFile => { - val addFileAsKey = AddFileAsKey(addFile) - - val ret = ClickhouseSnapshot.addFileToAddMTPCache.get(addFileAsKey) - // this is for later use - ClickhouseSnapshot.pathToAddMTPCache.put(ret.fullPartPath(), ret) - ret - }), - deltaScan.total, - deltaScan.partition, - deltaScan.scanned - )( - deltaScan.scannedSnapshot, - deltaScan.partitionFilters, - deltaScan.dataFilters, - deltaScan.unusedFilters, - deltaScan.scanDurationMs, - deltaScan.dataSkippingType - ) + if (ClickHouseConfig.isMergeTreeFormatEngine(metadata.configuration)) { + DeltaScan.apply( + deltaScan.version, + deltaScan.files + .map( + addFile => { + val addFileAsKey = AddFileAsKey(addFile) + + val ret = ClickhouseSnapshot.addFileToAddMTPCache.get(addFileAsKey) + // this is for later use + ClickhouseSnapshot.pathToAddMTPCache.put(ret.fullPartPath(), ret) + ret + }), + deltaScan.total, + deltaScan.partition, + deltaScan.scanned + )( + deltaScan.scannedSnapshot, + deltaScan.partitionFilters, + deltaScan.dataFilters, + deltaScan.unusedFilters, + deltaScan.scanDurationMs, + deltaScan.dataSkippingType + ) + } else { + deltaScan + } } + // --- modified end logInfo(s"Created snapshot $this") init() @@ -450,64 +545,51 @@ object Snapshot extends DeltaLogging { /** Verifies that a set of delta or checkpoint files to be read actually belongs to this table. */ private def assertLogFilesBelongToTable(logBasePath: Path, files: Seq[FileStatus]): Unit = { - files.map(_.getPath).foreach { - filePath => - if (new Path(filePath.toUri).getParent != new Path(logBasePath.toUri)) { - // scalastyle:off throwerror - throw new AssertionError( - s"File ($filePath) doesn't belong in the " + - s"transaction log at $logBasePath. Please contact Databricks Support.") - // scalastyle:on throwerror - } + files.map(_.getPath).foreach { filePath => + if (new Path(filePath.toUri).getParent != new Path(logBasePath.toUri)) { + // scalastyle:off throwerror + throw new AssertionError(s"File ($filePath) doesn't belong in the " + + s"transaction log at $logBasePath. Please contact Databricks Support.") + // scalastyle:on throwerror + } } } /** * Metrics and metadata computed around the Delta table. - * @param sizeInBytes - * The total size of the table (of active files, not including tombstones). - * @param numOfSetTransactions - * Number of streams writing to this table. - * @param numOfFiles - * The number of files in this table. - * @param numOfRemoves - * The number of tombstones in the state. - * @param numOfMetadata - * The number of metadata actions in the state. Should be 1. - * @param numOfProtocol - * The number of protocol actions in the state. Should be 1. - * @param setTransactions - * The streaming queries writing to this table. - * @param metadata - * The metadata of the table. - * @param protocol - * The protocol version of the Delta table. - * @param fileSizeHistogram - * A Histogram class tracking the file counts and total bytes in different size ranges. + * @param sizeInBytes The total size of the table (of active files, not including tombstones). + * @param numOfSetTransactions Number of streams writing to this table. + * @param numOfFiles The number of files in this table. + * @param numOfRemoves The number of tombstones in the state. + * @param numOfMetadata The number of metadata actions in the state. Should be 1. + * @param numOfProtocol The number of protocol actions in the state. Should be 1. + * @param setTransactions The streaming queries writing to this table. + * @param metadata The metadata of the table. + * @param protocol The protocol version of the Delta table. + * @param fileSizeHistogram A Histogram class tracking the file counts and total bytes + * in different size ranges. */ case class State( - sizeInBytes: Long, - numOfSetTransactions: Long, - numOfFiles: Long, - numOfRemoves: Long, - numOfMetadata: Long, - numOfProtocol: Long, - setTransactions: Seq[SetTransaction], - metadata: Metadata, - protocol: Protocol, - fileSizeHistogram: Option[FileSizeHistogram] = None) + sizeInBytes: Long, + numOfSetTransactions: Long, + numOfFiles: Long, + numOfRemoves: Long, + numOfMetadata: Long, + numOfProtocol: Long, + setTransactions: Seq[SetTransaction], + metadata: Metadata, + protocol: Protocol, + fileSizeHistogram: Option[FileSizeHistogram] = None + ) } /** * An initial snapshot with only metadata specified. Useful for creating a DataFrame from an * existing parquet table during its conversion to delta. * - * @param logPath - * the path to transaction log - * @param deltaLog - * the delta log object - * @param metadata - * the metadata of the table + * @param logPath the path to transaction log + * @param deltaLog the delta log object + * @param metadata the metadata of the table */ class InitialSnapshot( val logPath: Path, @@ -517,27 +599,30 @@ class InitialSnapshot( path = logPath, version = -1, logSegment = LogSegment.empty(logPath), - minFileRetentionTimestamp = -1, deltaLog = deltaLog, timestamp = -1, - checksumOpt = None, - minSetTransactionRetentionTimestamp = None + checksumOpt = None ) { def this(logPath: Path, deltaLog: DeltaLog) = this( logPath, deltaLog, Metadata( - configuration = - DeltaConfigs.mergeGlobalConfigs(SparkSession.active.sessionState.conf, Map.empty), - createdTime = Some(System.currentTimeMillis())) - ) + configuration = DeltaConfigs.mergeGlobalConfigs( + sqlConfs = SparkSession.active.sessionState.conf, + tableConf = Map.empty, + ignoreProtocolConfsOpt = Some( + DeltaConfigs.ignoreProtocolDefaultsIsSet( + sqlConfs = SparkSession.active.sessionState.conf, + tableConf = deltaLog.allOptions))), + createdTime = Some(System.currentTimeMillis()))) override def stateDS: Dataset[SingleAction] = emptyDF.as[SingleAction] override def stateDF: DataFrame = emptyDF override protected lazy val computedState: Snapshot.State = initialState + override def protocol: Protocol = computedState.protocol private def initialState: Snapshot.State = { - val protocol = Protocol.forNewTable(spark, metadata) + val protocol = Protocol.forNewTable(spark, Some(metadata)) Snapshot.State( sizeInBytes = 0L, numOfSetTransactions = 0L, @@ -550,5 +635,4 @@ class InitialSnapshot( protocol = protocol ) } - } diff --git a/backends-clickhouse/src/main/delta-23/org/apache/spark/sql/delta/catalog/ClickHouseTableV2.scala b/backends-clickhouse/src/main/delta-23/org/apache/spark/sql/delta/catalog/ClickHouseTableV2.scala new file mode 100644 index 0000000000000..90370f0b1d998 --- /dev/null +++ b/backends-clickhouse/src/main/delta-23/org/apache/spark/sql/delta/catalog/ClickHouseTableV2.scala @@ -0,0 +1,186 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.delta.catalog +import org.apache.spark.internal.Logging +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.catalog.CatalogTable +import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} +import org.apache.spark.sql.connector.read.InputPartition +import org.apache.spark.sql.connector.write.{LogicalWriteInfo, WriteBuilder} +import org.apache.spark.sql.delta.{ClickhouseSnapshot, DeltaErrors, DeltaLog, DeltaTimeTravelSpec, Snapshot} +import org.apache.spark.sql.delta.actions.Metadata +import org.apache.spark.sql.delta.catalog.ClickHouseTableV2.deltaLog2Table +import org.apache.spark.sql.delta.sources.DeltaDataSource +import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, PartitionDirectory} +import org.apache.spark.sql.execution.datasources.utils.MergeTreePartsPartitionsUtil +import org.apache.spark.sql.execution.datasources.v2.clickhouse.source.DeltaMergeTreeFileFormat +import org.apache.spark.sql.util.CaseInsensitiveStringMap +import org.apache.spark.util.collection.BitSet + +import org.apache.hadoop.fs.Path + +import java.{util => ju} + +import scala.collection.JavaConverters._ + +@SuppressWarnings(Array("io.github.zhztheplayer.scalawarts.InheritFromCaseClass")) +class ClickHouseTableV2( + override val spark: SparkSession, + override val path: Path, + override val catalogTable: Option[CatalogTable] = None, + override val tableIdentifier: Option[String] = None, + override val timeTravelOpt: Option[DeltaTimeTravelSpec] = None, + override val options: Map[String, String] = Map.empty, + override val cdcOptions: CaseInsensitiveStringMap = CaseInsensitiveStringMap.empty(), + val clickhouseExtensionOptions: Map[String, String] = Map.empty) + extends DeltaTableV2( + spark, + path, + catalogTable, + tableIdentifier, + timeTravelOpt, + options, + cdcOptions) + with ClickHouseTableV2Base { + + lazy val (rootPath, partitionFilters, timeTravelByPath) = { + if (catalogTable.isDefined) { + // Fast path for reducing path munging overhead + (new Path(catalogTable.get.location), Nil, None) + } else { + DeltaDataSource.parsePathIdentifier(spark, path.toString, options) + } + } + + private lazy val timeTravelSpec: Option[DeltaTimeTravelSpec] = { + if (timeTravelOpt.isDefined && timeTravelByPath.isDefined) { + throw DeltaErrors.multipleTimeTravelSyntaxUsed + } + timeTravelOpt.orElse(timeTravelByPath) + } + + override def name(): String = + catalogTable + .map(_.identifier.unquotedString) + .orElse(tableIdentifier) + .getOrElse(s"clickhouse.`${deltaLog.dataPath}`") + + override def properties(): ju.Map[String, String] = { + val ret = super.properties() + + // for file path based write + if (snapshot.version < 0 && clickhouseExtensionOptions.nonEmpty) { + ret.putAll(clickhouseExtensionOptions.asJava) + } + ret + } + + override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder = { + new WriteIntoDeltaBuilder(deltaLog, info.options) + } + + def getFileFormat(meta: Metadata): DeltaMergeTreeFileFormat = { + new DeltaMergeTreeFileFormat( + meta, + dataBaseName, + tableName, + ClickhouseSnapshot.genSnapshotId(snapshot), + orderByKeyOption, + lowCardKeyOption, + minmaxIndexKeyOption, + bfIndexKeyOption, + setIndexKeyOption, + primaryKeyOption, + clickhouseTableConfigs, + partitionColumns + ) + } + + override def deltaProperties(): ju.Map[String, String] = properties() + + override def deltaCatalog(): Option[CatalogTable] = catalogTable + + override def deltaPath(): Path = path + + override def deltaSnapshot(): Snapshot = snapshot + + def cacheThis(): Unit = { + deltaLog2Table.put(deltaLog, this) + } + + cacheThis() +} + +@SuppressWarnings(Array("io.github.zhztheplayer.scalawarts.InheritFromCaseClass")) +class TempClickHouseTableV2( + override val spark: SparkSession, + override val catalogTable: Option[CatalogTable] = None) + extends ClickHouseTableV2(spark, null, catalogTable) { + import collection.JavaConverters._ + override def properties(): ju.Map[String, String] = catalogTable.get.properties.asJava + override lazy val partitionColumns: Seq[String] = catalogTable.get.partitionColumnNames + override def cacheThis(): Unit = {} +} + +object ClickHouseTableV2 extends Logging { + private val deltaLog2Table = + new scala.collection.concurrent.TrieMap[DeltaLog, ClickHouseTableV2]() + // for CTAS use + val temporalThreadLocalCHTable = new ThreadLocal[ClickHouseTableV2]() + + def getTable(deltaLog: DeltaLog): ClickHouseTableV2 = { + if (deltaLog2Table.contains(deltaLog)) { + deltaLog2Table(deltaLog) + } else if (temporalThreadLocalCHTable.get() != null) { + temporalThreadLocalCHTable.get() + } else { + throw new IllegalStateException( + s"Can not find ClickHouseTableV2 for deltalog ${deltaLog.dataPath}") + } + } + + def clearCache(): Unit = { + deltaLog2Table.clear() + temporalThreadLocalCHTable.remove() + } + + def partsPartitions( + deltaLog: DeltaLog, + relation: HadoopFsRelation, + selectedPartitions: Array[PartitionDirectory], + output: Seq[Attribute], + bucketedScan: Boolean, + optionalBucketSet: Option[BitSet], + optionalNumCoalescedBuckets: Option[Int], + disableBucketedScan: Boolean, + filterExprs: Seq[Expression]): Seq[InputPartition] = { + val tableV2 = ClickHouseTableV2.getTable(deltaLog) + + MergeTreePartsPartitionsUtil.getMergeTreePartsPartitions( + relation, + selectedPartitions, + output, + bucketedScan, + tableV2.spark, + tableV2, + optionalBucketSet, + optionalNumCoalescedBuckets, + disableBucketedScan, + filterExprs) + + } +} diff --git a/backends-clickhouse/src/main/delta-22/org/apache/spark/sql/delta/commands/DeleteCommand.scala b/backends-clickhouse/src/main/delta-23/org/apache/spark/sql/delta/commands/DeleteCommand.scala similarity index 61% rename from backends-clickhouse/src/main/delta-22/org/apache/spark/sql/delta/commands/DeleteCommand.scala rename to backends-clickhouse/src/main/delta-23/org/apache/spark/sql/delta/commands/DeleteCommand.scala index 006a3fce84296..5f9c2953ba16f 100644 --- a/backends-clickhouse/src/main/delta-22/org/apache/spark/sql/delta/commands/DeleteCommand.scala +++ b/backends-clickhouse/src/main/delta-23/org/apache/spark/sql/delta/commands/DeleteCommand.scala @@ -14,33 +14,34 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.spark.sql.delta.commands -import org.apache.spark.SparkContext -import org.apache.spark.sql._ -import org.apache.spark.sql.catalyst.analysis.EliminateSubqueryAliases -import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, EqualNullSafe, Expression, If, Literal, Not} -import org.apache.spark.sql.catalyst.plans.QueryPlan -import org.apache.spark.sql.catalyst.plans.logical.{DeltaDelete, LogicalPlan} import org.apache.spark.sql.delta._ import org.apache.spark.sql.delta.actions.{Action, AddCDCFile, AddFile, FileAction} import org.apache.spark.sql.delta.commands.DeleteCommand.{rewritingFilesMsg, FINDING_TOUCHED_FILES_MSG} import org.apache.spark.sql.delta.commands.MergeIntoCommand.totalBytesAndDistinctPartitionValues import org.apache.spark.sql.delta.files.TahoeBatchFileIndex import org.apache.spark.sql.delta.sources.DeltaSQLConf -import org.apache.spark.sql.execution.SQLExecution +import org.apache.spark.sql.delta.util.Utils +import com.fasterxml.jackson.databind.annotation.JsonDeserialize + +import org.apache.spark.SparkContext +import org.apache.spark.sql.{Column, DataFrame, Dataset, Row, SparkSession} +import org.apache.spark.sql.catalyst.analysis.EliminateSubqueryAliases +import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, EqualNullSafe, Expression, If, Literal, Not} +import org.apache.spark.sql.catalyst.plans.QueryPlan +import org.apache.spark.sql.catalyst.plans.logical.{DeltaDelete, LogicalPlan} import org.apache.spark.sql.execution.command.LeafRunnableCommand -import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} +import org.apache.spark.sql.execution.metric.SQLMetric import org.apache.spark.sql.execution.metric.SQLMetrics.{createMetric, createTimingMetric} import org.apache.spark.sql.functions.{col, explode, input_file_name, split} import org.apache.spark.sql.types.LongType -import com.fasterxml.jackson.databind.annotation.JsonDeserialize - /** * Gluten overwrite Delta: * - * This file is copied from Delta 2.2.0. It is modified to overcome the following issues: + * This file is copied from Delta 2.3.0. It is modified to overcome the following issues: * 1. In Clickhouse backend, we can't implement input_file_name() correctly, we can only implement * it so that it return a a list of filenames (concated by ','). */ @@ -60,8 +61,8 @@ trait DeleteCommandMetrics { self: LeafRunnableCommand => "numPartitionsAddedTo" -> createMetric(sc, "number of partitions added"), "numPartitionsRemovedFrom" -> createMetric(sc, "number of partitions removed"), "numCopiedRows" -> createMetric(sc, "number of rows copied"), - "numBytesAdded" -> createMetric(sc, "number of bytes added"), - "numBytesRemoved" -> createMetric(sc, "number of bytes removed"), + "numAddedBytes" -> createMetric(sc, "number of bytes added"), + "numRemovedBytes" -> createMetric(sc, "number of bytes removed"), "executionTimeMs" -> createTimingMetric(sc, "time taken to execute the entire operation"), "scanTimeMs" -> @@ -73,7 +74,7 @@ trait DeleteCommandMetrics { self: LeafRunnableCommand => "numTouchedRows" -> createMetric(sc, "number of rows touched") ) - def getDeletedRowsFromAddFilesAndUpdateMetrics(files: Seq[AddFile]): Option[Long] = { + def getDeletedRowsFromAddFilesAndUpdateMetrics(files: Seq[AddFile]) : Option[Long] = { if (!conf.getConf(DeltaSQLConf.DELTA_DML_METRICS_FROM_METADATA)) { return None; } @@ -97,15 +98,18 @@ trait DeleteCommandMetrics { self: LeafRunnableCommand => /** * Performs a Delete based on the search condition * - * Algorithm: 1) Scan all the files and determine which files have the rows that need to be deleted. - * 2) Traverse the affected files and rebuild the touched files. 3) Use the Delta protocol to - * atomically write the remaining rows to new files and remove the affected files that are - * identified in step 1. + * Algorithm: + * 1) Scan all the files and determine which files have + * the rows that need to be deleted. + * 2) Traverse the affected files and rebuild the touched files. + * 3) Use the Delta protocol to atomically write the remaining rows to new files and remove + * the affected files that are identified in step 1. */ -case class DeleteCommand(deltaLog: DeltaLog, target: LogicalPlan, condition: Option[Expression]) - extends LeafRunnableCommand - with DeltaCommand - with DeleteCommandMetrics { +case class DeleteCommand( + deltaLog: DeltaLog, + target: LogicalPlan, + condition: Option[Expression]) + extends LeafRunnableCommand with DeltaCommand with DeleteCommandMetrics { override def innerChildren: Seq[QueryPlan[_]] = Seq(target) @@ -115,13 +119,15 @@ case class DeleteCommand(deltaLog: DeltaLog, target: LogicalPlan, condition: Opt final override def run(sparkSession: SparkSession): Seq[Row] = { recordDeltaOperation(deltaLog, "delta.dml.delete") { - deltaLog.assertRemovable() - deltaLog.withNewTransaction { - txn => - val deleteActions = performDelete(sparkSession, deltaLog, txn) - if (deleteActions.nonEmpty) { - txn.commit(deleteActions, DeltaOperations.Delete(condition.map(_.sql).toSeq)) - } + deltaLog.withNewTransaction { txn => + DeltaLog.assertRemovable(txn.snapshot) + if (hasBeenExecuted(txn, sparkSession)) { + sendDriverMetrics(sparkSession, metrics) + return Seq.empty + } + + val deleteActions = performDelete(sparkSession, deltaLog, txn) + txn.commitIfNeeded(deleteActions, DeltaOperations.Delete(condition.map(_.sql).toSeq)) } // Re-cache all cached plans(including this relation itself, if it's cached) that refer to // this data source relation. @@ -150,9 +156,9 @@ case class DeleteCommand(deltaLog: DeltaLog, target: LogicalPlan, condition: Opt var numAddedChangeFiles: Long = 0 var scanTimeMs: Long = 0 var rewriteTimeMs: Long = 0 - var numBytesAdded: Long = 0 + var numAddedBytes: Long = 0 var changeFileBytes: Long = 0 - var numBytesRemoved: Long = 0 + var numRemovedBytes: Long = 0 var numFilesBeforeSkipping: Long = 0 var numBytesBeforeSkipping: Long = 0 var numFilesAfterSkipping: Long = 0 @@ -175,7 +181,7 @@ case class DeleteCommand(deltaLog: DeltaLog, target: LogicalPlan, condition: Opt numRemovedFiles = allFiles.size scanTimeMs = (System.nanoTime() - startTime) / 1000 / 1000 val (numBytes, numPartitions) = totalBytesAndDistinctPartitionValues(allFiles) - numBytesRemoved = numBytes + numRemovedBytes = numBytes numFilesBeforeSkipping = numRemovedFiles numBytesBeforeSkipping = numBytes numFilesAfterSkipping = numRemovedFiles @@ -192,9 +198,7 @@ case class DeleteCommand(deltaLog: DeltaLog, target: LogicalPlan, condition: Opt case Some(cond) => val (metadataPredicates, otherPredicates) = DeltaTableUtils.splitMetadataAndDataPredicates( - cond, - txn.metadata.partitionColumns, - sparkSession) + cond, txn.metadata.partitionColumns, sparkSession) numFilesBeforeSkipping = txn.snapshot.numOfFiles numBytesBeforeSkipping = txn.snapshot.sizeInBytes @@ -209,7 +213,7 @@ case class DeleteCommand(deltaLog: DeltaLog, target: LogicalPlan, condition: Opt scanTimeMs = (System.nanoTime() - startTime) / 1000 / 1000 numRemovedFiles = candidateFiles.size - numBytesRemoved = candidateFiles.map(_.size).sum + numRemovedBytes = candidateFiles.map(_.size).sum numFilesAfterSkipping = candidateFiles.size val (numCandidateBytes, numCandidatePartitions) = totalBytesAndDistinctPartitionValues(candidateFiles) @@ -224,7 +228,15 @@ case class DeleteCommand(deltaLog: DeltaLog, target: LogicalPlan, condition: Opt candidateFiles.map(_.removeWithTimestamp(operationTimestamp)) } else { // Case 3: Delete the rows based on the condition. - val candidateFiles = txn.filterFiles(metadataPredicates ++ otherPredicates) + + // Should we write the DVs to represent the deleted rows? + val shouldWriteDVs = shouldWritePersistentDeletionVectors(sparkSession, txn) + + val candidateFiles = txn.filterFiles( + metadataPredicates ++ otherPredicates, + keepNumRecords = shouldWriteDVs) + // `candidateFiles` contains the files filtered using statistics and delete condition + // They may or may not contains any rows that need to be deleted. numFilesAfterSkipping = candidateFiles.size val (numCandidateBytes, numCandidatePartitions) = @@ -237,89 +249,104 @@ case class DeleteCommand(deltaLog: DeltaLog, target: LogicalPlan, condition: Opt val nameToAddFileMap = generateCandidateFileMap(deltaLog.dataPath, candidateFiles) val fileIndex = new TahoeBatchFileIndex( - sparkSession, - "delete", - candidateFiles, - deltaLog, - deltaLog.dataPath, - txn.snapshot) - // Keep everything from the resolved target except a new TahoeFileIndex - // that only involves the affected files instead of all files. - val newTarget = DeltaTableUtils.replaceFileIndex(target, fileIndex) - val data = Dataset.ofRows(sparkSession, newTarget) - val deletedRowCount = metrics("numDeletedRows") - val deletedRowUdf = DeltaUDF - .boolean { - () => - deletedRowCount += 1 - true - } - .asNondeterministic() - val filesToRewrite = - withStatusCode("DELTA", FINDING_TOUCHED_FILES_MSG) { - if (candidateFiles.isEmpty) { - Array.empty[String] - } else { - data - .filter(new Column(cond)) - .select(input_file_name().as("input_files")) - .filter(deletedRowUdf()) - .select(explode(split(col("input_files"), ","))) - .distinct() - .as[String] - .collect() - } - } + sparkSession, "delete", candidateFiles, deltaLog, deltaLog.dataPath, txn.snapshot) + if (shouldWriteDVs) { + val targetDf = DeleteWithDeletionVectorsHelper.createTargetDfForScanningForMatches( + sparkSession, + target, + fileIndex) - numRemovedFiles = filesToRewrite.length - scanTimeMs = (System.nanoTime() - startTime) / 1000 / 1000 - if (filesToRewrite.isEmpty) { - // Case 3.1: no row matches and no delete will be triggered - if (txn.metadata.partitionColumns.nonEmpty) { - numPartitionsRemovedFrom = Some(0) - numPartitionsAddedTo = Some(0) - } - Nil - } else { - // Case 3.2: some files need an update to remove the deleted files - // Do the second pass and just read the affected files - val baseRelation = buildBaseRelation( + // Does the target table already has DVs enabled? If so, we need to read the table + // with deletion vectors. + val mustReadDeletionVectors = DeletionVectorUtils.deletionVectorsReadable(txn.snapshot) + + val touchedFiles = DeleteWithDeletionVectorsHelper.findTouchedFiles( sparkSession, txn, - "delete", - deltaLog.dataPath, - filesToRewrite, - nameToAddFileMap) + mustReadDeletionVectors, + deltaLog, + targetDf, + fileIndex, + cond) + + if (touchedFiles.nonEmpty) { + DeleteWithDeletionVectorsHelper.processUnmodifiedData(touchedFiles) + } else { + Nil // Nothing to update + } + } else { // Keep everything from the resolved target except a new TahoeFileIndex // that only involves the affected files instead of all files. - val newTarget = DeltaTableUtils.replaceFileIndex(target, baseRelation.location) - val targetDF = Dataset.ofRows(sparkSession, newTarget) - val filterCond = Not(EqualNullSafe(cond, Literal.TrueLiteral)) - val rewrittenActions = rewriteFiles(txn, targetDF, filterCond, filesToRewrite.length) - val (changeFiles, rewrittenFiles) = rewrittenActions - .partition(_.isInstanceOf[AddCDCFile]) - numAddedFiles = rewrittenFiles.size - val removedFiles = - filesToRewrite.map(f => getTouchedFile(deltaLog.dataPath, f, nameToAddFileMap)) - val (removedBytes, removedPartitions) = - totalBytesAndDistinctPartitionValues(removedFiles) - numBytesRemoved = removedBytes - val (rewrittenBytes, rewrittenPartitions) = - totalBytesAndDistinctPartitionValues(rewrittenFiles) - numBytesAdded = rewrittenBytes - if (txn.metadata.partitionColumns.nonEmpty) { - numPartitionsRemovedFrom = Some(removedPartitions) - numPartitionsAddedTo = Some(rewrittenPartitions) + val newTarget = DeltaTableUtils.replaceFileIndex(target, fileIndex) + val data = Dataset.ofRows(sparkSession, newTarget) + val deletedRowCount = metrics("numDeletedRows") + val deletedRowUdf = DeltaUDF.boolean { () => + deletedRowCount += 1 + true + }.asNondeterministic() + val filesToRewrite = + withStatusCode("DELTA", FINDING_TOUCHED_FILES_MSG) { + if (candidateFiles.isEmpty) { + Array.empty[String] + } else { + // --- modified start + data.filter(new Column(cond)) + .select(input_file_name().as("input_files")) + .filter(deletedRowUdf()) + .select(explode(split(col("input_files"), ","))) + .distinct() + .as[String] + .collect() + // --- modified end + } + } + + numRemovedFiles = filesToRewrite.length + scanTimeMs = (System.nanoTime() - startTime) / 1000 / 1000 + if (filesToRewrite.isEmpty) { + // Case 3.1: no row matches and no delete will be triggered + if (txn.metadata.partitionColumns.nonEmpty) { + numPartitionsRemovedFrom = Some(0) + numPartitionsAddedTo = Some(0) + } + Nil + } else { + // Case 3.2: some files need an update to remove the deleted files + // Do the second pass and just read the affected files + val baseRelation = buildBaseRelation( + sparkSession, txn, "delete", deltaLog.dataPath, filesToRewrite, nameToAddFileMap) + // Keep everything from the resolved target except a new TahoeFileIndex + // that only involves the affected files instead of all files. + val newTarget = DeltaTableUtils.replaceFileIndex(target, baseRelation.location) + val targetDF = Dataset.ofRows(sparkSession, newTarget) + val filterCond = Not(EqualNullSafe(cond, Literal.TrueLiteral)) + val rewrittenActions = rewriteFiles(txn, targetDF, filterCond, filesToRewrite.length) + val (changeFiles, rewrittenFiles) = rewrittenActions + .partition(_.isInstanceOf[AddCDCFile]) + numAddedFiles = rewrittenFiles.size + val removedFiles = filesToRewrite.map(f => + getTouchedFile(deltaLog.dataPath, f, nameToAddFileMap)) + val (removedBytes, removedPartitions) = + totalBytesAndDistinctPartitionValues(removedFiles) + numRemovedBytes = removedBytes + val (rewrittenBytes, rewrittenPartitions) = + totalBytesAndDistinctPartitionValues(rewrittenFiles) + numAddedBytes = rewrittenBytes + if (txn.metadata.partitionColumns.nonEmpty) { + numPartitionsRemovedFrom = Some(removedPartitions) + numPartitionsAddedTo = Some(rewrittenPartitions) + } + numAddedChangeFiles = changeFiles.size + changeFileBytes = changeFiles.collect { case f: AddCDCFile => f.size }.sum + rewriteTimeMs = (System.nanoTime() - startTime) / 1000 / 1000 - scanTimeMs + numDeletedRows = Some(metrics("numDeletedRows").value) + numCopiedRows = + Some(metrics("numTouchedRows").value - metrics("numDeletedRows").value) + + val operationTimestamp = System.currentTimeMillis() + removeFilesFromPaths( + deltaLog, nameToAddFileMap, filesToRewrite, operationTimestamp) ++ rewrittenActions } - numAddedChangeFiles = changeFiles.size - changeFileBytes = changeFiles.collect { case f: AddCDCFile => f.size }.sum - rewriteTimeMs = (System.nanoTime() - startTime) / 1000 / 1000 - scanTimeMs - numDeletedRows = Some(metrics("numDeletedRows").value) - numCopiedRows = Some(metrics("numTouchedRows").value - metrics("numDeletedRows").value) - - val operationTimestamp = System.currentTimeMillis() - removeFilesFromPaths(deltaLog, nameToAddFileMap, filesToRewrite, operationTimestamp) ++ - rewrittenActions } } } @@ -331,8 +358,8 @@ case class DeleteCommand(deltaLog: DeltaLog, target: LogicalPlan, condition: Opt metrics("rewriteTimeMs").set(rewriteTimeMs) metrics("numAddedChangeFiles").set(numAddedChangeFiles) metrics("changeFileBytes").set(changeFileBytes) - metrics("numBytesAdded").set(numBytesAdded) - metrics("numBytesRemoved").set(numBytesRemoved) + metrics("numAddedBytes").set(numAddedBytes) + metrics("numRemovedBytes").set(numRemovedBytes) metrics("numFilesBeforeSkipping").set(numFilesBeforeSkipping) metrics("numBytesBeforeSkipping").set(numBytesBeforeSkipping) metrics("numFilesAfterSkipping").set(numFilesAfterSkipping) @@ -342,9 +369,7 @@ case class DeleteCommand(deltaLog: DeltaLog, target: LogicalPlan, condition: Opt numPartitionsRemovedFrom.foreach(metrics("numPartitionsRemovedFrom").set) numCopiedRows.foreach(metrics("numCopiedRows").set) txn.registerSQLMetrics(sparkSession, metrics) - // This is needed to make the SQL metrics visible in the Spark UI - val executionId = sparkSession.sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY) - SQLMetrics.postDriverMetricUpdates(sparkSession.sparkContext, executionId, metrics.values.toSeq) + sendDriverMetrics(sparkSession, metrics) recordDeltaEvent( deltaLog, @@ -366,18 +391,23 @@ case class DeleteCommand(deltaLog: DeltaLog, target: LogicalPlan, condition: Opt numPartitionsRemovedFrom, numCopiedRows, numDeletedRows, - numBytesAdded, - numBytesRemoved, + numAddedBytes, + numRemovedBytes, changeFileBytes = changeFileBytes, scanTimeMs, - rewriteTimeMs - ) + rewriteTimeMs) ) - deleteActions + if (deleteActions.nonEmpty) { + createSetTransaction(sparkSession, deltaLog).toSeq ++ deleteActions + } else { + Seq.empty + } } - /** Returns the list of [[AddFile]]s and [[AddCDCFile]]s that have been re-written. */ + /** + * Returns the list of [[AddFile]]s and [[AddCDCFile]]s that have been re-written. + */ private def rewriteFiles( txn: OptimisticTransaction, baseData: DataFrame, @@ -387,15 +417,13 @@ case class DeleteCommand(deltaLog: DeltaLog, target: LogicalPlan, condition: Opt // number of total rows that we have seen / are either copying or deleting (sum of both). val numTouchedRows = metrics("numTouchedRows") - val numTouchedRowsUdf = DeltaUDF - .boolean { - () => - numTouchedRows += 1 - true - } - .asNondeterministic() + val numTouchedRowsUdf = DeltaUDF.boolean { () => + numTouchedRows += 1 + true + }.asNondeterministic() - withStatusCode("DELTA", rewritingFilesMsg(numFilesToRewrite)) { + withStatusCode( + "DELTA", rewritingFilesMsg(numFilesToRewrite)) { val dfToWrite = if (shouldWriteCdc) { import org.apache.spark.sql.delta.commands.cdc.CDCReader._ // The logic here ends up being surprisingly elegant, with all source rows ending up in @@ -418,6 +446,14 @@ case class DeleteCommand(deltaLog: DeltaLog, target: LogicalPlan, condition: Opt txn.writeFiles(dfToWrite) } } + + def shouldWritePersistentDeletionVectors( + spark: SparkSession, txn: OptimisticTransaction): Boolean = { + // DELETE with DVs only enabled for tests. + Utils.isTesting && + spark.conf.get(DeltaSQLConf.DELETE_USE_PERSISTENT_DELETION_VECTORS) && + DeletionVectorUtils.deletionVectorsWritable(txn.snapshot) + } } object DeleteCommand { @@ -441,51 +477,29 @@ object DeleteCommand { /** * Used to report details about delete. * - * @param condition: - * what was the delete condition - * @param numFilesTotal: - * how big is the table - * @param numTouchedFiles: - * how many files did we touch. Alias for `numFilesAfterSkipping` - * @param numRewrittenFiles: - * how many files had to be rewritten. Alias for `numAddedFiles` - * @param numRemovedFiles: - * how many files we removed. Alias for `numTouchedFiles` - * @param numAddedFiles: - * how many files we added. Alias for `numRewrittenFiles` - * @param numAddedChangeFiles: - * how many change files were generated - * @param numFilesBeforeSkipping: - * how many candidate files before skipping - * @param numBytesBeforeSkipping: - * how many candidate bytes before skipping - * @param numFilesAfterSkipping: - * how many candidate files after skipping - * @param numBytesAfterSkipping: - * how many candidate bytes after skipping - * @param numPartitionsAfterSkipping: - * how many candidate partitions after skipping - * @param numPartitionsAddedTo: - * how many new partitions were added - * @param numPartitionsRemovedFrom: - * how many partitions were removed - * @param numCopiedRows: - * how many rows were copied - * @param numDeletedRows: - * how many rows were deleted - * @param numBytesAdded: - * how many bytes were added - * @param numBytesRemoved: - * how many bytes were removed - * @param changeFileBytes: - * total size of change files generated - * @param scanTimeMs: - * how long did finding take - * @param rewriteTimeMs: - * how long did rewriting take + * @param condition: what was the delete condition + * @param numFilesTotal: how big is the table + * @param numTouchedFiles: how many files did we touch. Alias for `numFilesAfterSkipping` + * @param numRewrittenFiles: how many files had to be rewritten. Alias for `numAddedFiles` + * @param numRemovedFiles: how many files we removed. Alias for `numTouchedFiles` + * @param numAddedFiles: how many files we added. Alias for `numRewrittenFiles` + * @param numAddedChangeFiles: how many change files were generated + * @param numFilesBeforeSkipping: how many candidate files before skipping + * @param numBytesBeforeSkipping: how many candidate bytes before skipping + * @param numFilesAfterSkipping: how many candidate files after skipping + * @param numBytesAfterSkipping: how many candidate bytes after skipping + * @param numPartitionsAfterSkipping: how many candidate partitions after skipping + * @param numPartitionsAddedTo: how many new partitions were added + * @param numPartitionsRemovedFrom: how many partitions were removed + * @param numCopiedRows: how many rows were copied + * @param numDeletedRows: how many rows were deleted + * @param numBytesAdded: how many bytes were added + * @param numBytesRemoved: how many bytes were removed + * @param changeFileBytes: total size of change files generated + * @param scanTimeMs: how long did finding take + * @param rewriteTimeMs: how long did rewriting take * - * @note - * All the time units are milliseconds. + * @note All the time units are milliseconds. */ case class DeleteMetric( condition: String, diff --git a/backends-clickhouse/src/main/delta-22/org/apache/spark/sql/delta/commands/MergeIntoCommand.scala b/backends-clickhouse/src/main/delta-23/org/apache/spark/sql/delta/commands/MergeIntoCommand.scala similarity index 70% rename from backends-clickhouse/src/main/delta-22/org/apache/spark/sql/delta/commands/MergeIntoCommand.scala rename to backends-clickhouse/src/main/delta-23/org/apache/spark/sql/delta/commands/MergeIntoCommand.scala index 5967d66b13b52..bb4d668975652 100644 --- a/backends-clickhouse/src/main/delta-22/org/apache/spark/sql/delta/commands/MergeIntoCommand.scala +++ b/backends-clickhouse/src/main/delta-23/org/apache/spark/sql/delta/commands/MergeIntoCommand.scala @@ -14,63 +14,66 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.spark.sql.delta.commands +import java.util.concurrent.TimeUnit + +import scala.collection.JavaConverters._ +import scala.collection.mutable + +import org.apache.spark.sql.delta._ +import org.apache.spark.sql.delta.actions.{AddCDCFile, AddFile, FileAction} +import org.apache.spark.sql.delta.commands.merge.MergeIntoMaterializeSource +import org.apache.spark.sql.delta.files._ +import org.apache.spark.sql.delta.schema.{ImplicitMetadataOperation, SchemaUtils} +import org.apache.spark.sql.delta.sources.DeltaSQLConf +import org.apache.spark.sql.delta.util.{AnalysisHelper, SetAccumulator} +import com.fasterxml.jackson.databind.annotation.JsonDeserialize + import org.apache.spark.SparkContext import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, RowEncoder} -import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, AttributeReference, BasePredicate, Expression, Literal, NamedExpression, PredicateHelper, UnsafeProjection} +import org.apache.spark.sql.catalyst.expressions.{Alias, And, Attribute, AttributeReference, BasePredicate, Expression, Literal, NamedExpression, PredicateHelper, UnsafeProjection} import org.apache.spark.sql.catalyst.expressions.codegen._ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap -import org.apache.spark.sql.delta._ -import org.apache.spark.sql.delta.actions.{AddCDCFile, AddFile, FileAction} -import org.apache.spark.sql.delta.commands.merge.MergeIntoMaterializeSource -import org.apache.spark.sql.delta.files._ -import org.apache.spark.sql.delta.schema.{ImplicitMetadataOperation, SchemaUtils} -import org.apache.spark.sql.delta.sources.DeltaSQLConf -import org.apache.spark.sql.delta.util.{AnalysisHelper, SetAccumulator} -import org.apache.spark.sql.execution.SQLExecution import org.apache.spark.sql.execution.command.LeafRunnableCommand import org.apache.spark.sql.execution.datasources.LogicalRelation import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{DataTypes, LongType, StructType} -import com.fasterxml.jackson.databind.annotation.JsonDeserialize - -import java.util.concurrent.TimeUnit - -import scala.collection.JavaConverters._ -import scala.collection.mutable - /** * Gluten overwrite Delta: * - * This file is copied from Delta 2.2.0. It is modified to overcome the following issues: + * This file is copied from Delta 2.3.0. It is modified to overcome the following issues: * 1. In Clickhouse backend, we can't implement input_file_name() correctly, we can only implement * it so that it return a a list of filenames (concated by ','). */ case class MergeDataSizes( - @JsonDeserialize(contentAs = classOf[java.lang.Long]) - rows: Option[Long] = None, - @JsonDeserialize(contentAs = classOf[java.lang.Long]) - files: Option[Long] = None, - @JsonDeserialize(contentAs = classOf[java.lang.Long]) - bytes: Option[Long] = None, - @JsonDeserialize(contentAs = classOf[java.lang.Long]) - partitions: Option[Long] = None) + @JsonDeserialize(contentAs = classOf[java.lang.Long]) + rows: Option[Long] = None, + @JsonDeserialize(contentAs = classOf[java.lang.Long]) + files: Option[Long] = None, + @JsonDeserialize(contentAs = classOf[java.lang.Long]) + bytes: Option[Long] = None, + @JsonDeserialize(contentAs = classOf[java.lang.Long]) + partitions: Option[Long] = None) /** * Represents the state of a single merge clause: - * - merge clause's (optional) predicate - * - action type (insert, update, delete) - * - action's expressions + * - merge clause's (optional) predicate + * - action type (insert, update, delete) + * - action's expressions */ -case class MergeClauseStats(condition: Option[String], actionType: String, actionExpr: Seq[String]) +case class MergeClauseStats( + condition: Option[String], + actionType: String, + actionExpr: Seq[String]) object MergeClauseStats { def apply(mergeClause: DeltaMergeIntoClause): MergeClauseStats = { @@ -93,9 +96,10 @@ case class MergeStats( insertExprs: Seq[String], deleteConditionExpr: String, - // Newer expressions used in MERGE with any number of MATCHED/NOT MATCHED + // Newer expressions used in MERGE with any number of MATCHED/NOT MATCHED/NOT MATCHED BY SOURCE matchedStats: Seq[MergeClauseStats], notMatchedStats: Seq[MergeClauseStats], + notMatchedBySourceStats: Seq[MergeClauseStats], // Timings executionTimeMs: Long, @@ -126,8 +130,12 @@ case class MergeStats( targetPartitionsAddedTo: Option[Long], targetRowsCopied: Long, targetRowsUpdated: Long, + targetRowsMatchedUpdated: Long, + targetRowsNotMatchedBySourceUpdated: Long, targetRowsInserted: Long, targetRowsDeleted: Long, + targetRowsMatchedDeleted: Long, + targetRowsNotMatchedBySourceDeleted: Long, // MergeMaterializeSource stats materializeSourceReason: Option[String] = None, @@ -142,6 +150,7 @@ object MergeStats { condition: Expression, matchedClauses: Seq[DeltaMergeIntoMatchedClause], notMatchedClauses: Seq[DeltaMergeIntoNotMatchedClause], + notMatchedBySourceClauses: Seq[DeltaMergeIntoNotMatchedBySourceClause], isPartitioned: Boolean): MergeStats = { def metricValueIfPartitioned(metricName: String): Option[Long] = { @@ -152,9 +161,11 @@ object MergeStats { // Merge condition expression conditionExpr = condition.sql, - // Newer expressions used in MERGE with any number of MATCHED/NOT MATCHED + // Newer expressions used in MERGE with any number of MATCHED/NOT MATCHED/ + // NOT MATCHED BY SOURCE matchedStats = matchedClauses.map(MergeClauseStats(_)), notMatchedStats = notMatchedClauses.map(MergeClauseStats(_)), + notMatchedBySourceStats = notMatchedBySourceClauses.map(MergeClauseStats(_)), // Timings executionTimeMs = metrics("executionTimeMs").value, @@ -163,15 +174,17 @@ object MergeStats { // Data sizes of source and target at different stages of processing source = MergeDataSizes(rows = Some(metrics("numSourceRows").value)), - targetBeforeSkipping = MergeDataSizes( - files = Some(metrics("numTargetFilesBeforeSkipping").value), - bytes = Some(metrics("numTargetBytesBeforeSkipping").value)), - targetAfterSkipping = MergeDataSizes( - files = Some(metrics("numTargetFilesAfterSkipping").value), - bytes = Some(metrics("numTargetBytesAfterSkipping").value), - partitions = metricValueIfPartitioned("numTargetPartitionsAfterSkipping") - ), - sourceRowsInSecondScan = metrics.get("numSourceRowsInSecondScan").map(_.value).filter(_ >= 0), + targetBeforeSkipping = + MergeDataSizes( + files = Some(metrics("numTargetFilesBeforeSkipping").value), + bytes = Some(metrics("numTargetBytesBeforeSkipping").value)), + targetAfterSkipping = + MergeDataSizes( + files = Some(metrics("numTargetFilesAfterSkipping").value), + bytes = Some(metrics("numTargetBytesAfterSkipping").value), + partitions = metricValueIfPartitioned("numTargetPartitionsAfterSkipping")), + sourceRowsInSecondScan = + metrics.get("numSourceRowsInSecondScan").map(_.value).filter(_ >= 0), // Data change sizes targetFilesAdded = metrics("numTargetFilesAdded").value, @@ -184,49 +197,48 @@ object MergeStats { targetPartitionsAddedTo = metricValueIfPartitioned("numTargetPartitionsAddedTo"), targetRowsCopied = metrics("numTargetRowsCopied").value, targetRowsUpdated = metrics("numTargetRowsUpdated").value, + targetRowsMatchedUpdated = metrics("numTargetRowsMatchedUpdated").value, + targetRowsNotMatchedBySourceUpdated = metrics("numTargetRowsNotMatchedBySourceUpdated").value, targetRowsInserted = metrics("numTargetRowsInserted").value, targetRowsDeleted = metrics("numTargetRowsDeleted").value, + targetRowsMatchedDeleted = metrics("numTargetRowsMatchedDeleted").value, + targetRowsNotMatchedBySourceDeleted = metrics("numTargetRowsNotMatchedBySourceDeleted").value, // Deprecated fields updateConditionExpr = null, updateExprs = null, insertConditionExpr = null, insertExprs = null, - deleteConditionExpr = null - ) + deleteConditionExpr = null) } } /** * Performs a merge of a source query/table into a Delta table. * - * Issues an error message when the ON search_condition of the MERGE statement can match a single - * row from the target table with multiple rows of the source table-reference. + * Issues an error message when the ON search_condition of the MERGE statement can match + * a single row from the target table with multiple rows of the source table-reference. * * Algorithm: * - * Phase 1: Find the input files in target that are touched by the rows that satisfy the condition - * and verify that no two source rows match with the same target row. This is implemented as an - * inner-join using the given condition. See [[findTouchedFiles]] for more details. + * Phase 1: Find the input files in target that are touched by the rows that satisfy + * the condition and verify that no two source rows match with the same target row. + * This is implemented as an inner-join using the given condition. See [[findTouchedFiles]] + * for more details. * * Phase 2: Read the touched files again and write new files with updated and/or inserted rows. * * Phase 3: Use the Delta protocol to atomically remove the touched files and add the new files. * - * @param source - * Source data to merge from - * @param target - * Target table to merge into - * @param targetFileIndex - * TahoeFileIndex of the target table - * @param condition - * Condition for a source row to match with a target row - * @param matchedClauses - * All info related to matched clauses. - * @param notMatchedClauses - * All info related to not matched clause. - * @param migratedSchema - * The final schema of the target - may be changed by schema evolution. + * @param source Source data to merge from + * @param target Target table to merge into + * @param targetFileIndex TahoeFileIndex of the target table + * @param condition Condition for a source row to match with a target row + * @param matchedClauses All info related to matched clauses. + * @param notMatchedClauses All info related to not matched clauses. + * @param notMatchedBySourceClauses All info related to not matched by source clauses. + * @param migratedSchema The final schema of the target - may be changed by schema + * evolution. */ case class MergeIntoCommand( @transient source: LogicalPlan, @@ -235,18 +247,18 @@ case class MergeIntoCommand( condition: Expression, matchedClauses: Seq[DeltaMergeIntoMatchedClause], notMatchedClauses: Seq[DeltaMergeIntoNotMatchedClause], - migratedSchema: Option[StructType]) - extends LeafRunnableCommand + notMatchedBySourceClauses: Seq[DeltaMergeIntoNotMatchedBySourceClause], + migratedSchema: Option[StructType]) extends LeafRunnableCommand with DeltaCommand with PredicateHelper with AnalysisHelper with ImplicitMetadataOperation with MergeIntoMaterializeSource { - import org.apache.spark.sql.delta.commands.cdc.CDCReader._ - import MergeIntoCommand._ + import SQLMetrics._ + import org.apache.spark.sql.delta.commands.cdc.CDCReader._ override val canMergeSchema: Boolean = conf.getConf(DeltaSQLConf.DELTA_SCHEMA_AUTO_MIGRATE) override val canOverwriteSchema: Boolean = false @@ -255,20 +267,18 @@ case class MergeIntoCommand( AttributeReference("num_affected_rows", LongType)(), AttributeReference("num_updated_rows", LongType)(), AttributeReference("num_deleted_rows", LongType)(), - AttributeReference("num_inserted_rows", LongType)() - ) + AttributeReference("num_inserted_rows", LongType)()) @transient private lazy val sc: SparkContext = SparkContext.getOrCreate() @transient private lazy val targetDeltaLog: DeltaLog = targetFileIndex.deltaLog - /** - * Map to get target output attributes by name. The case sensitivity of the map is set accordingly - * to Spark configuration. + * Map to get target output attributes by name. + * The case sensitivity of the map is set accordingly to Spark configuration. */ @transient private lazy val targetOutputAttributesMap: Map[String, Attribute] = { - val attrMap: Map[String, Attribute] = target.outputSet.view - .map(attr => attr.name -> attr) - .toMap + val attrMap: Map[String, Attribute] = target + .outputSet.view + .map(attr => attr.name -> attr).toMap if (conf.caseSensitiveAnalysis) { attrMap } else { @@ -277,10 +287,10 @@ case class MergeIntoCommand( } /** Whether this merge statement has only a single insert (NOT MATCHED) clause. */ - private def isSingleInsertOnly: Boolean = matchedClauses.isEmpty && notMatchedClauses.length == 1 - - /** Whether this merge statement has only MATCHED clauses. */ - private def isMatchedOnly: Boolean = notMatchedClauses.isEmpty && matchedClauses.nonEmpty + private def isSingleInsertOnly: Boolean = + matchedClauses.isEmpty && notMatchedBySourceClauses.isEmpty && notMatchedClauses.length == 1 + /** Whether this merge statement has no insert (NOT MATCHED) clause. */ + private def hasNoInserts: Boolean = notMatchedClauses.isEmpty // We over-count numTargetRowsDeleted when there are multiple matches; // this is the amount of the overcount, so we can subtract it to get a correct final metric. @@ -293,7 +303,15 @@ case class MergeIntoCommand( "numTargetRowsCopied" -> createMetric(sc, "number of target rows rewritten unmodified"), "numTargetRowsInserted" -> createMetric(sc, "number of inserted rows"), "numTargetRowsUpdated" -> createMetric(sc, "number of updated rows"), + "numTargetRowsMatchedUpdated" -> + createMetric(sc, "number of rows updated by a matched clause"), + "numTargetRowsNotMatchedBySourceUpdated" -> + createMetric(sc, "number of rows updated by a not matched by source clause"), "numTargetRowsDeleted" -> createMetric(sc, "number of deleted rows"), + "numTargetRowsMatchedDeleted" -> + createMetric(sc, "number of rows deleted by a matched clause"), + "numTargetRowsNotMatchedBySourceDeleted" -> + createMetric(sc, "number of rows deleted by a not matched by source clause"), "numTargetFilesBeforeSkipping" -> createMetric(sc, "number of target files before skipping"), "numTargetFilesAfterSkipping" -> createMetric(sc, "number of target files after skipping"), "numTargetFilesRemoved" -> createMetric(sc, "number of files removed to target"), @@ -317,8 +335,7 @@ case class MergeIntoCommand( "scanTimeMs" -> createTimingMetric(sc, "time taken to scan the files for matches"), "rewriteTimeMs" -> - createTimingMetric(sc, "time taken to rewrite the matched files") - ) + createTimingMetric(sc, "time taken to rewrite the matched files")) override def run(spark: SparkSession): Seq[Row] = { metrics("executionTimeMs").set(0) @@ -334,7 +351,7 @@ case class MergeIntoCommand( if (newNullColumn.isDefined) { throw new AnalysisException( s"""Cannot add column '${newNullColumn.get}' with type 'void'. Please explicitly specify a - |non-void type.""".stripMargin.replaceAll("\n", " ") + |non-void type.""".stripMargin.replaceAll("\n", " ") ) } } @@ -344,118 +361,107 @@ case class MergeIntoCommand( } else { // If it is determined that source should be materialized, wrap the execution with retries, // in case the data of the materialized source is lost. - runWithMaterializedSourceLostRetries(spark, targetFileIndex.deltaLog, metrics, runMerge) + runWithMaterializedSourceLostRetries( + spark, targetFileIndex.deltaLog, metrics, runMerge) } } protected def runMerge(spark: SparkSession): Seq[Row] = { recordDeltaOperation(targetDeltaLog, "delta.dml.merge") { val startTime = System.nanoTime() - targetDeltaLog.withNewTransaction { - deltaTxn => - if (target.schema.size != deltaTxn.metadata.schema.size) { - throw DeltaErrors.schemaChangedSinceAnalysis( - atAnalysis = target.schema, - latestSchema = deltaTxn.metadata.schema) - } + targetDeltaLog.withNewTransaction { deltaTxn => + if (hasBeenExecuted(deltaTxn, spark)) { + sendDriverMetrics(spark, metrics) + return Seq.empty + } + if (target.schema.size != deltaTxn.metadata.schema.size) { + throw DeltaErrors.schemaChangedSinceAnalysis( + atAnalysis = target.schema, latestSchema = deltaTxn.metadata.schema) + } - if (canMergeSchema) { - updateMetadata( - spark, - deltaTxn, - migratedSchema.getOrElse(target.schema), - deltaTxn.metadata.partitionColumns, - deltaTxn.metadata.configuration, - isOverwriteMode = false, - rearrangeOnly = false - ) - } + if (canMergeSchema) { + updateMetadata( + spark, deltaTxn, migratedSchema.getOrElse(target.schema), + deltaTxn.metadata.partitionColumns, deltaTxn.metadata.configuration, + isOverwriteMode = false, rearrangeOnly = false) + } - // If materialized, prepare the DF reading the materialize source - // Otherwise, prepare a regular DF from source plan. - val materializeSourceReason = prepareSourceDFAndReturnMaterializeReason( - spark, - source, - condition, - matchedClauses, - notMatchedClauses, - isSingleInsertOnly) - - val deltaActions = { - if (isSingleInsertOnly && spark.conf.get(DeltaSQLConf.MERGE_INSERT_ONLY_ENABLED)) { - writeInsertsOnlyWhenNoMatchedClauses(spark, deltaTxn) - } else { - val filesToRewrite = findTouchedFiles(spark, deltaTxn) - val newWrittenFiles = withStatusCode("DELTA", "Writing merged data") { - writeAllChanges(spark, deltaTxn, filesToRewrite) - } - filesToRewrite.map(_.remove) ++ newWrittenFiles + // If materialized, prepare the DF reading the materialize source + // Otherwise, prepare a regular DF from source plan. + val materializeSourceReason = prepareSourceDFAndReturnMaterializeReason( + spark, + source, + condition, + matchedClauses, + notMatchedClauses, + isSingleInsertOnly) + + val deltaActions = { + if (isSingleInsertOnly && spark.conf.get(DeltaSQLConf.MERGE_INSERT_ONLY_ENABLED)) { + writeInsertsOnlyWhenNoMatchedClauses(spark, deltaTxn) + } else { + val filesToRewrite = findTouchedFiles(spark, deltaTxn) + val newWrittenFiles = withStatusCode("DELTA", "Writing merged data") { + writeAllChanges(spark, deltaTxn, filesToRewrite) } + filesToRewrite.map(_.remove) ++ newWrittenFiles } + } - // Metrics should be recorded before commit (where they are written to delta logs). - metrics("executionTimeMs").set((System.nanoTime() - startTime) / 1000 / 1000) - deltaTxn.registerSQLMetrics(spark, metrics) - - // This is a best-effort sanity check. - if ( - metrics("numSourceRowsInSecondScan").value >= 0 && - metrics("numSourceRows").value != metrics("numSourceRowsInSecondScan").value - ) { - log.warn( - s"Merge source has ${metrics("numSourceRows")} rows in initial scan but " + - s"${metrics("numSourceRowsInSecondScan")} rows in second scan") - if (conf.getConf(DeltaSQLConf.MERGE_FAIL_IF_SOURCE_CHANGED)) { - throw DeltaErrors.sourceNotDeterministicInMergeException(spark) - } + val finalActions = createSetTransaction(spark, targetDeltaLog).toSeq ++ deltaActions + // Metrics should be recorded before commit (where they are written to delta logs). + metrics("executionTimeMs").set((System.nanoTime() - startTime) / 1000 / 1000) + deltaTxn.registerSQLMetrics(spark, metrics) + + // This is a best-effort sanity check. + if (metrics("numSourceRowsInSecondScan").value >= 0 && + metrics("numSourceRows").value != metrics("numSourceRowsInSecondScan").value) { + log.warn(s"Merge source has ${metrics("numSourceRows")} rows in initial scan but " + + s"${metrics("numSourceRowsInSecondScan")} rows in second scan") + if (conf.getConf(DeltaSQLConf.MERGE_FAIL_IF_SOURCE_CHANGED)) { + throw DeltaErrors.sourceNotDeterministicInMergeException(spark) } + } - deltaTxn.commit( - deltaActions, - DeltaOperations.Merge( - Option(condition.sql), - matchedClauses.map(DeltaOperations.MergePredicate(_)), - notMatchedClauses.map(DeltaOperations.MergePredicate(_))) - ) - - // Record metrics - var stats = MergeStats.fromMergeSQLMetrics( - metrics, - condition, - matchedClauses, - notMatchedClauses, - deltaTxn.metadata.partitionColumns.nonEmpty) - stats = stats.copy( - materializeSourceReason = Some(materializeSourceReason.toString), - materializeSourceAttempts = Some(attempt)) - - recordDeltaEvent(targetFileIndex.deltaLog, "delta.dml.merge.stats", data = stats) + deltaTxn.commitIfNeeded( + finalActions, + DeltaOperations.Merge( + Option(condition.sql), + matchedClauses.map(DeltaOperations.MergePredicate(_)), + notMatchedClauses.map(DeltaOperations.MergePredicate(_)), + notMatchedBySourceClauses.map(DeltaOperations.MergePredicate(_)))) + + // Record metrics + var stats = MergeStats.fromMergeSQLMetrics( + metrics, + condition, + matchedClauses, + notMatchedClauses, + notMatchedBySourceClauses, + deltaTxn.metadata.partitionColumns.nonEmpty) + stats = stats.copy( + materializeSourceReason = Some(materializeSourceReason.toString), + materializeSourceAttempts = Some(attempt)) + + recordDeltaEvent(targetFileIndex.deltaLog, "delta.dml.merge.stats", data = stats) } spark.sharedState.cacheManager.recacheByPlan(spark, target) } - // This is needed to make the SQL metrics visible in the Spark UI. Also this needs - // to be outside the recordMergeOperation because this method will update some metric. - val executionId = spark.sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY) - SQLMetrics.postDriverMetricUpdates(spark.sparkContext, executionId, metrics.values.toSeq) - Seq( - Row( - metrics("numTargetRowsUpdated").value + metrics("numTargetRowsDeleted").value + - metrics("numTargetRowsInserted").value, - metrics("numTargetRowsUpdated").value, - metrics("numTargetRowsDeleted").value, - metrics("numTargetRowsInserted").value - )) + sendDriverMetrics(spark, metrics) + Seq(Row(metrics("numTargetRowsUpdated").value + metrics("numTargetRowsDeleted").value + + metrics("numTargetRowsInserted").value, metrics("numTargetRowsUpdated").value, + metrics("numTargetRowsDeleted").value, metrics("numTargetRowsInserted").value)) } /** * Find the target table files that contain the rows that satisfy the merge condition. This is - * implemented as an inner-join between the source query/table and the target table using the - * merge condition. + * implemented as an inner-join between the source query/table and the target table using + * the merge condition. */ private def findTouchedFiles( - spark: SparkSession, - deltaTxn: OptimisticTransaction + spark: SparkSession, + deltaTxn: OptimisticTransaction ): Seq[AddFile] = recordMergeOperation(sqlMetricName = "scanTimeMs") { // Accumulator to collect all the distinct touched files @@ -463,34 +469,40 @@ case class MergeIntoCommand( spark.sparkContext.register(touchedFilesAccum, TOUCHED_FILES_ACCUM_NAME) // UDFs to records touched files names and add them to the accumulator - val recordTouchedFileName = DeltaUDF - .intFromString { - fileName => - fileName.split(",").foreach(name => touchedFilesAccum.add(name)) - 1 + val recordTouchedFileName = DeltaUDF.intFromString { fileName => + // --- modified start + fileName.split(",").foreach(name => touchedFilesAccum.add(name)) + // --- modified end + 1 + }.asNondeterministic() + + // Prune non-matching files if we don't need to collect them for NOT MATCHED BY SOURCE clauses. + val dataSkippedFiles = + if (notMatchedBySourceClauses.isEmpty) { + val targetOnlyPredicates = + splitConjunctivePredicates(condition).filter(_.references.subsetOf(target.outputSet)) + deltaTxn.filterFiles(targetOnlyPredicates) + } else { + deltaTxn.filterFiles() } - .asNondeterministic() - - // Skip data based on the merge condition - val targetOnlyPredicates = - splitConjunctivePredicates(condition).filter(_.references.subsetOf(target.outputSet)) - val dataSkippedFiles = deltaTxn.filterFiles(targetOnlyPredicates) // UDF to increment metrics val incrSourceRowCountExpr = makeMetricUpdateUDF("numSourceRows") val sourceDF = getSourceDF() .filter(new Column(incrSourceRowCountExpr)) - // Apply inner join to between source and target using the merge condition to find matches + // Join the source and target table using the merge condition to find touched files. An inner + // join collects all candidate files for MATCHED clauses, a right outer join also includes + // candidates for NOT MATCHED BY SOURCE clauses. // In addition, we attach two columns // - a monotonically increasing row id for target rows to later identify whether the same // target row is modified by multiple user or not // - the target file name the row is from to later identify the files touched by matched rows - val targetDF = Dataset - .ofRows(spark, buildTargetPlanWithFiles(deltaTxn, dataSkippedFiles)) + val joinType = if (notMatchedBySourceClauses.isEmpty) "inner" else "right_outer" + val targetDF = buildTargetPlanWithFiles(spark, deltaTxn, dataSkippedFiles) .withColumn(ROW_ID_COL, monotonically_increasing_id()) .withColumn(FILE_NAME_COL, input_file_name()) - val joinToFindTouchedFiles = sourceDF.join(targetDF, new Column(condition), "inner") + val joinToFindTouchedFiles = sourceDF.join(targetDF, new Column(condition), joinType) // Process the matches from the inner join to record touched files and find multiple matches val collectTouchedFiles = joinToFindTouchedFiles @@ -542,16 +554,14 @@ case class MergeIntoCommand( logTrace(s"findTouchedFiles: matched files:\n\t${touchedFileNames.mkString("\n\t")}") val nameToAddFileMap = generateCandidateFileMap(targetDeltaLog.dataPath, dataSkippedFiles) - val touchedAddFiles = - touchedFileNames.map(f => getTouchedFile(targetDeltaLog.dataPath, f, nameToAddFileMap)) + val touchedAddFiles = touchedFileNames.map(f => + getTouchedFile(targetDeltaLog.dataPath, f, nameToAddFileMap)) // When the target table is empty, and the optimizer optimized away the join entirely // numSourceRows will be incorrectly 0. We need to scan the source table once to get the correct // metric here. - if ( - metrics("numSourceRows").value == 0 && - (dataSkippedFiles.isEmpty || targetDF.take(1).isEmpty) - ) { + if (metrics("numSourceRows").value == 0 && + (dataSkippedFiles.isEmpty || targetDF.take(1).isEmpty)) { val numSourceRows = sourceDF.count() metrics("numSourceRows").set(numSourceRows) } @@ -572,15 +582,15 @@ case class MergeIntoCommand( } /** - * This is an optimization of the case when there is no update clause for the merge. We perform an - * left anti join on the source data to find the rows to be inserted. + * This is an optimization of the case when there is no update clause for the merge. + * We perform an left anti join on the source data to find the rows to be inserted. * * This will currently only optimize for the case when there is a _single_ notMatchedClause. */ private def writeInsertsOnlyWhenNoMatchedClauses( spark: SparkSession, deltaTxn: OptimisticTransaction - ): Seq[FileAction] = recordMergeOperation(sqlMetricName = "rewriteTimeMs") { + ): Seq[FileAction] = recordMergeOperation(sqlMetricName = "rewriteTimeMs") { // UDFs to update metrics val incrSourceRowCountExpr = makeMetricUpdateUDF("numSourceRows") @@ -589,9 +599,8 @@ case class MergeIntoCommand( val outputColNames = getTargetOutputCols(deltaTxn).map(_.name) // we use head here since we know there is only a single notMatchedClause val outputExprs = notMatchedClauses.head.resolvedActions.map(_.expr) - val outputCols = outputExprs.zip(outputColNames).map { - case (expr, name) => - new Column(Alias(expr, name)()) + val outputCols = outputExprs.zip(outputColNames).map { case (expr, name) => + new Column(Alias(expr, name)()) } // source DataFrame @@ -606,10 +615,9 @@ case class MergeIntoCommand( val dataSkippedFiles = deltaTxn.filterFiles(targetOnlyPredicates) // target DataFrame - val targetDF = Dataset.ofRows(spark, buildTargetPlanWithFiles(deltaTxn, dataSkippedFiles)) + val targetDF = buildTargetPlanWithFiles(spark, deltaTxn, dataSkippedFiles) - val insertDf = sourceDF - .join(targetDF, new Column(condition), "leftanti") + val insertDf = sourceDF.join(targetDF, new Column(condition), "leftanti") .select(outputCols: _*) .filter(new Column(incrInsertedCountExpr)) @@ -652,11 +660,11 @@ case class MergeIntoCommand( * CDC_TYPE_COL_NAME used for handling CDC when enabled. */ private def writeAllChanges( - spark: SparkSession, - deltaTxn: OptimisticTransaction, - filesToRewrite: Seq[AddFile] + spark: SparkSession, + deltaTxn: OptimisticTransaction, + filesToRewrite: Seq[AddFile] ): Seq[FileAction] = recordMergeOperation(sqlMetricName = "rewriteTimeMs") { - import org.apache.spark.sql.catalyst.expressions.Literal.{FalseLiteral, TrueLiteral} + import org.apache.spark.sql.catalyst.expressions.Literal.{TrueLiteral, FalseLiteral} val cdcEnabled = DeltaConfigs.CHANGE_DATA_FEED.fromMetaData(deltaTxn.metadata) @@ -685,32 +693,35 @@ case class MergeIntoCommand( // need to drop the duplicate matches. val isDeleteWithDuplicateMatchesAndCdc = multipleMatchDeleteOnlyOvercount.nonEmpty && cdcEnabled - // Generate a new logical plan that has same output attributes exprIds as the target plan. + // Generate a new target dataframe that has same output attributes exprIds as the target plan. // This allows us to apply the existing resolved update/insert expressions. - val newTarget = buildTargetPlanWithFiles(deltaTxn, filesToRewrite) - val joinType = - if ( - isMatchedOnly && - spark.conf.get(DeltaSQLConf.MERGE_MATCHED_ONLY_ENABLED) - ) { - "rightOuter" - } else { - "fullOuter" - } + val baseTargetDF = buildTargetPlanWithFiles(spark, deltaTxn, filesToRewrite) + val joinType = if (hasNoInserts && + spark.conf.get(DeltaSQLConf.MERGE_MATCHED_ONLY_ENABLED)) { + "rightOuter" + } else { + "fullOuter" + } logDebug(s"""writeAllChanges using $joinType join: | source.output: ${source.outputSet} | target.output: ${target.outputSet} | condition: $condition - | newTarget.output: ${newTarget.outputSet} + | newTarget.output: ${baseTargetDF.queryExecution.logical.outputSet} """.stripMargin) // UDFs to update metrics val incrSourceRowCountExpr = makeMetricUpdateUDF("numSourceRowsInSecondScan") val incrUpdatedCountExpr = makeMetricUpdateUDF("numTargetRowsUpdated") + val incrUpdatedMatchedCountExpr = makeMetricUpdateUDF("numTargetRowsMatchedUpdated") + val incrUpdatedNotMatchedBySourceCountExpr = + makeMetricUpdateUDF("numTargetRowsNotMatchedBySourceUpdated") val incrInsertedCountExpr = makeMetricUpdateUDF("numTargetRowsInserted") val incrNoopCountExpr = makeMetricUpdateUDF("numTargetRowsCopied") val incrDeletedCountExpr = makeMetricUpdateUDF("numTargetRowsDeleted") + val incrDeletedMatchedCountExpr = makeMetricUpdateUDF("numTargetRowsMatchedDeleted") + val incrDeletedNotMatchedBySourceCountExpr = + makeMetricUpdateUDF("numTargetRowsNotMatchedBySourceDeleted") // Apply an outer join to find both, matches and non-matches. We are adding two boolean fields // with value `true`, one to each side of the join. Whether this field is null or not after @@ -721,8 +732,7 @@ case class MergeIntoCommand( // insert clause. See above at isDeleteWithDuplicateMatchesAndCdc definition for more details. var sourceDF = getSourceDF() .withColumn(SOURCE_ROW_PRESENT_COL, new Column(incrSourceRowCountExpr)) - var targetDF = Dataset - .ofRows(spark, newTarget) + var targetDF = baseTargetDF .withColumn(TARGET_ROW_PRESENT_COL, lit(true)) if (isDeleteWithDuplicateMatchesAndCdc) { targetDF = targetDF.withColumn(TARGET_ROW_ID_COL, monotonically_increasing_id()) @@ -783,12 +793,13 @@ case class MergeIntoCommand( .add(CDC_TYPE_COLUMN_NAME, DataTypes.StringType) } - def updateOutput(resolvedActions: Seq[DeltaMergeAction]): Seq[Seq[Expression]] = { + def updateOutput(resolvedActions: Seq[DeltaMergeAction], incrMetricExpr: Expression) + : Seq[Seq[Expression]] = { val updateExprs = { // Generate update expressions and set ROW_DELETED_COL = false and // CDC_TYPE_COLUMN_NAME = CDC_TYPE_NOT_CDC val mainDataOutput = resolvedActions.map(_.expr) :+ FalseLiteral :+ - incrUpdatedCountExpr :+ CDC_TYPE_NOT_CDC + incrMetricExpr :+ CDC_TYPE_NOT_CDC if (cdcEnabled) { // For update preimage, we have do a no-op copy with ROW_DELETED_COL = false and // CDC_TYPE_COLUMN_NAME = CDC_TYPE_UPDATE_PREIMAGE and INCR_ROW_COUNT_COL as a no-op @@ -808,11 +819,11 @@ case class MergeIntoCommand( updateExprs.map(resolveOnJoinedPlan) } - def deleteOutput(): Seq[Seq[Expression]] = { + def deleteOutput(incrMetricExpr: Expression): Seq[Seq[Expression]] = { val deleteExprs = { // Generate expressions to set the ROW_DELETED_COL = true and CDC_TYPE_COLUMN_NAME = // CDC_TYPE_NOT_CDC - val mainDataOutput = targetOutputCols :+ TrueLiteral :+ incrDeletedCountExpr :+ + val mainDataOutput = targetOutputCols :+ TrueLiteral :+ incrMetricExpr :+ CDC_TYPE_NOT_CDC if (cdcEnabled) { // For delete we do a no-op copy with ROW_DELETED_COL = false, INCR_ROW_COUNT_COL as a @@ -827,7 +838,8 @@ case class MergeIntoCommand( deleteExprs.map(resolveOnJoinedPlan) } - def insertOutput(resolvedActions: Seq[DeltaMergeAction]): Seq[Seq[Expression]] = { + def insertOutput(resolvedActions: Seq[DeltaMergeAction], incrMetricExpr: Expression) + : Seq[Seq[Expression]] = { // Generate insert expressions and set ROW_DELETED_COL = false and // CDC_TYPE_COLUMN_NAME = CDC_TYPE_NOT_CDC val insertExprs = resolvedActions.map(_.expr) @@ -839,9 +851,9 @@ case class MergeIntoCommand( // isDeleteWithDuplicateMatchesAndCdc definition for more details. insertExprs :+ Alias(Literal(null), TARGET_ROW_ID_COL)() :+ UnresolvedAttribute(SOURCE_ROW_ID_COL) :+ - FalseLiteral :+ incrInsertedCountExpr :+ CDC_TYPE_NOT_CDC + FalseLiteral :+ incrMetricExpr :+ CDC_TYPE_NOT_CDC } else { - insertExprs :+ FalseLiteral :+ incrInsertedCountExpr :+ CDC_TYPE_NOT_CDC + insertExprs :+ FalseLiteral :+ incrMetricExpr :+ CDC_TYPE_NOT_CDC } ) if (cdcEnabled) { @@ -856,9 +868,18 @@ case class MergeIntoCommand( } def clauseOutput(clause: DeltaMergeIntoClause): Seq[Seq[Expression]] = clause match { - case u: DeltaMergeIntoMatchedUpdateClause => updateOutput(u.resolvedActions) - case _: DeltaMergeIntoMatchedDeleteClause => deleteOutput() - case i: DeltaMergeIntoNotMatchedInsertClause => insertOutput(i.resolvedActions) + case u: DeltaMergeIntoMatchedUpdateClause => + updateOutput(u.resolvedActions, And(incrUpdatedCountExpr, incrUpdatedMatchedCountExpr)) + case _: DeltaMergeIntoMatchedDeleteClause => + deleteOutput(And(incrDeletedCountExpr, incrDeletedMatchedCountExpr)) + case i: DeltaMergeIntoNotMatchedInsertClause => + insertOutput(i.resolvedActions, incrInsertedCountExpr) + case u: DeltaMergeIntoNotMatchedBySourceUpdateClause => + updateOutput( + u.resolvedActions, + And(incrUpdatedCountExpr, incrUpdatedNotMatchedBySourceCountExpr)) + case _: DeltaMergeIntoNotMatchedBySourceDeleteClause => + deleteOutput(And(incrDeletedCountExpr, incrDeletedNotMatchedBySourceCountExpr)) } def clauseCondition(clause: DeltaMergeIntoClause): Expression = { @@ -877,15 +898,16 @@ case class MergeIntoCommand( matchedOutputs = matchedClauses.map(clauseOutput), notMatchedConditions = notMatchedClauses.map(clauseCondition), notMatchedOutputs = notMatchedClauses.map(clauseOutput), - noopCopyOutput = resolveOnJoinedPlan( - targetOutputCols :+ FalseLiteral :+ incrNoopCountExpr :+ + notMatchedBySourceConditions = notMatchedBySourceClauses.map(clauseCondition), + notMatchedBySourceOutputs = notMatchedBySourceClauses.map(clauseOutput), + noopCopyOutput = + resolveOnJoinedPlan(targetOutputCols :+ FalseLiteral :+ incrNoopCountExpr :+ CDC_TYPE_NOT_CDC), deleteRowOutput = resolveOnJoinedPlan(targetOutputCols :+ TrueLiteral :+ TrueLiteral :+ CDC_TYPE_NOT_CDC), joinedAttributes = joinedPlan.output, joinedRowEncoder = joinedRowEncoder, - outputRowEncoder = outputRowEncoder - ) + outputRowEncoder = outputRowEncoder) var outputDF = Dataset.ofRows(spark, joinedPlan).mapPartitions(processor.processPartition)(outputRowEncoder) @@ -922,7 +944,7 @@ case class MergeIntoCommand( val (addedBytes, addedPartitions) = totalBytesAndDistinctPartitionValues(newFiles) metrics("numTargetFilesAdded") += newFiles.count(_.isInstanceOf[AddFile]) metrics("numTargetChangeFilesAdded") += newFiles.count(_.isInstanceOf[AddCDCFile]) - metrics("numTargetChangeFileBytes") += newFiles.collect { case f: AddCDCFile => f.size }.sum + metrics("numTargetChangeFileBytes") += newFiles.collect{ case f: AddCDCFile => f.size }.sum metrics("numTargetBytesAdded") += addedBytes metrics("numTargetPartitionsAddedTo") += addedPartitions if (multipleMatchDeleteOnlyOvercount.isDefined) { @@ -931,24 +953,29 @@ case class MergeIntoCommand( metrics("numTargetRowsDeleted").value - multipleMatchDeleteOnlyOvercount.get assert(actualRowsDeleted >= 0) metrics("numTargetRowsDeleted").set(actualRowsDeleted) + val actualRowsMatchedDeleted = + metrics("numTargetRowsMatchedDeleted").value - multipleMatchDeleteOnlyOvercount.get + assert(actualRowsMatchedDeleted >= 0) + metrics("numTargetRowsMatchedDeleted").set(actualRowsMatchedDeleted) } newFiles } + /** - * Build a new logical plan using the given `files` that has the same output columns (exprIds) as - * the `target` logical plan, so that existing update/insert expressions can be applied on this - * new plan. + * Build a new logical plan using the given `files` that has the same output columns (exprIds) + * as the `target` logical plan, so that existing update/insert expressions can be applied + * on this new plan. */ private def buildTargetPlanWithFiles( - deltaTxn: OptimisticTransaction, - files: Seq[AddFile]): LogicalPlan = { + spark: SparkSession, + deltaTxn: OptimisticTransaction, + files: Seq[AddFile]): DataFrame = { val targetOutputCols = getTargetOutputCols(deltaTxn) val targetOutputColsMap = { val colsMap: Map[String, NamedExpression] = targetOutputCols.view - .map(col => col.name -> col) - .toMap + .map(col => col.name -> col).toMap if (conf.caseSensitiveAnalysis) { colsMap } else { @@ -968,8 +995,7 @@ case class MergeIntoCommand( // We can ignore the new columns which aren't yet AttributeReferences. targetOutputCols.collect { case a: AttributeReference => a }, catalogTbl, - isStreaming - ) + isStreaming) } // In case of schema evolution & column mapping, we would also need to rebuild the file format @@ -987,14 +1013,11 @@ case class MergeIntoCommand( // create an alias val aliases = plan.output.map { case newAttrib: AttributeReference => - val existingTargetAttrib = targetOutputColsMap - .get(newAttrib.name) + val existingTargetAttrib = targetOutputColsMap.get(newAttrib.name) .getOrElse { throw DeltaErrors.failedFindAttributeInOutputColumns( - newAttrib.name, - targetOutputCols.mkString(",")) - } - .asInstanceOf[AttributeReference] + newAttrib.name, targetOutputCols.mkString(",")) + }.asInstanceOf[AttributeReference] if (existingTargetAttrib.exprId == newAttrib.exprId) { // It's not valid to alias an expression to its own exprId (this is considered a @@ -1005,7 +1028,7 @@ case class MergeIntoCommand( } } - Project(aliases, plan) + Dataset.ofRows(spark, Project(aliases, plan)) } /** Expressions to increment SQL metrics */ @@ -1016,18 +1039,20 @@ case class MergeIntoCommand( } private def getTargetOutputCols(txn: OptimisticTransaction): Seq[NamedExpression] = { - txn.metadata.schema.map { - col => - targetOutputAttributesMap - .get(col.name) - .map(a => AttributeReference(col.name, col.dataType, col.nullable)(a.exprId)) - .getOrElse(Alias(Literal(null), col.name)()) + txn.metadata.schema.map { col => + targetOutputAttributesMap + .get(col.name) + .map { a => + AttributeReference(col.name, col.dataType, col.nullable)(a.exprId) + } + .getOrElse(Alias(Literal(null), col.name)() + ) } } /** - * Repartitions the output DataFrame by the partition columns if table is partitioned and - * `merge.repartitionBeforeWrite.enabled` is set to true. + * Repartitions the output DataFrame by the partition columns if table is partitioned + * and `merge.repartitionBeforeWrite.enabled` is set to true. */ protected def repartitionIfNeeded( spark: SparkSession, @@ -1043,10 +1068,8 @@ case class MergeIntoCommand( /** * Execute the given `thunk` and return its result while recording the time taken to do it. * - * @param sqlMetricName - * name of SQL metric to update with the time taken by the thunk - * @param thunk - * the code to execute + * @param sqlMetricName name of SQL metric to update with the time taken by the thunk + * @param thunk the code to execute */ private def recordMergeOperation[A](sqlMetricName: String = null)(thunk: => A): A = { val startTimeNs = System.nanoTime() @@ -1060,7 +1083,6 @@ case class MergeIntoCommand( } object MergeIntoCommand { - /** * Spark UI will track all normal accumulators along with Spark tasks to show them on Web UI. * However, the accumulator used by `MergeIntoCommand` can store a very large value since it @@ -1081,31 +1103,29 @@ object MergeIntoCommand { val INCR_ROW_COUNT_COL = "_incr_row_count_" /** - * @param targetRowHasNoMatch - * whether a joined row is a target row with no match in the source table - * @param sourceRowHasNoMatch - * whether a joined row is a source row with no match in the target table - * @param matchedConditions - * condition for each match clause - * @param matchedOutputs - * corresponding output for each match clause. for each clause, we have 1-3 output rows, each of - * which is a sequence of expressions to apply to the joined row - * @param notMatchedConditions - * condition for each not-matched clause - * @param notMatchedOutputs - * corresponding output for each not-matched clause. for each clause, we have 1-2 output rows, - * each of which is a sequence of expressions to apply to the joined row - * @param noopCopyOutput - * no-op expression to copy a target row to the output - * @param deleteRowOutput - * expression to drop a row from the final output. this is used for source rows that don't match - * any not-matched clauses - * @param joinedAttributes - * schema of our outer-joined dataframe - * @param joinedRowEncoder - * joinedDF row encoder - * @param outputRowEncoder - * final output row encoder + * @param targetRowHasNoMatch whether a joined row is a target row with no match in the source + * table + * @param sourceRowHasNoMatch whether a joined row is a source row with no match in the target + * table + * @param matchedConditions condition for each match clause + * @param matchedOutputs corresponding output for each match clause. for each clause, we + * have 1-3 output rows, each of which is a sequence of expressions + * to apply to the joined row + * @param notMatchedConditions condition for each not-matched clause + * @param notMatchedOutputs corresponding output for each not-matched clause. for each clause, + * we have 1-2 output rows, each of which is a sequence of + * expressions to apply to the joined row + * @param notMatchedBySourceConditions condition for each not-matched-by-source clause + * @param notMatchedBySourceOutputs corresponding output for each not-matched-by-source + * clause. for each clause, we have 1-3 output rows, each of + * which is a sequence of expressions to apply to the joined + * row + * @param noopCopyOutput no-op expression to copy a target row to the output + * @param deleteRowOutput expression to drop a row from the final output. this is used for + * source rows that don't match any not-matched clauses + * @param joinedAttributes schema of our outer-joined dataframe + * @param joinedRowEncoder joinedDF row encoder + * @param outputRowEncoder final output row encoder */ class JoinedRowProcessor( targetRowHasNoMatch: Expression, @@ -1114,12 +1134,13 @@ object MergeIntoCommand { matchedOutputs: Seq[Seq[Seq[Expression]]], notMatchedConditions: Seq[Expression], notMatchedOutputs: Seq[Seq[Seq[Expression]]], + notMatchedBySourceConditions: Seq[Expression], + notMatchedBySourceOutputs: Seq[Seq[Seq[Expression]]], noopCopyOutput: Seq[Expression], deleteRowOutput: Seq[Expression], joinedAttributes: Seq[Attribute], joinedRowEncoder: ExpressionEncoder[Row], - outputRowEncoder: ExpressionEncoder[Row]) - extends Serializable { + outputRowEncoder: ExpressionEncoder[Row]) extends Serializable { private def generateProjection(exprs: Seq[Expression]): UnsafeProjection = { UnsafeProjection.create(exprs, joinedAttributes) @@ -1137,6 +1158,8 @@ object MergeIntoCommand { val matchedProjs = matchedOutputs.map(_.map(generateProjection)) val notMatchedPreds = notMatchedConditions.map(generatePredicate) val notMatchedProjs = notMatchedOutputs.map(_.map(generateProjection)) + val notMatchedBySourcePreds = notMatchedBySourceConditions.map(generatePredicate) + val notMatchedBySourceProjs = notMatchedBySourceOutputs.map(_.map(generateProjection)) val noopCopyProj = generateProjection(noopCopyOutput) val deleteRowProj = generateProjection(deleteRowOutput) val outputProj = UnsafeProjection.create(outputRowEncoder.schema) @@ -1145,35 +1168,33 @@ object MergeIntoCommand { // then CDC must be disabled and it's the column after our output cols def shouldDeleteRow(row: InternalRow): Boolean = { row.getBoolean( - outputRowEncoder.schema - .getFieldIndex(ROW_DROPPED_COL) + outputRowEncoder.schema.getFieldIndex(ROW_DROPPED_COL) .getOrElse(outputRowEncoder.schema.fields.size) ) } def processRow(inputRow: InternalRow): Iterator[InternalRow] = { - if (targetRowHasNoMatchPred.eval(inputRow)) { - // Target row did not match any source row, so just copy it to the output - Iterator(noopCopyProj.apply(inputRow)) + // Identify which set of clauses to execute: matched, not-matched or not-matched-by-source + val (predicates, projections, noopAction) = if (targetRowHasNoMatchPred.eval(inputRow)) { + // Target row did not match any source row, so update the target row. + (notMatchedBySourcePreds, notMatchedBySourceProjs, noopCopyProj) + } else if (sourceRowHasNoMatchPred.eval(inputRow)) { + // Source row did not match with any target row, so insert the new source row + (notMatchedPreds, notMatchedProjs, deleteRowProj) } else { - // identify which set of clauses to execute: matched or not-matched ones - val (predicates, projections, noopAction) = if (sourceRowHasNoMatchPred.eval(inputRow)) { - // Source row did not match with any target row, so insert the new source row - (notMatchedPreds, notMatchedProjs, deleteRowProj) - } else { - // Source row matched with target row, so update the target row - (matchedPreds, matchedProjs, noopCopyProj) - } + // Source row matched with target row, so update the target row + (matchedPreds, matchedProjs, noopCopyProj) + } - // find (predicate, projection) pair whose predicate satisfies inputRow - val pair = - (predicates.zip(projections)).find { case (predicate, _) => predicate.eval(inputRow) } + // find (predicate, projection) pair whose predicate satisfies inputRow + val pair = (predicates zip projections).find { + case (predicate, _) => predicate.eval(inputRow) + } - pair match { - case Some((_, projections)) => - projections.map(_.apply(inputRow)).iterator - case None => Iterator(noopAction.apply(inputRow)) - } + pair match { + case Some((_, projections)) => + projections.map(_.apply(inputRow)).iterator + case None => Iterator(noopAction.apply(inputRow)) } } @@ -1183,7 +1204,9 @@ object MergeIntoCommand { .map(toRow) .flatMap(processRow) .filter(!shouldDeleteRow(_)) - .map(notDeletedInternalRow => fromRow(outputProj(notDeletedInternalRow))) + .map { notDeletedInternalRow => + fromRow(outputProj(notDeletedInternalRow)) + } } } diff --git a/backends-clickhouse/src/main/delta-22/org/apache/spark/sql/delta/commands/OptimizeTableCommand.scala b/backends-clickhouse/src/main/delta-23/org/apache/spark/sql/delta/commands/OptimizeTableCommand.scala similarity index 51% rename from backends-clickhouse/src/main/delta-22/org/apache/spark/sql/delta/commands/OptimizeTableCommand.scala rename to backends-clickhouse/src/main/delta-23/org/apache/spark/sql/delta/commands/OptimizeTableCommand.scala index e69b3aa65931b..7fa2c97d90066 100644 --- a/backends-clickhouse/src/main/delta-22/org/apache/spark/sql/delta/commands/OptimizeTableCommand.scala +++ b/backends-clickhouse/src/main/delta-23/org/apache/spark/sql/delta/commands/OptimizeTableCommand.scala @@ -14,24 +14,31 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.spark.sql.delta.commands -import org.apache.spark.SparkContext -import org.apache.spark.SparkContext.SPARK_JOB_GROUP_ID -import org.apache.spark.sql.{AnalysisException, Encoders, Row, SparkSession} -import org.apache.spark.sql.catalyst.TableIdentifier -import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute -import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Expression} +import java.util.ConcurrentModificationException + +import scala.collection.mutable.ArrayBuffer + +import org.apache.spark.sql.delta.skipping.MultiDimClustering import org.apache.spark.sql.delta._ import org.apache.spark.sql.delta.DeltaOperations.Operation -import org.apache.spark.sql.delta.actions.{Action, AddFile, FileAction, RemoveFile} +import org.apache.spark.sql.delta.actions.{Action, AddFile, DeletionVectorDescriptor, FileAction, RemoveFile} import org.apache.spark.sql.delta.commands.OptimizeTableCommandOverwrites.{getDeltaLogClickhouse, groupFilesIntoBinsClickhouse, runOptimizeBinJobClickhouse} import org.apache.spark.sql.delta.commands.optimize._ import org.apache.spark.sql.delta.files.SQLMetricsReporting import org.apache.spark.sql.delta.schema.SchemaUtils -import org.apache.spark.sql.delta.skipping.MultiDimClustering import org.apache.spark.sql.delta.sources.DeltaSQLConf + +import org.apache.spark.SparkContext +import org.apache.spark.SparkContext.SPARK_JOB_GROUP_ID +import org.apache.spark.sql.{AnalysisException, Encoders, Row, SparkSession} +import org.apache.spark.sql.catalyst.TableIdentifier +import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute +import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Expression} import org.apache.spark.sql.execution.command.{LeafRunnableCommand, RunnableCommand} +import org.apache.spark.sql.execution.datasources.v2.clickhouse.ClickHouseConfig import org.apache.spark.sql.execution.datasources.v2.clickhouse.metadata.AddMergeTreeParts import org.apache.spark.sql.execution.datasources.v2.clickhouse.utils.CHDataSourceUtils import org.apache.spark.sql.execution.metric.SQLMetric @@ -39,15 +46,13 @@ import org.apache.spark.sql.execution.metric.SQLMetrics.createMetric import org.apache.spark.sql.types._ import org.apache.spark.util.{SystemClock, ThreadUtils} -import java.util.ConcurrentModificationException - -import scala.collection.mutable.ArrayBuffer - /** * Gluten overwrite Delta: * - * This file is copied from Delta 2.2.0. It is modified in: - * 1. getDeltaLogClickhouse 2. runOptimizeBinJobClickhouse 3. groupFilesIntoBinsClickhouse + * This file is copied from Delta 2.3.0. It is modified in: + * 1. getDeltaLogClickhouse + * 2. runOptimizeBinJobClickhouse + * 3. groupFilesIntoBinsClickhouse */ /** Base class defining abstract optimize command */ @@ -59,16 +64,12 @@ abstract class OptimizeTableCommandBase extends RunnableCommand with DeltaComman /** * Validates ZOrderBy columns - * - validates that partitions columns are not used in `unresolvedZOrderByCols` - * - validates that we already collect stats for all the columns used in - * `unresolvedZOrderByCols` + * - validates that partitions columns are not used in `unresolvedZOrderByCols` + * - validates that we already collect stats for all the columns used in `unresolvedZOrderByCols` * - * @param spark - * [[SparkSession]] to use - * @param txn - * the [[OptimisticTransaction]] being used to optimize - * @param unresolvedZOrderByCols - * Seq of [[UnresolvedAttribute]] corresponding to zOrderBy columns + * @param spark [[SparkSession]] to use + * @param txn the [[OptimisticTransaction]] being used to optimize + * @param unresolvedZOrderByCols Seq of [[UnresolvedAttribute]] corresponding to zOrderBy columns */ def validateZorderByColumns( spark: SparkSession, @@ -80,32 +81,32 @@ abstract class OptimizeTableCommandBase extends RunnableCommand with DeltaComman val dataSchema = StructType(metadata.schema.filterNot(c => partitionColumns.contains(c.name))) val df = spark.createDataFrame(new java.util.ArrayList[Row](), dataSchema) - val checkColStat = - spark.sessionState.conf.getConf(DeltaSQLConf.DELTA_OPTIMIZE_ZORDER_COL_STAT_CHECK) + val checkColStat = spark.sessionState.conf.getConf( + DeltaSQLConf.DELTA_OPTIMIZE_ZORDER_COL_STAT_CHECK) val statCollectionSchema = txn.snapshot.statCollectionSchema val colsWithoutStats = ArrayBuffer[String]() - unresolvedZOrderByCols.foreach { - colAttribute => - val colName = colAttribute.name - if (checkColStat) { - try { - SchemaUtils.findColumnPosition(colAttribute.nameParts, statCollectionSchema) - } catch { - case e: AnalysisException if e.getMessage.contains("Couldn't find column") => - colsWithoutStats.append(colName) - } - } - val isNameEqual = spark.sessionState.conf.resolver - if (partitionColumns.find(isNameEqual(_, colName)).nonEmpty) { - throw DeltaErrors.zOrderingOnPartitionColumnException(colName) - } - if (df.queryExecution.analyzed.resolve(colAttribute.nameParts, isNameEqual).isEmpty) { - throw DeltaErrors.zOrderingColumnDoesNotExistException(colName) + unresolvedZOrderByCols.foreach { colAttribute => + val colName = colAttribute.name + if (checkColStat) { + try { + SchemaUtils.findColumnPosition(colAttribute.nameParts, statCollectionSchema) + } catch { + case e: AnalysisException if e.getMessage.contains("Couldn't find column") => + colsWithoutStats.append(colName) } + } + val isNameEqual = spark.sessionState.conf.resolver + if (partitionColumns.find(isNameEqual(_, colName)).nonEmpty) { + throw DeltaErrors.zOrderingOnPartitionColumnException(colName) + } + if (df.queryExecution.analyzed.resolve(colAttribute.nameParts, isNameEqual).isEmpty) { + throw DeltaErrors.zOrderingColumnDoesNotExistException(colName) + } } if (checkColStat && colsWithoutStats.nonEmpty) { - throw DeltaErrors.zOrderingOnColumnWithNoStatsException(colsWithoutStats.toSeq, spark) + throw DeltaErrors.zOrderingOnColumnWithNoStatsException( + colsWithoutStats.toSeq, spark) } } } @@ -121,15 +122,15 @@ case class OptimizeTableCommand( tableId: Option[TableIdentifier], userPartitionPredicates: Seq[String], options: Map[String, String])(val zOrderBy: Seq[UnresolvedAttribute]) - extends OptimizeTableCommandBase - with LeafRunnableCommand { + extends OptimizeTableCommandBase with LeafRunnableCommand { override val otherCopyArgs: Seq[AnyRef] = zOrderBy :: Nil override def run(sparkSession: SparkSession): Seq[Row] = { + // --- modified start CHDataSourceUtils.ensureClickHouseTableV2(tableId, sparkSession) - val deltaLog = getDeltaLogClickhouse(sparkSession, path, tableId, "OPTIMIZE", options) + // --- modified end val txn = deltaLog.startTransaction() if (txn.readVersion == -1) { @@ -140,10 +141,12 @@ case class OptimizeTableCommand( // Parse the predicate expression into Catalyst expression and verify only simple filters // on partition columns are present - val partitionPredicates = userPartitionPredicates.flatMap { - predicate => + val partitionPredicates = userPartitionPredicates.flatMap { predicate => val predicates = parsePredicates(sparkSession, predicate) - verifyPartitionPredicates(sparkSession, partitionColumns, predicates) + verifyPartitionPredicates( + sparkSession, + partitionColumns, + predicates) predicates } @@ -155,24 +158,19 @@ case class OptimizeTableCommand( } /** - * Optimize job which compacts small files into larger files to reduce the number of files and - * potentially allow more efficient reads. + * Optimize job which compacts small files into larger files to reduce + * the number of files and potentially allow more efficient reads. * - * @param sparkSession - * Spark environment reference. - * @param txn - * The transaction used to optimize this table - * @param partitionPredicate - * List of partition predicates to select subset of files to optimize. + * @param sparkSession Spark environment reference. + * @param txn The transaction used to optimize this table + * @param partitionPredicate List of partition predicates to select subset of files to optimize. */ class OptimizeExecutor( sparkSession: SparkSession, txn: OptimisticTransaction, partitionPredicate: Seq[Expression], zOrderByColumns: Seq[String]) - extends DeltaCommand - with SQLMetricsReporting - with Serializable { + extends DeltaCommand with SQLMetricsReporting with Serializable { /** Timestamp to use in [[FileAction]] */ private val operationTimestamp = new SystemClock().getTimeMillis() @@ -181,63 +179,72 @@ class OptimizeExecutor( def optimize(): Seq[Row] = { recordDeltaOperation(txn.deltaLog, "delta.optimize") { - val minFileSize = - sparkSession.sessionState.conf.getConf(DeltaSQLConf.DELTA_OPTIMIZE_MIN_FILE_SIZE) - val maxFileSize = - sparkSession.sessionState.conf.getConf(DeltaSQLConf.DELTA_OPTIMIZE_MAX_FILE_SIZE) + // --- modified start + val isMergeTreeFormat = ClickHouseConfig + .isMergeTreeFormatEngine(txn.deltaLog.unsafeVolatileMetadata.configuration) + // --- modified end + val minFileSize = sparkSession.sessionState.conf.getConf( + DeltaSQLConf.DELTA_OPTIMIZE_MIN_FILE_SIZE) + val maxFileSize = sparkSession.sessionState.conf.getConf( + DeltaSQLConf.DELTA_OPTIMIZE_MAX_FILE_SIZE) require(minFileSize > 0, "minFileSize must be > 0") require(maxFileSize > 0, "maxFileSize must be > 0") - val candidateFiles = txn.filterFiles(partitionPredicate) + val candidateFiles = txn.filterFiles(partitionPredicate, keepNumRecords = true) val partitionSchema = txn.metadata.partitionSchema - // select all files in case of multi-dimensional clustering - val filesToProcess = candidateFiles.filter(_.size < minFileSize || isMultiDimClustering) - val partitionsToCompact = filesToProcess - .groupBy(file => (file.asInstanceOf[AddMergeTreeParts].bucketNum, file.partitionValues)) - .toSeq - - val jobs = groupFilesIntoBinsClickhouse(partitionsToCompact, maxFileSize) - + val maxDeletedRowsRatio = sparkSession.sessionState.conf.getConf( + DeltaSQLConf.DELTA_OPTIMIZE_MAX_DELETED_ROWS_RATIO) + val filesToProcess = pruneCandidateFileList(minFileSize, maxDeletedRowsRatio, candidateFiles) + // --- modified start val maxThreads = sparkSession.sessionState.conf.getConf(DeltaSQLConf.DELTA_OPTIMIZE_MAX_THREADS) - val updates = ThreadUtils - .parmap(jobs, "OptimizeJob", maxThreads) { - partitionBinGroup => - runOptimizeBinJobClickhouse( - txn, - partitionBinGroup._1._2, - partitionBinGroup._1._1, - partitionBinGroup._2, - maxFileSize) - } - .flatten + val (updates, jobs) = if (isMergeTreeFormat) { + val partitionsToCompact = filesToProcess + .groupBy(file => (file.asInstanceOf[AddMergeTreeParts].bucketNum, file.partitionValues)) + .toSeq + val jobs = groupFilesIntoBinsClickhouse(partitionsToCompact, maxFileSize) + (ThreadUtils.parmap(jobs, "OptimizeJob", maxThreads) { partitionBinGroup => + // --- modified start + runOptimizeBinJobClickhouse( + txn, + partitionBinGroup._1._2, + partitionBinGroup._1._1, + partitionBinGroup._2, + maxFileSize) + // --- modified end + }.flatten, jobs) + } else { + val partitionsToCompact = filesToProcess.groupBy(_.partitionValues).toSeq + val jobs = groupFilesIntoBins(partitionsToCompact, maxFileSize) + (ThreadUtils.parmap(jobs, "OptimizeJob", maxThreads) { partitionBinGroup => + runOptimizeBinJob(txn, partitionBinGroup._1, partitionBinGroup._2, maxFileSize) + }.flatten, jobs) + } + // --- modified end val addedFiles = updates.collect { case a: AddFile => a } val removedFiles = updates.collect { case r: RemoveFile => r } + val removedDVs = filesToProcess.filter(_.deletionVector != null).map(_.deletionVector).toSeq if (addedFiles.size > 0) { val operation = DeltaOperations.Optimize(partitionPredicate.map(_.sql), zOrderByColumns) - val metrics = createMetrics(sparkSession.sparkContext, addedFiles, removedFiles) - commitAndRetry(txn, operation, updates, metrics) { - newTxn => - val newPartitionSchema = newTxn.metadata.partitionSchema - val candidateSetOld = candidateFiles.map(_.path).toSet - val candidateSetNew = newTxn.filterFiles(partitionPredicate).map(_.path).toSet - - // As long as all of the files that we compacted are still part of the table, - // and the partitioning has not changed it is valid to continue to try - // and commit this checkpoint. - if ( - candidateSetOld.subsetOf(candidateSetNew) && partitionSchema == newPartitionSchema - ) { - true - } else { - val deleted = candidateSetOld -- candidateSetNew - logWarning( - s"The following compacted files were delete " + - s"during checkpoint ${deleted.mkString(",")}. Aborting the compaction.") - false - } + val metrics = createMetrics(sparkSession.sparkContext, addedFiles, removedFiles, removedDVs) + commitAndRetry(txn, operation, updates, metrics) { newTxn => + val newPartitionSchema = newTxn.metadata.partitionSchema + val candidateSetOld = candidateFiles.map(_.path).toSet + val candidateSetNew = newTxn.filterFiles(partitionPredicate).map(_.path).toSet + + // As long as all of the files that we compacted are still part of the table, + // and the partitioning has not changed it is valid to continue to try + // and commit this checkpoint. + if (candidateSetOld.subsetOf(candidateSetNew) && partitionSchema == newPartitionSchema) { + true + } else { + val deleted = candidateSetOld -- candidateSetNew + logWarning(s"The following compacted files were delete " + + s"during checkpoint ${deleted.mkString(",")}. Aborting the compaction.") + false + } } } @@ -249,37 +256,65 @@ class OptimizeExecutor( optimizeStats.totalConsideredFiles = candidateFiles.size optimizeStats.totalFilesSkipped = optimizeStats.totalConsideredFiles - removedFiles.size optimizeStats.totalClusterParallelism = sparkSession.sparkContext.defaultParallelism + val numTableColumns = txn.snapshot.metadata.schema.size + optimizeStats.numTableColumns = numTableColumns + optimizeStats.numTableColumnsWithStats = + DeltaConfigs.DATA_SKIPPING_NUM_INDEXED_COLS.fromMetaData(txn.snapshot.metadata) + .min(numTableColumns) + if (removedDVs.size > 0) { + optimizeStats.deletionVectorStats = Some(DeletionVectorStats( + numDeletionVectorsRemoved = removedDVs.size, + numDeletionVectorRowsRemoved = removedDVs.map(_.cardinality).sum)) + } if (isMultiDimClustering) { val inputFileStats = ZOrderFileStats(removedFiles.size, removedFiles.map(_.size.getOrElse(0L)).sum) - optimizeStats.zOrderStats = Some( - ZOrderStats( - strategyName = "all", // means process all files in a partition - inputCubeFiles = ZOrderFileStats(0, 0), - inputOtherFiles = inputFileStats, - inputNumCubes = 0, - mergedFiles = inputFileStats, - // There will one z-cube for each partition - numOutputCubes = optimizeStats.numPartitionsOptimized - )) + optimizeStats.zOrderStats = Some(ZOrderStats( + strategyName = "all", // means process all files in a partition + inputCubeFiles = ZOrderFileStats(0, 0), + inputOtherFiles = inputFileStats, + inputNumCubes = 0, + mergedFiles = inputFileStats, + // There will one z-cube for each partition + numOutputCubes = optimizeStats.numPartitionsOptimized)) } return Seq(Row(txn.deltaLog.dataPath.toString, optimizeStats.toOptimizeMetrics)) } } + /** + * Helper method to prune the list of selected files based on fileSize and ratio of + * deleted rows according to the deletion vector in [[AddFile]]. + */ + private def pruneCandidateFileList( + minFileSize: Long, maxDeletedRowsRatio: Double, files: Seq[AddFile]): Seq[AddFile] = { + + // Select all files in case of multi-dimensional clustering + if (isMultiDimClustering) return files + + def shouldCompactBecauseOfDeletedRows(file: AddFile): Boolean = { + // Always compact files with DVs but without numRecords stats. + // This may be overly aggressive, but it fixes the problem in the long-term, + // as the compacted files will have stats. + (file.deletionVector != null && file.numPhysicalRecords.isEmpty) || + file.deletedToPhysicalRecordsRatio.getOrElse(0d) > maxDeletedRowsRatio + } + + // Select files that are small or have too many deleted rows + files.filter( + addFile => addFile.size < minFileSize || shouldCompactBecauseOfDeletedRows(addFile)) + } + /** * Utility methods to group files into bins for optimize. * - * @param partitionsToCompact - * List of files to compact group by partition. Partition is defined by the partition values - * (partCol -> partValue) - * @param maxTargetFileSize - * Max size (in bytes) of the compaction output file. - * @return - * Sequence of bins. Each bin contains one or more files from the same partition and targeted - * for one output file. + * @param partitionsToCompact List of files to compact group by partition. + * Partition is defined by the partition values (partCol -> partValue) + * @param maxTargetFileSize Max size (in bytes) of the compaction output file. + * @return Sequence of bins. Each bin contains one or more files from the same + * partition and targeted for one output file. */ private def groupFilesIntoBins( partitionsToCompact: Seq[(Map[String, String], Seq[AddFile])], @@ -291,47 +326,42 @@ class OptimizeExecutor( val currentBin = new ArrayBuffer[AddFile]() var currentBinSize = 0L - files.sortBy(_.size).foreach { - file => - // Generally, a bin is a group of existing files, whose total size does not exceed the - // desired maxFileSize. They will be coalesced into a single output file. - // However, if isMultiDimClustering = true, all files in a partition will be read by the - // same job, the data will be range-partitioned and - // umFiles = totalFileSize / maxFileSize will be produced. See below. - if (file.size + currentBinSize > maxTargetFileSize && !isMultiDimClustering) { - bins += currentBin.toVector - currentBin.clear() - currentBin += file - currentBinSize = file.size - } else { - currentBin += file - currentBinSize += file.size - } + files.sortBy(_.size).foreach { file => + // Generally, a bin is a group of existing files, whose total size does not exceed the + // desired maxFileSize. They will be coalesced into a single output file. + // However, if isMultiDimClustering = true, all files in a partition will be read by the + // same job, the data will be range-partitioned and numFiles = totalFileSize / maxFileSize + // will be produced. See below. + if (file.size + currentBinSize > maxTargetFileSize && !isMultiDimClustering) { + bins += currentBin.toVector + currentBin.clear() + currentBin += file + currentBinSize = file.size + } else { + currentBin += file + currentBinSize += file.size + } } if (currentBin.nonEmpty) { bins += currentBin.toVector } - bins - .map(b => (partition, b)) - // select bins that have at least two files or in case of multi-dim clustering - // select all bins - .filter(_._2.size > 1 || isMultiDimClustering) + bins.filter { bin => + bin.size > 1 || // bin has more than one file or + (bin.size == 1 && bin(0).deletionVector != null) || // single file in the bin has a DV or + isMultiDimClustering // multi-clustering + }.map(b => (partition, b)) } } /** * Utility method to run a Spark job to compact the files in given bin * - * @param txn - * [[OptimisticTransaction]] instance in use to commit the changes to DeltaLog. - * @param partition - * Partition values of the partition that files in [[bin]] belongs to. - * @param bin - * List of files to compact into one large file. - * @param maxFileSize - * Targeted output file size in bytes + * @param txn [[OptimisticTransaction]] instance in use to commit the changes to DeltaLog. + * @param partition Partition values of the partition that files in [[bin]] belongs to. + * @param bin List of files to compact into one large file. + * @param maxFileSize Targeted output file size in bytes */ private def runOptimizeBinJob( txn: OptimisticTransaction, @@ -344,10 +374,13 @@ class OptimizeExecutor( val repartitionDF = if (isMultiDimClustering) { val totalSize = bin.map(_.size).sum val approxNumFiles = Math.max(1, totalSize / maxFileSize).toInt - MultiDimClustering.cluster(input, approxNumFiles, zOrderByColumns) + MultiDimClustering.cluster( + input, + approxNumFiles, + zOrderByColumns) } else { - val useRepartition = - sparkSession.sessionState.conf.getConf(DeltaSQLConf.DELTA_OPTIMIZE_REPARTITION_ENABLED) + val useRepartition = sparkSession.sessionState.conf.getConf( + DeltaSQLConf.DELTA_OPTIMIZE_REPARTITION_ENABLED) if (useRepartition) { input.repartition(numPartitions = 1) } else { @@ -369,7 +402,7 @@ class OptimizeExecutor( case other => throw new IllegalStateException( s"Unexpected action $other with type ${other.getClass}. File compaction job output" + - s"should only have AddFiles") + s"should only have AddFiles") } val removeFiles = bin.map(f => f.removeWithTimestamp(operationTimestamp, dataChange = false)) val updates = addFiles ++ removeFiles @@ -377,9 +410,9 @@ class OptimizeExecutor( } /** - * Attempts to commit the given actions to the log. In the case of a concurrent update, the given - * function will be invoked with a new transaction to allow custom conflict detection logic to - * indicate it is safe to try again, by returning `true`. + * Attempts to commit the given actions to the log. In the case of a concurrent update, + * the given function will be invoked with a new transaction to allow custom conflict + * detection logic to indicate it is safe to try again, by returning `true`. * * This function will continue to try to commit to the log as long as `f` returns `true`, * otherwise throws a subclass of [[ConcurrentModificationException]]. @@ -409,7 +442,8 @@ class OptimizeExecutor( private def createMetrics( sparkContext: SparkContext, addedFiles: Seq[AddFile], - removedFiles: Seq[RemoveFile]): Map[String, SQLMetric] = { + removedFiles: Seq[RemoveFile], + removedDVs: Seq[DeletionVectorDescriptor]): Map[String, SQLMetric] = { def setAndReturnMetric(description: String, value: Long) = { val metric = createMetric(sparkContext, description) @@ -419,19 +453,37 @@ class OptimizeExecutor( def totalSize(actions: Seq[FileAction]): Long = { var totalSize = 0L - actions.foreach { - file => - val fileSize = file match { - case addFile: AddFile => addFile.size - case removeFile: RemoveFile => removeFile.size.getOrElse(0L) - case default => - throw new IllegalArgumentException(s"Unknown FileAction type: ${default.getClass}") - } - totalSize += fileSize + actions.foreach { file => + val fileSize = file match { + case addFile: AddFile => addFile.size + case removeFile: RemoveFile => removeFile.size.getOrElse(0L) + case default => + throw new IllegalArgumentException(s"Unknown FileAction type: ${default.getClass}") + } + totalSize += fileSize } totalSize } + val (deletionVectorRowsRemoved, deletionVectorBytesRemoved) = + removedDVs.map(dv => (dv.cardinality, dv.sizeInBytes.toLong)) + .reduceLeftOption((dv1, dv2) => (dv1._1 + dv2._1, dv1._2 + dv2._2)) + .getOrElse((0L, 0L)) + + val dvMetrics: Map[String, SQLMetric] = Map( + "numDeletionVectorsRemoved" -> + setAndReturnMetric( + "total number of deletion vectors removed", + removedDVs.size), + "numDeletionVectorRowsRemoved" -> + setAndReturnMetric( + "total number of deletion vector rows removed", + deletionVectorRowsRemoved), + "numDeletionVectorBytesRemoved" -> + setAndReturnMetric( + "total number of bytes of removed deletion vectors", + deletionVectorBytesRemoved)) + val sizeStats = FileSizeStatsWithHistogram.create(addedFiles.map(_.size).sorted) Map[String, SQLMetric]( "minFileSize" -> setAndReturnMetric("minimum file size", sizeStats.get.min), @@ -444,6 +496,6 @@ class OptimizeExecutor( "numAddedBytes" -> setAndReturnMetric("total number of bytes added", totalSize(addedFiles)), "numRemovedBytes" -> setAndReturnMetric("total number of bytes removed", totalSize(removedFiles)) - ) + ) ++ dvMetrics } } diff --git a/backends-clickhouse/src/main/delta-23/org/apache/spark/sql/delta/commands/OptimizeTableCommandOverwrites.scala b/backends-clickhouse/src/main/delta-23/org/apache/spark/sql/delta/commands/OptimizeTableCommandOverwrites.scala new file mode 100644 index 0000000000000..f7a180b6a239c --- /dev/null +++ b/backends-clickhouse/src/main/delta-23/org/apache/spark/sql/delta/commands/OptimizeTableCommandOverwrites.scala @@ -0,0 +1,333 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.delta.commands + +import org.apache.gluten.expression.ConverterUtils + +import org.apache.spark.{TaskContext, TaskOutputFileAlreadyExistException} +import org.apache.spark.internal.Logging +import org.apache.spark.internal.io.FileCommitProtocol.TaskCommitMessage +import org.apache.spark.internal.io.SparkHadoopWriterUtils +import org.apache.spark.shuffle.FetchFailedException +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier} +import org.apache.spark.sql.catalyst.catalog.CatalogTableType +import org.apache.spark.sql.delta._ +import org.apache.spark.sql.delta.actions.{AddFile, FileAction} +import org.apache.spark.sql.delta.catalog.ClickHouseTableV2 +import org.apache.spark.sql.errors.QueryExecutionErrors +import org.apache.spark.sql.execution.datasources.CHDatasourceJniWrapper +import org.apache.spark.sql.execution.datasources.v1.CHMergeTreeWriterInjects +import org.apache.spark.sql.execution.datasources.v1.clickhouse._ +import org.apache.spark.sql.execution.datasources.v2.clickhouse.metadata.{AddFileTags, AddMergeTreeParts} +import org.apache.spark.sql.execution.datasources.v2.clickhouse.utils.CHDataSourceUtils +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types.StructType +import org.apache.spark.util.{SerializableConfiguration, SystemClock, Utils} + +import org.apache.hadoop.fs.{FileAlreadyExistsException, Path} +import org.apache.hadoop.mapreduce.{TaskAttemptContext, TaskAttemptID, TaskID, TaskType} +import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl + +import java.util.{Date, UUID} + +import scala.collection.mutable.ArrayBuffer + +object OptimizeTableCommandOverwrites extends Logging { + + case class TaskDescription( + path: String, + database: String, + tableName: String, + snapshotId: String, + orderByKeyOption: Option[Seq[String]], + lowCardKeyOption: Option[Seq[String]], + minmaxIndexKeyOption: Option[Seq[String]], + bfIndexKeyOption: Option[Seq[String]], + setIndexKeyOption: Option[Seq[String]], + primaryKeyOption: Option[Seq[String]], + partitionColumns: Seq[String], + partList: Seq[String], + tableSchema: StructType, + clickhouseTableConfigs: Map[String, String], + serializableHadoopConf: SerializableConfiguration, + jobIdInstant: Long, + partitionDir: Option[String], + bucketDir: Option[String] + ) + + private def executeTask( + description: TaskDescription, + sparkStageId: Int, + sparkPartitionId: Int, + sparkAttemptNumber: Int + ): MergeTreeWriteTaskResult = { + + val jobId = SparkHadoopWriterUtils.createJobID(new Date(description.jobIdInstant), sparkStageId) + val taskId = new TaskID(jobId, TaskType.MAP, sparkPartitionId) + val taskAttemptId = new TaskAttemptID(taskId, sparkAttemptNumber) + + // Set up the attempt context required to use in the output committer. + val taskAttemptContext: TaskAttemptContext = { + // Set up the configuration object + val hadoopConf = description.serializableHadoopConf.value + hadoopConf.set("mapreduce.job.id", jobId.toString) + hadoopConf.set("mapreduce.task.id", taskAttemptId.getTaskID.toString) + hadoopConf.set("mapreduce.task.attempt.id", taskAttemptId.toString) + hadoopConf.setBoolean("mapreduce.task.ismap", true) + hadoopConf.setInt("mapreduce.task.partition", 0) + + new TaskAttemptContextImpl(hadoopConf, taskAttemptId) + } + + try { + Utils.tryWithSafeFinallyAndFailureCallbacks(block = { + + val uuid = UUID.randomUUID.toString + + val planWithSplitInfo = CHMergeTreeWriterInjects.genMergeTreeWriteRel( + description.path, + description.database, + description.tableName, + description.snapshotId, + description.orderByKeyOption, + description.lowCardKeyOption, + description.minmaxIndexKeyOption, + description.bfIndexKeyOption, + description.setIndexKeyOption, + description.primaryKeyOption, + description.partitionColumns, + description.partList, + ConverterUtils.convertNamedStructJson(description.tableSchema), + description.clickhouseTableConfigs, + description.tableSchema.toAttributes + ) + + val datasourceJniWrapper = new CHDatasourceJniWrapper() + val returnedMetrics = + datasourceJniWrapper.nativeMergeMTParts( + planWithSplitInfo.plan, + planWithSplitInfo.splitInfo, + uuid, + taskId.getId.toString, + description.partitionDir.getOrElse(""), + description.bucketDir.getOrElse("") + ) + if (returnedMetrics != null && returnedMetrics.nonEmpty) { + val addFiles = AddFileTags.partsMetricsToAddFile( + description.database, + description.tableName, + description.path, + returnedMetrics, + Seq(Utils.localHostName())) + + val (taskCommitMessage, taskCommitTime) = Utils.timeTakenMs { + // committer.commitTask(taskAttemptContext) + new TaskCommitMessage(addFiles.toSeq) + } + +// val summary = MergeTreeExecutedWriteSummary( +// updatedPartitions = updatedPartitions.toSet, +// stats = statsTrackers.map(_.getFinalStats(taskCommitTime))) + MergeTreeWriteTaskResult(taskCommitMessage, null) + } else { + throw new IllegalStateException() + } + })( + catchBlock = { + // If there is an error, abort the task + logError(s"Job $jobId aborted.") + }, + finallyBlock = {}) + } catch { + case e: FetchFailedException => + throw e + case f: FileAlreadyExistsException if SQLConf.get.fastFailFileFormatOutput => + // If any output file to write already exists, it does not make sense to re-run this task. + // We throw the exception and let Executor throw ExceptionFailure to abort the job. + throw new TaskOutputFileAlreadyExistException(f) + case t: Throwable => + throw QueryExecutionErrors.taskFailedWhileWritingRowsError(t) + } + + } + + def runOptimizeBinJobClickhouse( + txn: OptimisticTransaction, + partitionValues: Map[String, String], + bucketNum: String, + bin: Seq[AddFile], + maxFileSize: Long): Seq[FileAction] = { + val tableV2 = ClickHouseTableV2.getTable(txn.deltaLog); + + val sparkSession = SparkSession.getActiveSession.get + + val rddWithNonEmptyPartitions = + sparkSession.sparkContext.parallelize(Array.empty[InternalRow], 1) + + val jobIdInstant = new Date().getTime + val ret = new Array[MergeTreeWriteTaskResult](rddWithNonEmptyPartitions.partitions.length) + + val serializableHadoopConf = new SerializableConfiguration( + sparkSession.sessionState.newHadoopConfWithOptions( + txn.metadata.configuration ++ txn.deltaLog.options)) + + val partitionDir = if (tableV2.partitionColumns.isEmpty) { + None + } else { + Some(tableV2.partitionColumns.map(c => c + "=" + partitionValues(c)).mkString("/")) + } + + val bucketDir = if (tableV2.bucketOption.isEmpty) { + None + } else { + Some(bucketNum) + } + + val description = TaskDescription.apply( + txn.deltaLog.dataPath.toString, + tableV2.dataBaseName, + tableV2.tableName, + ClickhouseSnapshot.genSnapshotId(tableV2.snapshot), + tableV2.orderByKeyOption, + tableV2.lowCardKeyOption, + tableV2.minmaxIndexKeyOption, + tableV2.bfIndexKeyOption, + tableV2.setIndexKeyOption, + tableV2.primaryKeyOption, + tableV2.partitionColumns, + bin.map(_.asInstanceOf[AddMergeTreeParts].name), + tableV2.schema(), + tableV2.clickhouseTableConfigs, + serializableHadoopConf, + jobIdInstant, + partitionDir, + bucketDir + ) + sparkSession.sparkContext.runJob( + rddWithNonEmptyPartitions, + (taskContext: TaskContext, _: Iterator[InternalRow]) => { + executeTask( + description, + taskContext.stageId(), + taskContext.partitionId(), + taskContext.taskAttemptId().toInt & Integer.MAX_VALUE + ) + }, + rddWithNonEmptyPartitions.partitions.indices, + (index, res: MergeTreeWriteTaskResult) => { + ret(index) = res + } + ) + + val addFiles = ret + .flatMap(_.commitMsg.obj.asInstanceOf[Seq[AddFile]]) + .toSeq + + val removeFiles = + bin.map(f => f.removeWithTimestamp(new SystemClock().getTimeMillis(), dataChange = false)) + addFiles ++ removeFiles + + } + + def getDeltaLogClickhouse( + spark: SparkSession, + path: Option[String], + tableIdentifier: Option[TableIdentifier], + operationName: String, + hadoopConf: Map[String, String] = Map.empty): DeltaLog = { + val tablePath = + if (path.nonEmpty) { + new Path(path.get) + } else if (tableIdentifier.nonEmpty) { + val sessionCatalog = spark.sessionState.catalog + lazy val metadata = sessionCatalog.getTableMetadata(tableIdentifier.get) + + if (CHDataSourceUtils.isClickhousePath(spark, tableIdentifier.get)) { + new Path(tableIdentifier.get.table) + } else if (CHDataSourceUtils.isClickHouseTable(spark, tableIdentifier.get)) { + new Path(metadata.location) + } else { + DeltaTableIdentifier(spark, tableIdentifier.get) match { + case Some(id) if id.path.nonEmpty => + new Path(id.path.get) + case Some(id) if id.table.nonEmpty => + new Path(metadata.location) + case _ => + if (metadata.tableType == CatalogTableType.VIEW) { + throw DeltaErrors.viewNotSupported(operationName) + } + throw DeltaErrors.notADeltaTableException(operationName) + } + } + } else { + throw DeltaErrors.missingTableIdentifierException(operationName) + } + + val startTime = Some(System.currentTimeMillis) + val deltaLog = DeltaLog.forTable(spark, tablePath, hadoopConf) + if (deltaLog.update(checkIfUpdatedSinceTs = startTime).version < 0) { + throw DeltaErrors.notADeltaTableException( + operationName, + DeltaTableIdentifier(path, tableIdentifier)) + } + deltaLog + } + + def groupFilesIntoBinsClickhouse( + partitionsToCompact: Seq[((String, Map[String, String]), Seq[AddFile])], + maxTargetFileSize: Long): Seq[((String, Map[String, String]), Seq[AddFile])] = { + partitionsToCompact.flatMap { + case (partition, files) => + val bins = new ArrayBuffer[Seq[AddFile]]() + + val currentBin = new ArrayBuffer[AddFile]() + var currentBinSize = 0L + + files.sortBy(_.size).foreach { + file => + // Generally, a bin is a group of existing files, whose total size does not exceed the + // desired maxFileSize. They will be coalesced into a single output file. + // However, if isMultiDimClustering = true, all files in a partition will be read by the + // same job, the data will be range-partitioned and + // numFiles = totalFileSize / maxFileSize + // will be produced. See below. + + // isMultiDimClustering is always false for Gluten Clickhouse for now + if (file.size + currentBinSize > maxTargetFileSize /* && !isMultiDimClustering */ ) { + bins += currentBin.toVector + currentBin.clear() + currentBin += file + currentBinSize = file.size + } else { + currentBin += file + currentBinSize += file.size + } + } + + if (currentBin.nonEmpty) { + bins += currentBin.toVector + } + + bins + .map(b => (partition, b)) + // select bins that have at least two files or in case of multi-dim clustering + // select all bins + .filter(_._2.size > 1 /* || isMultiDimClustering */ ) + } + } +} diff --git a/backends-clickhouse/src/main/delta-22/org/apache/spark/sql/delta/commands/UpdateCommand.scala b/backends-clickhouse/src/main/delta-23/org/apache/spark/sql/delta/commands/UpdateCommand.scala similarity index 66% rename from backends-clickhouse/src/main/delta-22/org/apache/spark/sql/delta/commands/UpdateCommand.scala rename to backends-clickhouse/src/main/delta-23/org/apache/spark/sql/delta/commands/UpdateCommand.scala index ad118470fc7f6..94ccef961c08a 100644 --- a/backends-clickhouse/src/main/delta-22/org/apache/spark/sql/delta/commands/UpdateCommand.scala +++ b/backends-clickhouse/src/main/delta-23/org/apache/spark/sql/delta/commands/UpdateCommand.scala @@ -14,32 +14,32 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.spark.sql.delta.commands +// scalastyle:off import.ordering.noEmptyLine +import org.apache.spark.sql.delta.{DeltaConfigs, DeltaLog, DeltaOperations, DeltaTableUtils, DeltaUDF, OptimisticTransaction} +import org.apache.spark.sql.delta.actions.{AddCDCFile, AddFile, FileAction} +import org.apache.spark.sql.delta.commands.cdc.CDCReader.{CDC_TYPE_COLUMN_NAME, CDC_TYPE_NOT_CDC, CDC_TYPE_UPDATE_POSTIMAGE, CDC_TYPE_UPDATE_PREIMAGE} +import org.apache.spark.sql.delta.files.{TahoeBatchFileIndex, TahoeFileIndex} +import org.apache.hadoop.fs.Path + import org.apache.spark.SparkContext -import org.apache.spark.sql._ +import org.apache.spark.sql.{Column, DataFrame, Dataset, Row, SparkSession} import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, AttributeReference, Expression, If, Literal} import org.apache.spark.sql.catalyst.plans.QueryPlan import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan -import org.apache.spark.sql.delta._ -import org.apache.spark.sql.delta.actions.{AddCDCFile, AddFile, FileAction} -import org.apache.spark.sql.delta.commands.cdc.CDCReader.{CDC_TYPE_COLUMN_NAME, CDC_TYPE_NOT_CDC, CDC_TYPE_UPDATE_POSTIMAGE, CDC_TYPE_UPDATE_PREIMAGE} -import org.apache.spark.sql.delta.files.{TahoeBatchFileIndex, TahoeFileIndex} -import org.apache.spark.sql.execution.SQLExecution import org.apache.spark.sql.execution.command.LeafRunnableCommand -import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} +import org.apache.spark.sql.execution.metric.SQLMetric import org.apache.spark.sql.execution.metric.SQLMetrics.{createMetric, createTimingMetric} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.LongType -// scalastyle:off import.ordering.noEmptyLine -import org.apache.hadoop.fs.Path - /** * Gluten overwrite Delta: * - * This file is copied from Delta 2.2.0. It is modified to overcome the following issues: + * This file is copied from Delta 2.3.0. It is modified to overcome the following issues: * 1. In Clickhouse backend, we can't implement input_file_name() correctly, we can only implement * it so that it return a a list of filenames (concated by ','). */ @@ -47,18 +47,18 @@ import org.apache.hadoop.fs.Path /** * Performs an Update using `updateExpression` on the rows that match `condition` * - * Algorithm: 1) Identify the affected files, i.e., the files that may have the rows to be updated. - * 2) Scan affected files, apply the updates, and generate a new DF with updated rows. 3) Use the - * Delta protocol to atomically write the new DF as new files and remove the affected files that are - * identified in step 1. + * Algorithm: + * 1) Identify the affected files, i.e., the files that may have the rows to be updated. + * 2) Scan affected files, apply the updates, and generate a new DF with updated rows. + * 3) Use the Delta protocol to atomically write the new DF as new files and remove + * the affected files that are identified in step 1. */ case class UpdateCommand( tahoeFileIndex: TahoeFileIndex, target: LogicalPlan, updateExpressions: Seq[Expression], condition: Option[Expression]) - extends LeafRunnableCommand - with DeltaCommand { + extends LeafRunnableCommand with DeltaCommand { override val output: Seq[Attribute] = { Seq(AttributeReference("num_affected_rows", LongType)()) @@ -70,7 +70,9 @@ case class UpdateCommand( override lazy val metrics = Map[String, SQLMetric]( "numAddedFiles" -> createMetric(sc, "number of files added."), + "numAddedBytes" -> createMetric(sc, "number of bytes added"), "numRemovedFiles" -> createMetric(sc, "number of files removed."), + "numRemovedBytes" -> createMetric(sc, "number of bytes removed"), "numUpdatedRows" -> createMetric(sc, "number of rows updated."), "numCopiedRows" -> createMetric(sc, "number of rows copied."), "executionTimeMs" -> @@ -87,8 +89,14 @@ case class UpdateCommand( final override def run(sparkSession: SparkSession): Seq[Row] = { recordDeltaOperation(tahoeFileIndex.deltaLog, "delta.dml.update") { val deltaLog = tahoeFileIndex.deltaLog - deltaLog.assertRemovable() - deltaLog.withNewTransaction(txn => performUpdate(sparkSession, deltaLog, txn)) + deltaLog.withNewTransaction { txn => + DeltaLog.assertRemovable(txn.snapshot) + if (hasBeenExecuted(txn, sparkSession)) { + sendDriverMetrics(sparkSession, metrics) + return Seq.empty + } + performUpdate(sparkSession, deltaLog, txn) + } // Re-cache all cached plans(including this relation itself, if it's cached) that refer to // this data source relation. sparkSession.sharedState.cacheManager.recacheByPlan(sparkSession, target) @@ -97,13 +105,13 @@ case class UpdateCommand( } private def performUpdate( - sparkSession: SparkSession, - deltaLog: DeltaLog, - txn: OptimisticTransaction): Unit = { + sparkSession: SparkSession, deltaLog: DeltaLog, txn: OptimisticTransaction): Unit = { import org.apache.spark.sql.delta.implicits._ var numTouchedFiles: Long = 0 var numRewrittenFiles: Long = 0 + var numAddedBytes: Long = 0 + var numRemovedBytes: Long = 0 var numAddedChangeFiles: Long = 0 var changeFileBytes: Long = 0 var scanTimeMs: Long = 0 @@ -115,9 +123,7 @@ case class UpdateCommand( val updateCondition = condition.getOrElse(Literal.TrueLiteral) val (metadataPredicates, dataPredicates) = DeltaTableUtils.splitMetadataAndDataPredicates( - updateCondition, - txn.metadata.partitionColumns, - sparkSession) + updateCondition, txn.metadata.partitionColumns, sparkSession) val candidateFiles = txn.filterFiles(metadataPredicates ++ dataPredicates) val nameToAddFile = generateCandidateFileMap(deltaLog.dataPath, candidateFiles) @@ -134,34 +140,27 @@ case class UpdateCommand( } else { // Case 3: Find all the affected files using the user-specified condition val fileIndex = new TahoeBatchFileIndex( - sparkSession, - "update", - candidateFiles, - deltaLog, - tahoeFileIndex.path, - txn.snapshot) + sparkSession, "update", candidateFiles, deltaLog, tahoeFileIndex.path, txn.snapshot) // Keep everything from the resolved target except a new TahoeFileIndex // that only involves the affected files instead of all files. val newTarget = DeltaTableUtils.replaceFileIndex(target, fileIndex) val data = Dataset.ofRows(sparkSession, newTarget) val updatedRowCount = metrics("numUpdatedRows") - val updatedRowUdf = DeltaUDF - .boolean { - () => - updatedRowCount += 1 - true - } - .asNondeterministic() + val updatedRowUdf = DeltaUDF.boolean { () => + updatedRowCount += 1 + true + }.asNondeterministic() val pathsToRewrite = withStatusCode("DELTA", UpdateCommand.FINDING_TOUCHED_FILES_MSG) { - data - .filter(new Column(updateCondition)) + // --- modified start + data.filter(new Column(updateCondition)) .select(input_file_name().as("input_files")) .filter(updatedRowUdf()) .select(explode(split(col("input_files"), ","))) .distinct() .as[String] .collect() + // --- modified end } scanTimeMs = (System.nanoTime() - startTime) / 1000 / 1000 @@ -177,13 +176,8 @@ case class UpdateCommand( } else { // Generate the new files containing the updated values withStatusCode("DELTA", UpdateCommand.rewritingFilesMsg(filesToRewrite.size)) { - rewriteFiles( - sparkSession, - txn, - tahoeFileIndex.path, - filesToRewrite.map(_.path), - nameToAddFile, - updateCondition) + rewriteFiles(sparkSession, txn, tahoeFileIndex.path, + filesToRewrite.map(_.path), nameToAddFile, updateCondition) } } @@ -191,6 +185,7 @@ case class UpdateCommand( val (changeActions, addActions) = newActions.partition(_.isInstanceOf[AddCDCFile]) numRewrittenFiles = addActions.size + numAddedBytes = addActions.map(_.getFileSize).sum numAddedChangeFiles = changeActions.size changeFileBytes = changeActions.collect { case f: AddCDCFile => f.size }.sum @@ -202,47 +197,42 @@ case class UpdateCommand( // files containing the updated values val operationTimestamp = System.currentTimeMillis() val deleteActions = filesToRewrite.map(_.removeWithTimestamp(operationTimestamp)) - + numRemovedBytes = filesToRewrite.map(_.getFileSize).sum deleteActions ++ newActions } - if (totalActions.nonEmpty) { - metrics("numAddedFiles").set(numRewrittenFiles) - metrics("numAddedChangeFiles").set(numAddedChangeFiles) - metrics("changeFileBytes").set(changeFileBytes) - metrics("numRemovedFiles").set(numTouchedFiles) - metrics("executionTimeMs").set((System.nanoTime() - startTime) / 1000 / 1000) - metrics("scanTimeMs").set(scanTimeMs) - metrics("rewriteTimeMs").set(rewriteTimeMs) - // In the case where the numUpdatedRows is not captured, we can siphon out the metrics from - // the BasicWriteStatsTracker. This is for case 2 where the update condition contains only - // metadata predicates and so the entire partition is re-written. - val outputRows = txn.getMetric("numOutputRows").map(_.value).getOrElse(-1L) - if ( - metrics("numUpdatedRows").value == 0 && outputRows != 0 && - metrics("numCopiedRows").value == 0 - ) { - // We know that numTouchedRows = numCopiedRows + numUpdatedRows. - // Since an entire partition was re-written, no rows were copied. - // So numTouchedRows == numUpdateRows - metrics("numUpdatedRows").set(metrics("numTouchedRows").value) - } else { - // This is for case 3 where the update condition contains both metadata and data predicates - // so relevant files will have some rows updated and some rows copied. We don't need to - // consider case 1 here, where no files match the update condition, as we know that - // `totalActions` is empty. - metrics("numCopiedRows").set( - metrics("numTouchedRows").value - metrics("numUpdatedRows").value) - } - txn.registerSQLMetrics(sparkSession, metrics) - txn.commit(totalActions, DeltaOperations.Update(condition.map(_.toString))) - // This is needed to make the SQL metrics visible in the Spark UI - val executionId = sparkSession.sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY) - SQLMetrics.postDriverMetricUpdates( - sparkSession.sparkContext, - executionId, - metrics.values.toSeq) + metrics("numAddedFiles").set(numRewrittenFiles) + metrics("numAddedBytes").set(numAddedBytes) + metrics("numAddedChangeFiles").set(numAddedChangeFiles) + metrics("changeFileBytes").set(changeFileBytes) + metrics("numRemovedFiles").set(numTouchedFiles) + metrics("numRemovedBytes").set(numRemovedBytes) + metrics("executionTimeMs").set((System.nanoTime() - startTime) / 1000 / 1000) + metrics("scanTimeMs").set(scanTimeMs) + metrics("rewriteTimeMs").set(rewriteTimeMs) + // In the case where the numUpdatedRows is not captured, we can siphon out the metrics from + // the BasicWriteStatsTracker. This is for case 2 where the update condition contains only + // metadata predicates and so the entire partition is re-written. + val outputRows = txn.getMetric("numOutputRows").map(_.value).getOrElse(-1L) + if (metrics("numUpdatedRows").value == 0 && outputRows != 0 && + metrics("numCopiedRows").value == 0) { + // We know that numTouchedRows = numCopiedRows + numUpdatedRows. + // Since an entire partition was re-written, no rows were copied. + // So numTouchedRows == numUpdateRows + metrics("numUpdatedRows").set(metrics("numTouchedRows").value) + } else { + // This is for case 3 where the update condition contains both metadata and data predicates + // so relevant files will have some rows updated and some rows copied. We don't need to + // consider case 1 here, where no files match the update condition, as we know that + // `totalActions` is empty. + metrics("numCopiedRows").set( + metrics("numTouchedRows").value - metrics("numUpdatedRows").value) } + txn.registerSQLMetrics(sparkSession, metrics) + + val finalActions = createSetTransaction(sparkSession, deltaLog).toSeq ++ totalActions + txn.commitIfNeeded(finalActions, DeltaOperations.Update(condition.map(_.toString))) + sendDriverMetrics(sparkSession, metrics) recordDeltaEvent( deltaLog, @@ -255,19 +245,17 @@ case class UpdateCommand( numAddedChangeFiles, changeFileBytes, scanTimeMs, - rewriteTimeMs - ) + rewriteTimeMs) ) } /** * Scan all the affected files and write out the updated files. * - * When CDF is enabled, includes the generation of CDC preimage and postimage columns for changed - * rows. + * When CDF is enabled, includes the generation of CDC preimage and postimage columns for + * changed rows. * - * @return - * the list of [[AddFile]]s and [[AddCDCFile]]s that have been written. + * @return the list of [[AddFile]]s and [[AddCDCFile]]s that have been written. */ private def rewriteFiles( spark: SparkSession, @@ -277,21 +265,18 @@ case class UpdateCommand( nameToAddFileMap: Map[String, AddFile], condition: Expression): Seq[FileAction] = { // Containing the map from the relative file path to AddFile - val baseRelation = - buildBaseRelation(spark, txn, "update", rootPath, inputLeafFiles, nameToAddFileMap) + val baseRelation = buildBaseRelation( + spark, txn, "update", rootPath, inputLeafFiles, nameToAddFileMap) val newTarget = DeltaTableUtils.replaceFileIndex(target, baseRelation.location) val targetDf = Dataset.ofRows(spark, newTarget) // Number of total rows that we have seen, i.e. are either copying or updating (sum of both). // This will be used later, along with numUpdatedRows, to determine numCopiedRows. val numTouchedRows = metrics("numTouchedRows") - val numTouchedRowsUdf = DeltaUDF - .boolean { - () => - numTouchedRows += 1 - true - } - .asNondeterministic() + val numTouchedRowsUdf = DeltaUDF.boolean { () => + numTouchedRows += 1 + true + }.asNondeterministic() val updatedDataFrame = UpdateCommand.withUpdatedColumns( target, @@ -300,8 +285,7 @@ case class UpdateCommand( targetDf .filter(numTouchedRowsUdf()) .withColumn(UpdateCommand.CONDITION_COLUMN_NAME, new Column(condition)), - UpdateCommand.shouldOutputCdc(txn) - ) + UpdateCommand.shouldOutputCdc(txn)) txn.writeFiles(updatedDataFrame) } @@ -324,25 +308,20 @@ object UpdateCommand { } /** - * Build the new columns. If the condition matches, generate the new value using the corresponding - * UPDATE EXPRESSION; otherwise, keep the original column value. + * Build the new columns. If the condition matches, generate the new value using + * the corresponding UPDATE EXPRESSION; otherwise, keep the original column value. * * When CDC is enabled, includes the generation of CDC pre-image and post-image columns for * changed rows. * - * @param target - * target we are updating into - * @param updateExpressions - * the update transformation to perform on the input DataFrame - * @param dfWithEvaluatedCondition - * source DataFrame on which we will apply the update expressions with an additional column - * CONDITION_COLUMN_NAME which is the true/false value of if the update condition is satisfied - * @param condition - * update condition - * @param shouldOutputCdc - * if we should output CDC data during this UPDATE operation. - * @return - * the updated DataFrame, with extra CDC columns if CDC is enabled + * @param target target we are updating into + * @param updateExpressions the update transformation to perform on the input DataFrame + * @param dfWithEvaluatedCondition source DataFrame on which we will apply the update expressions + * with an additional column CONDITION_COLUMN_NAME which is the + * true/false value of if the update condition is satisfied + * @param condition update condition + * @param shouldOutputCdc if we should output CDC data during this UPDATE operation. + * @return the updated DataFrame, with extra CDC columns if CDC is enabled */ def withUpdatedColumns( target: LogicalPlan, @@ -377,24 +356,22 @@ object UpdateCommand { If( UnresolvedAttribute(CONDITION_COLUMN_NAME), packedUpdates, // if it should be updated, then use `packagedUpdates` - array(struct(noopRewriteCols: _*)).expr - ) // else, this is a noop rewrite + array(struct(noopRewriteCols: _*)).expr) // else, this is a noop rewrite } // Explode the packed array, and project back out the final data columns. val finalColNames = target.output.map(_.name) :+ CDC_TYPE_COLUMN_NAME dfWithEvaluatedCondition .select(explode(new Column(packedData)).as("packedData")) - .select(finalColNames.map(n => col(s"packedData.`$n`").as(s"$n")): _*) + .select(finalColNames.map { n => col(s"packedData.`$n`").as(s"$n") }: _*) } else { - val finalCols = updateExpressions.zip(target.output).map { - case (update, original) => - val updated = if (condition == Literal.TrueLiteral) { - update - } else { - If(UnresolvedAttribute(CONDITION_COLUMN_NAME), update, original) - } - new Column(Alias(updated, original.name)()) + val finalCols = updateExpressions.zip(target.output).map { case (update, original) => + val updated = if (condition == Literal.TrueLiteral) { + update + } else { + If(UnresolvedAttribute(CONDITION_COLUMN_NAME), update, original) + } + new Column(Alias(updated, original.name)()) } dfWithEvaluatedCondition.select(finalCols: _*) @@ -407,25 +384,16 @@ object UpdateCommand { /** * Used to report details about update. * - * @param condition: - * what was the update condition - * @param numFilesTotal: - * how big is the table - * @param numTouchedFiles: - * how many files did we touch - * @param numRewrittenFiles: - * how many files had to be rewritten - * @param numAddedChangeFiles: - * how many change files were generated - * @param changeFileBytes: - * total size of change files generated - * @param scanTimeMs: - * how long did finding take - * @param rewriteTimeMs: - * how long did rewriting take + * @param condition: what was the update condition + * @param numFilesTotal: how big is the table + * @param numTouchedFiles: how many files did we touch + * @param numRewrittenFiles: how many files had to be rewritten + * @param numAddedChangeFiles: how many change files were generated + * @param changeFileBytes: total size of change files generated + * @param scanTimeMs: how long did finding take + * @param rewriteTimeMs: how long did rewriting take * - * @note - * All the time units are milliseconds. + * @note All the time units are milliseconds. */ case class UpdateMetric( condition: String, diff --git a/backends-clickhouse/src/main/delta-23/org/apache/spark/sql/delta/commands/VacuumCommand.scala b/backends-clickhouse/src/main/delta-23/org/apache/spark/sql/delta/commands/VacuumCommand.scala new file mode 100644 index 0000000000000..9f82feeee2fc0 --- /dev/null +++ b/backends-clickhouse/src/main/delta-23/org/apache/spark/sql/delta/commands/VacuumCommand.scala @@ -0,0 +1,594 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.delta.commands + +// scalastyle:off import.ordering.noEmptyLine +import java.net.URI +import java.util.Date +import java.util.concurrent.TimeUnit + +import scala.collection.JavaConverters._ + +import org.apache.spark.sql.delta._ +import org.apache.spark.sql.delta.actions.{FileAction, RemoveFile} +import org.apache.spark.sql.delta.sources.DeltaSQLConf +import org.apache.spark.sql.delta.util.DeltaFileOperations +import org.apache.spark.sql.delta.util.DeltaFileOperations.tryDeleteNonRecursive +import com.fasterxml.jackson.databind.annotation.JsonDeserialize +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.{FileSystem, Path} + +import org.apache.gluten.utils.QueryPlanSelector +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.sql.{Column, DataFrame, Dataset, SparkSession} +import org.apache.spark.sql.execution.datasources.v2.clickhouse.ClickHouseConfig +import org.apache.spark.sql.execution.metric.SQLMetric +import org.apache.spark.sql.execution.metric.SQLMetrics.createMetric +import org.apache.spark.sql.functions._ +import org.apache.spark.util.{Clock, SerializableConfiguration, SystemClock} + +/** + * Gluten overwrite Delta: + * + * This file is copied from Delta 2.3.0. It is modified to overcome the following issues: + * 1. In Gluten, part is a directory, but VacuumCommand assumes part is a file. So we need some + * modifications to make it work. + */ + +/** + * Vacuums the table by clearing all untracked files and folders within this table. + * First lists all the files and directories in the table, and gets the relative paths with + * respect to the base of the table. Then it gets the list of all tracked files for this table, + * which may or may not be within the table base path, and gets the relative paths of + * all the tracked files with respect to the base of the table. Files outside of the table path + * will be ignored. Then we take a diff of the files and delete directories that were already empty, + * and all files that are within the table that are no longer tracked. + */ +object VacuumCommand extends VacuumCommandImpl with Serializable { + + // --- modified start + case class FileNameAndSize(path: String, length: Long, isDir: Boolean = false) + // --- modified end + /** + * Additional check on retention duration to prevent people from shooting themselves in the foot. + */ + protected def checkRetentionPeriodSafety( + spark: SparkSession, + retentionMs: Option[Long], + configuredRetention: Long): Unit = { + require(retentionMs.forall(_ >= 0), "Retention for Vacuum can't be less than 0.") + val checkEnabled = + spark.sessionState.conf.getConf(DeltaSQLConf.DELTA_VACUUM_RETENTION_CHECK_ENABLED) + val retentionSafe = retentionMs.forall(_ >= configuredRetention) + var configuredRetentionHours = TimeUnit.MILLISECONDS.toHours(configuredRetention) + if (TimeUnit.HOURS.toMillis(configuredRetentionHours) < configuredRetention) { + configuredRetentionHours += 1 + } + require(!checkEnabled || retentionSafe, + s"""Are you sure you would like to vacuum files with such a low retention period? If you have + |writers that are currently writing to this table, there is a risk that you may corrupt the + |state of your Delta table. + | + |If you are certain that there are no operations being performed on this table, such as + |insert/upsert/delete/optimize, then you may turn off this check by setting: + |spark.databricks.delta.retentionDurationCheck.enabled = false + | + |If you are not sure, please use a value not less than "$configuredRetentionHours hours". + """.stripMargin) + } + + /** + * Clears all untracked files and folders within this table. First lists all the files and + * directories in the table, and gets the relative paths with respect to the base of the + * table. Then it gets the list of all tracked files for this table, which may or may not + * be within the table base path, and gets the relative paths of all the tracked files with + * respect to the base of the table. Files outside of the table path will be ignored. + * Then we take a diff of the files and delete directories that were already empty, and all files + * that are within the table that are no longer tracked. + * + * @param dryRun If set to true, no files will be deleted. Instead, we will list all files and + * directories that will be cleared. + * @param retentionHours An optional parameter to override the default Delta tombstone retention + * period + * @return A Dataset containing the paths of the files/folders to delete in dryRun mode. Otherwise + * returns the base path of the table. + */ + def gc( + spark: SparkSession, + deltaLog: DeltaLog, + dryRun: Boolean = true, + retentionHours: Option[Double] = None, + clock: Clock = new SystemClock): DataFrame = { + recordDeltaOperation(deltaLog, "delta.gc") { + + val path = deltaLog.dataPath + val deltaHadoopConf = deltaLog.newDeltaHadoopConf() + val fs = path.getFileSystem(deltaHadoopConf) + + import org.apache.spark.sql.delta.implicits._ + + val snapshot = deltaLog.update() + + require(snapshot.version >= 0, "No state defined for this table. Is this really " + + "a Delta table? Refusing to garbage collect.") + + // --- modified start + val isMergeTreeFormat = ClickHouseConfig + .isMergeTreeFormatEngine(deltaLog.unsafeVolatileMetadata.configuration) + // --- modified end + + DeletionVectorUtils.assertDeletionVectorsNotReadable( + spark, snapshot.metadata, snapshot.protocol) + + val snapshotTombstoneRetentionMillis = DeltaLog.tombstoneRetentionMillis(snapshot.metadata) + val retentionMillis = retentionHours.map(h => TimeUnit.HOURS.toMillis(math.round(h))) + checkRetentionPeriodSafety(spark, retentionMillis, snapshotTombstoneRetentionMillis) + + val deleteBeforeTimestamp = retentionMillis.map { millis => + clock.getTimeMillis() - millis + }.getOrElse(snapshot.minFileRetentionTimestamp) + // --- modified start: toGMTString is a deprecated function + logInfo(s"Starting garbage collection (dryRun = $dryRun) of untracked files older than " + + s"${new Date(deleteBeforeTimestamp).toString} in $path") + // --- modified end + val hadoopConf = spark.sparkContext.broadcast( + new SerializableConfiguration(deltaHadoopConf)) + val basePath = fs.makeQualified(path).toString + var isBloomFiltered = false + val parallelDeleteEnabled = + spark.sessionState.conf.getConf(DeltaSQLConf.DELTA_VACUUM_PARALLEL_DELETE_ENABLED) + val parallelDeletePartitions = + spark.sessionState.conf.getConf(DeltaSQLConf.DELTA_VACUUM_PARALLEL_DELETE_PARALLELISM) + .getOrElse(spark.sessionState.conf.numShufflePartitions) + val relativizeIgnoreError = + spark.sessionState.conf.getConf(DeltaSQLConf.DELTA_VACUUM_RELATIVIZE_IGNORE_ERROR) + val startTimeToIdentifyEligibleFiles = System.currentTimeMillis() + + // --- modified start + val originalEnabledGluten = + spark.sparkContext.getLocalProperty(QueryPlanSelector.GLUTEN_ENABLE_FOR_THREAD_KEY) + // gluten can not support vacuum command + spark.sparkContext.setLocalProperty(QueryPlanSelector.GLUTEN_ENABLE_FOR_THREAD_KEY, "false") + // --- modified end + + val validFiles = snapshot.stateDS + .mapPartitions { actions => + val reservoirBase = new Path(basePath) + val fs = reservoirBase.getFileSystem(hadoopConf.value.value) + actions.flatMap { + _.unwrap match { + case tombstone: RemoveFile if tombstone.delTimestamp < deleteBeforeTimestamp => + Nil + case fa: FileAction => + getValidRelativePathsAndSubdirs( + fa, + fs, + reservoirBase, + relativizeIgnoreError, + isBloomFiltered) + case _ => Nil + } + } + }.toDF("path") + + val partitionColumns = snapshot.metadata.partitionSchema.fieldNames + val parallelism = spark.sessionState.conf.parallelPartitionDiscoveryParallelism + + val allFilesAndDirs = DeltaFileOperations.recursiveListDirs( + spark, + Seq(basePath), + hadoopConf, + hiddenDirNameFilter = DeltaTableUtils.isHiddenDirectory(partitionColumns, _), + hiddenFileNameFilter = DeltaTableUtils.isHiddenDirectory(partitionColumns, _), + fileListingParallelism = Option(parallelism) + ) + .groupByKey(_.path) + .mapGroups { (k, v) => + val duplicates = v.toSeq + // of all the duplicates we can return the newest file. + duplicates.maxBy(_.modificationTime) + } + + try { + allFilesAndDirs.cache() + + implicit val fileNameAndSizeEncoder = org.apache.spark.sql.Encoders.product[FileNameAndSize] + + val dirCounts = allFilesAndDirs.where(col("isDir")).count() + 1 // +1 for the base path + + // The logic below is as follows: + // 1. We take all the files and directories listed in our reservoir + // 2. We filter all files older than our tombstone retention period and directories + // 3. We get the subdirectories of all files so that we can find non-empty directories + // 4. We groupBy each path, and count to get how many files are in each sub-directory + // 5. We subtract all the valid files and tombstones in our state + // 6. We filter all paths with a count of 1, which will correspond to files not in the + // state, and empty directories. We can safely delete all of these + // --- modified start + val diff = if (isMergeTreeFormat) { + val diff_tmp = allFilesAndDirs + .where(col("modificationTime") < deleteBeforeTimestamp || col("isDir")) + .mapPartitions { fileStatusIterator => + val reservoirBase = new Path(basePath) + val fs = reservoirBase.getFileSystem(hadoopConf.value.value) + fileStatusIterator.flatMap { fileStatus => + if (fileStatus.isDir) { + Iterator.single(FileNameAndSize( + relativize(fileStatus.getHadoopPath, fs, reservoirBase, isDir = true), + 0L, + true)) + } else { + val dirs = getAllSubdirs(basePath, fileStatus.path, fs) + val dirsWithSlash = dirs.map { p => + val relativizedPath = relativize(new Path(p), fs, reservoirBase, isDir = true) + FileNameAndSize(relativizedPath, 0L, true) + } + dirsWithSlash ++ Iterator( + FileNameAndSize(relativize( + fileStatus.getHadoopPath, fs, reservoirBase, isDir = false), + fileStatus.length)) + } + } + } + .withColumn( + "dir", + when(col("isDir"), col("path")) + .otherwise(expr("substring_index(path, '/',size(split(path, '/')) -1)"))) + .groupBy(col("path"), col("dir")) + .agg(count(new Column("*")).as("count"), sum("length").as("length")) + + diff_tmp + .join(validFiles, diff_tmp("dir") === validFiles("path"), "leftanti") + .where(col("count") === 1) + } else { + allFilesAndDirs + .where(col("modificationTime") < deleteBeforeTimestamp || col("isDir")) + .mapPartitions { fileStatusIterator => + val reservoirBase = new Path(basePath) + val fs = reservoirBase.getFileSystem(hadoopConf.value.value) + fileStatusIterator.flatMap { fileStatus => + if (fileStatus.isDir) { + Iterator.single(FileNameAndSize( + relativize(fileStatus.getHadoopPath, fs, reservoirBase, isDir = true), 0L)) + } else { + val dirs = getAllSubdirs(basePath, fileStatus.path, fs) + val dirsWithSlash = dirs.map { p => + val relativizedPath = relativize(new Path(p), fs, reservoirBase, isDir = true) + FileNameAndSize(relativizedPath, 0L) + } + dirsWithSlash ++ Iterator( + FileNameAndSize(relativize( + fileStatus.getHadoopPath, fs, reservoirBase, isDir = false), + fileStatus.length)) + } + } + } + .groupBy(col("path")) + .agg(count(new Column("*")).as("count"), sum("length").as("length")) + .join(validFiles, Seq("path"), "leftanti") + .where(col("count") === 1) + } + // --- modified end + + val sizeOfDataToDeleteRow = diff.agg(sum("length").cast("long")).first + val sizeOfDataToDelete = if (sizeOfDataToDeleteRow.isNullAt(0)) { + 0L + } else { + sizeOfDataToDeleteRow.getLong(0) + } + + val diffFiles = diff + .select(col("path")) + .as[String] + .map { relativePath => + assert(!stringToPath(relativePath).isAbsolute, + "Shouldn't have any absolute paths for deletion here.") + pathToString(DeltaFileOperations.absolutePath(basePath, relativePath)) + } + val timeTakenToIdentifyEligibleFiles = + System.currentTimeMillis() - startTimeToIdentifyEligibleFiles + + val numFiles = diffFiles.count() + if (dryRun) { + val stats = DeltaVacuumStats( + isDryRun = true, + specifiedRetentionMillis = retentionMillis, + defaultRetentionMillis = snapshotTombstoneRetentionMillis, + minRetainedTimestamp = deleteBeforeTimestamp, + dirsPresentBeforeDelete = dirCounts, + objectsDeleted = numFiles, + sizeOfDataToDelete = sizeOfDataToDelete, + timeTakenToIdentifyEligibleFiles = timeTakenToIdentifyEligibleFiles, + timeTakenForDelete = 0L) + + recordDeltaEvent(deltaLog, "delta.gc.stats", data = stats) + logConsole(s"Found $numFiles files ($sizeOfDataToDelete bytes) and directories in " + + s"a total of $dirCounts directories that are safe to delete.") + + return diffFiles.map(f => stringToPath(f).toString).toDF("path") + } + logVacuumStart( + spark, + deltaLog, + path, + diffFiles, + sizeOfDataToDelete, + retentionMillis, + snapshotTombstoneRetentionMillis) + + val deleteStartTime = System.currentTimeMillis() + val filesDeleted = try { + delete(diffFiles, spark, basePath, + hadoopConf, parallelDeleteEnabled, parallelDeletePartitions) + } catch { + case t: Throwable => + logVacuumEnd(deltaLog, spark, path) + throw t + } + val timeTakenForDelete = System.currentTimeMillis() - deleteStartTime + val stats = DeltaVacuumStats( + isDryRun = false, + specifiedRetentionMillis = retentionMillis, + defaultRetentionMillis = snapshotTombstoneRetentionMillis, + minRetainedTimestamp = deleteBeforeTimestamp, + dirsPresentBeforeDelete = dirCounts, + objectsDeleted = filesDeleted, + sizeOfDataToDelete = sizeOfDataToDelete, + timeTakenToIdentifyEligibleFiles = timeTakenToIdentifyEligibleFiles, + timeTakenForDelete = timeTakenForDelete) + recordDeltaEvent(deltaLog, "delta.gc.stats", data = stats) + logVacuumEnd(deltaLog, spark, path, Some(filesDeleted), Some(dirCounts)) + + + spark.createDataset(Seq(basePath)).toDF("path") + } finally { + allFilesAndDirs.unpersist() + + // --- modified start + if (originalEnabledGluten != null) { + spark.sparkContext.setLocalProperty( + QueryPlanSelector.GLUTEN_ENABLE_FOR_THREAD_KEY, originalEnabledGluten) + } else { + spark.sparkContext.setLocalProperty( + QueryPlanSelector.GLUTEN_ENABLE_FOR_THREAD_KEY, "true") + } + // --- modified end + } + } + } +} + +trait VacuumCommandImpl extends DeltaCommand { + + private val supportedFsForLogging = Seq( + "wasbs", "wasbss", "abfs", "abfss", "adl", "gs", "file", "hdfs" + ) + + /** + * Returns whether we should record vacuum metrics in the delta log. + */ + private def shouldLogVacuum( + spark: SparkSession, + deltaLog: DeltaLog, + hadoopConf: Configuration, + path: Path): Boolean = { + val logVacuumConf = spark.sessionState.conf.getConf(DeltaSQLConf.DELTA_VACUUM_LOGGING_ENABLED) + + if (logVacuumConf.nonEmpty) { + return logVacuumConf.get + } + + val logStore = deltaLog.store + + try { + val rawResolvedUri: URI = logStore.resolvePathOnPhysicalStorage(path, hadoopConf).toUri + val scheme = rawResolvedUri.getScheme + supportedFsForLogging.contains(scheme) + } catch { + case _: UnsupportedOperationException => + logWarning("Vacuum event logging" + + " not enabled on this file system because we cannot detect your cloud storage type.") + false + } + } + + /** + * Record Vacuum specific metrics in the commit log at the START of vacuum. + * + * @param spark - spark session + * @param deltaLog - DeltaLog of the table + * @param path - the (data) path to the root of the table + * @param diff - the list of paths (files, directories) that are safe to delete + * @param sizeOfDataToDelete - the amount of data (bytes) to be deleted + * @param specifiedRetentionMillis - the optional override retention period (millis) to keep + * logically removed files before deleting them + * @param defaultRetentionMillis - the default retention period (millis) + */ + protected def logVacuumStart( + spark: SparkSession, + deltaLog: DeltaLog, + path: Path, + diff: Dataset[String], + sizeOfDataToDelete: Long, + specifiedRetentionMillis: Option[Long], + defaultRetentionMillis: Long): Unit = { + logInfo(s"Deleting untracked files and empty directories in $path. The amount of data to be " + + s"deleted is $sizeOfDataToDelete (in bytes)") + + // We perform an empty commit in order to record information about the Vacuum + if (shouldLogVacuum(spark, deltaLog, deltaLog.newDeltaHadoopConf(), path)) { + val checkEnabled = + spark.sessionState.conf.getConf(DeltaSQLConf.DELTA_VACUUM_RETENTION_CHECK_ENABLED) + val txn = deltaLog.startTransaction() + val metrics = Map[String, SQLMetric]( + "numFilesToDelete" -> createMetric(spark.sparkContext, "number of files to deleted"), + "sizeOfDataToDelete" -> createMetric(spark.sparkContext, + "The total amount of data to be deleted in bytes") + ) + metrics("numFilesToDelete").set(diff.count()) + metrics("sizeOfDataToDelete").set(sizeOfDataToDelete) + txn.registerSQLMetrics(spark, metrics) + txn.commit(actions = Seq(), DeltaOperations.VacuumStart( + checkEnabled, + specifiedRetentionMillis, + defaultRetentionMillis + )) + } + } + + /** + * Record Vacuum specific metrics in the commit log at the END of vacuum. + * + * @param deltaLog - DeltaLog of the table + * @param spark - spark session + * @param path - the (data) path to the root of the table + * @param filesDeleted - if the vacuum completed this will contain the number of files deleted. + * if the vacuum failed, this will be None. + * @param dirCounts - if the vacuum completed this will contain the number of directories + * vacuumed. if the vacuum failed, this will be None. + */ + protected def logVacuumEnd( + deltaLog: DeltaLog, + spark: SparkSession, + path: Path, + filesDeleted: Option[Long] = None, + dirCounts: Option[Long] = None): Unit = { + if (shouldLogVacuum(spark, deltaLog, deltaLog.newDeltaHadoopConf(), path)) { + val txn = deltaLog.startTransaction() + val status = if (filesDeleted.isEmpty && dirCounts.isEmpty) { "FAILED" } else { "COMPLETED" } + if (filesDeleted.nonEmpty && dirCounts.nonEmpty) { + val metrics = Map[String, SQLMetric]( + "numDeletedFiles" -> createMetric(spark.sparkContext, "number of files deleted."), + "numVacuumedDirectories" -> + createMetric(spark.sparkContext, "num of directories vacuumed."), + "status" -> createMetric(spark.sparkContext, "status of vacuum") + ) + metrics("numDeletedFiles").set(filesDeleted.get) + metrics("numVacuumedDirectories").set(dirCounts.get) + txn.registerSQLMetrics(spark, metrics) + } + txn.commit(actions = Seq(), DeltaOperations.VacuumEnd( + status + )) + } + + if (filesDeleted.nonEmpty) { + logConsole(s"Deleted ${filesDeleted.get} files and directories in a total " + + s"of ${dirCounts.get} directories.") + } + } + + /** + * Attempts to relativize the `path` with respect to the `reservoirBase` and converts the path to + * a string. + */ + protected def relativize( + path: Path, + fs: FileSystem, + reservoirBase: Path, + isDir: Boolean): String = { + pathToString(DeltaFileOperations.tryRelativizePath(fs, reservoirBase, path)) + } + + /** + * Wrapper function for DeltaFileOperations.getAllSubDirectories + * returns all subdirectories that `file` has with respect to `base`. + */ + protected def getAllSubdirs(base: String, file: String, fs: FileSystem): Iterator[String] = { + DeltaFileOperations.getAllSubDirectories(base, file)._1 + } + + /** + * Attempts to delete the list of candidate files. Returns the number of files deleted. + */ + protected def delete( + diff: Dataset[String], + spark: SparkSession, + basePath: String, + hadoopConf: Broadcast[SerializableConfiguration], + parallel: Boolean, + parallelPartitions: Int): Long = { + import org.apache.spark.sql.delta.implicits._ + + if (parallel) { + diff.repartition(parallelPartitions).mapPartitions { files => + val fs = new Path(basePath).getFileSystem(hadoopConf.value.value) + val filesDeletedPerPartition = + files.map(p => stringToPath(p)).count(f => tryDeleteNonRecursive(fs, f)) + Iterator(filesDeletedPerPartition) + }.collect().sum + } else { + val fs = new Path(basePath).getFileSystem(hadoopConf.value.value) + val fileResultSet = diff.toLocalIterator().asScala + fileResultSet.map(p => stringToPath(p)).count(f => tryDeleteNonRecursive(fs, f)) + } + } + + protected def stringToPath(path: String): Path = new Path(new URI(path)) + + protected def pathToString(path: Path): String = path.toUri.toString + + /** Returns the relative path of a file action or None if the file lives outside of the table. */ + protected def getActionRelativePath( + action: FileAction, + fs: FileSystem, + basePath: Path, + relativizeIgnoreError: Boolean): Option[String] = { + val filePath = stringToPath(action.path) + if (filePath.isAbsolute) { + val maybeRelative = + DeltaFileOperations.tryRelativizePath(fs, basePath, filePath, relativizeIgnoreError) + if (maybeRelative.isAbsolute) { + // This file lives outside the directory of the table. + None + } else { + Some(pathToString(maybeRelative)) + } + } else { + Some(pathToString(filePath)) + } + } + + + /** + * Returns the relative paths of all files and subdirectories for this action that must be + * retained during GC. + */ + protected def getValidRelativePathsAndSubdirs( + action: FileAction, + fs: FileSystem, + basePath: Path, + relativizeIgnoreError: Boolean, + isBloomFiltered: Boolean): Seq[String] = { + getActionRelativePath(action, fs, basePath, relativizeIgnoreError).map { relativePath => + Seq(relativePath) ++ getAllSubdirs("/", relativePath, fs) + }.getOrElse(Seq.empty) + } +} + +case class DeltaVacuumStats( + isDryRun: Boolean, + @JsonDeserialize(contentAs = classOf[java.lang.Long]) + specifiedRetentionMillis: Option[Long], + defaultRetentionMillis: Long, + minRetainedTimestamp: Long, + dirsPresentBeforeDelete: Long, + objectsDeleted: Long, + sizeOfDataToDelete: Long, + timeTakenToIdentifyEligibleFiles: Long, + timeTakenForDelete: Long) diff --git a/backends-clickhouse/src/main/delta-22/org/apache/spark/sql/delta/files/MergeTreeCommitProtocol.scala b/backends-clickhouse/src/main/delta-23/org/apache/spark/sql/delta/files/MergeTreeCommitProtocol.scala similarity index 68% rename from backends-clickhouse/src/main/delta-22/org/apache/spark/sql/delta/files/MergeTreeCommitProtocol.scala rename to backends-clickhouse/src/main/delta-23/org/apache/spark/sql/delta/files/MergeTreeCommitProtocol.scala index 52fb921ad4868..0a10d073e5c49 100644 --- a/backends-clickhouse/src/main/delta-22/org/apache/spark/sql/delta/files/MergeTreeCommitProtocol.scala +++ b/backends-clickhouse/src/main/delta-23/org/apache/spark/sql/delta/files/MergeTreeCommitProtocol.scala @@ -14,8 +14,12 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.spark.sql.delta.files +// scalastyle:off import.ordering.noEmptyLine +import org.apache.hadoop.fs.{FileStatus, Path} +import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext} import org.apache.spark.internal.Logging import org.apache.spark.internal.io.FileCommitProtocol import org.apache.spark.internal.io.FileCommitProtocol.TaskCommitMessage @@ -26,18 +30,28 @@ import org.apache.spark.sql.delta.commands.cdc.CDCReader.{CDC_LOCATION, CDC_PART import org.apache.spark.sql.delta.util.{DateFormatter, PartitionUtils, TimestampFormatter, Utils => DeltaUtils} import org.apache.spark.sql.types.StringType -// scalastyle:off import.ordering.noEmptyLine -import org.apache.hadoop.fs.{FileStatus, Path} -import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext} - import java.util.UUID - import scala.collection.mutable.ArrayBuffer -class MergeTreeCommitProtocol(jobId: String, path: String, randomPrefixLength: Option[Int]) - extends FileCommitProtocol - with Serializable - with Logging { +/** + * This file is copied from the DelayedCommitProtocol of the Delta 2.3.0 + * and renamed to MergeTreeCommitProtocol. + * It is modified to overcome the following issues: + * 1. the function commitTask will return TaskCommitMessage(Nil), + * the FileStatus list will be get from the CH backend. + */ + +/** + * Writes out the files to `path` and returns a list of them in `addedStatuses`. Includes + * special handling for partitioning on [[CDC_PARTITION_COL]] for + * compatibility between enabled and disabled CDC; partitions with a value of false in this + * column produce no corresponding partitioning directory. + */ +class MergeTreeCommitProtocol( + jobId: String, + path: String, + randomPrefixLength: Option[Int]) + extends FileCommitProtocol with Serializable with Logging { // Track the list of files added by a task, only used on the executors. @transient protected var addedFiles: ArrayBuffer[(Map[String, String], String)] = _ @@ -58,11 +72,13 @@ class MergeTreeCommitProtocol(jobId: String, path: String, randomPrefixLength: O // Constants for CDC partition manipulation. Used only in newTaskTempFile(), but we define them // here to avoid building a new redundant regex for every file. - protected val cdcPartitionFalse = s"$CDC_PARTITION_COL=false" - protected val cdcPartitionTrue = s"$CDC_PARTITION_COL=true" + protected val cdcPartitionFalse = s"${CDC_PARTITION_COL}=false" + protected val cdcPartitionTrue = s"${CDC_PARTITION_COL}=true" protected val cdcPartitionTrueRegex = cdcPartitionTrue.r - override def setupJob(jobContext: JobContext): Unit = {} + override def setupJob(jobContext: JobContext): Unit = { + + } /** * Commits a job after the writes succeed. Must be called on the driver. Partitions the written @@ -70,8 +86,7 @@ class MergeTreeCommitProtocol(jobId: String, path: String, randomPrefixLength: O * by [[TransactionalWrite]] (i.e. AddFile's may have additional statistics injected) */ override def commitJob(jobContext: JobContext, taskCommits: Seq[TaskCommitMessage]): Unit = { - val (addFiles, changeFiles) = taskCommits - .flatMap(_.obj.asInstanceOf[Seq[_]]) + val (addFiles, changeFiles) = taskCommits.flatMap(_.obj.asInstanceOf[Seq[_]]) .partition { case _: AddFile => true case _: AddCDCFile => false @@ -128,12 +143,14 @@ class MergeTreeCommitProtocol(jobId: String, path: String, randomPrefixLength: O timestampFormatter) ._1 .get - parsedPartition.columnNames - .zip( - parsedPartition.literals - .map(l => Cast(l, StringType).eval()) - .map(Option(_).map(_.toString).orNull)) - .toMap + parsedPartition + .columnNames + .zip( + parsedPartition + .literals + .map(l => Cast(l, StringType).eval()) + .map(Option(_).map(_.toString).orNull)) + .toMap } /** @@ -142,56 +159,46 @@ class MergeTreeCommitProtocol(jobId: String, path: String, randomPrefixLength: O * * Includes special logic for CDC files and paths. Specifically, if the directory `dir` contains * the CDC partition `__is_cdc=true` then - * - the file name begins with `cdc-` instead of `part-` - * - the directory has the `__is_cdc=true` partition removed and is placed in the - * `_changed_data` folder + * - the file name begins with `cdc-` instead of `part-` + * - the directory has the `__is_cdc=true` partition removed and is placed in the `_changed_data` + * folder */ override def newTaskTempFile( - taskContext: TaskAttemptContext, - dir: Option[String], - ext: String): String = { + taskContext: TaskAttemptContext, dir: Option[String], ext: String): String = { val partitionValues = dir.map(parsePartitions).getOrElse(Map.empty[String, String]) val filename = getFileName(taskContext, ext, partitionValues) - val relativePath = randomPrefixLength - .map { - prefixLength => - DeltaUtils.getRandomPrefix(prefixLength) // Generate a random prefix as a first choice - } - .orElse { - dir // or else write into the partition directory if it is partitioned + val relativePath = randomPrefixLength.map { prefixLength => + DeltaUtils.getRandomPrefix(prefixLength) // Generate a random prefix as a first choice + }.orElse { + dir // or else write into the partition directory if it is partitioned + }.map { subDir => + // Do some surgery on the paths we write out to eliminate the CDC_PARTITION_COL. Non-CDC + // data is written to the base location, while CDC data is written to a special folder + // _change_data. + // The code here gets a bit complicated to accommodate two corner cases: an empty subdir + // can't be passed to new Path() at all, and a single-level subdir won't have a trailing + // slash. + if (subDir == cdcPartitionFalse) { + new Path(filename) + } else if (subDir.startsWith(cdcPartitionTrue)) { + val cleanedSubDir = cdcPartitionTrueRegex.replaceFirstIn(subDir, CDC_LOCATION) + new Path(cleanedSubDir, filename) + } else if (subDir.startsWith(cdcPartitionFalse)) { + // We need to remove the trailing slash in addition to the directory - otherwise + // it'll be interpreted as an absolute path and fail. + val cleanedSubDir = subDir.stripPrefix(cdcPartitionFalse + "/") + new Path(cleanedSubDir, filename) + } else { + new Path(subDir, filename) } - .map { - subDir => - // Do some surgery on the paths we write out to eliminate the CDC_PARTITION_COL. Non-CDC - // data is written to the base location, while CDC data is written to a special folder - // _change_data. - // The code here gets a bit complicated to accommodate two corner cases: an empty subdir - // can't be passed to new Path() at all, and a single-level subdir won't have a trailing - // slash. - if (subDir == cdcPartitionFalse) { - new Path(filename) - } else if (subDir.startsWith(cdcPartitionTrue)) { - val cleanedSubDir = cdcPartitionTrueRegex.replaceFirstIn(subDir, CDC_LOCATION) - new Path(cleanedSubDir, filename) - } else if (subDir.startsWith(cdcPartitionFalse)) { - // We need to remove the trailing slash in addition to the directory - otherwise - // it'll be interpreted as an absolute path and fail. - val cleanedSubDir = subDir.stripPrefix(cdcPartitionFalse + "/") - new Path(cleanedSubDir, filename) - } else { - new Path(subDir, filename) - } - } - .getOrElse(new Path(filename)) // or directly write out to the output path + }.getOrElse(new Path(filename)) // or directly write out to the output path addedFiles.append((partitionValues, relativePath.toUri.toString)) new Path(path, relativePath).toString } override def newTaskTempFileAbsPath( - taskContext: TaskAttemptContext, - absoluteDir: String, - ext: String): String = { + taskContext: TaskAttemptContext, absoluteDir: String, ext: String): String = { throw DeltaErrors.unsupportedAbsPathAddFile(s"$this") } @@ -213,8 +220,9 @@ class MergeTreeCommitProtocol(jobId: String, path: String, randomPrefixLength: O } override def commitTask(taskContext: TaskAttemptContext): TaskCommitMessage = { - if (addedFiles.nonEmpty) { - /* val fs = new Path(path, addedFiles.head._2).getFileSystem(taskContext.getConfiguration) + // --- modified start + /* if (addedFiles.nonEmpty) { + val fs = new Path(path, addedFiles.head._2).getFileSystem(taskContext.getConfiguration) val statuses: Seq[FileAction] = addedFiles.map { f => val filePath = new Path(path, new Path(new URI(f._2))) val stat = fs.getFileStatus(filePath) @@ -222,15 +230,15 @@ class MergeTreeCommitProtocol(jobId: String, path: String, randomPrefixLength: O buildActionFromAddedFile(f, stat, taskContext) }.toSeq - new TaskCommitMessage(statuses) */ - new TaskCommitMessage(Nil) + new TaskCommitMessage(statuses) } else { new TaskCommitMessage(Nil) - } + } */ + // --- modified end + new TaskCommitMessage(Nil) } override def abortTask(taskContext: TaskAttemptContext): Unit = { // TODO: we can also try delete the addedFiles as a best-effort cleanup. } - } diff --git a/backends-clickhouse/src/main/delta-23/org/apache/spark/sql/execution/datasources/v2/clickhouse/ClickHouseDataSource.scala b/backends-clickhouse/src/main/delta-23/org/apache/spark/sql/execution/datasources/v2/clickhouse/ClickHouseDataSource.scala new file mode 100644 index 0000000000000..8c1062f4c7b6c --- /dev/null +++ b/backends-clickhouse/src/main/delta-23/org/apache/spark/sql/execution/datasources/v2/clickhouse/ClickHouseDataSource.scala @@ -0,0 +1,145 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.execution.datasources.v2.clickhouse + +import org.apache.spark.sql._ +import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap +import org.apache.spark.sql.connector.catalog.Table +import org.apache.spark.sql.connector.expressions.Transform +import org.apache.spark.sql.delta._ +import org.apache.spark.sql.delta.catalog.ClickHouseTableV2 +import org.apache.spark.sql.delta.commands.WriteIntoDelta +import org.apache.spark.sql.delta.commands.cdc.CDCReader +import org.apache.spark.sql.delta.sources.{DeltaDataSource, DeltaSourceUtils, DeltaSQLConf} +import org.apache.spark.sql.sources.BaseRelation +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.util.CaseInsensitiveStringMap + +import org.apache.hadoop.fs.Path + +import scala.collection.JavaConverters._ +import scala.collection.mutable + +/** A DataSource V1 for integrating Delta into Spark SQL batch and Streaming APIs. */ +class ClickHouseDataSource extends DeltaDataSource { + + override def shortName(): String = { + ClickHouseConfig.NAME + } + + override def getTable( + schema: StructType, + partitioning: Array[Transform], + properties: java.util.Map[String, String]): Table = { + val options = new CaseInsensitiveStringMap(properties) + val path = options.get("path") + if (path == null) throw DeltaErrors.pathNotSpecifiedException + new ClickHouseTableV2( + SparkSession.active, + new Path(path), + options = properties.asScala.toMap, + clickhouseExtensionOptions = ClickHouseConfig + .createMergeTreeConfigurations( + ClickHouseConfig + .getMergeTreeConfigurations(properties) + .asJava) + ) + } + + override def createRelation( + sqlContext: SQLContext, + mode: SaveMode, + parameters: Map[String, String], + data: DataFrame): BaseRelation = { + val path = parameters.getOrElse("path", throw DeltaErrors.pathNotSpecifiedException) + val partitionColumns = parameters + .get(DeltaSourceUtils.PARTITIONING_COLUMNS_KEY) + .map(DeltaDataSource.decodePartitioningColumns) + .getOrElse(Nil) + + val deltaLog = DeltaLog.forTable(sqlContext.sparkSession, path, parameters) + // need to use the latest snapshot + val configs = if (deltaLog.update().version < 0) { + // when creating table, save the clickhouse config to the delta metadata + val clickHouseTableV2 = ClickHouseTableV2.getTable(deltaLog) + clickHouseTableV2.properties().asScala.toMap ++ DeltaConfigs + .validateConfigurations(parameters.filterKeys(_.startsWith("delta.")).toMap) + } else { + DeltaConfigs.validateConfigurations(parameters.filterKeys(_.startsWith("delta.")).toMap) + } + WriteIntoDelta( + deltaLog = deltaLog, + mode = mode, + new DeltaOptions(parameters, sqlContext.sparkSession.sessionState.conf), + partitionColumns = partitionColumns, + configuration = configs, + data = data + ).run(sqlContext.sparkSession) + + deltaLog.createRelation() + } + + override def createRelation( + sqlContext: SQLContext, + parameters: Map[String, String]): BaseRelation = { + recordFrameProfile("Delta", "DeltaDataSource.createRelation") { + val maybePath = parameters.getOrElse("path", throw DeltaErrors.pathNotSpecifiedException) + + // Log any invalid options that are being passed in + DeltaOptions.verifyOptions(CaseInsensitiveMap(parameters)) + + val timeTravelByParams = DeltaDataSource.getTimeTravelVersion(parameters) + var cdcOptions: mutable.Map[String, String] = mutable.Map.empty + val caseInsensitiveParams = new CaseInsensitiveStringMap(parameters.asJava) + if (CDCReader.isCDCRead(caseInsensitiveParams)) { + cdcOptions = mutable.Map[String, String](DeltaDataSource.CDC_ENABLED_KEY -> "true") + if (caseInsensitiveParams.containsKey(DeltaDataSource.CDC_START_VERSION_KEY)) { + cdcOptions(DeltaDataSource.CDC_START_VERSION_KEY) = + caseInsensitiveParams.get(DeltaDataSource.CDC_START_VERSION_KEY) + } + if (caseInsensitiveParams.containsKey(DeltaDataSource.CDC_START_TIMESTAMP_KEY)) { + cdcOptions(DeltaDataSource.CDC_START_TIMESTAMP_KEY) = + caseInsensitiveParams.get(DeltaDataSource.CDC_START_TIMESTAMP_KEY) + } + if (caseInsensitiveParams.containsKey(DeltaDataSource.CDC_END_VERSION_KEY)) { + cdcOptions(DeltaDataSource.CDC_END_VERSION_KEY) = + caseInsensitiveParams.get(DeltaDataSource.CDC_END_VERSION_KEY) + } + if (caseInsensitiveParams.containsKey(DeltaDataSource.CDC_END_TIMESTAMP_KEY)) { + cdcOptions(DeltaDataSource.CDC_END_TIMESTAMP_KEY) = + caseInsensitiveParams.get(DeltaDataSource.CDC_END_TIMESTAMP_KEY) + } + } + val dfOptions: Map[String, String] = + if ( + sqlContext.sparkSession.sessionState.conf.getConf( + DeltaSQLConf.LOAD_FILE_SYSTEM_CONFIGS_FROM_DATAFRAME_OPTIONS) + ) { + parameters + } else { + Map.empty + } + (new ClickHouseTableV2( + sqlContext.sparkSession, + new Path(maybePath), + timeTravelOpt = timeTravelByParams, + options = dfOptions, + cdcOptions = new CaseInsensitiveStringMap(cdcOptions.asJava) + )).toBaseRelation + } + } +} diff --git a/backends-clickhouse/src/main/delta-23/org/apache/spark/sql/execution/datasources/v2/clickhouse/ClickHouseSparkCatalog.scala b/backends-clickhouse/src/main/delta-23/org/apache/spark/sql/execution/datasources/v2/clickhouse/ClickHouseSparkCatalog.scala new file mode 100644 index 0000000000000..61e1da44d0af0 --- /dev/null +++ b/backends-clickhouse/src/main/delta-23/org/apache/spark/sql/execution/datasources/v2/clickhouse/ClickHouseSparkCatalog.scala @@ -0,0 +1,662 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.execution.datasources.v2.clickhouse + +import org.apache.gluten.sql.shims.SparkShimLoader + +import org.apache.spark.sql.{AnalysisException, DataFrame, SparkSession} +import org.apache.spark.sql.catalyst.TableIdentifier +import org.apache.spark.sql.catalyst.analysis.{NoSuchDatabaseException, NoSuchNamespaceException, NoSuchTableException} +import org.apache.spark.sql.catalyst.catalog._ +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.connector.catalog._ +import org.apache.spark.sql.connector.catalog.TableCapability.V1_BATCH_WRITE +import org.apache.spark.sql.connector.expressions.Transform +import org.apache.spark.sql.connector.write.{LogicalWriteInfo, V1Write, WriteBuilder} +import org.apache.spark.sql.delta.{DeltaConfigs, DeltaErrors, DeltaLog, DeltaOptions, DeltaTableUtils} +import org.apache.spark.sql.delta.DeltaTableIdentifier.gluePermissionError +import org.apache.spark.sql.delta.catalog.{ClickHouseTableV2, DeltaTableV2, TempClickHouseTableV2} +import org.apache.spark.sql.delta.commands.{CreateDeltaTableCommand, TableCreationModes, WriteIntoDelta} +import org.apache.spark.sql.delta.metering.DeltaLogging +import org.apache.spark.sql.delta.sources.{DeltaSourceUtils, DeltaSQLConf} +import org.apache.spark.sql.execution.datasources.{DataSource, PartitioningUtils} +import org.apache.spark.sql.execution.datasources.v2.clickhouse.utils.CHDataSourceUtils +import org.apache.spark.sql.sources.InsertableRelation +import org.apache.spark.sql.types.StructType + +import org.apache.hadoop.fs.Path + +import java.util +import java.util.Locale + +import scala.collection.JavaConverters._ + +class ClickHouseSparkCatalog + extends DelegatingCatalogExtension + with StagingTableCatalog + with SupportsPathIdentifier + with DeltaLogging { + + val spark = SparkSession.active + + private def createCatalogTable( + ident: Identifier, + schema: StructType, + partitions: Array[Transform], + properties: util.Map[String, String] + ): Table = { + super.createTable(ident, schema, partitions, properties) + } + + override def createTable( + ident: Identifier, + schema: StructType, + partitions: Array[Transform], + properties: util.Map[String, String]): Table = { + if (CHDataSourceUtils.isClickHouseDataSourceName(getProvider(properties))) { + createClickHouseTable( + ident, + schema, + partitions, + properties, + Map.empty, + sourceQuery = None, + TableCreationModes.Create) + } else if (DeltaSourceUtils.isDeltaDataSourceName(getProvider(properties))) { + createDeltaTable( + ident, + schema, + partitions, + properties, + Map.empty, + sourceQuery = None, + TableCreationModes.Create + ) + } else { + createCatalogTable(ident, schema, partitions, properties) + } + } + + /** + * Creates a ClickHouse table + * + * @param ident + * The identifier of the table + * @param schema + * The schema of the table + * @param partitions + * The partition transforms for the table + * @param allTableProperties + * The table properties that configure the behavior of the table or provide information about + * the table + * @param writeOptions + * Options specific to the write during table creation or replacement + * @param sourceQuery + * A query if this CREATE request came from a CTAS or RTAS + * @param operation + * The specific table creation mode, whether this is a Create/Replace/Create or Replace + */ + private def createClickHouseTable( + ident: Identifier, + schema: StructType, + partitions: Array[Transform], + allTableProperties: util.Map[String, String], + writeOptions: Map[String, String], + sourceQuery: Option[DataFrame], + operation: TableCreationModes.CreationMode): Table = { + val (partitionColumns, maybeBucketSpec) = + SparkShimLoader.getSparkShims.convertPartitionTransforms(partitions) + var newSchema = schema + var newPartitionColumns = partitionColumns + var newBucketSpec = maybeBucketSpec + + // Delta does not support bucket feature, so save the bucket infos into properties if exists. + val tableProperties = + ClickHouseConfig.createMergeTreeConfigurations(allTableProperties, newBucketSpec) + + val isByPath = isPathIdentifier(ident) + val location = if (isByPath) { + Option(ident.name()) + } else { + Option(allTableProperties.get("location")) + } + val locUriOpt = location.map(CatalogUtils.stringToURI) + val storage = DataSource + .buildStorageFormatFromOptions(writeOptions) + .copy(locationUri = locUriOpt) + val tableType = + if (location.isDefined) CatalogTableType.EXTERNAL else CatalogTableType.MANAGED + val id = { + TableIdentifier(ident.name(), ident.namespace().lastOption) + } + val existingTableOpt = getExistingTableIfExists(id) + val loc = new Path(locUriOpt.getOrElse(spark.sessionState.catalog.defaultTablePath(id))) + val commentOpt = Option(allTableProperties.get("comment")) + + val tableDesc = new CatalogTable( + identifier = id, + tableType = tableType, + storage = storage, + schema = newSchema, + provider = Some(ClickHouseConfig.ALT_NAME), + partitionColumnNames = newPartitionColumns, + bucketSpec = newBucketSpec, + properties = tableProperties, + comment = commentOpt + ) + + val withDb = verifyTableAndSolidify(tableDesc, None, true) + + val writer = sourceQuery.map { + df => + WriteIntoDelta( + DeltaLog.forTable(spark, loc), + operation.mode, + new DeltaOptions(withDb.storage.properties, spark.sessionState.conf), + withDb.partitionColumnNames, + withDb.properties ++ commentOpt.map("comment" -> _), + df, + schemaInCatalog = if (newSchema != schema) Some(newSchema) else None + ) + } + try { + ClickHouseTableV2.temporalThreadLocalCHTable.set( + new TempClickHouseTableV2(spark, Some(withDb))) + + CreateDeltaTableCommand( + withDb, + existingTableOpt, + operation.mode, + writer, + operation = operation, + tableByPath = isByPath).run(spark) + } finally { + ClickHouseTableV2.temporalThreadLocalCHTable.remove() + } + + logInfo(s"create table ${ident.toString} successfully.") + loadTable(ident) + } + + /** + * Creates a Delta table + * + * @param ident + * The identifier of the table + * @param schema + * The schema of the table + * @param partitions + * The partition transforms for the table + * @param allTableProperties + * The table properties that configure the behavior of the table or provide information about + * the table + * @param writeOptions + * Options specific to the write during table creation or replacement + * @param sourceQuery + * A query if this CREATE request came from a CTAS or RTAS + * @param operation + * The specific table creation mode, whether this is a Create/Replace/Create or Replace + */ + private def createDeltaTable( + ident: Identifier, + schema: StructType, + partitions: Array[Transform], + allTableProperties: util.Map[String, String], + writeOptions: Map[String, String], + sourceQuery: Option[DataFrame], + operation: TableCreationModes.CreationMode + ): Table = { + // These two keys are tableProperties in data source v2 but not in v1, so we have to filter + // them out. Otherwise property consistency checks will fail. + val tableProperties = allTableProperties.asScala.filterKeys { + case TableCatalog.PROP_LOCATION => false + case TableCatalog.PROP_PROVIDER => false + case TableCatalog.PROP_COMMENT => false + case TableCatalog.PROP_OWNER => false + case TableCatalog.PROP_EXTERNAL => false + case "path" => false + case _ => true + }.toMap + val (partitionColumns, maybeBucketSpec) = + SparkShimLoader.getSparkShims.convertPartitionTransforms(partitions) + var newSchema = schema + var newPartitionColumns = partitionColumns + var newBucketSpec = maybeBucketSpec + val conf = spark.sessionState.conf + + val isByPath = isPathIdentifier(ident) + if ( + isByPath && !conf.getConf(DeltaSQLConf.DELTA_LEGACY_ALLOW_AMBIGUOUS_PATHS) + && allTableProperties.containsKey("location") + // The location property can be qualified and different from the path in the identifier, so + // we check `endsWith` here. + && Option(allTableProperties.get("location")).exists(!_.endsWith(ident.name())) + ) { + throw DeltaErrors.ambiguousPathsInCreateTableException( + ident.name(), + allTableProperties.get("location")) + } + val location = if (isByPath) { + Option(ident.name()) + } else { + Option(allTableProperties.get("location")) + } + val id = { + TableIdentifier(ident.name(), ident.namespace().lastOption) + } + var locUriOpt = location.map(CatalogUtils.stringToURI) + val existingTableOpt = getExistingTableIfExists(id) + val loc = locUriOpt + .orElse(existingTableOpt.flatMap(_.storage.locationUri)) + .getOrElse(spark.sessionState.catalog.defaultTablePath(id)) + val storage = DataSource + .buildStorageFormatFromOptions(writeOptions) + .copy(locationUri = Option(loc)) + val tableType = + if (location.isDefined) CatalogTableType.EXTERNAL else CatalogTableType.MANAGED + val commentOpt = Option(allTableProperties.get("comment")) + + var tableDesc = new CatalogTable( + identifier = id, + tableType = tableType, + storage = storage, + schema = newSchema, + provider = Some(DeltaSourceUtils.ALT_NAME), + partitionColumnNames = newPartitionColumns, + bucketSpec = newBucketSpec, + properties = tableProperties, + comment = commentOpt + ) + + val withDb = verifyTableAndSolidify(tableDesc, None) + + val writer = sourceQuery.map { + df => + WriteIntoDelta( + DeltaLog.forTable(spark, new Path(loc)), + operation.mode, + new DeltaOptions(withDb.storage.properties, spark.sessionState.conf), + withDb.partitionColumnNames, + withDb.properties ++ commentOpt.map("comment" -> _), + df, + schemaInCatalog = if (newSchema != schema) Some(newSchema) else None + ) + } + + CreateDeltaTableCommand( + withDb, + existingTableOpt, + operation.mode, + writer, + operation, + tableByPath = isByPath).run(spark) + + loadTable(ident) + } + + /** Performs checks on the parameters provided for table creation for a ClickHouse table. */ + private def verifyTableAndSolidify( + tableDesc: CatalogTable, + query: Option[LogicalPlan], + isMergeTree: Boolean = false): CatalogTable = { + + if (!isMergeTree && tableDesc.bucketSpec.isDefined) { + throw DeltaErrors.operationNotSupportedException("Bucketing", tableDesc.identifier) + } + + val schema = query + .map { + plan => + assert(tableDesc.schema.isEmpty, "Can't specify table schema in CTAS.") + plan.schema.asNullable + } + .getOrElse(tableDesc.schema) + + PartitioningUtils.validatePartitionColumn( + schema, + tableDesc.partitionColumnNames, + caseSensitive = false + ) // Delta is case insensitive + + val validatedConfigurations = if (isMergeTree) { + tableDesc.properties + } else { + DeltaConfigs.validateConfigurations(tableDesc.properties) + } + + val db = tableDesc.identifier.database.getOrElse(catalog.getCurrentDatabase) + val tableIdentWithDB = tableDesc.identifier.copy(database = Some(db)) + tableDesc.copy( + identifier = tableIdentWithDB, + schema = schema, + properties = validatedConfigurations) + } + + /** Checks if a table already exists for the provided identifier. */ + def getExistingTableIfExists(table: TableIdentifier): Option[CatalogTable] = { + // If this is a path identifier, we cannot return an existing CatalogTable. The Create command + // will check the file system itself + if (isPathIdentifier(table)) return None + val tableExists = catalog.tableExists(table) + if (tableExists) { + val oldTable = catalog.getTableMetadata(table) + if (oldTable.tableType == CatalogTableType.VIEW) { + throw new AnalysisException(s"$table is a view. You may not write data into a view.") + } + if ( + !DeltaSourceUtils.isDeltaTable(oldTable.provider) && + !CHDataSourceUtils.isClickHouseTable(oldTable.provider) + ) { + throw DeltaErrors.notADeltaTable(table.table) + } + Some(oldTable) + } else { + None + } + } + + private def getProvider(properties: util.Map[String, String]): String = { + Option(properties.get("provider")).getOrElse(ClickHouseConfig.NAME) + } + + override def loadTable(ident: Identifier): Table = { + try { + super.loadTable(ident) match { + case v1: V1Table if CHDataSourceUtils.isClickHouseTable(v1.catalogTable) => + new ClickHouseTableV2( + spark, + new Path(v1.catalogTable.location), + catalogTable = Some(v1.catalogTable), + tableIdentifier = Some(ident.toString)) + case v1: V1Table if DeltaTableUtils.isDeltaTable(v1.catalogTable) => + DeltaTableV2( + spark, + new Path(v1.catalogTable.location), + catalogTable = Some(v1.catalogTable), + tableIdentifier = Some(ident.toString)) + case o => + o + } + } catch { + case _: NoSuchDatabaseException | _: NoSuchNamespaceException | _: NoSuchTableException + if isPathIdentifier(ident) => + newDeltaPathTable(ident) + case e: AnalysisException if gluePermissionError(e) && isPathIdentifier(ident) => + logWarning( + "Received an access denied error from Glue. Assuming this " + + s"identifier ($ident) is path based.", + e) + newDeltaPathTable(ident) + } + } + + private def newDeltaPathTable(ident: Identifier): DeltaTableV2 = { + if (hasClickHouseNamespace(ident)) { + new ClickHouseTableV2(spark, new Path(ident.name())) + } else { + DeltaTableV2(spark, new Path(ident.name())) + } + } + + /** support to delete mergetree data from the external table */ + override def purgeTable(ident: Identifier): Boolean = { + try { + loadTable(ident) match { + case t: ClickHouseTableV2 => + val tableType = t.properties().getOrDefault("Type", "") + // file-based or external table + val isExternal = tableType.isEmpty || tableType.equalsIgnoreCase("external") + val tablePath = t.rootPath + // first delete the table metadata + val deletedTable = super.dropTable(ident) + if (deletedTable && isExternal) { + val fs = tablePath.getFileSystem(spark.sessionState.newHadoopConf()) + // delete all data if there is a external table + fs.delete(tablePath, true) + } + true + case _ => super.purgeTable(ident) + } + } catch { + case _: Exception => + false + } + } + + override def stageReplace( + ident: Identifier, + schema: StructType, + partitions: Array[Transform], + properties: util.Map[String, String]): StagedTable = + recordFrameProfile("DeltaCatalog", "stageReplace") { + if ( + CHDataSourceUtils.isClickHouseDataSourceName(getProvider(properties)) || + DeltaSourceUtils.isDeltaDataSourceName(getProvider(properties)) + ) { + new StagedDeltaTableV2(ident, schema, partitions, properties, TableCreationModes.Replace) + } else { + super.dropTable(ident) + val table = createCatalogTable(ident, schema, partitions, properties) + BestEffortStagedTable(ident, table, this) + } + } + + override def stageCreateOrReplace( + ident: Identifier, + schema: StructType, + partitions: Array[Transform], + properties: util.Map[String, String]): StagedTable = + recordFrameProfile("DeltaCatalog", "stageCreateOrReplace") { + if ( + CHDataSourceUtils.isClickHouseDataSourceName(getProvider(properties)) || + DeltaSourceUtils.isDeltaDataSourceName(getProvider(properties)) + ) { + new StagedDeltaTableV2( + ident, + schema, + partitions, + properties, + TableCreationModes.CreateOrReplace) + } else { + try super.dropTable(ident) + catch { + case _: NoSuchDatabaseException => // this is fine + case _: NoSuchTableException => // this is fine + } + val table = createCatalogTable(ident, schema, partitions, properties) + BestEffortStagedTable(ident, table, this) + } + } + + override def stageCreate( + ident: Identifier, + schema: StructType, + partitions: Array[Transform], + properties: util.Map[String, String]): StagedTable = + recordFrameProfile("DeltaCatalog", "stageCreate") { + if ( + CHDataSourceUtils.isClickHouseDataSourceName(getProvider(properties)) || + DeltaSourceUtils.isDeltaDataSourceName(getProvider(properties)) + ) { + new StagedDeltaTableV2(ident, schema, partitions, properties, TableCreationModes.Create) + } else { + val table = createCatalogTable(ident, schema, partitions, properties) + BestEffortStagedTable(ident, table, this) + } + } + + /** + * A staged delta table, which creates a HiveMetaStore entry and appends data if this was a + * CTAS/RTAS command. We have a ugly way of using this API right now, but it's the best way to + * maintain old behavior compatibility between Databricks Runtime and OSS Delta Lake. + */ + private class StagedDeltaTableV2( + ident: Identifier, + override val schema: StructType, + val partitions: Array[Transform], + override val properties: util.Map[String, String], + operation: TableCreationModes.CreationMode) + extends StagedTable + with SupportsWrite { + + private var asSelectQuery: Option[DataFrame] = None + private var writeOptions: Map[String, String] = Map.empty + + override def commitStagedChanges(): Unit = + recordFrameProfile("DeltaCatalog", "commitStagedChanges") { + val conf = spark.sessionState.conf + val props = new util.HashMap[String, String]() + // Options passed in through the SQL API will show up both with an "option." prefix and + // without in Spark 3.1, so we need to remove those from the properties + val optionsThroughProperties = properties.asScala.collect { + case (k, _) if k.startsWith("option.") => k.stripPrefix("option.") + }.toSet + val sqlWriteOptions = new util.HashMap[String, String]() + properties.asScala.foreach { + case (k, v) => + if (!k.startsWith("option.") && !optionsThroughProperties.contains(k)) { + // Do not add to properties + props.put(k, v) + } else if (optionsThroughProperties.contains(k)) { + sqlWriteOptions.put(k, v) + } + } + if (writeOptions.isEmpty && !sqlWriteOptions.isEmpty) { + writeOptions = sqlWriteOptions.asScala.toMap + } + if (conf.getConf(DeltaSQLConf.DELTA_LEGACY_STORE_WRITER_OPTIONS_AS_PROPS)) { + // Legacy behavior + writeOptions.foreach { case (k, v) => props.put(k, v) } + } else { + writeOptions.foreach { + case (k, v) => + // Continue putting in Delta prefixed options to avoid breaking workloads + if (k.toLowerCase(Locale.ROOT).startsWith("delta.")) { + props.put(k, v) + } + } + } + if (CHDataSourceUtils.isClickHouseDataSourceName(getProvider(properties))) { + createClickHouseTable( + ident, + schema, + partitions, + props, + writeOptions, + asSelectQuery, + operation) + } else { + createDeltaTable(ident, schema, partitions, props, writeOptions, asSelectQuery, operation) + } + } + + override def name(): String = ident.name() + + override def abortStagedChanges(): Unit = {} + + override def capabilities(): util.Set[TableCapability] = Set(V1_BATCH_WRITE).asJava + + override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder = { + writeOptions = info.options.asCaseSensitiveMap().asScala.toMap + new DeltaV1WriteBuilder + } + + /* + * WriteBuilder for creating a Delta table. + */ + private class DeltaV1WriteBuilder extends WriteBuilder { + override def build(): V1Write = new V1Write { + override def toInsertableRelation(): InsertableRelation = { + new InsertableRelation { + override def insert(data: DataFrame, overwrite: Boolean): Unit = { + asSelectQuery = Option(data) + } + } + } + } + } + } + + private case class BestEffortStagedTable(ident: Identifier, table: Table, catalog: TableCatalog) + extends StagedTable + with SupportsWrite { + override def abortStagedChanges(): Unit = catalog.dropTable(ident) + + override def commitStagedChanges(): Unit = {} + + // Pass through + override def name(): String = table.name() + override def schema(): StructType = table.schema() + override def partitioning(): Array[Transform] = table.partitioning() + override def capabilities(): util.Set[TableCapability] = table.capabilities() + override def properties(): util.Map[String, String] = table.properties() + + override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder = table match { + case supportsWrite: SupportsWrite => supportsWrite.newWriteBuilder(info) + case _ => throw DeltaErrors.unsupportedWriteStagedTable(name) + } + } +} + +/** + * A trait for handling table access through clickhouse.`/some/path`. This is a stop-gap solution + * until PathIdentifiers are implemented in Apache Spark. + */ +trait SupportsPathIdentifier extends TableCatalog { + self: ClickHouseSparkCatalog => + + protected lazy val catalog: SessionCatalog = spark.sessionState.catalog + + override def tableExists(ident: Identifier): Boolean = { + if (isPathIdentifier(ident)) { + val path = new Path(ident.name()) + val fs = path.getFileSystem(spark.sessionState.newHadoopConf()) + fs.exists(path) && fs.listStatus(path).nonEmpty + } else { + super.tableExists(ident) + } + } + + protected def isPathIdentifier(ident: Identifier): Boolean = { + // Should be a simple check of a special PathIdentifier class in the future + try { + supportSQLOnFile && (hasClickHouseNamespace(ident) || hasDeltaNamespace(ident)) && + new Path(ident.name()).isAbsolute + } catch { + case _: IllegalArgumentException => false + } + } + + protected def isPathIdentifier(table: CatalogTable): Boolean = { + isPathIdentifier(table.identifier) + } + + protected def isPathIdentifier(tableIdentifier: TableIdentifier): Boolean = { + isPathIdentifier(Identifier.of(tableIdentifier.database.toArray, tableIdentifier.table)) + } + + private def supportSQLOnFile: Boolean = spark.sessionState.conf.runSQLonFile + + protected def hasClickHouseNamespace(ident: Identifier): Boolean = { + ident.namespace().length == 1 && + CHDataSourceUtils.isClickHouseDataSourceName(ident.namespace().head) + } + + protected def hasDeltaNamespace(ident: Identifier): Boolean = { + ident.namespace().length == 1 && DeltaSourceUtils.isDeltaDataSourceName(ident.namespace().head) + } +} diff --git a/backends-clickhouse/src/main/delta-22/org/apache/spark/sql/execution/datasources/v2/clickhouse/source/DeltaMergeTreeFileFormat.scala b/backends-clickhouse/src/main/delta-23/org/apache/spark/sql/execution/datasources/v2/clickhouse/source/DeltaMergeTreeFileFormat.scala similarity index 94% rename from backends-clickhouse/src/main/delta-22/org/apache/spark/sql/execution/datasources/v2/clickhouse/source/DeltaMergeTreeFileFormat.scala rename to backends-clickhouse/src/main/delta-23/org/apache/spark/sql/execution/datasources/v2/clickhouse/source/DeltaMergeTreeFileFormat.scala index 130790308db8c..44e775b66688b 100644 --- a/backends-clickhouse/src/main/delta-22/org/apache/spark/sql/execution/datasources/v2/clickhouse/source/DeltaMergeTreeFileFormat.scala +++ b/backends-clickhouse/src/main/delta-23/org/apache/spark/sql/execution/datasources/v2/clickhouse/source/DeltaMergeTreeFileFormat.scala @@ -25,6 +25,7 @@ import org.apache.spark.sql.types.StructType import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext} +@SuppressWarnings(Array("io.github.zhztheplayer.scalawarts.InheritFromCaseClass")) class DeltaMergeTreeFileFormat(metadata: Metadata) extends DeltaParquetFileFormat(metadata) { protected var database = "" @@ -75,7 +76,10 @@ class DeltaMergeTreeFileFormat(metadata: Metadata) extends DeltaParquetFileForma override def equals(other: Any): Boolean = { other match { case ff: DeltaMergeTreeFileFormat => - ff.columnMappingMode == columnMappingMode && ff.referenceSchema == referenceSchema + ff.columnMappingMode == columnMappingMode && + ff.referenceSchema == referenceSchema && + ff.isSplittable == isSplittable && + ff.disablePushDowns == disablePushDowns case _ => false } } diff --git a/backends-clickhouse/src/main/delta-32/io/delta/tables/ClickhouseTable.scala b/backends-clickhouse/src/main/delta-32/io/delta/tables/ClickhouseTable.scala new file mode 100644 index 0000000000000..790b4c1f8a379 --- /dev/null +++ b/backends-clickhouse/src/main/delta-32/io/delta/tables/ClickhouseTable.scala @@ -0,0 +1,134 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.delta.tables + +import org.apache.spark.sql.{Dataset, Row, SparkSession} +import org.apache.spark.sql.delta.{DeltaErrors, DeltaTableIdentifier, DeltaTableUtils} +import org.apache.spark.sql.delta.catalog.ClickHouseTableV2 + +import org.apache.hadoop.fs.Path + +import scala.collection.JavaConverters._ + +class ClickhouseTable( + @transient private val _df: Dataset[Row], + @transient private val table: ClickHouseTableV2) + extends DeltaTable(_df, table) { + + override def optimize(): DeltaOptimizeBuilder = { + DeltaOptimizeBuilder(table) + } +} + +object ClickhouseTable { + + /** + * Instantiate a [[DeltaTable]] object representing the data at the given path, If the given path + * is invalid (i.e. either no table exists or an existing table is not a Delta table), it throws a + * `not a Delta table` error. + * + * Note: This uses the active SparkSession in the current thread to read the table data. Hence, + * this throws error if active SparkSession has not been set, that is, + * `SparkSession.getActiveSession()` is empty. + * + * @since 0.3.0 + */ + def forPath(path: String): DeltaTable = { + val sparkSession = SparkSession.getActiveSession.getOrElse { + throw DeltaErrors.activeSparkSessionNotFound() + } + forPath(sparkSession, path) + } + + /** + * Instantiate a [[DeltaTable]] object representing the data at the given path, If the given path + * is invalid (i.e. either no table exists or an existing table is not a Delta table), it throws a + * `not a Delta table` error. + * + * @since 0.3.0 + */ + def forPath(sparkSession: SparkSession, path: String): DeltaTable = { + forPath(sparkSession, path, Map.empty[String, String]) + } + + /** + * Instantiate a [[DeltaTable]] object representing the data at the given path, If the given path + * is invalid (i.e. either no table exists or an existing table is not a Delta table), it throws a + * `not a Delta table` error. + * + * @param hadoopConf + * Hadoop configuration starting with "fs." or "dfs." will be picked up by `DeltaTable` to + * access the file system when executing queries. Other configurations will not be allowed. + * + * {{{ + * val hadoopConf = Map( + * "fs.s3a.access.key" -> "", + * "fs.s3a.secret.key" -> "" + * ) + * DeltaTable.forPath(spark, "/path/to/table", hadoopConf) + * }}} + * @since 2.2.0 + */ + def forPath( + sparkSession: SparkSession, + path: String, + hadoopConf: scala.collection.Map[String, String]): DeltaTable = { + // We only pass hadoopConf so that we won't pass any unsafe options to Delta. + val badOptions = hadoopConf.filterKeys { + k => !DeltaTableUtils.validDeltaTableHadoopPrefixes.exists(k.startsWith) + }.toMap + if (!badOptions.isEmpty) { + throw DeltaErrors.unsupportedDeltaTableForPathHadoopConf(badOptions) + } + val fileSystemOptions: Map[String, String] = hadoopConf.toMap + val hdpPath = new Path(path) + if (DeltaTableUtils.isDeltaTable(sparkSession, hdpPath, fileSystemOptions)) { + new ClickhouseTable( + sparkSession.read.format("clickhouse").options(fileSystemOptions).load(path), + new ClickHouseTableV2(spark = sparkSession, path = hdpPath, options = fileSystemOptions) + ) + } else { + throw DeltaErrors.notADeltaTableException(DeltaTableIdentifier(path = Some(path))) + } + } + + /** + * Java friendly API to instantiate a [[DeltaTable]] object representing the data at the given + * path, If the given path is invalid (i.e. either no table exists or an existing table is not a + * Delta table), it throws a `not a Delta table` error. + * + * @param hadoopConf + * Hadoop configuration starting with "fs." or "dfs." will be picked up by `DeltaTable` to + * access the file system when executing queries. Other configurations will be ignored. + * + * {{{ + * val hadoopConf = Map( + * "fs.s3a.access.key" -> "", + * "fs.s3a.secret.key", "" + * ) + * DeltaTable.forPath(spark, "/path/to/table", hadoopConf) + * }}} + * @since 2.2.0 + */ + def forPath( + sparkSession: SparkSession, + path: String, + hadoopConf: java.util.Map[String, String]): DeltaTable = { + val fsOptions = hadoopConf.asScala.toMap + forPath(sparkSession, path, fsOptions) + } +} diff --git a/backends-clickhouse/src/main/delta-32/org/apache/spark/sql/delta/ClickhouseOptimisticTransaction.scala b/backends-clickhouse/src/main/delta-32/org/apache/spark/sql/delta/ClickhouseOptimisticTransaction.scala new file mode 100644 index 0000000000000..9e79c4f2e9845 --- /dev/null +++ b/backends-clickhouse/src/main/delta-32/org/apache/spark/sql/delta/ClickhouseOptimisticTransaction.scala @@ -0,0 +1,193 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.delta + +import org.apache.gluten.backendsapi.clickhouse.CHBackendSettings +import org.apache.gluten.execution.ColumnarToRowExecBase + +import org.apache.spark.SparkException +import org.apache.spark.sql.Dataset +import org.apache.spark.sql.catalyst.catalog.CatalogTable +import org.apache.spark.sql.delta.actions._ +import org.apache.spark.sql.delta.catalog.ClickHouseTableV2 +import org.apache.spark.sql.delta.constraints.{Constraint, Constraints} +import org.apache.spark.sql.delta.files.MergeTreeCommitProtocol +import org.apache.spark.sql.delta.schema.InvariantViolationException +import org.apache.spark.sql.delta.sources.DeltaSQLConf +import org.apache.spark.sql.execution.{SparkPlan, SQLExecution} +import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec +import org.apache.spark.sql.execution.datasources.{BasicWriteJobStatsTracker, FakeRowAdaptor, FileFormatWriter, WriteJobStatsTracker} +import org.apache.spark.sql.execution.datasources.v1.clickhouse.MergeTreeFileFormatWriter +import org.apache.spark.sql.execution.datasources.v2.clickhouse.ClickHouseConfig +import org.apache.spark.util.SerializableConfiguration + +import org.apache.commons.lang3.exception.ExceptionUtils + +import scala.collection.mutable.ListBuffer + +object ClickhouseOptimisticTransaction {} + +class ClickhouseOptimisticTransaction( + override val deltaLog: DeltaLog, + override val catalogTable: Option[CatalogTable], + override val snapshot: Snapshot) + extends OptimisticTransaction(deltaLog, catalogTable, snapshot) { + + def this( + deltaLog: DeltaLog, + catalogTable: Option[CatalogTable], + snapshotOpt: Option[Snapshot] = None) = { + this( + deltaLog, + catalogTable, + snapshotOpt.getOrElse(deltaLog.update()) + ) + } + + def insertFakeRowAdaptor(queryPlan: SparkPlan): SparkPlan = queryPlan match { + // if the child is columnar, we can just wrap&transfer the columnar data + case c2r: ColumnarToRowExecBase => + FakeRowAdaptor(c2r.child) + // If the child is aqe, we make aqe "support columnar", + // then aqe itself will guarantee to generate columnar outputs. + // So FakeRowAdaptor will always consumes columnar data, + // thus avoiding the case of c2r->aqe->r2c->writer + case aqe: AdaptiveSparkPlanExec => + FakeRowAdaptor( + AdaptiveSparkPlanExec( + aqe.inputPlan, + aqe.context, + aqe.preprocessingRules, + aqe.isSubquery, + supportsColumnar = true + )) + case other => FakeRowAdaptor(other) + } + + override def writeFiles( + inputData: Dataset[_], + writeOptions: Option[DeltaOptions], + additionalConstraints: Seq[Constraint]): Seq[FileAction] = { + if (ClickHouseConfig.isMergeTreeFormatEngine(metadata.configuration)) { + hasWritten = true + + val spark = inputData.sparkSession + val (data, partitionSchema) = performCDCPartition(inputData) + val outputPath = deltaLog.dataPath + + val (queryExecution, output, generatedColumnConstraints, _) = + normalizeData(deltaLog, writeOptions, data) + val partitioningColumns = getPartitioningColumns(partitionSchema, output) + + val committer = + new MergeTreeCommitProtocol("delta-mergetree", outputPath.toString, None, None) + + // val (optionalStatsTracker, _) = + // getOptionalStatsTrackerAndStatsCollection(output, outputPath, partitionSchema, data) + val (optionalStatsTracker, _) = (None, None) + + val constraints = + Constraints.getAll(metadata, spark) ++ generatedColumnConstraints ++ additionalConstraints + + SQLExecution.withNewExecutionId(queryExecution, Option("deltaTransactionalWrite")) { + val outputSpec = FileFormatWriter.OutputSpec(outputPath.toString, Map.empty, output) + + val queryPlan = queryExecution.executedPlan + val newQueryPlan = insertFakeRowAdaptor(queryPlan) + + val statsTrackers: ListBuffer[WriteJobStatsTracker] = ListBuffer() + + if (spark.conf.get(DeltaSQLConf.DELTA_HISTORY_METRICS_ENABLED)) { + val basicWriteJobStatsTracker = new BasicWriteJobStatsTracker( + new SerializableConfiguration(deltaLog.newDeltaHadoopConf()), + BasicWriteJobStatsTracker.metrics) + // registerSQLMetrics(spark, basicWriteJobStatsTracker.driverSideMetrics) + statsTrackers.append(basicWriteJobStatsTracker) + } + + // Iceberg spec requires partition columns in data files + val writePartitionColumns = IcebergCompat.isAnyEnabled(metadata) + // Retain only a minimal selection of Spark writer options to avoid any potential + // compatibility issues + var options = (writeOptions match { + case None => Map.empty[String, String] + case Some(writeOptions) => + writeOptions.options.filterKeys { + key => + key.equalsIgnoreCase(DeltaOptions.MAX_RECORDS_PER_FILE) || + key.equalsIgnoreCase(DeltaOptions.COMPRESSION) + }.toMap + }) + (DeltaOptions.WRITE_PARTITION_COLUMNS -> writePartitionColumns.toString) + + spark.conf.getAll.foreach( + entry => { + if ( + entry._1.startsWith(s"${CHBackendSettings.getBackendConfigPrefix}.runtime_settings") + || entry._1.equalsIgnoreCase(DeltaSQLConf.DELTA_OPTIMIZE_MIN_FILE_SIZE.key) + ) { + options += (entry._1 -> entry._2) + } + }) + + try { + val tableV2 = ClickHouseTableV2.getTable(deltaLog) + MergeTreeFileFormatWriter.write( + sparkSession = spark, + plan = newQueryPlan, + fileFormat = tableV2.getFileFormat(protocol, metadata), + // formats. + committer = committer, + outputSpec = outputSpec, + // scalastyle:off deltahadoopconfiguration + hadoopConf = spark.sessionState + .newHadoopConfWithOptions(metadata.configuration ++ deltaLog.options), + // scalastyle:on deltahadoopconfiguration + orderByKeyOption = tableV2.orderByKeyOption, + lowCardKeyOption = tableV2.lowCardKeyOption, + minmaxIndexKeyOption = tableV2.minmaxIndexKeyOption, + bfIndexKeyOption = tableV2.bfIndexKeyOption, + setIndexKeyOption = tableV2.setIndexKeyOption, + primaryKeyOption = tableV2.primaryKeyOption, + partitionColumns = partitioningColumns, + bucketSpec = tableV2.bucketOption, + statsTrackers = optionalStatsTracker.toSeq ++ statsTrackers, + options = options, + constraints = constraints + ) + } catch { + case s: SparkException => + // Pull an InvariantViolationException up to the top level if it was the root cause. + val violationException = ExceptionUtils.getRootCause(s) + if (violationException.isInstanceOf[InvariantViolationException]) { + throw violationException + } else { + throw s + } + } + } + committer.addedStatuses.toSeq ++ committer.changeFiles + } else { + // TODO: support native delta parquet write + // 1. insert FakeRowAdaptor + // 2. DeltaInvariantCheckerExec transform + // 3. DeltaTaskStatisticsTracker collect null count / min values / max values + // 4. set the parameters 'staticPartitionWriteOnly', 'isNativeAppliable', + // 'nativeFormat' in the LocalProperty of the sparkcontext + super.writeFiles(inputData, writeOptions, additionalConstraints) + } + } +} diff --git a/backends-clickhouse/src/main/delta-32/org/apache/spark/sql/delta/DeltaAdapter.scala b/backends-clickhouse/src/main/delta-32/org/apache/spark/sql/delta/DeltaAdapter.scala new file mode 100644 index 0000000000000..8a9c5585e888b --- /dev/null +++ b/backends-clickhouse/src/main/delta-32/org/apache/spark/sql/delta/DeltaAdapter.scala @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.delta + +object DeltaAdapter extends DeltaAdapterTrait { + override def snapshot(deltaLog: DeltaLog): Snapshot = deltaLog.unsafeVolatileSnapshot +} diff --git a/backends-clickhouse/src/main/delta-32/org/apache/spark/sql/delta/DeltaLog.scala b/backends-clickhouse/src/main/delta-32/org/apache/spark/sql/delta/DeltaLog.scala new file mode 100644 index 0000000000000..dca14d7fb1fb8 --- /dev/null +++ b/backends-clickhouse/src/main/delta-32/org/apache/spark/sql/delta/DeltaLog.scala @@ -0,0 +1,1106 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.delta + +// scalastyle:off import.ordering.noEmptyLine +import java.io.IOException +import java.lang.ref.WeakReference +import java.net.URI +import java.util.concurrent.TimeUnit + +import scala.collection.JavaConverters._ +import scala.collection.mutable +import scala.util.control.NonFatal + +import com.databricks.spark.util.TagDefinitions._ +import org.apache.spark.sql.delta.actions._ +import org.apache.spark.sql.delta.catalog.ClickHouseTableV2 +import org.apache.spark.sql.delta.commands.WriteIntoDelta +import org.apache.spark.sql.delta.files.{TahoeBatchFileIndex, TahoeLogFileIndex} +import org.apache.spark.sql.delta.managedcommit.ManagedCommitUtils +import org.apache.spark.sql.delta.metering.DeltaLogging +import org.apache.spark.sql.delta.schema.{SchemaMergingUtils, SchemaUtils} +import org.apache.spark.sql.delta.sources._ +import org.apache.spark.sql.delta.storage.LogStoreProvider +import org.apache.spark.sql.delta.util.FileNames +import com.google.common.cache.{Cache, CacheBuilder, RemovalNotification} +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.{FileStatus, FileSystem, Path} + +import org.apache.spark.sql._ +import org.apache.spark.sql.catalyst.{FileSourceOptions, TableIdentifier} +import org.apache.spark.sql.catalyst.analysis.{Resolver, UnresolvedAttribute} +import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogTable} +import org.apache.spark.sql.catalyst.expressions.{And, Attribute, Cast, Expression, Literal} +import org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper +import org.apache.spark.sql.catalyst.util.FailFastMode +import org.apache.spark.sql.execution.datasources._ +import org.apache.spark.sql.execution.datasources.v2.clickhouse.ClickHouseConfig +import org.apache.spark.sql.expressions.UserDefinedFunction +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.sources.{BaseRelation, InsertableRelation} +import org.apache.spark.sql.types.{StructField, StructType} +import org.apache.spark.util._ + +/** + * Gluten overwrite Delta: + * + * This file is copied from Delta 3.2.0, it is modified to overcome the following issues: + * 1. return ClickhouseOptimisticTransaction + * 2. return DeltaMergeTreeFileFormat + * 3. create HadoopFsRelation with the bucket options + */ + +/** + * Used to query the current state of the log as well as modify it by adding + * new atomic collections of actions. + * + * Internally, this class implements an optimistic concurrency control + * algorithm to handle multiple readers or writers. Any single read + * is guaranteed to see a consistent snapshot of the table. + * + * @param logPath Path of the Delta log JSONs. + * @param dataPath Path of the data files. + * @param options Filesystem options filtered from `allOptions`. + * @param allOptions All options provided by the user, for example via `df.write.option()`. This + * includes but not limited to filesystem and table properties. + * @param clock Clock to be used when starting a new transaction. + */ +class DeltaLog private( + val logPath: Path, + val dataPath: Path, + val options: Map[String, String], + val allOptions: Map[String, String], + val clock: Clock + ) extends Checkpoints + with MetadataCleanup + with LogStoreProvider + with SnapshotManagement + with DeltaFileFormat + with ProvidesUniFormConverters + with ReadChecksum { + + import org.apache.spark.sql.delta.files.TahoeFileIndex + + /** + * Path to sidecar directory. + * This is intentionally kept `lazy val` as otherwise any other constructor codepaths in DeltaLog + * (e.g. SnapshotManagement etc) will see it as null as they are executed before this line is + * called. + */ + lazy val sidecarDirPath: Path = FileNames.sidecarDirPath(logPath) + + + protected def spark = SparkSession.active + + checkRequiredConfigurations() + + /** + * Keep a reference to `SparkContext` used to create `DeltaLog`. `DeltaLog` cannot be used when + * `SparkContext` is stopped. We keep the reference so that we can check whether the cache is + * still valid and drop invalid `DeltaLog`` objects. + */ + private val sparkContext = new WeakReference(spark.sparkContext) + + /** + * Returns the Hadoop [[Configuration]] object which can be used to access the file system. All + * Delta code should use this method to create the Hadoop [[Configuration]] object, so that the + * hadoop file system configurations specified in DataFrame options will come into effect. + */ + // scalastyle:off deltahadoopconfiguration + final def newDeltaHadoopConf(): Configuration = + spark.sessionState.newHadoopConfWithOptions(options) + // scalastyle:on deltahadoopconfiguration + + /** Used to read and write physical log files and checkpoints. */ + lazy val store = createLogStore(spark) + + /** Delta History Manager containing version and commit history. */ + lazy val history = new DeltaHistoryManager( + this, spark.sessionState.conf.getConf(DeltaSQLConf.DELTA_HISTORY_PAR_SEARCH_THRESHOLD)) + + /* --------------- * + | Configuration | + * --------------- */ + + /** + * The max lineage length of a Snapshot before Delta forces to build a Snapshot from scratch. + * Delta will build a Snapshot on top of the previous one if it doesn't see a checkpoint. + * However, there is a race condition that when two writers are writing at the same time, + * a writer may fail to pick up checkpoints written by another one, and the lineage will grow + * and finally cause StackOverflowError. Hence we have to force to build a Snapshot from scratch + * when the lineage length is too large to avoid hitting StackOverflowError. + */ + def maxSnapshotLineageLength: Int = + spark.sessionState.conf.getConf(DeltaSQLConf.DELTA_MAX_SNAPSHOT_LINEAGE_LENGTH) + + /** The unique identifier for this table. */ + def tableId: String = unsafeVolatileMetadata.id // safe because table id never changes + + /** + * Combines the tableId with the path of the table to ensure uniqueness. Normally `tableId` + * should be globally unique, but nothing stops users from copying a Delta table directly to + * a separate location, where the transaction log is copied directly, causing the tableIds to + * match. When users mutate the copied table, and then try to perform some checks joining the + * two tables, optimizations that depend on `tableId` alone may not be correct. Hence we use a + * composite id. + */ + private[delta] def compositeId: (String, Path) = tableId -> dataPath + + /** + * Creates a [[LogicalRelation]] for a given [[DeltaLogFileIndex]], with all necessary file source + * options taken from the Delta Log. All reads of Delta metadata files should use this method. + */ + def indexToRelation( + index: DeltaLogFileIndex, + schema: StructType = Action.logSchema): LogicalRelation = { + DeltaLog.indexToRelation(spark, index, options, schema) + } + + /** + * Load the data using the FileIndex. This allows us to skip many checks that add overhead, e.g. + * file existence checks, partitioning schema inference. + */ + def loadIndex( + index: DeltaLogFileIndex, + schema: StructType = Action.logSchema): DataFrame = { + Dataset.ofRows(spark, indexToRelation(index, schema)) + } + + /* ------------------ * + | Delta Management | + * ------------------ */ + + /** + * Returns a new [[OptimisticTransaction]] that can be used to read the current state of the log + * and then commit updates. The reads and updates will be checked for logical conflicts with any + * concurrent writes to the log, and post-commit hooks can be used to notify the table's catalog + * of schema changes, etc. + * + * Note that all reads in a transaction must go through the returned transaction object, and not + * directly to the [[DeltaLog]] otherwise they will not be checked for conflicts. + * + * @param catalogTableOpt The [[CatalogTable]] for the table this transaction updates. Passing + * None asserts this is a path-based table with no catalog entry. + * + * @param snapshotOpt THe [[Snapshot]] this transaction should use, if not latest. + */ + def startTransaction( + catalogTableOpt: Option[CatalogTable], + snapshotOpt: Option[Snapshot] = None): OptimisticTransaction = { + TransactionExecutionObserver.threadObserver.get().startingTransaction { + // --- modified start + new ClickhouseOptimisticTransaction(this, catalogTableOpt, snapshotOpt) + // --- modified end + } + } + + /** Legacy/compat overload that does not require catalog table information. Avoid prod use. */ + // --- modified start + // @deprecated("Please use the CatalogTable overload instead", "3.0") + // --- modified end + def startTransaction(): OptimisticTransaction = { + startTransaction(catalogTableOpt = None, snapshotOpt = None) + } + + /** + * Execute a piece of code within a new [[OptimisticTransaction]]. Reads/write sets will + * be recorded for this table, and all other tables will be read + * at a snapshot that is pinned on the first access. + * + * @param catalogTableOpt The [[CatalogTable]] for the table this transaction updates. Passing + * None asserts this is a path-based table with no catalog entry. + * + * @param snapshotOpt THe [[Snapshot]] this transaction should use, if not latest. + * @note This uses thread-local variable to make the active transaction visible. So do not use + * multi-threaded code in the provided thunk. + */ + def withNewTransaction[T]( + catalogTableOpt: Option[CatalogTable], + snapshotOpt: Option[Snapshot] = None)( + thunk: OptimisticTransaction => T): T = { + try { + val txn = startTransaction(catalogTableOpt, snapshotOpt) + OptimisticTransaction.setActive(txn) + thunk(txn) + } finally { + OptimisticTransaction.clearActive() + } + } + + /** Legacy/compat overload that does not require catalog table information. Avoid prod use. */ + @deprecated("Please use the CatalogTable overload instead", "3.0") + def withNewTransaction[T](thunk: OptimisticTransaction => T): T = { + try { + val txn = startTransaction() + OptimisticTransaction.setActive(txn) + thunk(txn) + } finally { + OptimisticTransaction.clearActive() + } + } + + + /** + * Upgrade the table's protocol version, by default to the maximum recognized reader and writer + * versions in this Delta release. This method only upgrades protocol version, and will fail if + * the new protocol version is not a superset of the original one used by the snapshot. + */ + def upgradeProtocol( + catalogTable: Option[CatalogTable], + snapshot: Snapshot, + newVersion: Protocol): Unit = { + val currentVersion = snapshot.protocol + if (newVersion == currentVersion) { + logConsole(s"Table $dataPath is already at protocol version $newVersion.") + return + } + if (!currentVersion.canUpgradeTo(newVersion)) { + throw new ProtocolDowngradeException(currentVersion, newVersion) + } + + val txn = startTransaction(catalogTable, Some(snapshot)) + try { + SchemaMergingUtils.checkColumnNameDuplication(txn.metadata.schema, "in the table schema") + } catch { + case e: AnalysisException => + throw DeltaErrors.duplicateColumnsOnUpdateTable(e) + } + txn.commit(Seq(newVersion), DeltaOperations.UpgradeProtocol(newVersion)) + logConsole(s"Upgraded table at $dataPath to $newVersion.") + } + + /** + * Get all actions starting from "startVersion" (inclusive). If `startVersion` doesn't exist, + * return an empty Iterator. + * Callers are encouraged to use the other override which takes the endVersion if available to + * avoid I/O and improve performance of this method. + */ + def getChanges( + startVersion: Long, + failOnDataLoss: Boolean = false): Iterator[(Long, Seq[Action])] = { + getChangeLogFiles(startVersion, failOnDataLoss).map { case (version, status) => + (version, store.read(status, newDeltaHadoopConf()).map(Action.fromJson(_))) + } + } + + private[sql] def getChanges( + startVersion: Long, + endVersion: Long, + failOnDataLoss: Boolean): Iterator[(Long, Seq[Action])] = { + getChangeLogFiles(startVersion, endVersion, failOnDataLoss).map { case (version, status) => + (version, store.read(status, newDeltaHadoopConf()).map(Action.fromJson(_))) + } + } + + private[sql] def getChangeLogFiles( + startVersion: Long, + endVersion: Long, + failOnDataLoss: Boolean): Iterator[(Long, FileStatus)] = { + implicit class IteratorWithStopAtHelper[T](underlying: Iterator[T]) { + // This method is used to stop the iterator when the condition is met. + def stopAt(stopAtFunc: (T) => Boolean): Iterator[T] = new Iterator[T] { + var shouldStop = false + + override def hasNext: Boolean = !shouldStop && underlying.hasNext + + override def next(): T = { + val v = underlying.next() + shouldStop = stopAtFunc(v) + v + } + } + } + + getChangeLogFiles(startVersion, failOnDataLoss) + // takeWhile always looks at one extra item, which can trigger unnecessary work. Instead, we + // stop if we've seen the item we believe should be the last interesting item, without + // examining the one that follows. + .stopAt { case (version, _) => version >= endVersion } + // The last element in this iterator may not be <= endVersion, so we need to filter it out. + .takeWhile { case (version, _) => version <= endVersion } + } + + /** + * Get access to all actions starting from "startVersion" (inclusive) via [[FileStatus]]. + * If `startVersion` doesn't exist, return an empty Iterator. + * Callers are encouraged to use the other override which takes the endVersion if available to + * avoid I/O and improve performance of this method. + */ + def getChangeLogFiles( + startVersion: Long, + failOnDataLoss: Boolean = false): Iterator[(Long, FileStatus)] = { + val deltasWithVersion = ManagedCommitUtils.commitFilesIterator(this, startVersion) + // Subtract 1 to ensure that we have the same check for the inclusive startVersion + var lastSeenVersion = startVersion - 1 + deltasWithVersion.map { case (status, version) => + if (failOnDataLoss && version > lastSeenVersion + 1) { + throw DeltaErrors.failOnDataLossException(lastSeenVersion + 1, version) + } + lastSeenVersion = version + (version, status) + } + } + + /* --------------------- * + | Protocol validation | + * --------------------- */ + + /** + * Asserts the highest protocol supported by this client is not less than what required by the + * table for performing read or write operations. This ensures the client to support a + * greater-or-equal protocol versions and recognizes/supports all features enabled by the table. + * + * The operation type to be checked is passed as a string in `readOrWrite`. Valid values are + * `read` and `write`. + */ + private def protocolCheck(tableProtocol: Protocol, readOrWrite: String): Unit = { + val clientSupportedProtocol = Action.supportedProtocolVersion() + // Depending on the operation, pull related protocol versions out of Protocol objects. + // `getEnabledFeatures` is a pointer to pull reader/writer features out of a Protocol. + val (clientSupportedVersions, tableRequiredVersion, getEnabledFeatures) = readOrWrite match { + case "read" => ( + Action.supportedReaderVersionNumbers, + tableProtocol.minReaderVersion, + (f: Protocol) => f.readerFeatureNames) + case "write" => ( + Action.supportedWriterVersionNumbers, + tableProtocol.minWriterVersion, + (f: Protocol) => f.writerFeatureNames) + case _ => + throw new IllegalArgumentException("Table operation must be either `read` or `write`.") + } + + // Check is complete when both the protocol version and all referenced features are supported. + val clientSupportedFeatureNames = getEnabledFeatures(clientSupportedProtocol) + val tableEnabledFeatureNames = getEnabledFeatures(tableProtocol) + if (tableEnabledFeatureNames.subsetOf(clientSupportedFeatureNames) && + clientSupportedVersions.contains(tableRequiredVersion)) { + return + } + + // Otherwise, either the protocol version, or few features referenced by the table, is + // unsupported. + val clientUnsupportedFeatureNames = + tableEnabledFeatureNames.diff(clientSupportedFeatureNames) + // Prepare event log constants and the appropriate error message handler. + val (opType, versionKey, unsupportedFeaturesException) = readOrWrite match { + case "read" => ( + "delta.protocol.failure.read", + "minReaderVersion", + DeltaErrors.unsupportedReaderTableFeaturesInTableException _) + case "write" => ( + "delta.protocol.failure.write", + "minWriterVersion", + DeltaErrors.unsupportedWriterTableFeaturesInTableException _) + } + recordDeltaEvent( + this, + opType, + data = Map( + "clientVersion" -> clientSupportedVersions.max, + versionKey -> tableRequiredVersion, + "clientFeatures" -> clientSupportedFeatureNames.mkString(","), + "clientUnsupportedFeatures" -> clientUnsupportedFeatureNames.mkString(","))) + if (!clientSupportedVersions.contains(tableRequiredVersion)) { + throw new InvalidProtocolVersionException( + dataPath.toString(), + tableProtocol.minReaderVersion, + tableProtocol.minWriterVersion, + Action.supportedReaderVersionNumbers.toSeq, + Action.supportedWriterVersionNumbers.toSeq) + } else { + throw unsupportedFeaturesException(dataPath.toString(), clientUnsupportedFeatureNames) + } + } + + /** + * Asserts that the table's protocol enabled all features that are active in the metadata. + * + * A mismatch shouldn't happen when the table has gone through a proper write process because we + * require all active features during writes. However, other clients may void this guarantee. + */ + def assertTableFeaturesMatchMetadata( + targetProtocol: Protocol, + targetMetadata: Metadata): Unit = { + if (!targetProtocol.supportsReaderFeatures && !targetProtocol.supportsWriterFeatures) return + + val protocolEnabledFeatures = targetProtocol.writerFeatureNames + .flatMap(TableFeature.featureNameToFeature) + val activeFeatures = + Protocol.extractAutomaticallyEnabledFeatures(spark, targetMetadata, Some(targetProtocol)) + val activeButNotEnabled = activeFeatures.diff(protocolEnabledFeatures) + if (activeButNotEnabled.nonEmpty) { + throw DeltaErrors.tableFeatureMismatchException(activeButNotEnabled.map(_.name)) + } + } + + /** + * Asserts that the client is up to date with the protocol and allowed to read the table that is + * using the given `protocol`. + */ + def protocolRead(protocol: Protocol): Unit = { + protocolCheck(protocol, "read") + } + + /** + * Asserts that the client is up to date with the protocol and allowed to write to the table + * that is using the given `protocol`. + */ + def protocolWrite(protocol: Protocol): Unit = { + protocolCheck(protocol, "write") + } + + /* ---------------------------------------- * + | Log Directory Management and Retention | + * ---------------------------------------- */ + + /** + * Whether a Delta table exists at this directory. + * It is okay to use the cached volatile snapshot here, since the worst case is that the table + * has recently started existing which hasn't been picked up here. If so, any subsequent command + * that updates the table will see the right value. + */ + def tableExists: Boolean = unsafeVolatileSnapshot.version >= 0 + + def isSameLogAs(otherLog: DeltaLog): Boolean = this.compositeId == otherLog.compositeId + + /** Creates the log directory if it does not exist. */ + def ensureLogDirectoryExist(): Unit = { + val fs = logPath.getFileSystem(newDeltaHadoopConf()) + def createDirIfNotExists(path: Path): Unit = { + // Optimistically attempt to create the directory first without checking its existence. + // This is efficient because we're assuming it's more likely that the directory doesn't + // exist and it saves an filesystem existence check in that case. + val (success, mkdirsIOExceptionOpt) = try { + // Return value of false should mean the directory already existed (not an error) but + // we will verify below because we're paranoid about buggy FileSystem implementations. + (fs.mkdirs(path), None) + } catch { + // A FileAlreadyExistsException is expected if a non-directory object exists but an explicit + // check is needed because buggy Hadoop FileSystem.mkdir wrongly throws the exception even + // on existing directories. + case io: IOException => + val dirExists = + try { + fs.getFileStatus(path).isDirectory + } catch { + case NonFatal(_) => false + } + (dirExists, Some(io)) + } + if (!success) { + throw DeltaErrors.cannotCreateLogPathException( + logPath = logPath.toString, + cause = mkdirsIOExceptionOpt.orNull) + } + } + createDirIfNotExists(FileNames.commitDirPath(logPath)) + } + + /** + * Create the log directory. Unlike `ensureLogDirectoryExist`, this method doesn't check whether + * the log directory exists and it will ignore the return value of `mkdirs`. + */ + def createLogDirectory(): Unit = { + logPath.getFileSystem(newDeltaHadoopConf()).mkdirs(logPath) + } + + /* ------------ * + | Integration | + * ------------ */ + + /** + * Returns a [[org.apache.spark.sql.DataFrame]] containing the new files within the specified + * version range. + */ + def createDataFrame( + snapshot: SnapshotDescriptor, + addFiles: Seq[AddFile], + isStreaming: Boolean = false, + actionTypeOpt: Option[String] = None): DataFrame = { + val actionType = actionTypeOpt.getOrElse(if (isStreaming) "streaming" else "batch") + // It's ok to not pass down the partitionSchema to TahoeBatchFileIndex. Schema evolution will + // ensure any partitionSchema changes will be captured, and upon restart, the new snapshot will + // be initialized with the correct partition schema again. + val fileIndex = new TahoeBatchFileIndex(spark, actionType, addFiles, this, dataPath, snapshot) + // --- modified start + // TODO: Don't add the bucketOption here, it will cause the OOM when the merge into update + // key is the bucket column, fix later + // --- modified end + val relation = buildHadoopFsRelationWithFileIndex(snapshot, fileIndex, bucketSpec = None) + Dataset.ofRows(spark, LogicalRelation(relation, isStreaming = isStreaming)) + } + + /** + * Returns a [[BaseRelation]] that contains all of the data present + * in the table. This relation will be continually updated + * as files are added or removed from the table. However, new [[BaseRelation]] + * must be requested in order to see changes to the schema. + */ + def createRelation( + partitionFilters: Seq[Expression] = Nil, + snapshotToUseOpt: Option[Snapshot] = None, + catalogTableOpt: Option[CatalogTable] = None, + isTimeTravelQuery: Boolean = false): BaseRelation = { + + /** Used to link the files present in the table into the query planner. */ + // TODO: If snapshotToUse is unspecified, get the correct snapshot from update() + val snapshotToUse = snapshotToUseOpt.getOrElse(unsafeVolatileSnapshot) + if (snapshotToUse.version < 0) { + // A negative version here means the dataPath is an empty directory. Read query should error + // out in this case. + throw DeltaErrors.pathNotExistsException(dataPath.toString) + } + + val fileIndex = TahoeLogFileIndex( + spark, this, dataPath, snapshotToUse, partitionFilters, isTimeTravelQuery) + // --- modified start + var bucketSpec: Option[BucketSpec] = + if (ClickHouseConfig.isMergeTreeFormatEngine(snapshotToUse.metadata.configuration)) { + ClickHouseTableV2.getTable(this).bucketOption + } else { + None + } + + val r = buildHadoopFsRelationWithFileIndex(snapshotToUse, fileIndex, bucketSpec = bucketSpec) + new DeltaLog.DeltaHadoopFsRelation( + r.location, + r.partitionSchema, + r.dataSchema, + r.bucketSpec, + r.fileFormat, + r.options + )(spark, this, catalogTableOpt) + // --- modified end + } + + def buildHadoopFsRelationWithFileIndex(snapshot: SnapshotDescriptor, fileIndex: TahoeFileIndex, + bucketSpec: Option[BucketSpec]): HadoopFsRelation = { + HadoopFsRelation( + fileIndex, + partitionSchema = DeltaColumnMapping.dropColumnMappingMetadata( + snapshot.metadata.partitionSchema), + // We pass all table columns as `dataSchema` so that Spark will preserve the partition + // column locations. Otherwise, for any partition columns not in `dataSchema`, Spark would + // just append them to the end of `dataSchema`. + dataSchema = DeltaColumnMapping.dropColumnMappingMetadata( + DeltaTableUtils.removeInternalMetadata(spark, + SchemaUtils.dropNullTypeColumns(snapshot.metadata.schema))), + bucketSpec = bucketSpec, + fileFormat(snapshot.protocol, snapshot.metadata), + // `metadata.format.options` is not set today. Even if we support it in future, we shouldn't + // store any file system options since they may contain credentials. Hence, it will never + // conflict with `DeltaLog.options`. + snapshot.metadata.format.options ++ options)(spark) + } + + /** + * Verify the required Spark conf for delta + * Throw `DeltaErrors.configureSparkSessionWithExtensionAndCatalog` exception if + * `spark.sql.catalog.spark_catalog` config is missing. We do not check for + * `spark.sql.extensions` because DeltaSparkSessionExtension can alternatively + * be activated using the `.withExtension()` API. This check can be disabled + * by setting DELTA_CHECK_REQUIRED_SPARK_CONF to false. + */ + protected def checkRequiredConfigurations(): Unit = { + if (spark.sessionState.conf.getConf(DeltaSQLConf.DELTA_REQUIRED_SPARK_CONFS_CHECK)) { + if (spark.conf.getOption( + SQLConf.V2_SESSION_CATALOG_IMPLEMENTATION.key).isEmpty) { + throw DeltaErrors.configureSparkSessionWithExtensionAndCatalog(None) + } + } + } + + /** + * Returns a proper path canonicalization function for the current Delta log. + * + * If `runsOnExecutors` is true, the returned method will use a broadcast Hadoop Configuration + * so that the method is suitable for execution on executors. Otherwise, the returned method + * will use a local Hadoop Configuration and the method can only be executed on the driver. + */ + private[delta] def getCanonicalPathFunction(runsOnExecutors: Boolean): String => String = { + val hadoopConf = newDeltaHadoopConf() + // Wrap `hadoopConf` with a method to delay the evaluation to run on executors. + val getHadoopConf = if (runsOnExecutors) { + val broadcastHadoopConf = + spark.sparkContext.broadcast(new SerializableConfiguration(hadoopConf)) + () => broadcastHadoopConf.value.value + } else { + () => hadoopConf + } + + new DeltaLog.CanonicalPathFunction(getHadoopConf) + } + + /** + * Returns a proper path canonicalization UDF for the current Delta log. + * + * If `runsOnExecutors` is true, the returned UDF will use a broadcast Hadoop Configuration. + * Otherwise, the returned UDF will use a local Hadoop Configuration and the UDF can + * only be executed on the driver. + */ + private[delta] def getCanonicalPathUdf(runsOnExecutors: Boolean = true): UserDefinedFunction = { + DeltaUDF.stringFromString(getCanonicalPathFunction(runsOnExecutors)) + } + + // --- modified start + override def fileFormat(protocol: Protocol, metadata: Metadata): FileFormat = { + if (ClickHouseConfig.isMergeTreeFormatEngine(metadata.configuration)) { + ClickHouseTableV2.getTable(this).getFileFormat(protocol, metadata) + } else { + super.fileFormat(protocol, metadata) + } + } + // --- modified end +} + +object DeltaLog extends DeltaLogging { + + // --- modified start + @SuppressWarnings(Array("io.github.zhztheplayer.scalawarts.InheritFromCaseClass")) + private class DeltaHadoopFsRelation( + location: FileIndex, + partitionSchema: StructType, + // The top-level columns in `dataSchema` should match the actual physical file schema, + // otherwise the ORC data source may not work with the by-ordinal mode. + dataSchema: StructType, + bucketSpec: Option[BucketSpec], + fileFormat: FileFormat, + options: Map[String, String] + )(spark: SparkSession, deltaLog: DeltaLog, catalogTableOpt: Option[CatalogTable]) + extends HadoopFsRelation( + location, + partitionSchema, + dataSchema, + bucketSpec, + fileFormat, + options)(spark) + with InsertableRelation { + def insert(data: DataFrame, overwrite: Boolean): Unit = { + val mode = if (overwrite) SaveMode.Overwrite else SaveMode.Append + WriteIntoDelta( + deltaLog = deltaLog, + mode = mode, + new DeltaOptions(Map.empty[String, String], spark.sessionState.conf), + partitionColumns = Seq.empty, + configuration = Map.empty, + data = data, + catalogTableOpt = catalogTableOpt).run(spark) + } + } + // --- modified end + + /** + * The key type of `DeltaLog` cache. It's a pair of the canonicalized table path and the file + * system options (options starting with "fs." or "dfs." prefix) passed into + * `DataFrameReader/Writer` + */ + private type DeltaLogCacheKey = (Path, Map[String, String]) + + /** The name of the subdirectory that holds Delta metadata files */ + private[delta] val LOG_DIR_NAME = "_delta_log" + + private[delta] def logPathFor(dataPath: String): Path = logPathFor(new Path(dataPath)) + private[delta] def logPathFor(dataPath: Path): Path = + DeltaTableUtils.safeConcatPaths(dataPath, LOG_DIR_NAME) + + /** + * We create only a single [[DeltaLog]] for any given `DeltaLogCacheKey` to avoid wasted work + * in reconstructing the log. + */ + type CacheKey = (Path, Map[String, String]) + private[delta] def getOrCreateCache(conf: SQLConf): + Cache[CacheKey, DeltaLog] = synchronized { + deltaLogCache match { + case Some(c) => c + case None => + val builder = createCacheBuilder(conf) + .removalListener( + (removalNotification: RemovalNotification[DeltaLogCacheKey, DeltaLog]) => { + val log = removalNotification.getValue + // TODO: We should use ref-counting to uncache snapshots instead of a manual timed op + try log.unsafeVolatileSnapshot.uncache() catch { + case _: java.lang.NullPointerException => + // Various layers will throw null pointer if the RDD is already gone. + } + }) + deltaLogCache = Some(builder.build[CacheKey, DeltaLog]()) + deltaLogCache.get + } + } + + private var deltaLogCache: Option[Cache[CacheKey, DeltaLog]] = None + + /** + * Helper to create delta log caches + */ + private def createCacheBuilder(conf: SQLConf): CacheBuilder[AnyRef, AnyRef] = { + val cacheRetention = conf.getConf(DeltaSQLConf.DELTA_LOG_CACHE_RETENTION_MINUTES) + val cacheSize = conf + .getConf(DeltaSQLConf.DELTA_LOG_CACHE_SIZE) + .max(sys.props.get("delta.log.cacheSize").map(_.toLong).getOrElse(0L)) + + CacheBuilder + .newBuilder() + .expireAfterAccess(cacheRetention, TimeUnit.MINUTES) + .maximumSize(cacheSize) + } + + + /** + * Creates a [[LogicalRelation]] for a given [[DeltaLogFileIndex]], with all necessary file source + * options taken from the Delta Log. All reads of Delta metadata files should use this method. + */ + def indexToRelation( + spark: SparkSession, + index: DeltaLogFileIndex, + additionalOptions: Map[String, String], + schema: StructType = Action.logSchema): LogicalRelation = { + val formatSpecificOptions: Map[String, String] = index.format match { + case DeltaLogFileIndex.COMMIT_FILE_FORMAT => + jsonCommitParseOption + case _ => Map.empty + } + // Delta should NEVER ignore missing or corrupt metadata files, because doing so can render the + // entire table unusable. Hard-wire that into the file source options so the user can't override + // it by setting spark.sql.files.ignoreCorruptFiles or spark.sql.files.ignoreMissingFiles. + val allOptions = additionalOptions ++ formatSpecificOptions ++ Map( + FileSourceOptions.IGNORE_CORRUPT_FILES -> "false", + FileSourceOptions.IGNORE_MISSING_FILES -> "false" + ) + // --- modified start + // Don't need to add the bucketOption here, it handles the delta log meta json file + // --- modified end + val fsRelation = HadoopFsRelation( + index, index.partitionSchema, schema, None, index.format, allOptions)(spark) + LogicalRelation(fsRelation) + } + + // Don't tolerate malformed JSON when parsing Delta log actions (default is PERMISSIVE) + val jsonCommitParseOption = Map("mode" -> FailFastMode.name) + + /** Helper for creating a log when it stored at the root of the data. */ + def forTable(spark: SparkSession, dataPath: String): DeltaLog = { + apply(spark, logPathFor(dataPath), Map.empty, new SystemClock) + } + + /** Helper for creating a log when it stored at the root of the data. */ + def forTable(spark: SparkSession, dataPath: Path): DeltaLog = { + apply(spark, logPathFor(dataPath), new SystemClock) + } + + /** Helper for creating a log when it stored at the root of the data. */ + def forTable(spark: SparkSession, dataPath: Path, options: Map[String, String]): DeltaLog = { + apply(spark, logPathFor(dataPath), options, new SystemClock) + } + + /** Helper for creating a log when it stored at the root of the data. */ + def forTable(spark: SparkSession, dataPath: Path, clock: Clock): DeltaLog = { + apply(spark, logPathFor(dataPath), clock) + } + + /** Helper for creating a log for the table. */ + def forTable(spark: SparkSession, tableName: TableIdentifier): DeltaLog = { + forTable(spark, tableName, new SystemClock) + } + + /** Helper for creating a log for the table. */ + def forTable(spark: SparkSession, table: CatalogTable): DeltaLog = { + forTable(spark, table, new SystemClock) + } + + /** Helper for creating a log for the table. */ + def forTable(spark: SparkSession, tableName: TableIdentifier, clock: Clock): DeltaLog = { + if (DeltaTableIdentifier.isDeltaPath(spark, tableName)) { + forTable(spark, new Path(tableName.table)) + } else { + forTable(spark, spark.sessionState.catalog.getTableMetadata(tableName), clock) + } + } + + /** Helper for creating a log for the table. */ + def forTable(spark: SparkSession, table: CatalogTable, clock: Clock): DeltaLog = { + apply(spark, logPathFor(new Path(table.location)), clock) + } + + private def apply(spark: SparkSession, rawPath: Path, clock: Clock = new SystemClock): DeltaLog = + apply(spark, rawPath, Map.empty, clock) + + + /** Helper for getting a log, as well as the latest snapshot, of the table */ + def forTableWithSnapshot(spark: SparkSession, dataPath: String): (DeltaLog, Snapshot) = + withFreshSnapshot { forTable(spark, new Path(dataPath), _) } + + /** Helper for getting a log, as well as the latest snapshot, of the table */ + def forTableWithSnapshot(spark: SparkSession, dataPath: Path): (DeltaLog, Snapshot) = + withFreshSnapshot { forTable(spark, dataPath, _) } + + /** Helper for getting a log, as well as the latest snapshot, of the table */ + def forTableWithSnapshot( + spark: SparkSession, + tableName: TableIdentifier): (DeltaLog, Snapshot) = + withFreshSnapshot { forTable(spark, tableName, _) } + + /** Helper for getting a log, as well as the latest snapshot, of the table */ + def forTableWithSnapshot( + spark: SparkSession, + dataPath: Path, + options: Map[String, String]): (DeltaLog, Snapshot) = + withFreshSnapshot { apply(spark, logPathFor(dataPath), options, _) } + + /** + * Helper function to be used with the forTableWithSnapshot calls. Thunk is a + * partially applied DeltaLog.forTable call, which we can then wrap around with a + * snapshot update. We use the system clock to avoid back-to-back updates. + */ + private[delta] def withFreshSnapshot(thunk: Clock => DeltaLog): (DeltaLog, Snapshot) = { + val clock = new SystemClock + val ts = clock.getTimeMillis() + val deltaLog = thunk(clock) + val snapshot = deltaLog.update(checkIfUpdatedSinceTs = Some(ts)) + (deltaLog, snapshot) + } + + private def apply( + spark: SparkSession, + rawPath: Path, + options: Map[String, String], + clock: Clock + ): DeltaLog = { + val fileSystemOptions: Map[String, String] = + if (spark.sessionState.conf.getConf( + DeltaSQLConf.LOAD_FILE_SYSTEM_CONFIGS_FROM_DATAFRAME_OPTIONS)) { + // We pick up only file system options so that we don't pass any parquet or json options to + // the code that reads Delta transaction logs. + options.filterKeys { k => + DeltaTableUtils.validDeltaTableHadoopPrefixes.exists(k.startsWith) + }.toMap + } else { + Map.empty + } + // scalastyle:off deltahadoopconfiguration + val hadoopConf = spark.sessionState.newHadoopConfWithOptions(fileSystemOptions) + // scalastyle:on deltahadoopconfiguration + val fs = rawPath.getFileSystem(hadoopConf) + val path = fs.makeQualified(rawPath) + def createDeltaLog(): DeltaLog = recordDeltaOperation( + null, + "delta.log.create", + Map(TAG_TAHOE_PATH -> path.getParent.toString)) { + AnalysisHelper.allowInvokingTransformsInAnalyzer { + new DeltaLog( + logPath = path, + dataPath = path.getParent, + options = fileSystemOptions, + allOptions = options, + clock = clock + ) + } + } + def getDeltaLogFromCache(): DeltaLog = { + // The following cases will still create a new ActionLog even if there is a cached + // ActionLog using a different format path: + // - Different `scheme` + // - Different `authority` (e.g., different user tokens in the path) + // - Different mount point. + try { + getOrCreateCache(spark.sessionState.conf) + .get(path -> fileSystemOptions, () => { + createDeltaLog() + } + ) + } catch { + case e: com.google.common.util.concurrent.UncheckedExecutionException => throw e.getCause + case e: java.util.concurrent.ExecutionException => throw e.getCause + } + } + + val deltaLog = getDeltaLogFromCache() + if (Option(deltaLog.sparkContext.get).map(_.isStopped).getOrElse(true)) { + // Invalid the cached `DeltaLog` and create a new one because the `SparkContext` of the cached + // `DeltaLog` has been stopped. + getOrCreateCache(spark.sessionState.conf).invalidate(path -> fileSystemOptions) + getDeltaLogFromCache() + } else { + deltaLog + } + } + + /** Invalidate the cached DeltaLog object for the given `dataPath`. */ + def invalidateCache(spark: SparkSession, dataPath: Path): Unit = { + try { + val rawPath = logPathFor(dataPath) + // scalastyle:off deltahadoopconfiguration + // This method cannot be called from DataFrameReader/Writer so it's safe to assume the user + // has set the correct file system configurations in the session configs. + val fs = rawPath.getFileSystem(spark.sessionState.newHadoopConf()) + // scalastyle:on deltahadoopconfiguration + val path = fs.makeQualified(rawPath) + + val deltaLogCache = getOrCreateCache(spark.sessionState.conf) + if (spark.sessionState.conf.getConf( + DeltaSQLConf.LOAD_FILE_SYSTEM_CONFIGS_FROM_DATAFRAME_OPTIONS)) { + // We rely on the fact that accessing the key set doesn't modify the entry access time. See + // `CacheBuilder.expireAfterAccess`. + val keysToBeRemoved = mutable.ArrayBuffer[DeltaLogCacheKey]() + val iter = deltaLogCache.asMap().keySet().iterator() + while (iter.hasNext) { + val key = iter.next() + if (key._1 == path) { + keysToBeRemoved += key + } + } + deltaLogCache.invalidateAll(keysToBeRemoved.asJava) + } else { + deltaLogCache.invalidate(path -> Map.empty) + } + } catch { + case NonFatal(e) => logWarning(e.getMessage, e) + } + } + + def clearCache(): Unit = { + deltaLogCache.foreach(_.invalidateAll()) + } + + /** Unset the caches. Exposing for testing */ + private[delta] def unsetCache(): Unit = { + synchronized { + deltaLogCache = None + } + } + + /** Return the number of cached `DeltaLog`s. Exposing for testing */ + private[delta] def cacheSize: Long = { + deltaLogCache.map(_.size()).getOrElse(0L) + } + + /** + * Filters the given [[Dataset]] by the given `partitionFilters`, returning those that match. + * @param files The active files in the DeltaLog state, which contains the partition value + * information + * @param partitionFilters Filters on the partition columns + * @param partitionColumnPrefixes The path to the `partitionValues` column, if it's nested + * @param shouldRewritePartitionFilters Whether to rewrite `partitionFilters` to be over the + * [[AddFile]] schema + */ + def filterFileList( + partitionSchema: StructType, + files: DataFrame, + partitionFilters: Seq[Expression], + partitionColumnPrefixes: Seq[String] = Nil, + shouldRewritePartitionFilters: Boolean = true): DataFrame = { + + val rewrittenFilters = if (shouldRewritePartitionFilters) { + rewritePartitionFilters( + partitionSchema, + files.sparkSession.sessionState.conf.resolver, + partitionFilters, + partitionColumnPrefixes) + } else { + partitionFilters + } + val expr = rewrittenFilters.reduceLeftOption(And).getOrElse(Literal.TrueLiteral) + val columnFilter = new Column(expr) + files.filter(columnFilter) + } + + /** + * Rewrite the given `partitionFilters` to be used for filtering partition values. + * We need to explicitly resolve the partitioning columns here because the partition columns + * are stored as keys of a Map type instead of attributes in the AddFile schema (below) and thus + * cannot be resolved automatically. + * + * @param partitionFilters Filters on the partition columns + * @param partitionColumnPrefixes The path to the `partitionValues` column, if it's nested + */ + def rewritePartitionFilters( + partitionSchema: StructType, + resolver: Resolver, + partitionFilters: Seq[Expression], + partitionColumnPrefixes: Seq[String] = Nil): Seq[Expression] = { + partitionFilters + .map(_.transformUp { + case a: Attribute => + // If we have a special column name, e.g. `a.a`, then an UnresolvedAttribute returns + // the column name as '`a.a`' instead of 'a.a', therefore we need to strip the backticks. + val unquoted = a.name.stripPrefix("`").stripSuffix("`") + val partitionCol = partitionSchema.find { field => resolver(field.name, unquoted) } + partitionCol match { + case Some(f: StructField) => + val name = DeltaColumnMapping.getPhysicalName(f) + Cast( + UnresolvedAttribute(partitionColumnPrefixes ++ Seq("partitionValues", name)), + f.dataType) + case None => + // This should not be able to happen, but the case was present in the original code so + // we kept it to be safe. + log.error(s"Partition filter referenced column ${a.name} not in the partition schema") + UnresolvedAttribute(partitionColumnPrefixes ++ Seq("partitionValues", a.name)) + } + }) + } + + + /** + * Checks whether this table only accepts appends. If so it will throw an error in operations that + * can remove data such as DELETE/UPDATE/MERGE. + */ + def assertRemovable(snapshot: Snapshot): Unit = { + val metadata = snapshot.metadata + if (DeltaConfigs.IS_APPEND_ONLY.fromMetaData(metadata)) { + throw DeltaErrors.modifyAppendOnlyTableException(metadata.name) + } + } + + /** How long to keep around SetTransaction actions before physically deleting them. */ + def minSetTransactionRetentionInterval(metadata: Metadata): Option[Long] = { + DeltaConfigs.TRANSACTION_ID_RETENTION_DURATION + .fromMetaData(metadata) + .map(DeltaConfigs.getMilliSeconds) + } + /** How long to keep around logically deleted files before physically deleting them. */ + def tombstoneRetentionMillis(metadata: Metadata): Long = { + DeltaConfigs.getMilliSeconds(DeltaConfigs.TOMBSTONE_RETENTION.fromMetaData(metadata)) + } + + /** Get a function that canonicalizes a given `path`. */ + private[delta] class CanonicalPathFunction(getHadoopConf: () => Configuration) + extends Function[String, String] with Serializable { + // Mark it `@transient lazy val` so that de-serialization happens only once on every executor. + @transient + private lazy val fs = { + // scalastyle:off FileSystemGet + FileSystem.get(getHadoopConf()) + // scalastyle:on FileSystemGet + } + + override def apply(path: String): String = { + // scalastyle:off pathfromuri + val hadoopPath = new Path(new URI(path)) + // scalastyle:on pathfromuri + if (hadoopPath.isAbsoluteAndSchemeAuthorityNull) { + fs.makeQualified(hadoopPath).toUri.toString + } else { + // return untouched if it is a relative path or is already fully qualified + hadoopPath.toUri.toString + } + } + } +} diff --git a/backends-clickhouse/src/main/delta-32/org/apache/spark/sql/delta/Snapshot.scala b/backends-clickhouse/src/main/delta-32/org/apache/spark/sql/delta/Snapshot.scala new file mode 100644 index 0000000000000..8836f7c88d237 --- /dev/null +++ b/backends-clickhouse/src/main/delta-32/org/apache/spark/sql/delta/Snapshot.scala @@ -0,0 +1,683 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.delta + +// scalastyle:off import.ordering.noEmptyLine +import scala.collection.mutable + +import org.apache.spark.sql.delta.actions._ +import org.apache.spark.sql.delta.actions.Action.logSchema +import org.apache.spark.sql.delta.managedcommit.{CommitOwnerProvider, TableCommitOwnerClient} +import org.apache.spark.sql.delta.metering.DeltaLogging +import org.apache.spark.sql.delta.schema.SchemaUtils +import org.apache.spark.sql.delta.sources.DeltaSQLConf +import org.apache.spark.sql.delta.stats.DataSkippingReader +import org.apache.spark.sql.delta.stats.DeltaScan +import org.apache.spark.sql.delta.stats.DeltaStatsColumnSpec +import org.apache.spark.sql.delta.stats.StatisticsCollection +import org.apache.spark.sql.delta.util.DeltaCommitFileProvider +import org.apache.spark.sql.delta.util.FileNames +import org.apache.spark.sql.delta.util.StateCache +import org.apache.hadoop.fs.{FileStatus, Path} + +import org.apache.spark.sql._ +import org.apache.spark.sql.catalyst.expressions.Expression +import org.apache.spark.sql.execution.datasources.v2.clickhouse.ClickHouseConfig +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.types.StructType +import org.apache.spark.util.Utils + +/** + * Gluten overwrite Delta: + * + * This file is copied from Delta 3.2.0. It is modified to overcome the following issues: + * 1. filesForScan() will cache the DeltaScan by the FilterExprsAsKey + * 2. filesForScan() should return DeltaScan of AddMergeTreeParts instead of AddFile + */ + +/** + * A description of a Delta [[Snapshot]], including basic information such its [[DeltaLog]] + * metadata, protocol, and version. + */ +trait SnapshotDescriptor { + def deltaLog: DeltaLog + def version: Long + def metadata: Metadata + def protocol: Protocol + + def schema: StructType = metadata.schema + + protected[delta] def numOfFilesIfKnown: Option[Long] + protected[delta] def sizeInBytesIfKnown: Option[Long] +} + +/** + * An immutable snapshot of the state of the log at some delta version. Internally + * this class manages the replay of actions stored in checkpoint or delta files. + * + * After resolving any new actions, it caches the result and collects the + * following basic information to the driver: + * - Protocol Version + * - Metadata + * - Transaction state + * + * @param inCommitTimestampOpt The in-commit-timestamp of the latest commit in milliseconds. Can + * be set to None if + * 1. The timestamp has not been read yet - generally the case for cold tables. + * 2. Or the table has not been initialized, i.e. `version = -1`. + * 3. Or the table does not have [[InCommitTimestampTableFeature]] enabled. + * + */ +class Snapshot( + val path: Path, + override val version: Long, + val logSegment: LogSegment, + override val deltaLog: DeltaLog, + val checksumOpt: Option[VersionChecksum] + ) + extends SnapshotDescriptor + with SnapshotStateManager + with StateCache + with StatisticsCollection + with DataSkippingReader + with DeltaLogging { + + import Snapshot._ + import DeltaLogFileIndex.COMMIT_VERSION_COLUMN + // For implicits which re-use Encoder: + import org.apache.spark.sql.delta.implicits._ + + protected def spark = SparkSession.active + + /** Snapshot to scan by the DeltaScanGenerator for metadata query optimizations */ + override val snapshotToScan: Snapshot = this + + override def columnMappingMode: DeltaColumnMappingMode = metadata.columnMappingMode + + /** + * Returns the timestamp of the latest commit of this snapshot. + * For an uninitialized snapshot, this returns -1. + * + * When InCommitTimestampTableFeature is enabled, the timestamp + * is retrieved from the CommitInfo of the latest commit which + * can result in an IO operation. + */ + def timestamp: Long = + getInCommitTimestampOpt.getOrElse(logSegment.lastCommitFileModificationTimestamp) + + /** + * Returns the inCommitTimestamp if ICT is enabled, otherwise returns None. + * This potentially triggers an IO operation to read the inCommitTimestamp. + * This is a lazy val, so repeated calls will not trigger multiple IO operations. + */ + protected lazy val getInCommitTimestampOpt: Option[Long] = { + // --- modified start + // This implicit is for scala 2.12, copy from scala 2.13 + implicit class OptionExtCompanion(opt: Option.type) { + /** + * When a given condition is true, evaluates the a argument and returns Some(a). + * When the condition is false, a is not evaluated and None is returned. + */ + def when[A](cond: Boolean)(a: => A): Option[A] = if (cond) Some(a) else None + + /** + * When a given condition is false, evaluates the a argument and returns Some(a). + * When the condition is true, a is not evaluated and None is returned. + */ + def whenNot[A](cond: Boolean)(a: => A): Option[A] = if (!cond) Some(a) else None + + /** Sum up all the `options`, substituting `default` for each `None`. */ + def sum[N: Numeric](default: N)(options: Option[N]*): N = + options.map(_.getOrElse(default)).sum + } + // --- modified end + Option.when(DeltaConfigs.IN_COMMIT_TIMESTAMPS_ENABLED.fromMetaData(metadata)) { + _reconstructedProtocolMetadataAndICT.inCommitTimestamp + .getOrElse { + val startTime = System.currentTimeMillis() + var exception = Option.empty[Throwable] + try { + val commitInfoOpt = DeltaHistoryManager.getCommitInfoOpt( + deltaLog.store, + DeltaCommitFileProvider(this).deltaFile(version), + deltaLog.newDeltaHadoopConf()) + CommitInfo.getRequiredInCommitTimestamp(commitInfoOpt, version.toString) + } catch { + case e: Throwable => + exception = Some(e) + throw e + } finally { + recordDeltaEvent( + deltaLog, + "delta.inCommitTimestamp.read", + data = Map( + "version" -> version, + "callSite" -> "Snapshot.getInCommitTimestampOpt", + "checkpointVersion" -> logSegment.checkpointProvider.version, + "durationMs" -> (System.currentTimeMillis() - startTime), + "exceptionMessage" -> exception.map(_.getMessage).getOrElse(""), + "exceptionStackTrace" -> exception.map(_.getStackTrace.mkString("\n")).getOrElse("") + ) + ) + } + } + } + } + + + private[delta] lazy val nonFileActions: Seq[Action] = { + Seq(protocol, metadata) ++ + setTransactions ++ + domainMetadata + } + + @volatile private[delta] var stateReconstructionTriggered = false + + /** + * Use [[stateReconstruction]] to create a representation of the actions in this table. + * Cache the resultant output. + */ + private lazy val cachedState = recordFrameProfile("Delta", "snapshot.cachedState") { + stateReconstructionTriggered = true + cacheDS(stateReconstruction, s"Delta Table State #$version - $redactedPath") + } + + /** + * Given the list of files from `LogSegment`, create respective file indices to help create + * a DataFrame and short-circuit the many file existence and partition schema inference checks + * that exist in DataSource.resolveRelation(). + */ + protected[delta] lazy val deltaFileIndexOpt: Option[DeltaLogFileIndex] = { + assertLogFilesBelongToTable(path, logSegment.deltas) + DeltaLogFileIndex(DeltaLogFileIndex.COMMIT_FILE_FORMAT, logSegment.deltas) + } + + protected lazy val fileIndices: Seq[DeltaLogFileIndex] = { + val checkpointFileIndexes = checkpointProvider.allActionsFileIndexes() + checkpointFileIndexes ++ deltaFileIndexOpt.toSeq + } + + /** + * Protocol, Metadata, and In-Commit Timestamp retrieved through + * `protocolMetadataAndICTReconstruction` which skips a full state reconstruction. + */ + case class ReconstructedProtocolMetadataAndICT( + protocol: Protocol, + metadata: Metadata, + inCommitTimestamp: Option[Long]) + + /** + * Generate the protocol and metadata for this snapshot. This is usually cheaper than a + * full state reconstruction, but still only compute it when necessary. + */ + private lazy val _reconstructedProtocolMetadataAndICT: ReconstructedProtocolMetadataAndICT = + { + // Should be small. At most 'checkpointInterval' rows, unless new commits are coming + // in before a checkpoint can be written + var protocol: Protocol = null + var metadata: Metadata = null + var inCommitTimestamp: Option[Long] = None + protocolMetadataAndICTReconstruction().foreach { + case ReconstructedProtocolMetadataAndICT(p: Protocol, _, _) => protocol = p + case ReconstructedProtocolMetadataAndICT(_, m: Metadata, _) => metadata = m + case ReconstructedProtocolMetadataAndICT(_, _, ict: Option[Long]) => inCommitTimestamp = ict + } + + if (protocol == null) { + recordDeltaEvent( + deltaLog, + opType = "delta.assertions.missingAction", + data = Map( + "version" -> version.toString, "action" -> "Protocol", "source" -> "Snapshot")) + throw DeltaErrors.actionNotFoundException("protocol", version) + } + + if (metadata == null) { + recordDeltaEvent( + deltaLog, + opType = "delta.assertions.missingAction", + data = Map( + "version" -> version.toString, "action" -> "Metadata", "source" -> "Snapshot")) + throw DeltaErrors.actionNotFoundException("metadata", version) + } + + ReconstructedProtocolMetadataAndICT(protocol, metadata, inCommitTimestamp) + } + + /** + * [[CommitOwnerClient]] for the given delta table as of this snapshot. + * - This must be present when managed commit is enabled. + * - This must be None when managed commit is disabled. + */ + val tableCommitOwnerClientOpt: Option[TableCommitOwnerClient] = initializeTableCommitOwner() + protected def initializeTableCommitOwner(): Option[TableCommitOwnerClient] = { + CommitOwnerProvider.getTableCommitOwner(this) + } + + /** Number of columns to collect stats on for data skipping */ + override lazy val statsColumnSpec: DeltaStatsColumnSpec = + StatisticsCollection.configuredDeltaStatsColumnSpec(metadata) + + /** Performs validations during initialization */ + protected def init(): Unit = { + deltaLog.protocolRead(protocol) + deltaLog.assertTableFeaturesMatchMetadata(protocol, metadata) + SchemaUtils.recordUndefinedTypes(deltaLog, metadata.schema) + } + + /** The current set of actions in this [[Snapshot]] as plain Rows */ + def stateDF: DataFrame = recordFrameProfile("Delta", "stateDF") { + cachedState.getDF + } + + /** The current set of actions in this [[Snapshot]] as a typed Dataset. */ + def stateDS: Dataset[SingleAction] = recordFrameProfile("Delta", "stateDS") { + cachedState.getDS + } + + private[delta] def allFilesViaStateReconstruction: Dataset[AddFile] = { + stateDS.where("add IS NOT NULL").select(col("add").as[AddFile]) + } + + // Here we need to bypass the ACL checks for SELECT anonymous function permissions. + /** All of the files present in this [[Snapshot]]. */ + def allFiles: Dataset[AddFile] = allFilesViaStateReconstruction + + /** All unexpired tombstones. */ + def tombstones: Dataset[RemoveFile] = { + stateDS.where("remove IS NOT NULL").select(col("remove").as[RemoveFile]) + } + + def deltaFileSizeInBytes(): Long = deltaFileIndexOpt.map(_.sizeInBytes).getOrElse(0L) + + def checkpointSizeInBytes(): Long = checkpointProvider.effectiveCheckpointSizeInBytes() + + override def metadata: Metadata = _reconstructedProtocolMetadataAndICT.metadata + + override def protocol: Protocol = _reconstructedProtocolMetadataAndICT.protocol + + /** + * Pulls the protocol and metadata of the table from the files that are used to compute the + * Snapshot directly--without triggering a full state reconstruction. This is important, because + * state reconstruction depends on protocol and metadata for correctness. + * If the current table version does not have a checkpoint, this function will also return the + * in-commit-timestamp of the latest commit if available. + * + * Also this method should only access methods defined in [[UninitializedCheckpointProvider]] + * which are not present in [[CheckpointProvider]]. This is because initialization of + * [[Snapshot.checkpointProvider]] depends on [[Snapshot.protocolMetadataAndICTReconstruction()]] + * and so if [[Snapshot.protocolMetadataAndICTReconstruction()]] starts depending on + * [[Snapshot.checkpointProvider]] then there will be cyclic dependency. + */ + protected def protocolMetadataAndICTReconstruction(): + Array[ReconstructedProtocolMetadataAndICT] = { + import implicits._ + + val schemaToUse = Action.logSchema(Set("protocol", "metaData", "commitInfo")) + val checkpointOpt = checkpointProvider.topLevelFileIndex.map { index => + deltaLog.loadIndex(index, schemaToUse) + .withColumn(COMMIT_VERSION_COLUMN, lit(checkpointProvider.version)) + } + (checkpointOpt ++ deltaFileIndexOpt.map(deltaLog.loadIndex(_, schemaToUse)).toSeq) + .reduceOption(_.union(_)).getOrElse(emptyDF) + .select("protocol", "metaData", "commitInfo.inCommitTimestamp", COMMIT_VERSION_COLUMN) + .where("protocol.minReaderVersion is not null or metaData.id is not null " + + s"or (commitInfo.inCommitTimestamp is not null and version = $version)") + .as[(Protocol, Metadata, Option[Long], Long)] + .collect() + .sortBy(_._4) + .map { + case (p, m, ict, _) => ReconstructedProtocolMetadataAndICT(p, m, ict) + } + } + + // Reconstruct the state by applying deltas in order to the checkpoint. + // We partition by path as it is likely the bulk of the data is add/remove. + // Non-path based actions will be collocated to a single partition. + protected def stateReconstruction: Dataset[SingleAction] = { + recordFrameProfile("Delta", "snapshot.stateReconstruction") { + // for serializability + val localMinFileRetentionTimestamp = minFileRetentionTimestamp + val localMinSetTransactionRetentionTimestamp = minSetTransactionRetentionTimestamp + + val canonicalPath = deltaLog.getCanonicalPathUdf() + + // Canonicalize the paths so we can repartition the actions correctly, but only rewrite the + // add/remove actions themselves after partitioning and sorting are complete. Otherwise, the + // optimizer can generate a really bad plan that re-evaluates _EVERY_ field of the rewritten + // struct(...) projection every time we touch _ANY_ field of the rewritten struct. + // + // NOTE: We sort by [[COMMIT_VERSION_COLUMN]] (provided by [[loadActions]]), to ensure that + // actions are presented to InMemoryLogReplay in the ascending version order it expects. + val ADD_PATH_CANONICAL_COL_NAME = "add_path_canonical" + val REMOVE_PATH_CANONICAL_COL_NAME = "remove_path_canonical" + loadActions + .withColumn(ADD_PATH_CANONICAL_COL_NAME, when( + col("add.path").isNotNull, canonicalPath(col("add.path")))) + .withColumn(REMOVE_PATH_CANONICAL_COL_NAME, when( + col("remove.path").isNotNull, canonicalPath(col("remove.path")))) + .repartition( + getNumPartitions, + coalesce(col(ADD_PATH_CANONICAL_COL_NAME), col(REMOVE_PATH_CANONICAL_COL_NAME))) + .sortWithinPartitions(COMMIT_VERSION_COLUMN) + .withColumn("add", when( + col("add.path").isNotNull, + struct( + col(ADD_PATH_CANONICAL_COL_NAME).as("path"), + col("add.partitionValues"), + col("add.size"), + col("add.modificationTime"), + col("add.dataChange"), + col(ADD_STATS_TO_USE_COL_NAME).as("stats"), + col("add.tags"), + col("add.deletionVector"), + col("add.baseRowId"), + col("add.defaultRowCommitVersion"), + col("add.clusteringProvider") + ))) + .withColumn("remove", when( + col("remove.path").isNotNull, + col("remove").withField("path", col(REMOVE_PATH_CANONICAL_COL_NAME)))) + .as[SingleAction] + .mapPartitions { iter => + val state: LogReplay = + new InMemoryLogReplay( + localMinFileRetentionTimestamp, + localMinSetTransactionRetentionTimestamp) + state.append(0, iter.map(_.unwrap)) + state.checkpoint.map(_.wrap) + } + } + } + + /** + * Loads the file indices into a DataFrame that can be used for LogReplay. + * + * In addition to the usual nested columns provided by the SingleAction schema, it should provide + * two additional columns to simplify the log replay process: [[COMMIT_VERSION_COLUMN]] (which, + * when sorted in ascending order, will order older actions before newer ones, as required by + * [[InMemoryLogReplay]]); and [[ADD_STATS_TO_USE_COL_NAME]] (to handle certain combinations of + * config settings for delta.checkpoint.writeStatsAsJson and delta.checkpoint.writeStatsAsStruct). + */ + protected def loadActions: DataFrame = { + fileIndices.map(deltaLog.loadIndex(_)) + .reduceOption(_.union(_)).getOrElse(emptyDF) + .withColumn(ADD_STATS_TO_USE_COL_NAME, col("add.stats")) + } + + /** + * Tombstones before the [[minFileRetentionTimestamp]] timestamp will be dropped from the + * checkpoint. + */ + private[delta] def minFileRetentionTimestamp: Long = { + deltaLog.clock.getTimeMillis() - DeltaLog.tombstoneRetentionMillis(metadata) + } + + /** + * [[SetTransaction]]s before [[minSetTransactionRetentionTimestamp]] will be considered expired + * and dropped from the snapshot. + */ + private[delta] def minSetTransactionRetentionTimestamp: Option[Long] = { + DeltaLog.minSetTransactionRetentionInterval(metadata).map(deltaLog.clock.getTimeMillis() - _) + } + + private[delta] def getNumPartitions: Int = { + spark.sessionState.conf.getConf(DeltaSQLConf.DELTA_SNAPSHOT_PARTITIONS) + .getOrElse(Snapshot.defaultNumSnapshotPartitions) + } + + /** + * Computes all the information that is needed by the checksum for the current snapshot. + * May kick off state reconstruction if needed by any of the underlying fields. + * Note that it's safe to set txnId to none, since the snapshot doesn't always have a txn + * attached. E.g. if a snapshot is created by reading a checkpoint, then no txnId is present. + */ + def computeChecksum: VersionChecksum = VersionChecksum( + txnId = None, + tableSizeBytes = sizeInBytes, + numFiles = numOfFiles, + numMetadata = numOfMetadata, + numProtocol = numOfProtocol, + inCommitTimestampOpt = getInCommitTimestampOpt, + setTransactions = checksumOpt.flatMap(_.setTransactions), + domainMetadata = domainMetadatasIfKnown, + metadata = metadata, + protocol = protocol, + histogramOpt = fileSizeHistogram, + allFiles = checksumOpt.flatMap(_.allFiles)) + + /** Returns the data schema of the table, used for reading stats */ + def tableSchema: StructType = metadata.dataSchema + + def outputTableStatsSchema: StructType = metadata.dataSchema + + def outputAttributeSchema: StructType = metadata.dataSchema + + /** Returns the schema of the columns written out to file (overridden in write path) */ + def dataSchema: StructType = metadata.dataSchema + + /** Return the set of properties of the table. */ + def getProperties: mutable.Map[String, String] = { + val base = new mutable.LinkedHashMap[String, String]() + metadata.configuration.foreach { case (k, v) => + if (k != "path") { + base.put(k, v) + } + } + base.put(Protocol.MIN_READER_VERSION_PROP, protocol.minReaderVersion.toString) + base.put(Protocol.MIN_WRITER_VERSION_PROP, protocol.minWriterVersion.toString) + if (protocol.supportsReaderFeatures || protocol.supportsWriterFeatures) { + val features = protocol.readerAndWriterFeatureNames.map(name => + s"${TableFeatureProtocolUtils.FEATURE_PROP_PREFIX}$name" -> + TableFeatureProtocolUtils.FEATURE_PROP_SUPPORTED) + base ++ features.toSeq.sorted + } else { + base + } + } + + /** The [[CheckpointProvider]] for the underlying checkpoint */ + lazy val checkpointProvider: CheckpointProvider = logSegment.checkpointProvider match { + case cp: CheckpointProvider => cp + case uninitializedProvider: UninitializedCheckpointProvider => + CheckpointProvider(spark, this, checksumOpt, uninitializedProvider) + case o => throw new IllegalStateException(s"Unknown checkpoint provider: ${o.getClass.getName}") + } + + def redactedPath: String = + Utils.redact(spark.sessionState.conf.stringRedactionPattern, path.toUri.toString) + + /** + * Ensures that commit files are backfilled up to the current version in the snapshot. + * + * This method checks if there are any un-backfilled versions up to the current version and + * triggers the backfilling process using the commit-owner. It verifies that the delta file for + * the current version exists after the backfilling process. + * + * @throws IllegalStateException + * if the delta file for the current version is not found after backfilling. + */ + def ensureCommitFilesBackfilled(): Unit = { + val tableCommitOwnerClient = tableCommitOwnerClientOpt.getOrElse { + return + } + val minUnbackfilledVersion = DeltaCommitFileProvider(this).minUnbackfilledVersion + if (minUnbackfilledVersion <= version) { + val hadoopConf = deltaLog.newDeltaHadoopConf() + tableCommitOwnerClient.backfillToVersion( + startVersion = minUnbackfilledVersion, endVersion = Some(version)) + val fs = deltaLog.logPath.getFileSystem(hadoopConf) + val expectedBackfilledDeltaFile = FileNames.unsafeDeltaFile(deltaLog.logPath, version) + if (!fs.exists(expectedBackfilledDeltaFile)) { + throw new IllegalStateException("Backfilling of commit files failed. " + + s"Expected delta file $expectedBackfilledDeltaFile not found.") + } + } + } + + + protected def emptyDF: DataFrame = + spark.createDataFrame(spark.sparkContext.emptyRDD[Row], logSchema) + + + override def logInfo(msg: => String): Unit = { + super.logInfo(s"[tableId=${deltaLog.tableId}] " + msg) + } + + override def logWarning(msg: => String): Unit = { + super.logWarning(s"[tableId=${deltaLog.tableId}] " + msg) + } + + override def logWarning(msg: => String, throwable: Throwable): Unit = { + super.logWarning(s"[tableId=${deltaLog.tableId}] " + msg, throwable) + } + + override def logError(msg: => String): Unit = { + super.logError(s"[tableId=${deltaLog.tableId}] " + msg) + } + + override def logError(msg: => String, throwable: Throwable): Unit = { + super.logError(s"[tableId=${deltaLog.tableId}] " + msg, throwable) + } + + override def toString: String = + s"${getClass.getSimpleName}(path=$path, version=$version, metadata=$metadata, " + + s"logSegment=$logSegment, checksumOpt=$checksumOpt)" + + // --- modified start + override def filesForScan(filters: Seq[Expression], keepNumRecords: Boolean): DeltaScan = { + val deltaScan = ClickhouseSnapshot.deltaScanCache.get( + FilterExprsAsKey(path, ClickhouseSnapshot.genSnapshotId(this), filters, None), + () => { + super.filesForScan(filters, keepNumRecords) + }) + + replaceWithAddMergeTreeParts(deltaScan) + } + + override def filesForScan(limit: Long, partitionFilters: Seq[Expression]): DeltaScan = { + val deltaScan = ClickhouseSnapshot.deltaScanCache.get( + FilterExprsAsKey(path, ClickhouseSnapshot.genSnapshotId(this), partitionFilters, Some(limit)), + () => { + super.filesForScan(limit, partitionFilters) + }) + + replaceWithAddMergeTreeParts(deltaScan) + } + + private def replaceWithAddMergeTreeParts(deltaScan: DeltaScan) = { + if (ClickHouseConfig.isMergeTreeFormatEngine(metadata.configuration)) { + DeltaScan.apply( + deltaScan.version, + deltaScan.files + .map( + addFile => { + val addFileAsKey = AddFileAsKey(addFile) + + val ret = ClickhouseSnapshot.addFileToAddMTPCache.get(addFileAsKey) + // this is for later use + ClickhouseSnapshot.pathToAddMTPCache.put(ret.fullPartPath(), ret) + ret + }), + deltaScan.total, + deltaScan.partition, + deltaScan.scanned + )( + deltaScan.scannedSnapshot, + deltaScan.partitionFilters, + deltaScan.dataFilters, + deltaScan.unusedFilters, + deltaScan.scanDurationMs, + deltaScan.dataSkippingType + ) + } else { + deltaScan + } + } + // --- modified end + + logInfo(s"Created snapshot $this") + init() +} + +object Snapshot extends DeltaLogging { + + // Used by [[loadActions]] and [[stateReconstruction]] + val ADD_STATS_TO_USE_COL_NAME = "add_stats_to_use" + + private val defaultNumSnapshotPartitions: Int = 50 + + /** Verifies that a set of delta or checkpoint files to be read actually belongs to this table. */ + private def assertLogFilesBelongToTable(logBasePath: Path, files: Seq[FileStatus]): Unit = { + val logPath = new Path(logBasePath.toUri) + val commitDirPath = FileNames.commitDirPath(logPath) + files.map(_.getPath).foreach { filePath => + val commitParent = new Path(filePath.toUri).getParent + if (commitParent != logPath && commitParent != commitDirPath) { + // scalastyle:off throwerror + throw new AssertionError(s"File ($filePath) doesn't belong in the " + + s"transaction log at $logBasePath.") + // scalastyle:on throwerror + } + } + } +} + +/** + * An initial snapshot with only metadata specified. Useful for creating a DataFrame from an + * existing parquet table during its conversion to delta. + * + * @param logPath the path to transaction log + * @param deltaLog the delta log object + * @param metadata the metadata of the table + */ +class InitialSnapshot( + val logPath: Path, + override val deltaLog: DeltaLog, + override val metadata: Metadata) + extends Snapshot( + path = logPath, + version = -1, + logSegment = LogSegment.empty(logPath), + deltaLog = deltaLog, + checksumOpt = None + ) { + + def this(logPath: Path, deltaLog: DeltaLog) = this( + logPath, + deltaLog, + Metadata( + configuration = DeltaConfigs.mergeGlobalConfigs( + sqlConfs = SparkSession.active.sessionState.conf, + tableConf = Map.empty, + ignoreProtocolConfsOpt = Some( + DeltaConfigs.ignoreProtocolDefaultsIsSet( + sqlConfs = SparkSession.active.sessionState.conf, + tableConf = deltaLog.allOptions))), + createdTime = Some(System.currentTimeMillis()))) + + override def stateDS: Dataset[SingleAction] = emptyDF.as[SingleAction] + override def stateDF: DataFrame = emptyDF + override protected lazy val computedState: SnapshotState = initialState(metadata) + override def protocol: Protocol = computedState.protocol + override protected lazy val getInCommitTimestampOpt: Option[Long] = None + + // The [[InitialSnapshot]] is not backed by any external commit-owner. + override def initializeTableCommitOwner(): Option[TableCommitOwnerClient] = None + override def timestamp: Long = -1L +} diff --git a/backends-clickhouse/src/main/delta-32/org/apache/spark/sql/delta/catalog/ClickHouseTableV2.scala b/backends-clickhouse/src/main/delta-32/org/apache/spark/sql/delta/catalog/ClickHouseTableV2.scala new file mode 100644 index 0000000000000..8b4a13a30a699 --- /dev/null +++ b/backends-clickhouse/src/main/delta-32/org/apache/spark/sql/delta/catalog/ClickHouseTableV2.scala @@ -0,0 +1,220 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.delta.catalog + +import org.apache.spark.internal.Logging +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.TableIdentifier +import org.apache.spark.sql.catalyst.analysis.{ResolvedTable, UnresolvedTable} +import org.apache.spark.sql.catalyst.catalog.CatalogTable +import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.connector.catalog.V1Table +import org.apache.spark.sql.connector.read.InputPartition +import org.apache.spark.sql.connector.write.{LogicalWriteInfo, WriteBuilder} +import org.apache.spark.sql.delta.{ClickhouseSnapshot, DeltaErrors, DeltaLog, DeltaTableUtils, DeltaTimeTravelSpec, Snapshot, UnresolvedPathBasedDeltaTable} +import org.apache.spark.sql.delta.actions.{Metadata, Protocol} +import org.apache.spark.sql.delta.sources.DeltaDataSource +import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, PartitionDirectory} +import org.apache.spark.sql.execution.datasources.utils.MergeTreePartsPartitionsUtil +import org.apache.spark.sql.execution.datasources.v2.clickhouse.source.DeltaMergeTreeFileFormat +import org.apache.spark.sql.execution.datasources.v2.clickhouse.utils.CHDataSourceUtils +import org.apache.spark.sql.util.CaseInsensitiveStringMap +import org.apache.spark.util.collection.BitSet + +import org.apache.hadoop.fs.Path + +import java.{util => ju} + +import scala.collection.JavaConverters._ + +@SuppressWarnings(Array("io.github.zhztheplayer.scalawarts.InheritFromCaseClass")) +class ClickHouseTableV2( + override val spark: SparkSession, + override val path: Path, + override val catalogTable: Option[CatalogTable] = None, + override val tableIdentifier: Option[String] = None, + override val timeTravelOpt: Option[DeltaTimeTravelSpec] = None, + override val options: Map[String, String] = Map.empty, + val clickhouseExtensionOptions: Map[String, String] = Map.empty) + extends DeltaTableV2(spark, path, catalogTable, tableIdentifier, timeTravelOpt, options) + with ClickHouseTableV2Base { + + lazy val (rootPath, partitionFilters, timeTravelByPath) = { + if (catalogTable.isDefined) { + // Fast path for reducing path munging overhead + (new Path(catalogTable.get.location), Nil, None) + } else { + DeltaDataSource.parsePathIdentifier(spark, path.toString, options) + } + } + + override def name(): String = + catalogTable + .map(_.identifier.unquotedString) + .orElse(tableIdentifier) + .getOrElse(s"clickhouse.`${deltaLog.dataPath}`") + + private lazy val timeTravelSpec: Option[DeltaTimeTravelSpec] = { + if (timeTravelOpt.isDefined && timeTravelByPath.isDefined) { + throw DeltaErrors.multipleTimeTravelSyntaxUsed + } + timeTravelOpt.orElse(timeTravelByPath) + } + + private lazy val caseInsensitiveOptions = new CaseInsensitiveStringMap(options.asJava) + + override def properties(): ju.Map[String, String] = { + val ret = super.properties() + + // for file path based write + if (initialSnapshot.version < 0 && clickhouseExtensionOptions.nonEmpty) { + ret.putAll(clickhouseExtensionOptions.asJava) + } + ret + } + + override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder = { + new WriteIntoDeltaBuilder( + this, + info.options, + spark.sessionState.conf.useNullsForMissingDefaultColumnValues) + } + + def getFileFormat(protocol: Protocol, meta: Metadata): DeltaMergeTreeFileFormat = { + new DeltaMergeTreeFileFormat( + protocol, + meta, + dataBaseName, + tableName, + ClickhouseSnapshot.genSnapshotId(initialSnapshot), + orderByKeyOption, + lowCardKeyOption, + minmaxIndexKeyOption, + bfIndexKeyOption, + setIndexKeyOption, + primaryKeyOption, + clickhouseTableConfigs, + partitionColumns + ) + } + + override def deltaProperties(): ju.Map[String, String] = properties() + + override def deltaCatalog(): Option[CatalogTable] = catalogTable + + override def deltaPath(): Path = path + + override def deltaSnapshot(): Snapshot = initialSnapshot + + def cacheThis(): Unit = { + ClickHouseTableV2.deltaLog2Table.put(deltaLog, this) + } + + cacheThis() +} + +@SuppressWarnings(Array("io.github.zhztheplayer.scalawarts.InheritFromCaseClass")) +class TempClickHouseTableV2( + override val spark: SparkSession, + override val catalogTable: Option[CatalogTable] = None) + extends ClickHouseTableV2(spark, null, catalogTable) { + import collection.JavaConverters._ + override def properties(): ju.Map[String, String] = catalogTable.get.properties.asJava + override lazy val partitionColumns: Seq[String] = catalogTable.get.partitionColumnNames + override def cacheThis(): Unit = {} +} + +object ClickHouseTableV2 extends Logging { + private val deltaLog2Table = + new scala.collection.concurrent.TrieMap[DeltaLog, ClickHouseTableV2]() + // for CTAS use + val temporalThreadLocalCHTable = new ThreadLocal[ClickHouseTableV2]() + + def getTable(deltaLog: DeltaLog): ClickHouseTableV2 = { + if (deltaLog2Table.contains(deltaLog)) { + deltaLog2Table(deltaLog) + } else if (temporalThreadLocalCHTable.get() != null) { + temporalThreadLocalCHTable.get() + } else { + throw new IllegalStateException( + s"Can not find ClickHouseTableV2 for deltalog ${deltaLog.dataPath}") + } + } + + def clearCache(): Unit = { + deltaLog2Table.clear() + temporalThreadLocalCHTable.remove() + } + + def partsPartitions( + deltaLog: DeltaLog, + relation: HadoopFsRelation, + selectedPartitions: Array[PartitionDirectory], + output: Seq[Attribute], + bucketedScan: Boolean, + optionalBucketSet: Option[BitSet], + optionalNumCoalescedBuckets: Option[Int], + disableBucketedScan: Boolean, + filterExprs: Seq[Expression]): Seq[InputPartition] = { + val tableV2 = ClickHouseTableV2.getTable(deltaLog) + + MergeTreePartsPartitionsUtil.getMergeTreePartsPartitions( + relation, + selectedPartitions, + output, + bucketedScan, + tableV2.spark, + tableV2, + optionalBucketSet, + optionalNumCoalescedBuckets, + disableBucketedScan, + filterExprs) + } + + /** Resolves a path into a DeltaTableV2, leveraging standard v2 table resolution. */ + def apply( + spark: SparkSession, + tablePath: Path, + options: Map[String, String], + cmd: String): DeltaTableV2 = + resolve(spark, UnresolvedPathBasedDeltaTable(tablePath.toString, options, cmd), cmd) + + /** Resolves a table identifier into a DeltaTableV2, leveraging standard v2 table resolution. */ + def apply(spark: SparkSession, tableId: TableIdentifier, cmd: String): DeltaTableV2 = { + resolve(spark, UnresolvedTable(tableId.nameParts, cmd, None), cmd) + } + + /** Applies standard v2 table resolution to an unresolved Delta table plan node */ + def resolve(spark: SparkSession, unresolved: LogicalPlan, cmd: String): DeltaTableV2 = + extractFrom(spark.sessionState.analyzer.ResolveRelations(unresolved), cmd) + + /** + * Extracts the DeltaTableV2 from a resolved Delta table plan node, throwing "table not found" if + * the node does not actually represent a resolved Delta table. + */ + def extractFrom(plan: LogicalPlan, cmd: String): DeltaTableV2 = plan match { + case ResolvedTable(_, _, d: ClickHouseTableV2, _) => d + case ResolvedTable(_, _, d: DeltaTableV2, _) => d + case ResolvedTable(_, _, t: V1Table, _) + if CHDataSourceUtils.isClickHouseTable(t.catalogTable) => + new ClickHouseTableV2(SparkSession.active, new Path(t.v1Table.location), Some(t.v1Table)) + case ResolvedTable(_, _, t: V1Table, _) if DeltaTableUtils.isDeltaTable(t.catalogTable) => + DeltaTableV2(SparkSession.active, new Path(t.v1Table.location), Some(t.v1Table)) + case _ => throw DeltaErrors.notADeltaTableException(cmd) + } +} diff --git a/backends-clickhouse/src/main/delta-32/org/apache/spark/sql/delta/commands/DeleteCommand.scala b/backends-clickhouse/src/main/delta-32/org/apache/spark/sql/delta/commands/DeleteCommand.scala new file mode 100644 index 0000000000000..dec1f4b9c3f5b --- /dev/null +++ b/backends-clickhouse/src/main/delta-32/org/apache/spark/sql/delta/commands/DeleteCommand.scala @@ -0,0 +1,557 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.delta.commands + +import java.util.concurrent.TimeUnit + +import org.apache.spark.sql.delta.metric.IncrementMetric +import org.apache.spark.sql.delta._ +import org.apache.spark.sql.delta.actions.{Action, AddCDCFile, AddFile, FileAction} +import org.apache.spark.sql.delta.commands.DeleteCommand.{rewritingFilesMsg, FINDING_TOUCHED_FILES_MSG} +import org.apache.spark.sql.delta.commands.MergeIntoCommandBase.totalBytesAndDistinctPartitionValues +import org.apache.spark.sql.delta.files.TahoeBatchFileIndex +import org.apache.spark.sql.delta.sources.DeltaSQLConf +import com.fasterxml.jackson.databind.annotation.JsonDeserialize + +import org.apache.spark.SparkContext +import org.apache.spark.sql.{Column, DataFrame, Dataset, Row, SparkSession} +import org.apache.spark.sql.catalyst.analysis.EliminateSubqueryAliases +import org.apache.spark.sql.catalyst.catalog.CatalogTable +import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, EqualNullSafe, Expression, If, Literal, Not} +import org.apache.spark.sql.catalyst.expressions.Literal.TrueLiteral +import org.apache.spark.sql.catalyst.plans.QueryPlan +import org.apache.spark.sql.catalyst.plans.logical.{DeltaDelete, LogicalPlan} +import org.apache.spark.sql.execution.command.LeafRunnableCommand +import org.apache.spark.sql.execution.metric.SQLMetric +import org.apache.spark.sql.execution.metric.SQLMetrics.{createMetric, createTimingMetric} +import org.apache.spark.sql.functions.{col, explode, input_file_name, split} +import org.apache.spark.sql.types.LongType + +/** + * Gluten overwrite Delta: + * + * This file is copied from Delta 3.2.0. It is modified to overcome the following issues: + * 1. In Clickhouse backend, we can't implement input_file_name() correctly, we can only implement + * it so that it return a a list of filenames (concated by ','). + */ + +trait DeleteCommandMetrics { self: LeafRunnableCommand => + @transient private lazy val sc: SparkContext = SparkContext.getOrCreate() + + def createMetrics: Map[String, SQLMetric] = Map[String, SQLMetric]( + "numRemovedFiles" -> createMetric(sc, "number of files removed."), + "numAddedFiles" -> createMetric(sc, "number of files added."), + "numDeletedRows" -> createMetric(sc, "number of rows deleted."), + "numFilesBeforeSkipping" -> createMetric(sc, "number of files before skipping"), + "numBytesBeforeSkipping" -> createMetric(sc, "number of bytes before skipping"), + "numFilesAfterSkipping" -> createMetric(sc, "number of files after skipping"), + "numBytesAfterSkipping" -> createMetric(sc, "number of bytes after skipping"), + "numPartitionsAfterSkipping" -> createMetric(sc, "number of partitions after skipping"), + "numPartitionsAddedTo" -> createMetric(sc, "number of partitions added"), + "numPartitionsRemovedFrom" -> createMetric(sc, "number of partitions removed"), + "numCopiedRows" -> createMetric(sc, "number of rows copied"), + "numAddedBytes" -> createMetric(sc, "number of bytes added"), + "numRemovedBytes" -> createMetric(sc, "number of bytes removed"), + "executionTimeMs" -> + createTimingMetric(sc, "time taken to execute the entire operation"), + "scanTimeMs" -> + createTimingMetric(sc, "time taken to scan the files for matches"), + "rewriteTimeMs" -> + createTimingMetric(sc, "time taken to rewrite the matched files"), + "numAddedChangeFiles" -> createMetric(sc, "number of change data capture files generated"), + "changeFileBytes" -> createMetric(sc, "total size of change data capture files generated"), + "numTouchedRows" -> createMetric(sc, "number of rows touched"), + "numDeletionVectorsAdded" -> createMetric(sc, "number of deletion vectors added"), + "numDeletionVectorsRemoved" -> createMetric(sc, "number of deletion vectors removed"), + "numDeletionVectorsUpdated" -> createMetric(sc, "number of deletion vectors updated") + ) + + def getDeletedRowsFromAddFilesAndUpdateMetrics(files: Seq[AddFile]) : Option[Long] = { + if (!conf.getConf(DeltaSQLConf.DELTA_DML_METRICS_FROM_METADATA)) { + return None; + } + // No file to get metadata, return none to be consistent with metadata stats disabled + if (files.isEmpty) { + return None + } + // Return None if any file does not contain numLogicalRecords status + var count: Long = 0 + for (file <- files) { + if (file.numLogicalRecords.isEmpty) { + return None + } + count += file.numLogicalRecords.get + } + metrics("numDeletedRows").set(count) + return Some(count) + } +} + +/** + * Performs a Delete based on the search condition + * + * Algorithm: + * 1) Scan all the files and determine which files have + * the rows that need to be deleted. + * 2) Traverse the affected files and rebuild the touched files. + * 3) Use the Delta protocol to atomically write the remaining rows to new files and remove + * the affected files that are identified in step 1. + */ +case class DeleteCommand( + deltaLog: DeltaLog, + catalogTable: Option[CatalogTable], + target: LogicalPlan, + condition: Option[Expression]) + extends LeafRunnableCommand with DeltaCommand with DeleteCommandMetrics { + + override def innerChildren: Seq[QueryPlan[_]] = Seq(target) + + override val output: Seq[Attribute] = Seq(AttributeReference("num_affected_rows", LongType)()) + + override lazy val metrics = createMetrics + + final override def run(sparkSession: SparkSession): Seq[Row] = { + recordDeltaOperation(deltaLog, "delta.dml.delete") { + deltaLog.withNewTransaction(catalogTable) { txn => + DeltaLog.assertRemovable(txn.snapshot) + if (hasBeenExecuted(txn, sparkSession)) { + sendDriverMetrics(sparkSession, metrics) + return Seq.empty + } + + val deleteActions = performDelete(sparkSession, deltaLog, txn) + txn.commitIfNeeded(actions = deleteActions, + op = DeltaOperations.Delete(condition.toSeq), + tags = RowTracking.addPreservedRowTrackingTagIfNotSet(txn.snapshot)) + } + // Re-cache all cached plans(including this relation itself, if it's cached) that refer to + // this data source relation. + sparkSession.sharedState.cacheManager.recacheByPlan(sparkSession, target) + } + + // Adjust for deletes at partition boundaries. Deletes at partition boundaries is a metadata + // operation, therefore we don't actually have any information around how many rows were deleted + // While this info may exist in the file statistics, it's not guaranteed that we have these + // statistics. To avoid any performance regressions, we currently just return a -1 in such cases + if (metrics("numRemovedFiles").value > 0 && metrics("numDeletedRows").value == 0) { + Seq(Row(-1L)) + } else { + Seq(Row(metrics("numDeletedRows").value)) + } + } + + def performDelete( + sparkSession: SparkSession, + deltaLog: DeltaLog, + txn: OptimisticTransaction): Seq[Action] = { + import org.apache.spark.sql.delta.implicits._ + + var numRemovedFiles: Long = 0 + var numAddedFiles: Long = 0 + var numAddedChangeFiles: Long = 0 + var scanTimeMs: Long = 0 + var rewriteTimeMs: Long = 0 + var numAddedBytes: Long = 0 + var changeFileBytes: Long = 0 + var numRemovedBytes: Long = 0 + var numFilesBeforeSkipping: Long = 0 + var numBytesBeforeSkipping: Long = 0 + var numFilesAfterSkipping: Long = 0 + var numBytesAfterSkipping: Long = 0 + var numPartitionsAfterSkipping: Option[Long] = None + var numPartitionsRemovedFrom: Option[Long] = None + var numPartitionsAddedTo: Option[Long] = None + var numDeletedRows: Option[Long] = None + var numCopiedRows: Option[Long] = None + var numDeletionVectorsAdded: Long = 0 + var numDeletionVectorsRemoved: Long = 0 + var numDeletionVectorsUpdated: Long = 0 + + val startTime = System.nanoTime() + val numFilesTotal = txn.snapshot.numOfFiles + + val deleteActions: Seq[Action] = condition match { + case None => + // Case 1: Delete the whole table if the condition is true + val reportRowLevelMetrics = conf.getConf(DeltaSQLConf.DELTA_DML_METRICS_FROM_METADATA) + val allFiles = txn.filterFiles(Nil, keepNumRecords = reportRowLevelMetrics) + + numRemovedFiles = allFiles.size + numDeletionVectorsRemoved = allFiles.count(_.deletionVector != null) + scanTimeMs = (System.nanoTime() - startTime) / 1000 / 1000 + val (numBytes, numPartitions) = totalBytesAndDistinctPartitionValues(allFiles) + numRemovedBytes = numBytes + numFilesBeforeSkipping = numRemovedFiles + numBytesBeforeSkipping = numBytes + numFilesAfterSkipping = numRemovedFiles + numBytesAfterSkipping = numBytes + numDeletedRows = getDeletedRowsFromAddFilesAndUpdateMetrics(allFiles) + + if (txn.metadata.partitionColumns.nonEmpty) { + numPartitionsAfterSkipping = Some(numPartitions) + numPartitionsRemovedFrom = Some(numPartitions) + numPartitionsAddedTo = Some(0) + } + val operationTimestamp = System.currentTimeMillis() + allFiles.map(_.removeWithTimestamp(operationTimestamp)) + case Some(cond) => + val (metadataPredicates, otherPredicates) = + DeltaTableUtils.splitMetadataAndDataPredicates( + cond, txn.metadata.partitionColumns, sparkSession) + + numFilesBeforeSkipping = txn.snapshot.numOfFiles + numBytesBeforeSkipping = txn.snapshot.sizeInBytes + + if (otherPredicates.isEmpty) { + // Case 2: The condition can be evaluated using metadata only. + // Delete a set of files without the need of scanning any data files. + val operationTimestamp = System.currentTimeMillis() + val reportRowLevelMetrics = conf.getConf(DeltaSQLConf.DELTA_DML_METRICS_FROM_METADATA) + val candidateFiles = + txn.filterFiles(metadataPredicates, keepNumRecords = reportRowLevelMetrics) + + scanTimeMs = (System.nanoTime() - startTime) / 1000 / 1000 + numRemovedFiles = candidateFiles.size + numRemovedBytes = candidateFiles.map(_.size).sum + numFilesAfterSkipping = candidateFiles.size + numDeletionVectorsRemoved = candidateFiles.count(_.deletionVector != null) + val (numCandidateBytes, numCandidatePartitions) = + totalBytesAndDistinctPartitionValues(candidateFiles) + numBytesAfterSkipping = numCandidateBytes + numDeletedRows = getDeletedRowsFromAddFilesAndUpdateMetrics(candidateFiles) + + if (txn.metadata.partitionColumns.nonEmpty) { + numPartitionsAfterSkipping = Some(numCandidatePartitions) + numPartitionsRemovedFrom = Some(numCandidatePartitions) + numPartitionsAddedTo = Some(0) + } + candidateFiles.map(_.removeWithTimestamp(operationTimestamp)) + } else { + // Case 3: Delete the rows based on the condition. + + // Should we write the DVs to represent the deleted rows? + val shouldWriteDVs = shouldWritePersistentDeletionVectors(sparkSession, txn) + + val candidateFiles = txn.filterFiles( + metadataPredicates ++ otherPredicates, + keepNumRecords = shouldWriteDVs) + // `candidateFiles` contains the files filtered using statistics and delete condition + // They may or may not contains any rows that need to be deleted. + + numFilesAfterSkipping = candidateFiles.size + val (numCandidateBytes, numCandidatePartitions) = + totalBytesAndDistinctPartitionValues(candidateFiles) + numBytesAfterSkipping = numCandidateBytes + if (txn.metadata.partitionColumns.nonEmpty) { + numPartitionsAfterSkipping = Some(numCandidatePartitions) + } + + val nameToAddFileMap = generateCandidateFileMap(deltaLog.dataPath, candidateFiles) + + val fileIndex = new TahoeBatchFileIndex( + sparkSession, "delete", candidateFiles, deltaLog, deltaLog.dataPath, txn.snapshot) + if (shouldWriteDVs) { + val targetDf = DMLWithDeletionVectorsHelper.createTargetDfForScanningForMatches( + sparkSession, + target, + fileIndex) + + // Does the target table already has DVs enabled? If so, we need to read the table + // with deletion vectors. + val mustReadDeletionVectors = DeletionVectorUtils.deletionVectorsReadable(txn.snapshot) + + val touchedFiles = DMLWithDeletionVectorsHelper.findTouchedFiles( + sparkSession, + txn, + mustReadDeletionVectors, + deltaLog, + targetDf, + fileIndex, + cond, + opName = "DELETE") + + if (touchedFiles.nonEmpty) { + val (actions, metricMap) = DMLWithDeletionVectorsHelper.processUnmodifiedData( + sparkSession, + touchedFiles, + txn.snapshot) + metrics("numDeletedRows").set(metricMap("numModifiedRows")) + numDeletionVectorsAdded = metricMap("numDeletionVectorsAdded") + numDeletionVectorsRemoved = metricMap("numDeletionVectorsRemoved") + numDeletionVectorsUpdated = metricMap("numDeletionVectorsUpdated") + numRemovedFiles = metricMap("numRemovedFiles") + actions + } else { + Nil // Nothing to update + } + } else { + // Keep everything from the resolved target except a new TahoeFileIndex + // that only involves the affected files instead of all files. + val newTarget = DeltaTableUtils.replaceFileIndex(target, fileIndex) + val data = Dataset.ofRows(sparkSession, newTarget) + val incrDeletedCountExpr = IncrementMetric(TrueLiteral, metrics("numDeletedRows")) + val filesToRewrite = + withStatusCode("DELTA", FINDING_TOUCHED_FILES_MSG) { + if (candidateFiles.isEmpty) { + Array.empty[String] + } else { + // --- modified start + data.filter(new Column(cond)) + .select(input_file_name().as("input_files")) + .filter(new Column(incrDeletedCountExpr)) + .select(explode(split(col("input_files"), ","))) + .distinct() + .as[String] + .collect() + // --- modified end + } + } + + numRemovedFiles = filesToRewrite.length + scanTimeMs = (System.nanoTime() - startTime) / 1000 / 1000 + if (filesToRewrite.isEmpty) { + // Case 3.1: no row matches and no delete will be triggered + if (txn.metadata.partitionColumns.nonEmpty) { + numPartitionsRemovedFrom = Some(0) + numPartitionsAddedTo = Some(0) + } + Nil + } else { + // Case 3.2: some files need an update to remove the deleted files + // Do the second pass and just read the affected files + val baseRelation = buildBaseRelation( + sparkSession, txn, "delete", deltaLog.dataPath, filesToRewrite, nameToAddFileMap) + // Keep everything from the resolved target except a new TahoeFileIndex + // that only involves the affected files instead of all files. + val newTarget = DeltaTableUtils.replaceFileIndex(target, baseRelation.location) + val targetDF = RowTracking.preserveRowTrackingColumns( + dfWithoutRowTrackingColumns = Dataset.ofRows(sparkSession, newTarget), + snapshot = txn.snapshot) + val filterCond = Not(EqualNullSafe(cond, Literal.TrueLiteral)) + val rewrittenActions = rewriteFiles(txn, targetDF, filterCond, filesToRewrite.length) + val (changeFiles, rewrittenFiles) = rewrittenActions + .partition(_.isInstanceOf[AddCDCFile]) + numAddedFiles = rewrittenFiles.size + val removedFiles = filesToRewrite.map(f => + getTouchedFile(deltaLog.dataPath, f, nameToAddFileMap)) + val (removedBytes, removedPartitions) = + totalBytesAndDistinctPartitionValues(removedFiles) + numRemovedBytes = removedBytes + val (rewrittenBytes, rewrittenPartitions) = + totalBytesAndDistinctPartitionValues(rewrittenFiles) + numAddedBytes = rewrittenBytes + if (txn.metadata.partitionColumns.nonEmpty) { + numPartitionsRemovedFrom = Some(removedPartitions) + numPartitionsAddedTo = Some(rewrittenPartitions) + } + numAddedChangeFiles = changeFiles.size + changeFileBytes = changeFiles.collect { case f: AddCDCFile => f.size }.sum + rewriteTimeMs = + TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startTime) - scanTimeMs + numDeletedRows = Some(metrics("numDeletedRows").value) + numCopiedRows = + Some(metrics("numTouchedRows").value - metrics("numDeletedRows").value) + numDeletionVectorsRemoved = removedFiles.count(_.deletionVector != null) + val operationTimestamp = System.currentTimeMillis() + removeFilesFromPaths( + deltaLog, nameToAddFileMap, filesToRewrite, operationTimestamp) ++ rewrittenActions + } + } + } + } + metrics("numRemovedFiles").set(numRemovedFiles) + metrics("numAddedFiles").set(numAddedFiles) + val executionTimeMs = (System.nanoTime() - startTime) / 1000 / 1000 + metrics("executionTimeMs").set(executionTimeMs) + metrics("scanTimeMs").set(scanTimeMs) + metrics("rewriteTimeMs").set(rewriteTimeMs) + metrics("numAddedChangeFiles").set(numAddedChangeFiles) + metrics("changeFileBytes").set(changeFileBytes) + metrics("numAddedBytes").set(numAddedBytes) + metrics("numRemovedBytes").set(numRemovedBytes) + metrics("numFilesBeforeSkipping").set(numFilesBeforeSkipping) + metrics("numBytesBeforeSkipping").set(numBytesBeforeSkipping) + metrics("numFilesAfterSkipping").set(numFilesAfterSkipping) + metrics("numBytesAfterSkipping").set(numBytesAfterSkipping) + metrics("numDeletionVectorsAdded").set(numDeletionVectorsAdded) + metrics("numDeletionVectorsRemoved").set(numDeletionVectorsRemoved) + metrics("numDeletionVectorsUpdated").set(numDeletionVectorsUpdated) + numPartitionsAfterSkipping.foreach(metrics("numPartitionsAfterSkipping").set) + numPartitionsAddedTo.foreach(metrics("numPartitionsAddedTo").set) + numPartitionsRemovedFrom.foreach(metrics("numPartitionsRemovedFrom").set) + numCopiedRows.foreach(metrics("numCopiedRows").set) + txn.registerSQLMetrics(sparkSession, metrics) + sendDriverMetrics(sparkSession, metrics) + + recordDeltaEvent( + deltaLog, + "delta.dml.delete.stats", + data = DeleteMetric( + condition = condition.map(_.sql).getOrElse("true"), + numFilesTotal, + numFilesAfterSkipping, + numAddedFiles, + numRemovedFiles, + numAddedFiles, + numAddedChangeFiles = numAddedChangeFiles, + numFilesBeforeSkipping, + numBytesBeforeSkipping, + numFilesAfterSkipping, + numBytesAfterSkipping, + numPartitionsAfterSkipping, + numPartitionsAddedTo, + numPartitionsRemovedFrom, + numCopiedRows, + numDeletedRows, + numAddedBytes, + numRemovedBytes, + changeFileBytes = changeFileBytes, + scanTimeMs, + rewriteTimeMs, + numDeletionVectorsAdded, + numDeletionVectorsRemoved, + numDeletionVectorsUpdated) + ) + + if (deleteActions.nonEmpty) { + createSetTransaction(sparkSession, deltaLog).toSeq ++ deleteActions + } else { + Seq.empty + } + } + + /** + * Returns the list of [[AddFile]]s and [[AddCDCFile]]s that have been re-written. + */ + private def rewriteFiles( + txn: OptimisticTransaction, + baseData: DataFrame, + filterCondition: Expression, + numFilesToRewrite: Long): Seq[FileAction] = { + val shouldWriteCdc = DeltaConfigs.CHANGE_DATA_FEED.fromMetaData(txn.metadata) + + // number of total rows that we have seen / are either copying or deleting (sum of both). + val incrTouchedCountExpr = IncrementMetric(TrueLiteral, metrics("numTouchedRows")) + + withStatusCode( + "DELTA", rewritingFilesMsg(numFilesToRewrite)) { + val dfToWrite = if (shouldWriteCdc) { + import org.apache.spark.sql.delta.commands.cdc.CDCReader._ + // The logic here ends up being surprisingly elegant, with all source rows ending up in + // the output. Recall that we flipped the user-provided delete condition earlier, before the + // call to `rewriteFiles`. All rows which match this latest `filterCondition` are retained + // as table data, while all rows which don't match are removed from the rewritten table data + // but do get included in the output as CDC events. + baseData + .filter(new Column(incrTouchedCountExpr)) + .withColumn( + CDC_TYPE_COLUMN_NAME, + new Column(If(filterCondition, CDC_TYPE_NOT_CDC, CDC_TYPE_DELETE)) + ) + } else { + baseData + .filter(new Column(incrTouchedCountExpr)) + .filter(new Column(filterCondition)) + } + + txn.writeFiles(dfToWrite) + } + } + + def shouldWritePersistentDeletionVectors( + spark: SparkSession, txn: OptimisticTransaction): Boolean = { + spark.conf.get(DeltaSQLConf.DELETE_USE_PERSISTENT_DELETION_VECTORS) && + DeletionVectorUtils.deletionVectorsWritable(txn.snapshot) + } +} + +object DeleteCommand { + def apply(delete: DeltaDelete): DeleteCommand = { + EliminateSubqueryAliases(delete.child) match { + case DeltaFullTable(relation, fileIndex) => + DeleteCommand(fileIndex.deltaLog, relation.catalogTable, delete.child, delete.condition) + case o => + throw DeltaErrors.notADeltaSourceException("DELETE", Some(o)) + } + } + + val FILE_NAME_COLUMN: String = "_input_file_name_" + val FINDING_TOUCHED_FILES_MSG: String = "Finding files to rewrite for DELETE operation" + + def rewritingFilesMsg(numFilesToRewrite: Long): String = + s"Rewriting $numFilesToRewrite files for DELETE operation" +} + +/** + * Used to report details about delete. + * + * @param condition: what was the delete condition + * @param numFilesTotal: how big is the table + * @param numTouchedFiles: how many files did we touch. Alias for `numFilesAfterSkipping` + * @param numRewrittenFiles: how many files had to be rewritten. Alias for `numAddedFiles` + * @param numRemovedFiles: how many files we removed. Alias for `numTouchedFiles` + * @param numAddedFiles: how many files we added. Alias for `numRewrittenFiles` + * @param numAddedChangeFiles: how many change files were generated + * @param numFilesBeforeSkipping: how many candidate files before skipping + * @param numBytesBeforeSkipping: how many candidate bytes before skipping + * @param numFilesAfterSkipping: how many candidate files after skipping + * @param numBytesAfterSkipping: how many candidate bytes after skipping + * @param numPartitionsAfterSkipping: how many candidate partitions after skipping + * @param numPartitionsAddedTo: how many new partitions were added + * @param numPartitionsRemovedFrom: how many partitions were removed + * @param numCopiedRows: how many rows were copied + * @param numDeletedRows: how many rows were deleted + * @param numBytesAdded: how many bytes were added + * @param numBytesRemoved: how many bytes were removed + * @param changeFileBytes: total size of change files generated + * @param scanTimeMs: how long did finding take + * @param rewriteTimeMs: how long did rewriting take + * @param numDeletionVectorsAdded: how many deletion vectors were added + * @param numDeletionVectorsRemoved: how many deletion vectors were removed + * @param numDeletionVectorsUpdated: how many deletion vectors were updated + * + * @note All the time units are milliseconds. + */ +case class DeleteMetric( + condition: String, + numFilesTotal: Long, + numTouchedFiles: Long, + numRewrittenFiles: Long, + numRemovedFiles: Long, + numAddedFiles: Long, + numAddedChangeFiles: Long, + numFilesBeforeSkipping: Long, + numBytesBeforeSkipping: Long, + numFilesAfterSkipping: Long, + numBytesAfterSkipping: Long, + numPartitionsAfterSkipping: Option[Long], + numPartitionsAddedTo: Option[Long], + numPartitionsRemovedFrom: Option[Long], + @JsonDeserialize(contentAs = classOf[java.lang.Long]) + numCopiedRows: Option[Long], + @JsonDeserialize(contentAs = classOf[java.lang.Long]) + numDeletedRows: Option[Long], + numBytesAdded: Long, + numBytesRemoved: Long, + changeFileBytes: Long, + scanTimeMs: Long, + rewriteTimeMs: Long, + numDeletionVectorsAdded: Long, + numDeletionVectorsRemoved: Long, + numDeletionVectorsUpdated: Long +) diff --git a/backends-clickhouse/src/main/delta-32/org/apache/spark/sql/delta/commands/OptimizeTableCommand.scala b/backends-clickhouse/src/main/delta-32/org/apache/spark/sql/delta/commands/OptimizeTableCommand.scala new file mode 100644 index 0000000000000..5b21702202288 --- /dev/null +++ b/backends-clickhouse/src/main/delta-32/org/apache/spark/sql/delta/commands/OptimizeTableCommand.scala @@ -0,0 +1,608 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.delta.commands + +import java.util.ConcurrentModificationException + +import scala.collection.mutable.ArrayBuffer + +import org.apache.spark.sql.delta.skipping.MultiDimClustering +import org.apache.spark.sql.delta.skipping.clustering.{ClusteredTableUtils, ClusteringColumnInfo} +import org.apache.spark.sql.delta._ +import org.apache.spark.sql.delta.DeltaOperations.Operation +import org.apache.spark.sql.delta.actions.{Action, AddFile, DeletionVectorDescriptor, FileAction, RemoveFile} +import org.apache.spark.sql.delta.commands.optimize._ +import org.apache.spark.sql.delta.files.SQLMetricsReporting +import org.apache.spark.sql.delta.schema.SchemaUtils +import org.apache.spark.sql.delta.sources.DeltaSQLConf + +import org.apache.spark.SparkContext +import org.apache.spark.SparkContext.SPARK_JOB_GROUP_ID +import org.apache.spark.sql.{AnalysisException, Encoders, Row, SparkSession} +import org.apache.spark.sql.catalyst.TableIdentifier +import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute +import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Expression} +import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, UnaryNode} +import org.apache.spark.sql.execution.command.RunnableCommand +import org.apache.spark.sql.execution.datasources.v2.clickhouse.ClickHouseConfig +import org.apache.spark.sql.execution.datasources.v2.clickhouse.metadata.AddMergeTreeParts +import org.apache.spark.sql.execution.metric.SQLMetric +import org.apache.spark.sql.execution.metric.SQLMetrics.createMetric +import org.apache.spark.sql.types._ +import org.apache.spark.util.{SystemClock, ThreadUtils} + +/** + * Gluten overwrite Delta: + * + * This file is copied from Delta 3.2.0. It is modified in: + * 1. getDeltaTable supports to get ClickHouseTableV2 + * 2. runOptimizeBinJobClickhouse + * 3. groupFilesIntoBinsClickhouse + */ + +/** Base class defining abstract optimize command */ +abstract class OptimizeTableCommandBase extends RunnableCommand with DeltaCommand { + + override val output: Seq[Attribute] = Seq( + AttributeReference("path", StringType)(), + AttributeReference("metrics", Encoders.product[OptimizeMetrics].schema)()) + + /** + * Validates ZOrderBy columns + * - validates that partitions columns are not used in `unresolvedZOrderByCols` + * - validates that we already collect stats for all the columns used in `unresolvedZOrderByCols` + * + * @param spark [[SparkSession]] to use + * @param txn the [[OptimisticTransaction]] being used to optimize + * @param unresolvedZOrderByCols Seq of [[UnresolvedAttribute]] corresponding to zOrderBy columns + */ + def validateZorderByColumns( + spark: SparkSession, + txn: OptimisticTransaction, + unresolvedZOrderByCols: Seq[UnresolvedAttribute]): Unit = { + if (unresolvedZOrderByCols.isEmpty) return + val metadata = txn.snapshot.metadata + val partitionColumns = metadata.partitionColumns.toSet + val dataSchema = + StructType(metadata.schema.filterNot(c => partitionColumns.contains(c.name))) + val df = spark.createDataFrame(new java.util.ArrayList[Row](), dataSchema) + val checkColStat = spark.sessionState.conf.getConf( + DeltaSQLConf.DELTA_OPTIMIZE_ZORDER_COL_STAT_CHECK) + val statCollectionSchema = txn.snapshot.statCollectionLogicalSchema + val colsWithoutStats = ArrayBuffer[String]() + + unresolvedZOrderByCols.foreach { colAttribute => + val colName = colAttribute.name + if (checkColStat) { + try { + SchemaUtils.findColumnPosition(colAttribute.nameParts, statCollectionSchema) + } catch { + case e: AnalysisException if e.getMessage.contains("Couldn't find column") => + colsWithoutStats.append(colName) + } + } + val isNameEqual = spark.sessionState.conf.resolver + if (partitionColumns.find(isNameEqual(_, colName)).nonEmpty) { + throw DeltaErrors.zOrderingOnPartitionColumnException(colName) + } + if (df.queryExecution.analyzed.resolve(colAttribute.nameParts, isNameEqual).isEmpty) { + throw DeltaErrors.zOrderingColumnDoesNotExistException(colName) + } + } + if (checkColStat && colsWithoutStats.nonEmpty) { + throw DeltaErrors.zOrderingOnColumnWithNoStatsException( + colsWithoutStats.toSeq, spark) + } + } +} + +object OptimizeTableCommand { + /** + * Alternate constructor that converts a provided path or table identifier into the + * correct child LogicalPlan node. If both path and tableIdentifier are specified (or + * if both are None), this method will throw an exception. If a table identifier is + * specified, the child LogicalPlan will be an [[UnresolvedTable]] whereas if a path + * is specified, it will be an [[UnresolvedPathBasedDeltaTable]]. + * + * Note that the returned OptimizeTableCommand will have an *unresolved* child table + * and hence, the command needs to be analyzed before it can be executed. + */ + def apply( + path: Option[String], + tableIdentifier: Option[TableIdentifier], + userPartitionPredicates: Seq[String], + optimizeContext: DeltaOptimizeContext = DeltaOptimizeContext())( + zOrderBy: Seq[UnresolvedAttribute]): OptimizeTableCommand = { + val plan = UnresolvedDeltaPathOrIdentifier(path, tableIdentifier, "OPTIMIZE") + OptimizeTableCommand(plan, userPartitionPredicates, optimizeContext)(zOrderBy) + } +} + +/** + * The `optimize` command implementation for Spark SQL. Example SQL: + * {{{ + * OPTIMIZE ('/path/to/dir' | delta.table) [WHERE part = 25]; + * }}} + */ +case class OptimizeTableCommand( + override val child: LogicalPlan, + userPartitionPredicates: Seq[String], + optimizeContext: DeltaOptimizeContext +)(val zOrderBy: Seq[UnresolvedAttribute]) + extends OptimizeTableCommandBase with RunnableCommand with UnaryNode { + + override val otherCopyArgs: Seq[AnyRef] = zOrderBy :: Nil + + override protected def withNewChildInternal(newChild: LogicalPlan): OptimizeTableCommand = + copy(child = newChild)(zOrderBy) + + override def run(sparkSession: SparkSession): Seq[Row] = { + // --- modified start + val table = OptimizeTableCommandOverwrites.getDeltaTable(child, "OPTIMIZE") + // --- modified end + + val txn = table.startTransaction() + if (txn.readVersion == -1) { + throw DeltaErrors.notADeltaTableException(table.deltaLog.dataPath.toString) + } + + if (ClusteredTableUtils.isSupported(txn.protocol)) { + if (userPartitionPredicates.nonEmpty) { + throw DeltaErrors.clusteringWithPartitionPredicatesException(userPartitionPredicates) + } + if (zOrderBy.nonEmpty) { + throw DeltaErrors.clusteringWithZOrderByException(zOrderBy) + } + } + + val partitionColumns = txn.snapshot.metadata.partitionColumns + // Parse the predicate expression into Catalyst expression and verify only simple filters + // on partition columns are present + + val partitionPredicates = userPartitionPredicates.flatMap { predicate => + val predicates = parsePredicates(sparkSession, predicate) + verifyPartitionPredicates( + sparkSession, + partitionColumns, + predicates) + predicates + } + + validateZorderByColumns(sparkSession, txn, zOrderBy) + val zOrderByColumns = zOrderBy.map(_.name).toSeq + + new OptimizeExecutor( + sparkSession, + txn, + partitionPredicates, + zOrderByColumns, + isAutoCompact = false, + optimizeContext + ).optimize() + } +} + +/** + * Stored all runtime context information that can control the execution of optimize. + * + * @param reorg The REORG operation that triggered the rewriting task, if any. + * @param minFileSize Files which are smaller than this threshold will be selected for compaction. + * If not specified, [[DeltaSQLConf.DELTA_OPTIMIZE_MIN_FILE_SIZE]] will be used. + * This parameter must be set to `0` when [[reorg]] is set. + * @param maxDeletedRowsRatio Files with a ratio of soft-deleted rows to the total rows larger than + * this threshold will be rewritten by the OPTIMIZE command. If not + * specified, [[DeltaSQLConf.DELTA_OPTIMIZE_MAX_DELETED_ROWS_RATIO]] + * will be used. This parameter must be set to `0` when [[reorg]] is set. + */ +case class DeltaOptimizeContext( + reorg: Option[DeltaReorgOperation] = None, + minFileSize: Option[Long] = None, + maxFileSize: Option[Long] = None, + maxDeletedRowsRatio: Option[Double] = None) { + if (reorg.nonEmpty) { + require( + minFileSize.contains(0L) && maxDeletedRowsRatio.contains(0d), + "minFileSize and maxDeletedRowsRatio must be 0 when running REORG TABLE.") + } +} + +/** + * Optimize job which compacts small files into larger files to reduce + * the number of files and potentially allow more efficient reads. + * + * @param sparkSession Spark environment reference. + * @param txn The transaction used to optimize this table + * @param partitionPredicate List of partition predicates to select subset of files to optimize. + */ +class OptimizeExecutor( + sparkSession: SparkSession, + txn: OptimisticTransaction, + partitionPredicate: Seq[Expression], + zOrderByColumns: Seq[String], + isAutoCompact: Boolean, + optimizeContext: DeltaOptimizeContext) + extends DeltaCommand with SQLMetricsReporting with Serializable { + + /** + * In which mode the Optimize command is running. There are three valid modes: + * 1. Compaction + * 2. ZOrder + * 3. Clustering + */ + private val optimizeStrategy = + OptimizeTableStrategy(sparkSession, txn.snapshot, optimizeContext, zOrderByColumns) + + /** Timestamp to use in [[FileAction]] */ + private val operationTimestamp = new SystemClock().getTimeMillis() + + private val isClusteredTable = ClusteredTableUtils.isSupported(txn.snapshot.protocol) + + private val isMultiDimClustering = + optimizeStrategy.isInstanceOf[ClusteringStrategy] || + optimizeStrategy.isInstanceOf[ZOrderStrategy] + + private val clusteringColumns: Seq[String] = { + if (zOrderByColumns.nonEmpty) { + zOrderByColumns + } else if (isClusteredTable) { + ClusteringColumnInfo.extractLogicalNames(txn.snapshot) + } else { + Nil + } + } + + def optimize(): Seq[Row] = { + recordDeltaOperation(txn.deltaLog, "delta.optimize") { + + // --- modified start + val isMergeTreeFormat = ClickHouseConfig + .isMergeTreeFormatEngine(txn.deltaLog.unsafeVolatileMetadata.configuration) + // --- modified end + + val minFileSize = optimizeContext.minFileSize.getOrElse( + sparkSession.sessionState.conf.getConf(DeltaSQLConf.DELTA_OPTIMIZE_MIN_FILE_SIZE)) + val maxFileSize = optimizeContext.maxFileSize.getOrElse( + sparkSession.sessionState.conf.getConf(DeltaSQLConf.DELTA_OPTIMIZE_MAX_FILE_SIZE)) + val maxDeletedRowsRatio = optimizeContext.maxDeletedRowsRatio.getOrElse( + sparkSession.sessionState.conf.getConf(DeltaSQLConf.DELTA_OPTIMIZE_MAX_DELETED_ROWS_RATIO)) + + val candidateFiles = txn.filterFiles(partitionPredicate, keepNumRecords = true) + val partitionSchema = txn.metadata.partitionSchema + + val filesToProcess = optimizeContext.reorg match { + case Some(reorgOperation) => reorgOperation.filterFilesToReorg(txn.snapshot, candidateFiles) + case None => filterCandidateFileList(minFileSize, maxDeletedRowsRatio, candidateFiles) + } + // --- modified start + val maxThreads = + sparkSession.sessionState.conf.getConf(DeltaSQLConf.DELTA_OPTIMIZE_MAX_THREADS) + val (updates, jobs) = if (isMergeTreeFormat) { + val partitionsToCompact = filesToProcess + .groupBy(file => (file.asInstanceOf[AddMergeTreeParts].bucketNum, file.partitionValues)) + .toSeq + val jobs = OptimizeTableCommandOverwrites + .groupFilesIntoBinsClickhouse(partitionsToCompact, maxFileSize) + val updates = ThreadUtils.parmap(jobs, "OptimizeJob", maxThreads) { + partitionBinGroup => + // --- modified start + OptimizeTableCommandOverwrites.runOptimizeBinJobClickhouse( + txn, + partitionBinGroup._1._2, + partitionBinGroup._1._1, + partitionBinGroup._2, + maxFileSize) + // --- modified end + }.flatten + // uniform the jobs type + (updates, jobs.map(v => (v._1._2 ++ Map("bucketNum" -> v._1.toString()), v._2))) + } else { + val partitionsToCompact = filesToProcess.groupBy(_.partitionValues).toSeq + + val jobs = groupFilesIntoBins(partitionsToCompact) + + val updates = ThreadUtils.parmap(jobs, "OptimizeJob", maxThreads) { partitionBinGroup => + runOptimizeBinJob(txn, partitionBinGroup._1, partitionBinGroup._2, maxFileSize) + }.flatten + (updates, jobs) + } + // --- modified end + + val addedFiles = updates.collect { case a: AddFile => a } + val removedFiles = updates.collect { case r: RemoveFile => r } + val removedDVs = filesToProcess.filter(_.deletionVector != null).map(_.deletionVector).toSeq + if (addedFiles.size > 0) { + val metrics = createMetrics(sparkSession.sparkContext, addedFiles, removedFiles, removedDVs) + commitAndRetry(txn, getOperation(), updates, metrics) { newTxn => + val newPartitionSchema = newTxn.metadata.partitionSchema + val candidateSetOld = candidateFiles.map(_.path).toSet + val candidateSetNew = newTxn.filterFiles(partitionPredicate).map(_.path).toSet + + // As long as all of the files that we compacted are still part of the table, + // and the partitioning has not changed it is valid to continue to try + // and commit this checkpoint. + if (candidateSetOld.subsetOf(candidateSetNew) && partitionSchema == newPartitionSchema) { + true + } else { + val deleted = candidateSetOld -- candidateSetNew + logWarning(s"The following compacted files were delete " + + s"during checkpoint ${deleted.mkString(",")}. Aborting the compaction.") + false + } + } + } + + val optimizeStats = OptimizeStats() + optimizeStats.addedFilesSizeStats.merge(addedFiles) + optimizeStats.removedFilesSizeStats.merge(removedFiles) + optimizeStats.numPartitionsOptimized = jobs.map(j => j._1).distinct.size + optimizeStats.numBatches = jobs.size + optimizeStats.totalConsideredFiles = candidateFiles.size + optimizeStats.totalFilesSkipped = optimizeStats.totalConsideredFiles - removedFiles.size + optimizeStats.totalClusterParallelism = sparkSession.sparkContext.defaultParallelism + val numTableColumns = txn.snapshot.metadata.schema.size + optimizeStats.numTableColumns = numTableColumns + optimizeStats.numTableColumnsWithStats = + DeltaConfigs.DATA_SKIPPING_NUM_INDEXED_COLS.fromMetaData(txn.snapshot.metadata) + .min(numTableColumns) + if (removedDVs.size > 0) { + optimizeStats.deletionVectorStats = Some(DeletionVectorStats( + numDeletionVectorsRemoved = removedDVs.size, + numDeletionVectorRowsRemoved = removedDVs.map(_.cardinality).sum)) + } + + optimizeStrategy.updateOptimizeStats(optimizeStats, removedFiles, jobs) + + return Seq(Row(txn.deltaLog.dataPath.toString, optimizeStats.toOptimizeMetrics)) + } + } + + /** + * Helper method to prune the list of selected files based on fileSize and ratio of + * deleted rows according to the deletion vector in [[AddFile]]. + */ + private def filterCandidateFileList( + minFileSize: Long, maxDeletedRowsRatio: Double, files: Seq[AddFile]): Seq[AddFile] = { + + // Select all files in case of multi-dimensional clustering + if (isMultiDimClustering) return files + + def shouldCompactBecauseOfDeletedRows(file: AddFile): Boolean = { + // Always compact files with DVs but without numRecords stats. + // This may be overly aggressive, but it fixes the problem in the long-term, + // as the compacted files will have stats. + (file.deletionVector != null && file.numPhysicalRecords.isEmpty) || + file.deletedToPhysicalRecordsRatio.getOrElse(0d) > maxDeletedRowsRatio + } + + // Select files that are small or have too many deleted rows + files.filter( + addFile => addFile.size < minFileSize || shouldCompactBecauseOfDeletedRows(addFile)) + } + + /** + * Utility methods to group files into bins for optimize. + * + * @param partitionsToCompact List of files to compact group by partition. + * Partition is defined by the partition values (partCol -> partValue) + * @return Sequence of bins. Each bin contains one or more files from the same + * partition and targeted for one output file. + */ + private def groupFilesIntoBins( + partitionsToCompact: Seq[(Map[String, String], Seq[AddFile])]) + : Seq[(Map[String, String], Seq[AddFile])] = { + val maxBinSize = optimizeStrategy.maxBinSize + partitionsToCompact.flatMap { + case (partition, files) => + val bins = new ArrayBuffer[Seq[AddFile]]() + + val currentBin = new ArrayBuffer[AddFile]() + var currentBinSize = 0L + + val preparedFiles = optimizeStrategy.prepareFilesPerPartition(files) + preparedFiles.foreach { file => + // Generally, a bin is a group of existing files, whose total size does not exceed the + // desired maxBinSize. The output file size depends on the mode: + // 1. Compaction: Files in a bin will be coalesced into a single output file. + // 2. ZOrder: all files in a partition will be read by the + // same job, the data will be range-partitioned and + // numFiles = totalFileSize / maxFileSize will be produced. + // 3. Clustering: Files in a bin belongs to one ZCUBE, the data will be + // range-partitioned and numFiles = totalFileSize / maxFileSize. + if (file.size + currentBinSize > maxBinSize) { + bins += currentBin.toVector + currentBin.clear() + currentBin += file + currentBinSize = file.size + } else { + currentBin += file + currentBinSize += file.size + } + } + + if (currentBin.nonEmpty) { + bins += currentBin.toVector + } + + bins.filter { bin => + bin.size > 1 || // bin has more than one file or + bin.size == 1 && optimizeContext.reorg.nonEmpty || // always rewrite files during reorg + isMultiDimClustering // multi-clustering + }.map(b => (partition, b)) + } + } + + /** + * Utility method to run a Spark job to compact the files in given bin + * + * @param txn [[OptimisticTransaction]] instance in use to commit the changes to DeltaLog. + * @param partition Partition values of the partition that files in [[bin]] belongs to. + * @param bin List of files to compact into one large file. + * @param maxFileSize Targeted output file size in bytes + */ + private def runOptimizeBinJob( + txn: OptimisticTransaction, + partition: Map[String, String], + bin: Seq[AddFile], + maxFileSize: Long): Seq[FileAction] = { + val baseTablePath = txn.deltaLog.dataPath + + var input = txn.deltaLog.createDataFrame(txn.snapshot, bin, actionTypeOpt = Some("Optimize")) + input = RowTracking.preserveRowTrackingColumns(input, txn.snapshot) + val repartitionDF = if (isMultiDimClustering) { + val totalSize = bin.map(_.size).sum + val approxNumFiles = Math.max(1, totalSize / maxFileSize).toInt + MultiDimClustering.cluster( + input, + approxNumFiles, + clusteringColumns, + optimizeStrategy.curve) + } else { + val useRepartition = sparkSession.sessionState.conf.getConf( + DeltaSQLConf.DELTA_OPTIMIZE_REPARTITION_ENABLED) + if (useRepartition) { + input.repartition(numPartitions = 1) + } else { + input.coalesce(numPartitions = 1) + } + } + + val partitionDesc = partition.toSeq.map(entry => entry._1 + "=" + entry._2).mkString(",") + + val partitionName = if (partition.isEmpty) "" else s" in partition ($partitionDesc)" + val description = s"$baseTablePath
Optimizing ${bin.size} files" + partitionName + sparkSession.sparkContext.setJobGroup( + sparkSession.sparkContext.getLocalProperty(SPARK_JOB_GROUP_ID), + description) + + val binInfo = optimizeStrategy.initNewBin + val addFiles = txn.writeFiles(repartitionDF, None, isOptimize = true, Nil).collect { + case a: AddFile => optimizeStrategy.tagAddFile(a, binInfo) + case other => + throw new IllegalStateException( + s"Unexpected action $other with type ${other.getClass}. File compaction job output" + + s"should only have AddFiles") + } + val removeFiles = bin.map(f => f.removeWithTimestamp(operationTimestamp, dataChange = false)) + val updates = addFiles ++ removeFiles + updates + } + + /** + * Attempts to commit the given actions to the log. In the case of a concurrent update, + * the given function will be invoked with a new transaction to allow custom conflict + * detection logic to indicate it is safe to try again, by returning `true`. + * + * This function will continue to try to commit to the log as long as `f` returns `true`, + * otherwise throws a subclass of [[ConcurrentModificationException]]. + */ + private def commitAndRetry( + txn: OptimisticTransaction, + optimizeOperation: Operation, + actions: Seq[Action], + metrics: Map[String, SQLMetric])(f: OptimisticTransaction => Boolean): Unit = { + try { + txn.registerSQLMetrics(sparkSession, metrics) + txn.commit(actions, optimizeOperation, + RowTracking.addPreservedRowTrackingTagIfNotSet(txn.snapshot)) + } catch { + case e: ConcurrentModificationException => + val newTxn = txn.deltaLog.startTransaction(txn.catalogTable) + if (f(newTxn)) { + logInfo("Retrying commit after checking for semantic conflicts with concurrent updates.") + commitAndRetry(newTxn, optimizeOperation, actions, metrics)(f) + } else { + logWarning("Semantic conflicts detected. Aborting operation.") + throw e + } + } + } + + /** Create the appropriate [[Operation]] object for txn commit history */ + private def getOperation(): Operation = { + if (optimizeContext.reorg.nonEmpty) { + DeltaOperations.Reorg(partitionPredicate) + } else { + DeltaOperations.Optimize( + predicate = partitionPredicate, + zOrderBy = zOrderByColumns, + auto = isAutoCompact, + clusterBy = if (isClusteredTable) Option(clusteringColumns).filter(_.nonEmpty) else None) + } + } + + /** Create a map of SQL metrics for adding to the commit history. */ + private def createMetrics( + sparkContext: SparkContext, + addedFiles: Seq[AddFile], + removedFiles: Seq[RemoveFile], + removedDVs: Seq[DeletionVectorDescriptor]): Map[String, SQLMetric] = { + + def setAndReturnMetric(description: String, value: Long) = { + val metric = createMetric(sparkContext, description) + metric.set(value) + metric + } + + def totalSize(actions: Seq[FileAction]): Long = { + var totalSize = 0L + actions.foreach { file => + val fileSize = file match { + case addFile: AddFile => addFile.size + case removeFile: RemoveFile => removeFile.size.getOrElse(0L) + case default => + throw new IllegalArgumentException(s"Unknown FileAction type: ${default.getClass}") + } + totalSize += fileSize + } + totalSize + } + + val (deletionVectorRowsRemoved, deletionVectorBytesRemoved) = + removedDVs.map(dv => (dv.cardinality, dv.sizeInBytes.toLong)) + .reduceLeftOption((dv1, dv2) => (dv1._1 + dv2._1, dv1._2 + dv2._2)) + .getOrElse((0L, 0L)) + + val dvMetrics: Map[String, SQLMetric] = Map( + "numDeletionVectorsRemoved" -> + setAndReturnMetric( + "total number of deletion vectors removed", + removedDVs.size), + "numDeletionVectorRowsRemoved" -> + setAndReturnMetric( + "total number of deletion vector rows removed", + deletionVectorRowsRemoved), + "numDeletionVectorBytesRemoved" -> + setAndReturnMetric( + "total number of bytes of removed deletion vectors", + deletionVectorBytesRemoved)) + + val sizeStats = FileSizeStatsWithHistogram.create(addedFiles.map(_.size).sorted) + Map[String, SQLMetric]( + "minFileSize" -> setAndReturnMetric("minimum file size", sizeStats.get.min), + "p25FileSize" -> setAndReturnMetric("25th percentile file size", sizeStats.get.p25), + "p50FileSize" -> setAndReturnMetric("50th percentile file size", sizeStats.get.p50), + "p75FileSize" -> setAndReturnMetric("75th percentile file size", sizeStats.get.p75), + "maxFileSize" -> setAndReturnMetric("maximum file size", sizeStats.get.max), + "numAddedFiles" -> setAndReturnMetric("total number of files added.", addedFiles.size), + "numRemovedFiles" -> setAndReturnMetric("total number of files removed.", removedFiles.size), + "numAddedBytes" -> setAndReturnMetric("total number of bytes added", totalSize(addedFiles)), + "numRemovedBytes" -> + setAndReturnMetric("total number of bytes removed", totalSize(removedFiles)) + ) ++ dvMetrics + } +} diff --git a/backends-clickhouse/src/main/delta-32/org/apache/spark/sql/delta/commands/OptimizeTableCommandOverwrites.scala b/backends-clickhouse/src/main/delta-32/org/apache/spark/sql/delta/commands/OptimizeTableCommandOverwrites.scala new file mode 100644 index 0000000000000..7b4c3231b8c31 --- /dev/null +++ b/backends-clickhouse/src/main/delta-32/org/apache/spark/sql/delta/commands/OptimizeTableCommandOverwrites.scala @@ -0,0 +1,345 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.delta.commands + +import org.apache.gluten.expression.ConverterUtils + +import org.apache.spark.{TaskContext, TaskOutputFileAlreadyExistException} +import org.apache.spark.internal.Logging +import org.apache.spark.internal.io.FileCommitProtocol.TaskCommitMessage +import org.apache.spark.internal.io.SparkHadoopWriterUtils +import org.apache.spark.shuffle.FetchFailedException +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier} +import org.apache.spark.sql.catalyst.catalog.CatalogTableType +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.catalyst.types.DataTypeUtils +import org.apache.spark.sql.delta._ +import org.apache.spark.sql.delta.actions.{AddFile, FileAction} +import org.apache.spark.sql.delta.catalog.{ClickHouseTableV2, DeltaTableV2} +import org.apache.spark.sql.errors.QueryExecutionErrors +import org.apache.spark.sql.execution.datasources.CHDatasourceJniWrapper +import org.apache.spark.sql.execution.datasources.v1.CHMergeTreeWriterInjects +import org.apache.spark.sql.execution.datasources.v1.clickhouse._ +import org.apache.spark.sql.execution.datasources.v2.clickhouse.metadata.{AddFileTags, AddMergeTreeParts} +import org.apache.spark.sql.execution.datasources.v2.clickhouse.utils.CHDataSourceUtils +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types.StructType +import org.apache.spark.util.{SerializableConfiguration, SystemClock, Utils} + +import org.apache.hadoop.fs.{FileAlreadyExistsException, Path} +import org.apache.hadoop.mapreduce.{TaskAttemptContext, TaskAttemptID, TaskID, TaskType} +import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl + +import java.util.{Date, UUID} + +import scala.collection.mutable.ArrayBuffer + +object OptimizeTableCommandOverwrites extends Logging { + + case class TaskDescription( + path: String, + database: String, + tableName: String, + snapshotId: String, + orderByKeyOption: Option[Seq[String]], + lowCardKeyOption: Option[Seq[String]], + minmaxIndexKeyOption: Option[Seq[String]], + bfIndexKeyOption: Option[Seq[String]], + setIndexKeyOption: Option[Seq[String]], + primaryKeyOption: Option[Seq[String]], + partitionColumns: Seq[String], + partList: Seq[String], + tableSchema: StructType, + clickhouseTableConfigs: Map[String, String], + serializableHadoopConf: SerializableConfiguration, + jobIdInstant: Long, + partitionDir: Option[String], + bucketDir: Option[String] + ) + + private def executeTask( + description: TaskDescription, + sparkStageId: Int, + sparkPartitionId: Int, + sparkAttemptNumber: Int + ): MergeTreeWriteTaskResult = { + + val jobId = SparkHadoopWriterUtils.createJobID(new Date(description.jobIdInstant), sparkStageId) + val taskId = new TaskID(jobId, TaskType.MAP, sparkPartitionId) + val taskAttemptId = new TaskAttemptID(taskId, sparkAttemptNumber) + + // Set up the attempt context required to use in the output committer. + val taskAttemptContext: TaskAttemptContext = { + // Set up the configuration object + val hadoopConf = description.serializableHadoopConf.value + hadoopConf.set("mapreduce.job.id", jobId.toString) + hadoopConf.set("mapreduce.task.id", taskAttemptId.getTaskID.toString) + hadoopConf.set("mapreduce.task.attempt.id", taskAttemptId.toString) + hadoopConf.setBoolean("mapreduce.task.ismap", true) + hadoopConf.setInt("mapreduce.task.partition", 0) + + new TaskAttemptContextImpl(hadoopConf, taskAttemptId) + } + + try { + Utils.tryWithSafeFinallyAndFailureCallbacks(block = { + + val uuid = UUID.randomUUID.toString + + val planWithSplitInfo = CHMergeTreeWriterInjects.genMergeTreeWriteRel( + description.path, + description.database, + description.tableName, + description.snapshotId, + description.orderByKeyOption, + description.lowCardKeyOption, + description.minmaxIndexKeyOption, + description.bfIndexKeyOption, + description.setIndexKeyOption, + description.primaryKeyOption, + description.partitionColumns, + description.partList, + ConverterUtils.convertNamedStructJson(description.tableSchema), + description.clickhouseTableConfigs, + DataTypeUtils.toAttributes(description.tableSchema) + ) + + val datasourceJniWrapper = new CHDatasourceJniWrapper() + val returnedMetrics = + datasourceJniWrapper.nativeMergeMTParts( + planWithSplitInfo.plan, + planWithSplitInfo.splitInfo, + uuid, + taskId.getId.toString, + description.partitionDir.getOrElse(""), + description.bucketDir.getOrElse("") + ) + if (returnedMetrics != null && returnedMetrics.nonEmpty) { + val addFiles = AddFileTags.partsMetricsToAddFile( + description.database, + description.tableName, + description.path, + returnedMetrics, + Seq(Utils.localHostName())) + + val (taskCommitMessage, taskCommitTime) = Utils.timeTakenMs { + // committer.commitTask(taskAttemptContext) + new TaskCommitMessage(addFiles.toSeq) + } + +// val summary = MergeTreeExecutedWriteSummary( +// updatedPartitions = updatedPartitions.toSet, +// stats = statsTrackers.map(_.getFinalStats(taskCommitTime))) + MergeTreeWriteTaskResult(taskCommitMessage, null) + } else { + throw new IllegalStateException() + } + })( + catchBlock = { + // If there is an error, abort the task + logError(s"Job $jobId aborted.") + }, + finallyBlock = {}) + } catch { + case e: FetchFailedException => + throw e + case f: FileAlreadyExistsException if SQLConf.get.fastFailFileFormatOutput => + // If any output file to write already exists, it does not make sense to re-run this task. + // We throw the exception and let Executor throw ExceptionFailure to abort the job. + throw new TaskOutputFileAlreadyExistException(f) + case t: Throwable => + throw QueryExecutionErrors.taskFailedWhileWritingRowsError(description.path, t) + } + + } + + def runOptimizeBinJobClickhouse( + txn: OptimisticTransaction, + partitionValues: Map[String, String], + bucketNum: String, + bin: Seq[AddFile], + maxFileSize: Long): Seq[FileAction] = { + val tableV2 = ClickHouseTableV2.getTable(txn.deltaLog); + + val sparkSession = SparkSession.getActiveSession.get + + val rddWithNonEmptyPartitions = + sparkSession.sparkContext.parallelize(Array.empty[InternalRow], 1) + + val jobIdInstant = new Date().getTime + val ret = new Array[MergeTreeWriteTaskResult](rddWithNonEmptyPartitions.partitions.length) + + val serializableHadoopConf = new SerializableConfiguration( + sparkSession.sessionState.newHadoopConfWithOptions( + txn.metadata.configuration ++ txn.deltaLog.options)) + + val partitionDir = if (tableV2.partitionColumns.isEmpty) { + None + } else { + Some(tableV2.partitionColumns.map(c => c + "=" + partitionValues(c)).mkString("/")) + } + + val bucketDir = if (tableV2.bucketOption.isEmpty) { + None + } else { + Some(bucketNum) + } + + val description = TaskDescription.apply( + txn.deltaLog.dataPath.toString, + tableV2.dataBaseName, + tableV2.tableName, + ClickhouseSnapshot.genSnapshotId(tableV2.initialSnapshot), + tableV2.orderByKeyOption, + tableV2.lowCardKeyOption, + tableV2.minmaxIndexKeyOption, + tableV2.bfIndexKeyOption, + tableV2.setIndexKeyOption, + tableV2.primaryKeyOption, + tableV2.partitionColumns, + bin.map(_.asInstanceOf[AddMergeTreeParts].name), + tableV2.schema(), + tableV2.clickhouseTableConfigs, + serializableHadoopConf, + jobIdInstant, + partitionDir, + bucketDir + ) + sparkSession.sparkContext.runJob( + rddWithNonEmptyPartitions, + (taskContext: TaskContext, _: Iterator[InternalRow]) => { + executeTask( + description, + taskContext.stageId(), + taskContext.partitionId(), + taskContext.taskAttemptId().toInt & Integer.MAX_VALUE + ) + }, + rddWithNonEmptyPartitions.partitions.indices, + (index, res: MergeTreeWriteTaskResult) => { + ret(index) = res + } + ) + + val addFiles = ret + .flatMap(_.commitMsg.obj.asInstanceOf[Seq[AddFile]]) + .toSeq + + val removeFiles = + bin.map(f => f.removeWithTimestamp(new SystemClock().getTimeMillis(), dataChange = false)) + addFiles ++ removeFiles + + } + + def getDeltaLogClickhouse( + spark: SparkSession, + path: Option[String], + tableIdentifier: Option[TableIdentifier], + operationName: String, + hadoopConf: Map[String, String] = Map.empty): DeltaLog = { + val tablePath = + if (path.nonEmpty) { + new Path(path.get) + } else if (tableIdentifier.nonEmpty) { + val sessionCatalog = spark.sessionState.catalog + lazy val metadata = sessionCatalog.getTableMetadata(tableIdentifier.get) + + if (CHDataSourceUtils.isClickhousePath(spark, tableIdentifier.get)) { + new Path(tableIdentifier.get.table) + } else if (CHDataSourceUtils.isClickHouseTable(spark, tableIdentifier.get)) { + new Path(metadata.location) + } else { + DeltaTableIdentifier(spark, tableIdentifier.get) match { + case Some(id) if id.path.nonEmpty => + new Path(id.path.get) + case Some(id) if id.table.nonEmpty => + new Path(metadata.location) + case _ => + if (metadata.tableType == CatalogTableType.VIEW) { + throw DeltaErrors.viewNotSupported(operationName) + } + throw DeltaErrors.notADeltaTableException(operationName) + } + } + } else { + throw DeltaErrors.missingTableIdentifierException(operationName) + } + + val startTime = Some(System.currentTimeMillis) + val deltaLog = DeltaLog.forTable(spark, tablePath, hadoopConf) + if (deltaLog.update(checkIfUpdatedSinceTs = startTime).version < 0) { + throw DeltaErrors.notADeltaTableException( + operationName, + DeltaTableIdentifier(path, tableIdentifier)) + } + deltaLog + } + + /** + * Extracts the [[DeltaTableV2]] from a LogicalPlan iff the LogicalPlan is a [[ResolvedTable]] + * with either a [[DeltaTableV2]] or a [[V1Table]] that is referencing a Delta table. In all + * other cases this method will throw a "Table not found" exception. + */ + def getDeltaTable(target: LogicalPlan, cmd: String): DeltaTableV2 = { + // TODO: Remove this wrapper and let former callers invoke DeltaTableV2.extractFrom directly. + ClickHouseTableV2.extractFrom(target, cmd) + } + + def groupFilesIntoBinsClickhouse( + partitionsToCompact: Seq[((String, Map[String, String]), Seq[AddFile])], + maxTargetFileSize: Long): Seq[((String, Map[String, String]), Seq[AddFile])] = { + partitionsToCompact.flatMap { + case (partition, files) => + val bins = new ArrayBuffer[Seq[AddFile]]() + + val currentBin = new ArrayBuffer[AddFile]() + var currentBinSize = 0L + + files.sortBy(_.size).foreach { + file => + // Generally, a bin is a group of existing files, whose total size does not exceed the + // desired maxFileSize. They will be coalesced into a single output file. + // However, if isMultiDimClustering = true, all files in a partition will be read by the + // same job, the data will be range-partitioned and + // numFiles = totalFileSize / maxFileSize + // will be produced. See below. + + // isMultiDimClustering is always false for Gluten Clickhouse for now + if (file.size + currentBinSize > maxTargetFileSize /* && !isMultiDimClustering */ ) { + bins += currentBin.toVector + currentBin.clear() + currentBin += file + currentBinSize = file.size + } else { + currentBin += file + currentBinSize += file.size + } + } + + if (currentBin.nonEmpty) { + bins += currentBin.toVector + } + + bins + .map(b => (partition, b)) + // select bins that have at least two files or in case of multi-dim clustering + // select all bins + .filter(_._2.size > 1 /* || isMultiDimClustering */ ) + } + } +} diff --git a/backends-clickhouse/src/main/delta-32/org/apache/spark/sql/delta/commands/UpdateCommand.scala b/backends-clickhouse/src/main/delta-32/org/apache/spark/sql/delta/commands/UpdateCommand.scala new file mode 100644 index 0000000000000..9a7fb96775f09 --- /dev/null +++ b/backends-clickhouse/src/main/delta-32/org/apache/spark/sql/delta/commands/UpdateCommand.scala @@ -0,0 +1,556 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.delta.commands + +// scalastyle:off import.ordering.noEmptyLine +import java.util.concurrent.TimeUnit + +import org.apache.spark.sql.delta.metric.IncrementMetric +import org.apache.spark.sql.delta._ +import org.apache.spark.sql.delta.actions.{AddCDCFile, AddFile, FileAction} +import org.apache.spark.sql.delta.commands.cdc.CDCReader.{CDC_TYPE_COLUMN_NAME, CDC_TYPE_NOT_CDC, CDC_TYPE_UPDATE_POSTIMAGE, CDC_TYPE_UPDATE_PREIMAGE} +import org.apache.spark.sql.delta.files.{TahoeBatchFileIndex, TahoeFileIndex} +import org.apache.spark.sql.delta.sources.DeltaSQLConf +import org.apache.hadoop.fs.Path + +import org.apache.spark.SparkContext +import org.apache.spark.sql.{Column, DataFrame, Dataset, Row, SparkSession} +import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute +import org.apache.spark.sql.catalyst.catalog.CatalogTable +import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Expression, If, Literal} +import org.apache.spark.sql.catalyst.expressions.Literal.TrueLiteral +import org.apache.spark.sql.catalyst.plans.QueryPlan +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.execution.command.LeafRunnableCommand +import org.apache.spark.sql.execution.metric.SQLMetric +import org.apache.spark.sql.execution.metric.SQLMetrics.{createMetric, createTimingMetric} +import org.apache.spark.sql.functions.{array, col, explode, input_file_name, lit, split, struct} +import org.apache.spark.sql.types.LongType + +/** + * Gluten overwrite Delta: + * + * This file is copied from Delta 3.2.0. It is modified to overcome the following issues: + * 1. In Clickhouse backend, we can't implement input_file_name() correctly, we can only implement + * it so that it return a a list of filenames (concated by ','). + */ + +/** + * Performs an Update using `updateExpression` on the rows that match `condition` + * + * Algorithm: + * 1) Identify the affected files, i.e., the files that may have the rows to be updated. + * 2) Scan affected files, apply the updates, and generate a new DF with updated rows. + * 3) Use the Delta protocol to atomically write the new DF as new files and remove + * the affected files that are identified in step 1. + */ +case class UpdateCommand( + tahoeFileIndex: TahoeFileIndex, + catalogTable: Option[CatalogTable], + target: LogicalPlan, + updateExpressions: Seq[Expression], + condition: Option[Expression]) + extends LeafRunnableCommand with DeltaCommand { + + override val output: Seq[Attribute] = { + Seq(AttributeReference("num_affected_rows", LongType)()) + } + + override def innerChildren: Seq[QueryPlan[_]] = Seq(target) + + @transient private lazy val sc: SparkContext = SparkContext.getOrCreate() + + override lazy val metrics = Map[String, SQLMetric]( + "numAddedFiles" -> createMetric(sc, "number of files added."), + "numAddedBytes" -> createMetric(sc, "number of bytes added"), + "numRemovedFiles" -> createMetric(sc, "number of files removed."), + "numRemovedBytes" -> createMetric(sc, "number of bytes removed"), + "numUpdatedRows" -> createMetric(sc, "number of rows updated."), + "numCopiedRows" -> createMetric(sc, "number of rows copied."), + "executionTimeMs" -> + createTimingMetric(sc, "time taken to execute the entire operation"), + "scanTimeMs" -> + createTimingMetric(sc, "time taken to scan the files for matches"), + "rewriteTimeMs" -> + createTimingMetric(sc, "time taken to rewrite the matched files"), + "numAddedChangeFiles" -> createMetric(sc, "number of change data capture files generated"), + "changeFileBytes" -> createMetric(sc, "total size of change data capture files generated"), + "numTouchedRows" -> createMetric(sc, "number of rows touched (copied + updated)"), + "numDeletionVectorsAdded" -> createMetric(sc, "number of deletion vectors added"), + "numDeletionVectorsRemoved" -> createMetric(sc, "number of deletion vectors removed"), + "numDeletionVectorsUpdated" -> createMetric(sc, "number of deletion vectors updated") + ) + + final override def run(sparkSession: SparkSession): Seq[Row] = { + recordDeltaOperation(tahoeFileIndex.deltaLog, "delta.dml.update") { + val deltaLog = tahoeFileIndex.deltaLog + deltaLog.withNewTransaction(catalogTable) { txn => + DeltaLog.assertRemovable(txn.snapshot) + if (hasBeenExecuted(txn, sparkSession)) { + sendDriverMetrics(sparkSession, metrics) + return Seq.empty + } + performUpdate(sparkSession, deltaLog, txn) + } + // Re-cache all cached plans(including this relation itself, if it's cached) that refer to + // this data source relation. + sparkSession.sharedState.cacheManager.recacheByPlan(sparkSession, target) + } + Seq(Row(metrics("numUpdatedRows").value)) + } + + private def performUpdate( + sparkSession: SparkSession, deltaLog: DeltaLog, txn: OptimisticTransaction): Unit = { + import org.apache.spark.sql.delta.implicits._ + + var numTouchedFiles: Long = 0 + var numRewrittenFiles: Long = 0 + var numAddedBytes: Long = 0 + var numRemovedBytes: Long = 0 + var numAddedChangeFiles: Long = 0 + var changeFileBytes: Long = 0 + var scanTimeMs: Long = 0 + var rewriteTimeMs: Long = 0 + var numDeletionVectorsAdded: Long = 0 + var numDeletionVectorsRemoved: Long = 0 + var numDeletionVectorsUpdated: Long = 0 + + val startTime = System.nanoTime() + val numFilesTotal = txn.snapshot.numOfFiles + + val updateCondition = condition.getOrElse(Literal.TrueLiteral) + val (metadataPredicates, dataPredicates) = + DeltaTableUtils.splitMetadataAndDataPredicates( + updateCondition, txn.metadata.partitionColumns, sparkSession) + + // Should we write the DVs to represent updated rows? + val shouldWriteDeletionVectors = shouldWritePersistentDeletionVectors(sparkSession, txn) + val candidateFiles = txn.filterFiles( + metadataPredicates ++ dataPredicates, + keepNumRecords = shouldWriteDeletionVectors) + + val nameToAddFile = generateCandidateFileMap(deltaLog.dataPath, candidateFiles) + + scanTimeMs = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startTime) + + val filesToRewrite: Seq[TouchedFileWithDV] = if (candidateFiles.isEmpty) { + // Case 1: Do nothing if no row qualifies the partition predicates + // that are part of Update condition + Nil + } else if (dataPredicates.isEmpty) { + // Case 2: Update all the rows from the files that are in the specified partitions + // when the data filter is empty + candidateFiles + .map(f => TouchedFileWithDV(f.path, f, newDeletionVector = null, deletedRows = 0L)) + } else { + // Case 3: Find all the affected files using the user-specified condition + val fileIndex = new TahoeBatchFileIndex( + sparkSession, "update", candidateFiles, deltaLog, tahoeFileIndex.path, txn.snapshot) + + val touchedFilesWithDV = if (shouldWriteDeletionVectors) { + // Case 3.1: Find all the affected files via DV path + val targetDf = DMLWithDeletionVectorsHelper.createTargetDfForScanningForMatches( + sparkSession, + target, + fileIndex) + + // Does the target table already has DVs enabled? If so, we need to read the table + // with deletion vectors. + val mustReadDeletionVectors = DeletionVectorUtils.deletionVectorsReadable(txn.snapshot) + + DMLWithDeletionVectorsHelper.findTouchedFiles( + sparkSession, + txn, + mustReadDeletionVectors, + deltaLog, + targetDf, + fileIndex, + updateCondition, + opName = "UPDATE") + } else { + // Case 3.2: Find all the affected files using the non-DV path + // Keep everything from the resolved target except a new TahoeFileIndex + // that only involves the affected files instead of all files. + val newTarget = DeltaTableUtils.replaceFileIndex(target, fileIndex) + val data = Dataset.ofRows(sparkSession, newTarget) + val incrUpdatedCountExpr = IncrementMetric(TrueLiteral, metrics("numUpdatedRows")) + val pathsToRewrite = + withStatusCode("DELTA", UpdateCommand.FINDING_TOUCHED_FILES_MSG) { + // --- modified start + data.filter(new Column(updateCondition)) + .select(input_file_name().as("input_files")) + .filter(new Column(incrUpdatedCountExpr)) + .select(explode(split(col("input_files"), ","))) + .distinct() + .as[String] + .collect() + // --- modified end + } + + // Wrap AddFile into TouchedFileWithDV that has empty DV. + pathsToRewrite + .map(getTouchedFile(deltaLog.dataPath, _, nameToAddFile)) + .map(f => TouchedFileWithDV(f.path, f, newDeletionVector = null, deletedRows = 0L)) + .toSeq + } + // Refresh scan time for Case 3, since we performed scan here. + scanTimeMs = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startTime) + touchedFilesWithDV + } + + val totalActions = { + // When DV is on, we first mask removed rows with DVs and generate (remove, add) pairs. + val actionsForExistingFiles = if (shouldWriteDeletionVectors) { + // When there's no data predicate, all matched files are removed. + if (dataPredicates.isEmpty) { + val operationTimestamp = System.currentTimeMillis() + filesToRewrite.map(_.fileLogEntry.removeWithTimestamp(operationTimestamp)) + } else { + // When there is data predicate, we generate (remove, add) pairs. + val filesToRewriteWithDV = filesToRewrite.filter(_.newDeletionVector != null) + val (dvActions, metricMap) = DMLWithDeletionVectorsHelper.processUnmodifiedData( + sparkSession, + filesToRewriteWithDV, + txn.snapshot) + metrics("numUpdatedRows").set(metricMap("numModifiedRows")) + numDeletionVectorsAdded = metricMap("numDeletionVectorsAdded") + numDeletionVectorsRemoved = metricMap("numDeletionVectorsRemoved") + numDeletionVectorsUpdated = metricMap("numDeletionVectorsUpdated") + numTouchedFiles = metricMap("numRemovedFiles") + dvActions + } + } else { + // Without DV we'll leave the job to `rewriteFiles`. + Nil + } + + // When DV is on, we write out updated rows only. The return value will be only `add` actions. + // When DV is off, we write out updated rows plus unmodified rows from the same file, then + // return `add` and `remove` actions. + val rewriteStartNs = System.nanoTime() + val actionsForNewFiles = + withStatusCode("DELTA", UpdateCommand.rewritingFilesMsg(filesToRewrite.size)) { + if (filesToRewrite.nonEmpty) { + rewriteFiles( + sparkSession, + txn, + rootPath = tahoeFileIndex.path, + inputLeafFiles = filesToRewrite.map(_.fileLogEntry), + nameToAddFileMap = nameToAddFile, + condition = updateCondition, + generateRemoveFileActions = !shouldWriteDeletionVectors, + copyUnmodifiedRows = !shouldWriteDeletionVectors) + } else { + Nil + } + } + rewriteTimeMs = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - rewriteStartNs) + + numTouchedFiles = filesToRewrite.length + val (addActions, removeActions) = actionsForNewFiles.partition(_.isInstanceOf[AddFile]) + numRewrittenFiles = addActions.size + numAddedBytes = addActions.map(_.getFileSize).sum + numRemovedBytes = removeActions.map(_.getFileSize).sum + + actionsForExistingFiles ++ actionsForNewFiles + } + + val changeActions = totalActions.collect { case f: AddCDCFile => f } + numAddedChangeFiles = changeActions.size + changeFileBytes = changeActions.map(_.size).sum + + metrics("numAddedFiles").set(numRewrittenFiles) + metrics("numAddedBytes").set(numAddedBytes) + metrics("numAddedChangeFiles").set(numAddedChangeFiles) + metrics("changeFileBytes").set(changeFileBytes) + metrics("numRemovedFiles").set(numTouchedFiles) + metrics("numRemovedBytes").set(numRemovedBytes) + metrics("executionTimeMs").set(TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startTime)) + metrics("scanTimeMs").set(scanTimeMs) + metrics("rewriteTimeMs").set(rewriteTimeMs) + // In the case where the numUpdatedRows is not captured, we can siphon out the metrics from + // the BasicWriteStatsTracker. This is for case 2 where the update condition contains only + // metadata predicates and so the entire partition is re-written. + val outputRows = txn.getMetric("numOutputRows").map(_.value).getOrElse(-1L) + if (metrics("numUpdatedRows").value == 0 && outputRows != 0 && + metrics("numCopiedRows").value == 0) { + // We know that numTouchedRows = numCopiedRows + numUpdatedRows. + // Since an entire partition was re-written, no rows were copied. + // So numTouchedRows == numUpdateRows + metrics("numUpdatedRows").set(metrics("numTouchedRows").value) + } else { + // This is for case 3 where the update condition contains both metadata and data predicates + // so relevant files will have some rows updated and some rows copied. We don't need to + // consider case 1 here, where no files match the update condition, as we know that + // `totalActions` is empty. + metrics("numCopiedRows").set( + metrics("numTouchedRows").value - metrics("numUpdatedRows").value) + metrics("numDeletionVectorsAdded").set(numDeletionVectorsAdded) + metrics("numDeletionVectorsRemoved").set(numDeletionVectorsRemoved) + metrics("numDeletionVectorsUpdated").set(numDeletionVectorsUpdated) + } + txn.registerSQLMetrics(sparkSession, metrics) + + val finalActions = createSetTransaction(sparkSession, deltaLog).toSeq ++ totalActions + txn.commitIfNeeded( + actions = finalActions, + op = DeltaOperations.Update(condition), + tags = RowTracking.addPreservedRowTrackingTagIfNotSet(txn.snapshot)) + sendDriverMetrics(sparkSession, metrics) + + recordDeltaEvent( + deltaLog, + "delta.dml.update.stats", + data = UpdateMetric( + condition = condition.map(_.sql).getOrElse("true"), + numFilesTotal, + numTouchedFiles, + numRewrittenFiles, + numAddedChangeFiles, + changeFileBytes, + scanTimeMs, + rewriteTimeMs, + numDeletionVectorsAdded, + numDeletionVectorsRemoved, + numDeletionVectorsUpdated) + ) + } + + /** + * Scan all the affected files and write out the updated files. + * + * When CDF is enabled, includes the generation of CDC preimage and postimage columns for + * changed rows. + * + * @return a list of [[FileAction]]s, consisting of newly-written data and CDC files and old + * files that have been removed. + */ + private def rewriteFiles( + spark: SparkSession, + txn: OptimisticTransaction, + rootPath: Path, + inputLeafFiles: Seq[AddFile], + nameToAddFileMap: Map[String, AddFile], + condition: Expression, + generateRemoveFileActions: Boolean, + copyUnmodifiedRows: Boolean): Seq[FileAction] = { + // Number of total rows that we have seen, i.e. are either copying or updating (sum of both). + // This will be used later, along with numUpdatedRows, to determine numCopiedRows. + val incrTouchedCountExpr = IncrementMetric(TrueLiteral, metrics("numTouchedRows")) + + // Containing the map from the relative file path to AddFile + val baseRelation = buildBaseRelation( + spark, txn, "update", rootPath, inputLeafFiles.map(_.path), nameToAddFileMap) + val newTarget = DeltaTableUtils.replaceFileIndex(target, baseRelation.location) + val (targetDf, finalOutput, finalUpdateExpressions) = UpdateCommand.preserveRowTrackingColumns( + targetDfWithoutRowTrackingColumns = Dataset.ofRows(spark, newTarget), + snapshot = txn.snapshot, + targetOutput = target.output, + updateExpressions) + + val targetDfWithEvaluatedCondition = { + val evalDf = targetDf.withColumn(UpdateCommand.CONDITION_COLUMN_NAME, new Column(condition)) + val copyAndUpdateRowsDf = if (copyUnmodifiedRows) { + evalDf + } else { + evalDf.filter(new Column(UpdateCommand.CONDITION_COLUMN_NAME)) + } + copyAndUpdateRowsDf.filter(new Column(incrTouchedCountExpr)) + } + + val updatedDataFrame = UpdateCommand.withUpdatedColumns( + finalOutput, + finalUpdateExpressions, + condition, + targetDfWithEvaluatedCondition, + UpdateCommand.shouldOutputCdc(txn)) + + val addFiles = txn.writeFiles(updatedDataFrame) + + val removeFiles = if (generateRemoveFileActions) { + val operationTimestamp = System.currentTimeMillis() + inputLeafFiles.map(_.removeWithTimestamp(operationTimestamp)) + } else { + Nil + } + + addFiles ++ removeFiles + } + + def shouldWritePersistentDeletionVectors( + spark: SparkSession, txn: OptimisticTransaction): Boolean = { + spark.conf.get(DeltaSQLConf.UPDATE_USE_PERSISTENT_DELETION_VECTORS) && + DeletionVectorUtils.deletionVectorsWritable(txn.snapshot) + } +} + +object UpdateCommand { + val FILE_NAME_COLUMN = "_input_file_name_" + val CONDITION_COLUMN_NAME = "__condition__" + val FINDING_TOUCHED_FILES_MSG: String = "Finding files to rewrite for UPDATE operation" + + def rewritingFilesMsg(numFilesToRewrite: Long): String = + s"Rewriting $numFilesToRewrite files for UPDATE operation" + + /** + * Whether or not CDC is enabled on this table and, thus, if we should output CDC data during this + * UPDATE operation. + */ + def shouldOutputCdc(txn: OptimisticTransaction): Boolean = { + DeltaConfigs.CHANGE_DATA_FEED.fromMetaData(txn.metadata) + } + + /** + * Build the new columns. If the condition matches, generate the new value using + * the corresponding UPDATE EXPRESSION; otherwise, keep the original column value. + * + * When CDC is enabled, includes the generation of CDC pre-image and post-image columns for + * changed rows. + * + * @param originalExpressions the original column values + * @param updateExpressions the update transformation to perform on the input DataFrame + * @param dfWithEvaluatedCondition source DataFrame on which we will apply the update expressions + * with an additional column CONDITION_COLUMN_NAME which is the + * true/false value of if the update condition is satisfied + * @param condition update condition + * @param shouldOutputCdc if we should output CDC data during this UPDATE operation. + * @return the updated DataFrame, with extra CDC columns if CDC is enabled + */ + def withUpdatedColumns( + originalExpressions: Seq[Attribute], + updateExpressions: Seq[Expression], + condition: Expression, + dfWithEvaluatedCondition: DataFrame, + shouldOutputCdc: Boolean): DataFrame = { + val resultDf = if (shouldOutputCdc) { + val namedUpdateCols = updateExpressions.zip(originalExpressions).map { + case (expr, targetCol) => new Column(expr).as(targetCol.name, targetCol.metadata) + } + + // Build an array of output rows to be unpacked later. If the condition is matched, we + // generate CDC pre and postimages in addition to the final output row; if the condition + // isn't matched, we just generate a rewritten no-op row without any CDC events. + val preimageCols = originalExpressions.map(new Column(_)) :+ + lit(CDC_TYPE_UPDATE_PREIMAGE).as(CDC_TYPE_COLUMN_NAME) + val postimageCols = namedUpdateCols :+ + lit(CDC_TYPE_UPDATE_POSTIMAGE).as(CDC_TYPE_COLUMN_NAME) + val notCdcCol = new Column(CDC_TYPE_NOT_CDC).as(CDC_TYPE_COLUMN_NAME) + val updatedDataCols = namedUpdateCols :+ notCdcCol + val noopRewriteCols = originalExpressions.map(new Column(_)) :+ notCdcCol + val packedUpdates = array( + struct(preimageCols: _*), + struct(postimageCols: _*), + struct(updatedDataCols: _*) + ).expr + + val packedData = if (condition == Literal.TrueLiteral) { + packedUpdates + } else { + If( + UnresolvedAttribute(CONDITION_COLUMN_NAME), + packedUpdates, // if it should be updated, then use `packagedUpdates` + array(struct(noopRewriteCols: _*)).expr) // else, this is a noop rewrite + } + + // Explode the packed array, and project back out the final data columns. + val finalColumns = (originalExpressions :+ UnresolvedAttribute(CDC_TYPE_COLUMN_NAME)).map { + a => col(s"packedData.`${a.name}`").as(a.name, a.metadata) + } + dfWithEvaluatedCondition + .select(explode(new Column(packedData)).as("packedData")) + .select(finalColumns: _*) + } else { + val finalCols = updateExpressions.zip(originalExpressions).map { case (update, original) => + val updated = if (condition == Literal.TrueLiteral) { + update + } else { + If(UnresolvedAttribute(CONDITION_COLUMN_NAME), update, original) + } + new Column(updated).as(original.name, original.metadata) + } + + dfWithEvaluatedCondition.select(finalCols: _*) + } + + resultDf.drop(CONDITION_COLUMN_NAME) + } + + /** + * Preserve the row tracking columns when performing an UPDATE. + * + * @param targetDfWithoutRowTrackingColumns The target DataFrame on which the UPDATE + * operation is to be performed. + * @param snapshot Snapshot of the Delta table at the start of + * the transaction. + * @param targetOutput The output schema of the target DataFrame. + * @param updateExpressions The update transformation to perform on the + * target DataFrame. + * @return + * 1. targetDf: The target DataFrame that includes the preserved row tracking columns. + * 2. finalOutput: The final output schema, including the preserved row tracking columns. + * 3. finalUpdateExpressions: The final update expressions, including transformations + * for the preserved row tracking columns. + */ + def preserveRowTrackingColumns( + targetDfWithoutRowTrackingColumns: DataFrame, + snapshot: Snapshot, + targetOutput: Seq[Attribute] = Seq.empty, + updateExpressions: Seq[Expression] = Seq.empty): + (DataFrame, Seq[Attribute], Seq[Expression]) = { + val targetDf = RowTracking.preserveRowTrackingColumns( + targetDfWithoutRowTrackingColumns, snapshot) + + val rowIdAttributeOpt = MaterializedRowId.getAttribute(snapshot, targetDf) + val rowCommitVersionAttributeOpt = + MaterializedRowCommitVersion.getAttribute(snapshot, targetDf) + val finalOutput = targetOutput ++ rowIdAttributeOpt ++ rowCommitVersionAttributeOpt + + val finalUpdateExpressions = updateExpressions ++ + rowIdAttributeOpt ++ + rowCommitVersionAttributeOpt.map(_ => Literal(null, LongType)) + (targetDf, finalOutput, finalUpdateExpressions) + } +} + +/** + * Used to report details about update. + * + * @param condition: what was the update condition + * @param numFilesTotal: how big is the table + * @param numTouchedFiles: how many files did we touch + * @param numRewrittenFiles: how many files had to be rewritten + * @param numAddedChangeFiles: how many change files were generated + * @param changeFileBytes: total size of change files generated + * @param scanTimeMs: how long did finding take + * @param rewriteTimeMs: how long did rewriting take + * + * @note All the time units are milliseconds. + */ +case class UpdateMetric( + condition: String, + numFilesTotal: Long, + numTouchedFiles: Long, + numRewrittenFiles: Long, + numAddedChangeFiles: Long, + changeFileBytes: Long, + scanTimeMs: Long, + rewriteTimeMs: Long, + numDeletionVectorsAdded: Long, + numDeletionVectorsRemoved: Long, + numDeletionVectorsUpdated: Long +) diff --git a/backends-clickhouse/src/main/delta-32/org/apache/spark/sql/delta/commands/VacuumCommand.scala b/backends-clickhouse/src/main/delta-32/org/apache/spark/sql/delta/commands/VacuumCommand.scala new file mode 100644 index 0000000000000..9f455fb27bb1d --- /dev/null +++ b/backends-clickhouse/src/main/delta-32/org/apache/spark/sql/delta/commands/VacuumCommand.scala @@ -0,0 +1,735 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.delta.commands + +// scalastyle:off import.ordering.noEmptyLine +import java.net.URI +import java.util.Date +import java.util.concurrent.TimeUnit +import scala.collection.JavaConverters._ +import org.apache.spark.sql.delta._ +import org.apache.spark.sql.delta.actions.{AddFile, FileAction, RemoveFile} +import org.apache.spark.sql.delta.sources.DeltaSQLConf +import org.apache.spark.sql.delta.util.DeltaFileOperations +import org.apache.spark.sql.delta.util.DeltaFileOperations.tryDeleteNonRecursive +import com.fasterxml.jackson.databind.annotation.JsonDeserialize + +import org.apache.gluten.utils.QueryPlanSelector +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.{FileSystem, Path} +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.paths.SparkPath +import org.apache.spark.sql.{Column, DataFrame, Dataset, Encoder, SparkSession} +import org.apache.spark.sql.execution.datasources.v2.clickhouse.ClickHouseConfig +import org.apache.spark.sql.execution.metric.SQLMetric +import org.apache.spark.sql.execution.metric.SQLMetrics.createMetric +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.types.{BooleanType, LongType, StringType, StructField, StructType} +import org.apache.spark.util.{Clock, SerializableConfiguration, SystemClock} + +/** + * Gluten overwrite Delta: + * + * This file is copied from Delta 3.2.0. It is modified to overcome the following issues: + * 1. In Gluten, part is a directory, but VacuumCommand assumes part is a file. So we need some + * modifications to make it work. + * 2. Set the 'gluten.enabledForCurrentThread' to false, now gluten can not support vacuum cmd. + */ + +/** + * Vacuums the table by clearing all untracked files and folders within this table. + * First lists all the files and directories in the table, and gets the relative paths with + * respect to the base of the table. Then it gets the list of all tracked files for this table, + * which may or may not be within the table base path, and gets the relative paths of + * all the tracked files with respect to the base of the table. Files outside of the table path + * will be ignored. Then we take a diff of the files and delete directories that were already empty, + * and all files that are within the table that are no longer tracked. + */ +object VacuumCommand extends VacuumCommandImpl with Serializable { + + // --- modified start + case class FileNameAndSize(path: String, length: Long, isDir: Boolean = false) + // --- modified end + + /** + * path : fully qualified uri + * length: size in bytes + * isDir: boolean indicating if it is a directory + * modificationTime: file update time in milliseconds + */ + val INVENTORY_SCHEMA = StructType( + Seq( + StructField("path", StringType), + StructField("length", LongType), + StructField("isDir", BooleanType), + StructField("modificationTime", LongType) + )) + + /** + * Additional check on retention duration to prevent people from shooting themselves in the foot. + */ + protected def checkRetentionPeriodSafety( + spark: SparkSession, + retentionMs: Option[Long], + configuredRetention: Long): Unit = { + require(retentionMs.forall(_ >= 0), "Retention for Vacuum can't be less than 0.") + val checkEnabled = + spark.sessionState.conf.getConf(DeltaSQLConf.DELTA_VACUUM_RETENTION_CHECK_ENABLED) + val retentionSafe = retentionMs.forall(_ >= configuredRetention) + var configuredRetentionHours = TimeUnit.MILLISECONDS.toHours(configuredRetention) + if (TimeUnit.HOURS.toMillis(configuredRetentionHours) < configuredRetention) { + configuredRetentionHours += 1 + } + require(!checkEnabled || retentionSafe, + s"""Are you sure you would like to vacuum files with such a low retention period? If you have + |writers that are currently writing to this table, there is a risk that you may corrupt the + |state of your Delta table. + | + |If you are certain that there are no operations being performed on this table, such as + |insert/upsert/delete/optimize, then you may turn off this check by setting: + |spark.databricks.delta.retentionDurationCheck.enabled = false + | + |If you are not sure, please use a value not less than "$configuredRetentionHours hours". + """.stripMargin) + } + + /** + * Helper to compute all valid files based on basePath and Snapshot provided. + */ + private def getValidFilesFromSnapshot( + spark: SparkSession, + basePath: String, + snapshot: Snapshot, + retentionMillis: Option[Long], + hadoopConf: Broadcast[SerializableConfiguration], + clock: Clock, + checkAbsolutePathOnly: Boolean): DataFrame = { + import org.apache.spark.sql.delta.implicits._ + require(snapshot.version >= 0, "No state defined for this table. Is this really " + + "a Delta table? Refusing to garbage collect.") + + val snapshotTombstoneRetentionMillis = DeltaLog.tombstoneRetentionMillis(snapshot.metadata) + checkRetentionPeriodSafety(spark, retentionMillis, snapshotTombstoneRetentionMillis) + val deleteBeforeTimestamp = retentionMillis match { + case Some(millis) => clock.getTimeMillis() - millis + case _ => snapshot.minFileRetentionTimestamp + } + val relativizeIgnoreError = + spark.sessionState.conf.getConf(DeltaSQLConf.DELTA_VACUUM_RELATIVIZE_IGNORE_ERROR) + + val canonicalizedBasePath = SparkPath.fromPathString(basePath).urlEncoded + snapshot.stateDS.mapPartitions { actions => + val reservoirBase = new Path(basePath) + val fs = reservoirBase.getFileSystem(hadoopConf.value.value) + actions.flatMap { + _.unwrap match { + // Existing tables may not store canonicalized paths, so we check both the canonicalized + // and non-canonicalized paths to ensure we don't accidentally delete wrong files. + case fa: FileAction if checkAbsolutePathOnly && + !fa.path.contains(basePath) && !fa.path.contains(canonicalizedBasePath) => Nil + case tombstone: RemoveFile if tombstone.delTimestamp < deleteBeforeTimestamp => Nil + case fa: FileAction => + getValidRelativePathsAndSubdirs( + fa, + fs, + reservoirBase, + relativizeIgnoreError + ) + case _ => Nil + } + } + }.toDF("path") + } + + def getFilesFromInventory(basePath: String, + partitionColumns: Seq[String], + inventory: DataFrame): Dataset[SerializableFileStatus] = { + implicit val fileNameAndSizeEncoder: Encoder[SerializableFileStatus] = + org.apache.spark.sql.Encoders.product[SerializableFileStatus] + + // filter out required fields from provided inventory DF + val inventorySchema = StructType( + inventory.schema.fields.filter(f => INVENTORY_SCHEMA.fields.map(_.name).contains(f.name)) + ) + if (inventorySchema != INVENTORY_SCHEMA) { + throw DeltaErrors.invalidInventorySchema(INVENTORY_SCHEMA.treeString) + } + + inventory + .filter(startswith(col("path"), lit(s"$basePath/"))) + .select( + substr(col("path"), lit(basePath.length + 2)).as("path"), + col("length"), col("isDir"), col("modificationTime") + ) + .flatMap { + row => + val path = row.getString(0) + if(!DeltaTableUtils.isHiddenDirectory(partitionColumns, path)) { + Seq(SerializableFileStatus(path, + row.getLong(1), row.getBoolean(2), row.getLong(3))) + } else { + None + } + } + } + + /** + * Clears all untracked files and folders within this table. If the inventory is not provided + * then the command first lists all the files and directories in the table, if inventory is + * provided then it will be used for identifying files and directories within the table and + * gets the relative paths with respect to the base of the table. Then the command gets the + * list of all tracked files for this table, which may or may not be within the table base path, + * and gets the relative paths of all the tracked files with respect to the base of the table. + * Files outside of the table path will be ignored. Then we take a diff of the files and delete + * directories that were already empty, and all files that are within the table that are no longer + * tracked. + * + * @param dryRun If set to true, no files will be deleted. Instead, we will list all files and + * directories that will be cleared. + * @param retentionHours An optional parameter to override the default Delta tombstone retention + * period + * @param inventory An optional dataframe of files and directories within the table generated + * from sources like blob store inventory report + * @return A Dataset containing the paths of the files/folders to delete in dryRun mode. Otherwise + * returns the base path of the table. + */ + def gc( + spark: SparkSession, + deltaLog: DeltaLog, + dryRun: Boolean = true, + retentionHours: Option[Double] = None, + inventory: Option[DataFrame] = None, + clock: Clock = new SystemClock): DataFrame = { + recordDeltaOperation(deltaLog, "delta.gc") { + + val vacuumStartTime = System.currentTimeMillis() + val path = deltaLog.dataPath + val deltaHadoopConf = deltaLog.newDeltaHadoopConf() + val fs = path.getFileSystem(deltaHadoopConf) + + import org.apache.spark.sql.delta.implicits._ + + val snapshot = deltaLog.update() + deltaLog.protocolWrite(snapshot.protocol) + + // --- modified start + val isMergeTreeFormat = ClickHouseConfig + .isMergeTreeFormatEngine(deltaLog.unsafeVolatileMetadata.configuration) + // --- modified end + + val snapshotTombstoneRetentionMillis = DeltaLog.tombstoneRetentionMillis(snapshot.metadata) + val retentionMillis = retentionHours.map(h => TimeUnit.HOURS.toMillis(math.round(h))) + val deleteBeforeTimestamp = retentionMillis match { + case Some(millis) => clock.getTimeMillis() - millis + case _ => snapshot.minFileRetentionTimestamp + } + // --- modified start: toGMTString is a deprecated function + logInfo(s"Starting garbage collection (dryRun = $dryRun) of untracked files older than " + + s"${new Date(deleteBeforeTimestamp).toString} in $path") + // --- modified end + val hadoopConf = spark.sparkContext.broadcast( + new SerializableConfiguration(deltaHadoopConf)) + val basePath = fs.makeQualified(path).toString + val parallelDeleteEnabled = + spark.sessionState.conf.getConf(DeltaSQLConf.DELTA_VACUUM_PARALLEL_DELETE_ENABLED) + val parallelDeletePartitions = + spark.sessionState.conf.getConf(DeltaSQLConf.DELTA_VACUUM_PARALLEL_DELETE_PARALLELISM) + .getOrElse(spark.sessionState.conf.numShufflePartitions) + val startTimeToIdentifyEligibleFiles = System.currentTimeMillis() + + // --- modified start + val originalEnabledGluten = + spark.sparkContext.getLocalProperty(QueryPlanSelector.GLUTEN_ENABLE_FOR_THREAD_KEY) + // gluten can not support vacuum command + spark.sparkContext.setLocalProperty(QueryPlanSelector.GLUTEN_ENABLE_FOR_THREAD_KEY, "false") + // --- modified end + + val validFiles = + getValidFilesFromSnapshot( + spark, + basePath, + snapshot, + retentionMillis, + hadoopConf, + clock, + checkAbsolutePathOnly = false) + + val partitionColumns = snapshot.metadata.partitionSchema.fieldNames + val parallelism = spark.sessionState.conf.parallelPartitionDiscoveryParallelism + val allFilesAndDirsWithDuplicates = inventory match { + case Some(inventoryDF) => getFilesFromInventory(basePath, partitionColumns, inventoryDF) + case None => DeltaFileOperations.recursiveListDirs( + spark, + Seq(basePath), + hadoopConf, + hiddenDirNameFilter = DeltaTableUtils.isHiddenDirectory(partitionColumns, _), + hiddenFileNameFilter = DeltaTableUtils.isHiddenDirectory(partitionColumns, _), + fileListingParallelism = Option(parallelism) + ) + } + val allFilesAndDirs = allFilesAndDirsWithDuplicates.groupByKey(_.path) + .mapGroups { (k, v) => + val duplicates = v.toSeq + // of all the duplicates we can return the newest file. + duplicates.maxBy(_.modificationTime) + } + + recordFrameProfile("Delta", "VacuumCommand.gc") { + try { + allFilesAndDirs.cache() + + implicit val fileNameAndSizeEncoder = + org.apache.spark.sql.Encoders.product[FileNameAndSize] + + val dirCounts = allFilesAndDirs.where(col("isDir")).count() + 1 // +1 for the base path + val filesAndDirsPresentBeforeDelete = allFilesAndDirs.count() + + // The logic below is as follows: + // 1. We take all the files and directories listed in our reservoir + // 2. We filter all files older than our tombstone retention period and directories + // 3. We get the subdirectories of all files so that we can find non-empty directories + // 4. We groupBy each path, and count to get how many files are in each sub-directory + // 5. We subtract all the valid files and tombstones in our state + // 6. We filter all paths with a count of 1, which will correspond to files not in the + // state, and empty directories. We can safely delete all of these + // --- modified start + val diff = if (isMergeTreeFormat) { + val diff_tmp = allFilesAndDirs + .where(col("modificationTime") < deleteBeforeTimestamp || col("isDir")) + .mapPartitions { fileStatusIterator => + val reservoirBase = new Path(basePath) + val fs = reservoirBase.getFileSystem(hadoopConf.value.value) + fileStatusIterator.flatMap { fileStatus => + if (fileStatus.isDir) { + Iterator.single(FileNameAndSize( + relativize(fileStatus.getHadoopPath, fs, reservoirBase, isDir = true), + 0L, + true)) + } else { + val dirs = getAllSubdirs(basePath, fileStatus.path, fs) + val dirsWithSlash = dirs.map { p => + val relativizedPath = relativize(new Path(p), fs, reservoirBase, isDir = true) + FileNameAndSize(relativizedPath, 0L, true) + } + dirsWithSlash ++ Iterator( + FileNameAndSize(relativize( + fileStatus.getHadoopPath, fs, reservoirBase, isDir = false), + fileStatus.length)) + } + } + } + .withColumn( + "dir", + when(col("isDir"), col("path")) + .otherwise(expr("substring_index(path, '/',size(split(path, '/')) -1)"))) + .groupBy(col("path"), col("dir")) + .agg(count(new Column("*")).as("count"), sum("length").as("length")) + + diff_tmp + .join(validFiles, diff_tmp("dir") === validFiles("path"), "leftanti") + .where(col("count") === 1) + } else { + allFilesAndDirs + .where(col("modificationTime") < deleteBeforeTimestamp || col("isDir")) + .mapPartitions { fileStatusIterator => + val reservoirBase = new Path(basePath) + val fs = reservoirBase.getFileSystem(hadoopConf.value.value) + fileStatusIterator.flatMap { fileStatus => + if (fileStatus.isDir) { + Iterator.single(FileNameAndSize( + relativize(fileStatus.getHadoopPath, fs, reservoirBase, isDir = true), 0L)) + } else { + val dirs = getAllSubdirs(basePath, fileStatus.path, fs) + val dirsWithSlash = dirs.map { p => + val relativizedPath = relativize(new Path(p), fs, reservoirBase, isDir = true) + FileNameAndSize(relativizedPath, 0L) + } + dirsWithSlash ++ Iterator( + FileNameAndSize(relativize( + fileStatus.getHadoopPath, fs, reservoirBase, isDir = false), + fileStatus.length)) + } + } + }.groupBy(col("path")).agg(count(new Column("*")).as("count"), + sum("length").as("length")) + .join(validFiles, Seq("path"), "leftanti") + .where(col("count") === 1) + } + // --- modified end + + val sizeOfDataToDeleteRow = diff.agg(sum("length").cast("long")).first() + val sizeOfDataToDelete = if (sizeOfDataToDeleteRow.isNullAt(0)) { + 0L + } else { + sizeOfDataToDeleteRow.getLong(0) + } + + val diffFiles = diff + .select(col("path")) + .as[String] + .map { relativePath => + assert(!stringToPath(relativePath).isAbsolute, + "Shouldn't have any absolute paths for deletion here.") + pathToString(DeltaFileOperations.absolutePath(basePath, relativePath)) + } + val timeTakenToIdentifyEligibleFiles = + System.currentTimeMillis() - startTimeToIdentifyEligibleFiles + + + val numFiles = diffFiles.count() + if (dryRun) { + val stats = DeltaVacuumStats( + isDryRun = true, + specifiedRetentionMillis = retentionMillis, + defaultRetentionMillis = snapshotTombstoneRetentionMillis, + minRetainedTimestamp = deleteBeforeTimestamp, + dirsPresentBeforeDelete = dirCounts, + filesAndDirsPresentBeforeDelete = filesAndDirsPresentBeforeDelete, + objectsDeleted = numFiles, + sizeOfDataToDelete = sizeOfDataToDelete, + timeTakenToIdentifyEligibleFiles = timeTakenToIdentifyEligibleFiles, + timeTakenForDelete = 0L, + vacuumStartTime = vacuumStartTime, + vacuumEndTime = System.currentTimeMillis, + numPartitionColumns = partitionColumns.size + ) + + recordDeltaEvent(deltaLog, "delta.gc.stats", data = stats) + logInfo(s"Found $numFiles files ($sizeOfDataToDelete bytes) and directories in " + + s"a total of $dirCounts directories that are safe to delete. Vacuum stats: $stats") + + return diffFiles.map(f => stringToPath(f).toString).toDF("path") + } + logVacuumStart( + spark, + deltaLog, + path, + diffFiles, + sizeOfDataToDelete, + retentionMillis, + snapshotTombstoneRetentionMillis) + + val deleteStartTime = System.currentTimeMillis() + val filesDeleted = try { + delete(diffFiles, spark, basePath, + hadoopConf, parallelDeleteEnabled, parallelDeletePartitions) + } catch { + case t: Throwable => + logVacuumEnd(deltaLog, spark, path) + throw t + } + val timeTakenForDelete = System.currentTimeMillis() - deleteStartTime + val stats = DeltaVacuumStats( + isDryRun = false, + specifiedRetentionMillis = retentionMillis, + defaultRetentionMillis = snapshotTombstoneRetentionMillis, + minRetainedTimestamp = deleteBeforeTimestamp, + dirsPresentBeforeDelete = dirCounts, + filesAndDirsPresentBeforeDelete = filesAndDirsPresentBeforeDelete, + objectsDeleted = filesDeleted, + sizeOfDataToDelete = sizeOfDataToDelete, + timeTakenToIdentifyEligibleFiles = timeTakenToIdentifyEligibleFiles, + timeTakenForDelete = timeTakenForDelete, + vacuumStartTime = vacuumStartTime, + vacuumEndTime = System.currentTimeMillis, + numPartitionColumns = partitionColumns.size) + recordDeltaEvent(deltaLog, "delta.gc.stats", data = stats) + logVacuumEnd(deltaLog, spark, path, Some(filesDeleted), Some(dirCounts)) + logInfo(s"Deleted $filesDeleted files ($sizeOfDataToDelete bytes) and directories in " + + s"a total of $dirCounts directories. Vacuum stats: $stats") + + + spark.createDataset(Seq(basePath)).toDF("path") + } finally { + allFilesAndDirs.unpersist() + + // --- modified start + if (originalEnabledGluten != null) { + spark.sparkContext.setLocalProperty( + QueryPlanSelector.GLUTEN_ENABLE_FOR_THREAD_KEY, originalEnabledGluten) + } else { + spark.sparkContext.setLocalProperty( + QueryPlanSelector.GLUTEN_ENABLE_FOR_THREAD_KEY, "true") + } + // --- modified end + } + } + } + } +} + +trait VacuumCommandImpl extends DeltaCommand { + + private val supportedFsForLogging = Seq( + "wasbs", "wasbss", "abfs", "abfss", "adl", "gs", "file", "hdfs" + ) + + /** + * Returns whether we should record vacuum metrics in the delta log. + */ + private def shouldLogVacuum( + spark: SparkSession, + deltaLog: DeltaLog, + hadoopConf: Configuration, + path: Path): Boolean = { + val logVacuumConf = spark.sessionState.conf.getConf(DeltaSQLConf.DELTA_VACUUM_LOGGING_ENABLED) + + if (logVacuumConf.nonEmpty) { + return logVacuumConf.get + } + + val logStore = deltaLog.store + + try { + val rawResolvedUri: URI = logStore.resolvePathOnPhysicalStorage(path, hadoopConf).toUri + val scheme = rawResolvedUri.getScheme + supportedFsForLogging.contains(scheme) + } catch { + case _: UnsupportedOperationException => + logWarning("Vacuum event logging" + + " not enabled on this file system because we cannot detect your cloud storage type.") + false + } + } + + /** + * Record Vacuum specific metrics in the commit log at the START of vacuum. + * + * @param spark - spark session + * @param deltaLog - DeltaLog of the table + * @param path - the (data) path to the root of the table + * @param diff - the list of paths (files, directories) that are safe to delete + * @param sizeOfDataToDelete - the amount of data (bytes) to be deleted + * @param specifiedRetentionMillis - the optional override retention period (millis) to keep + * logically removed files before deleting them + * @param defaultRetentionMillis - the default retention period (millis) + */ + protected def logVacuumStart( + spark: SparkSession, + deltaLog: DeltaLog, + path: Path, + diff: Dataset[String], + sizeOfDataToDelete: Long, + specifiedRetentionMillis: Option[Long], + defaultRetentionMillis: Long): Unit = { + logInfo(s"Deleting untracked files and empty directories in $path. The amount of data to be " + + s"deleted is $sizeOfDataToDelete (in bytes)") + + // We perform an empty commit in order to record information about the Vacuum + if (shouldLogVacuum(spark, deltaLog, deltaLog.newDeltaHadoopConf(), path)) { + val checkEnabled = + spark.sessionState.conf.getConf(DeltaSQLConf.DELTA_VACUUM_RETENTION_CHECK_ENABLED) + val txn = deltaLog.startTransaction() + val metrics = Map[String, SQLMetric]( + "numFilesToDelete" -> createMetric(spark.sparkContext, "number of files to deleted"), + "sizeOfDataToDelete" -> createMetric(spark.sparkContext, + "The total amount of data to be deleted in bytes") + ) + metrics("numFilesToDelete").set(diff.count()) + metrics("sizeOfDataToDelete").set(sizeOfDataToDelete) + txn.registerSQLMetrics(spark, metrics) + txn.commit(actions = Seq(), DeltaOperations.VacuumStart( + checkEnabled, + specifiedRetentionMillis, + defaultRetentionMillis + )) + } + } + + /** + * Record Vacuum specific metrics in the commit log at the END of vacuum. + * + * @param deltaLog - DeltaLog of the table + * @param spark - spark session + * @param path - the (data) path to the root of the table + * @param filesDeleted - if the vacuum completed this will contain the number of files deleted. + * if the vacuum failed, this will be None. + * @param dirCounts - if the vacuum completed this will contain the number of directories + * vacuumed. if the vacuum failed, this will be None. + */ + protected def logVacuumEnd( + deltaLog: DeltaLog, + spark: SparkSession, + path: Path, + filesDeleted: Option[Long] = None, + dirCounts: Option[Long] = None): Unit = { + if (shouldLogVacuum(spark, deltaLog, deltaLog.newDeltaHadoopConf(), path)) { + val txn = deltaLog.startTransaction() + val status = if (filesDeleted.isEmpty && dirCounts.isEmpty) { "FAILED" } else { "COMPLETED" } + if (filesDeleted.nonEmpty && dirCounts.nonEmpty) { + val metrics = Map[String, SQLMetric]( + "numDeletedFiles" -> createMetric(spark.sparkContext, "number of files deleted."), + "numVacuumedDirectories" -> + createMetric(spark.sparkContext, "num of directories vacuumed."), + "status" -> createMetric(spark.sparkContext, "status of vacuum") + ) + metrics("numDeletedFiles").set(filesDeleted.get) + metrics("numVacuumedDirectories").set(dirCounts.get) + txn.registerSQLMetrics(spark, metrics) + } + txn.commit(actions = Seq(), DeltaOperations.VacuumEnd( + status + )) + } + + if (filesDeleted.nonEmpty) { + logConsole(s"Deleted ${filesDeleted.get} files and directories in a total " + + s"of ${dirCounts.get} directories.") + } + } + + /** + * Attempts to relativize the `path` with respect to the `reservoirBase` and converts the path to + * a string. + */ + protected def relativize( + path: Path, + fs: FileSystem, + reservoirBase: Path, + isDir: Boolean): String = { + pathToString(DeltaFileOperations.tryRelativizePath(fs, reservoirBase, path)) + } + + /** + * Wrapper function for DeltaFileOperations.getAllSubDirectories + * returns all subdirectories that `file` has with respect to `base`. + */ + protected def getAllSubdirs(base: String, file: String, fs: FileSystem): Iterator[String] = { + DeltaFileOperations.getAllSubDirectories(base, file)._1 + } + + /** + * Attempts to delete the list of candidate files. Returns the number of files deleted. + */ + protected def delete( + diff: Dataset[String], + spark: SparkSession, + basePath: String, + hadoopConf: Broadcast[SerializableConfiguration], + parallel: Boolean, + parallelPartitions: Int): Long = { + import org.apache.spark.sql.delta.implicits._ + + if (parallel) { + diff.repartition(parallelPartitions).mapPartitions { files => + val fs = new Path(basePath).getFileSystem(hadoopConf.value.value) + val filesDeletedPerPartition = + files.map(p => stringToPath(p)).count(f => tryDeleteNonRecursive(fs, f)) + Iterator(filesDeletedPerPartition) + }.collect().sum + } else { + val fs = new Path(basePath).getFileSystem(hadoopConf.value.value) + val fileResultSet = diff.toLocalIterator().asScala + fileResultSet.map(p => stringToPath(p)).count(f => tryDeleteNonRecursive(fs, f)) + } + } + + // scalastyle:off pathfromuri + protected def stringToPath(path: String): Path = new Path(new URI(path)) + // scalastyle:on pathfromuri + + protected def pathToString(path: Path): String = path.toUri.toString + + /** Returns the relative path of a file action or None if the file lives outside of the table. */ + protected def getActionRelativePath( + action: FileAction, + fs: FileSystem, + basePath: Path, + relativizeIgnoreError: Boolean): Option[String] = { + val filePath = stringToPath(action.path) + if (filePath.isAbsolute) { + val maybeRelative = + DeltaFileOperations.tryRelativizePath(fs, basePath, filePath, relativizeIgnoreError) + if (maybeRelative.isAbsolute) { + // This file lives outside the directory of the table. + None + } else { + Some(pathToString(maybeRelative)) + } + } else { + Some(pathToString(filePath)) + } + } + + + /** + * Returns the relative paths of all files and subdirectories for this action that must be + * retained during GC. + */ + protected def getValidRelativePathsAndSubdirs( + action: FileAction, + fs: FileSystem, + basePath: Path, + relativizeIgnoreError: Boolean + ): Seq[String] = { + val paths = getActionRelativePath(action, fs, basePath, relativizeIgnoreError) + .map { + relativePath => + Seq(relativePath) ++ getAllSubdirs("/", relativePath, fs) + }.getOrElse(Seq.empty) + + val deletionVectorPath = + getDeletionVectorRelativePath(action).map(pathToString) + + paths ++ deletionVectorPath.toSeq + } + + /** + * Returns the path of the on-disk deletion vector if it is stored relative to the + * `basePath` otherwise `None`. + */ + protected def getDeletionVectorRelativePath(action: FileAction): Option[Path] = { + val dv = action match { + case a: AddFile if a.deletionVector != null => + Some(a.deletionVector) + case r: RemoveFile if r.deletionVector != null => + Some(r.deletionVector) + case _ => None + } + + dv match { + case Some(dv) if dv.isOnDisk => + if (dv.isRelative) { + // We actually want a relative path here. + Some(dv.absolutePath(new Path("."))) + } else { + assert(dv.isAbsolute) + // This is never going to be a path relative to `basePath` for DVs. + None + } + case _ => None + } + } +} + +case class DeltaVacuumStats( + isDryRun: Boolean, + @JsonDeserialize(contentAs = classOf[java.lang.Long]) + specifiedRetentionMillis: Option[Long], + defaultRetentionMillis: Long, + minRetainedTimestamp: Long, + dirsPresentBeforeDelete: Long, + filesAndDirsPresentBeforeDelete: Long, + objectsDeleted: Long, + sizeOfDataToDelete: Long, + timeTakenToIdentifyEligibleFiles: Long, + timeTakenForDelete: Long, + vacuumStartTime: Long, + vacuumEndTime: Long, + numPartitionColumns: Long +) diff --git a/backends-clickhouse/src/main/delta-32/org/apache/spark/sql/delta/commands/merge/ClassicMergeExecutor.scala b/backends-clickhouse/src/main/delta-32/org/apache/spark/sql/delta/commands/merge/ClassicMergeExecutor.scala new file mode 100644 index 0000000000000..42a89d4271972 --- /dev/null +++ b/backends-clickhouse/src/main/delta-32/org/apache/spark/sql/delta/commands/merge/ClassicMergeExecutor.scala @@ -0,0 +1,571 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.delta.commands.merge + +import scala.collection.JavaConverters._ + +import org.apache.spark.sql.delta._ +import org.apache.spark.sql.delta.actions.{AddCDCFile, AddFile, FileAction} +import org.apache.spark.sql.delta.commands.{DeletionVectorBitmapGenerator, DMLWithDeletionVectorsHelper, MergeIntoCommandBase} +import org.apache.spark.sql.delta.commands.cdc.CDCReader.{CDC_TYPE_COLUMN_NAME, CDC_TYPE_NOT_CDC} +import org.apache.spark.sql.delta.commands.merge.MergeOutputGeneration.{SOURCE_ROW_INDEX_COL, TARGET_ROW_INDEX_COL} +import org.apache.spark.sql.delta.files.TahoeBatchFileIndex +import org.apache.spark.sql.delta.util.SetAccumulator + +import org.apache.spark.sql.{Column, Dataset, SparkSession} +import org.apache.spark.sql.catalyst.expressions.{And, Expression, Literal, Or} +import org.apache.spark.sql.catalyst.plans.logical.DeltaMergeIntoClause +import org.apache.spark.sql.functions.{coalesce, col, count, input_file_name, lit, monotonically_increasing_id, sum} + +/** + * Gluten overwrite Delta: + * + * This file is copied from Delta 3.2.0. It is modified to overcome the following issues: + * 1. In Clickhouse backend, we can't implement input_file_name() correctly, we can only implement + * it so that it return a a list of filenames (concated by ','). In findTouchedFiles func. + */ + +/** + * Trait with merge execution in two phases: + * + * Phase 1: Find the input files in target that are touched by the rows that satisfy + * the condition and verify that no two source rows match with the same target row. + * This is implemented as an inner-join using the given condition (see [[findTouchedFiles]]). + * In the special case that there is no update clause we write all the non-matching + * source data as new files and skip phase 2. + * Issues an error message when the ON search_condition of the MERGE statement can match + * a single row from the target table with multiple rows of the source table-reference. + * + * Phase 2: Read the touched files again and write new files with updated and/or inserted rows. + * If there are updates, then use an outer join using the given condition to write the + * updates and inserts (see [[writeAllChanges()]]). If there are no matches for updates, + * only inserts, then write them directly (see [[writeInsertsOnlyWhenNoMatches()]]). + * + * Note, when deletion vectors are enabled, phase 2 is split into two parts: + * 2.a. Read the touched files again and only write modified and new + * rows (see [[writeAllChanges()]]). + * 2.b. Read the touched files and generate deletion vectors for the modified + * rows (see [[writeDVs()]]). + * + * If there are no matches for updates, only inserts, then write them directly + * (see [[writeInsertsOnlyWhenNoMatches()]]). This remains the same when DVs are enabled since there + * are no modified rows. Furthermore, eee [[InsertOnlyMergeExecutor]] for the optimized executor + * used in case there are only inserts. + */ +trait ClassicMergeExecutor extends MergeOutputGeneration { + self: MergeIntoCommandBase => + import MergeIntoCommandBase._ + + /** + * Find the target table files that contain the rows that satisfy the merge condition. This is + * implemented as an inner-join between the source query/table and the target table using + * the merge condition. + */ + protected def findTouchedFiles( + spark: SparkSession, + deltaTxn: OptimisticTransaction + ): (Seq[AddFile], DeduplicateCDFDeletes) = recordMergeOperation( + extraOpType = "findTouchedFiles", + status = "MERGE operation - scanning files for matches", + sqlMetricName = "scanTimeMs") { + + val columnComparator = spark.sessionState.analyzer.resolver + + // Accumulator to collect all the distinct touched files + val touchedFilesAccum = new SetAccumulator[String]() + spark.sparkContext.register(touchedFilesAccum, TOUCHED_FILES_ACCUM_NAME) + + // Prune non-matching files if we don't need to collect them for NOT MATCHED BY SOURCE clauses. + val dataSkippedFiles = + if (notMatchedBySourceClauses.isEmpty) { + deltaTxn.filterFiles(getTargetOnlyPredicates(spark), keepNumRecords = true) + } else { + deltaTxn.filterFiles(filters = Seq(Literal.TrueLiteral), keepNumRecords = true) + } + + // Join the source and target table using the merge condition to find touched files. An inner + // join collects all candidate files for MATCHED clauses, a right outer join also includes + // candidates for NOT MATCHED BY SOURCE clauses. + // In addition, we attach two columns + // - a monotonically increasing row id for target rows to later identify whether the same + // target row is modified by multiple user or not + // - the target file name the row is from to later identify the files touched by matched rows + val joinType = if (notMatchedBySourceClauses.isEmpty) "inner" else "right_outer" + + // When they are only MATCHED clauses, after the join we prune files that have no rows that + // satisfy any of the clause conditions. + val matchedPredicate = + if (isMatchedOnly) { + matchedClauses + // An undefined condition (None) is implicitly true + .map(_.condition.getOrElse(Literal.TrueLiteral)) + .reduce((a, b) => Or(a, b)) + } else Literal.TrueLiteral + + // Compute the columns needed for the inner join. + val targetColsNeeded = { + condition.references.map(_.name) ++ deltaTxn.snapshot.metadata.partitionColumns ++ + matchedPredicate.references.map(_.name) + } + + val columnsToDrop = deltaTxn.snapshot.metadata.schema.map(_.name) + .filterNot { field => + targetColsNeeded.exists { name => columnComparator(name, field) } + } + val incrSourceRowCountExpr = incrementMetricAndReturnBool("numSourceRows", valueToReturn = true) + // We can't use filter() directly on the expression because that will prevent + // column pruning. We don't need the SOURCE_ROW_PRESENT_COL so we immediately drop it. + val sourceDF = getMergeSource.df + .withColumn(SOURCE_ROW_PRESENT_COL, Column(incrSourceRowCountExpr)) + .filter(SOURCE_ROW_PRESENT_COL) + .drop(SOURCE_ROW_PRESENT_COL) + val targetPlan = + buildTargetPlanWithFiles( + spark, + deltaTxn, + dataSkippedFiles, + columnsToDrop) + val targetDF = Dataset.ofRows(spark, targetPlan) + .withColumn(ROW_ID_COL, monotonically_increasing_id()) + .withColumn(FILE_NAME_COL, input_file_name()) + + val joinToFindTouchedFiles = + sourceDF.join(targetDF, Column(condition), joinType) + + // UDFs to records touched files names and add them to the accumulator + val recordTouchedFileName = + DeltaUDF.intFromStringBoolean { (fileName, shouldRecord) => + if (shouldRecord) { + // --- modified start + fileName.split(",").foreach(name => touchedFilesAccum.add(name)) + // --- modified end + } + 1 + }.asNondeterministic() + + // Process the matches from the inner join to record touched files and find multiple matches + val collectTouchedFiles = joinToFindTouchedFiles + .select(col(ROW_ID_COL), + recordTouchedFileName(col(FILE_NAME_COL), Column(matchedPredicate)).as("one")) + + // Calculate frequency of matches per source row + val matchedRowCounts = collectTouchedFiles.groupBy(ROW_ID_COL).agg(sum("one").as("count")) + + // Get multiple matches and simultaneously collect (using touchedFilesAccum) the file names + import org.apache.spark.sql.delta.implicits._ + val (multipleMatchCount, multipleMatchSum) = matchedRowCounts + .filter("count > 1") + .select(coalesce(count(Column("*")), lit(0)), coalesce(sum("count"), lit(0))) + .as[(Long, Long)] + .collect() + .head + + val hasMultipleMatches = multipleMatchCount > 0 + throwErrorOnMultipleMatches(hasMultipleMatches, spark) + if (hasMultipleMatches) { + // This is only allowed for delete-only queries. + // This query will count the duplicates for numTargetRowsDeleted in Job 2, + // because we count matches after the join and not just the target rows. + // We have to compensate for this by subtracting the duplicates later, + // so we need to record them here. + val duplicateCount = multipleMatchSum - multipleMatchCount + multipleMatchDeleteOnlyOvercount = Some(duplicateCount) + } + + // Get the AddFiles using the touched file names. + val touchedFileNames = touchedFilesAccum.value.iterator().asScala.toSeq + logTrace(s"findTouchedFiles: matched files:\n\t${touchedFileNames.mkString("\n\t")}") + + val nameToAddFileMap = generateCandidateFileMap(targetDeltaLog.dataPath, dataSkippedFiles) + val touchedAddFiles = touchedFileNames.map( + getTouchedFile(targetDeltaLog.dataPath, _, nameToAddFileMap)) + + if (metrics("numSourceRows").value == 0 && (dataSkippedFiles.isEmpty || + dataSkippedFiles.forall(_.numLogicalRecords.getOrElse(0) == 0))) { + // The target table is empty, and the optimizer optimized away the join entirely OR the + // source table is truly empty. In that case, scanning the source table once is the only + // way to get the correct metric. + val numSourceRows = sourceDF.count() + metrics("numSourceRows").set(numSourceRows) + } + + metrics("numTargetFilesBeforeSkipping") += deltaTxn.snapshot.numOfFiles + metrics("numTargetBytesBeforeSkipping") += deltaTxn.snapshot.sizeInBytes + val (afterSkippingBytes, afterSkippingPartitions) = + totalBytesAndDistinctPartitionValues(dataSkippedFiles) + metrics("numTargetFilesAfterSkipping") += dataSkippedFiles.size + metrics("numTargetBytesAfterSkipping") += afterSkippingBytes + metrics("numTargetPartitionsAfterSkipping") += afterSkippingPartitions + val (removedBytes, removedPartitions) = totalBytesAndDistinctPartitionValues(touchedAddFiles) + metrics("numTargetFilesRemoved") += touchedAddFiles.size + metrics("numTargetBytesRemoved") += removedBytes + metrics("numTargetPartitionsRemovedFrom") += removedPartitions + val dedupe = DeduplicateCDFDeletes( + hasMultipleMatches && isCdcEnabled(deltaTxn), + includesInserts) + (touchedAddFiles, dedupe) + } + + /** + * Helper function that produces an expression by combining a sequence of clauses with OR. + * Requires the sequence to be non-empty. + */ + protected def clauseDisjunction(clauses: Seq[DeltaMergeIntoClause]): Expression = { + require(clauses.nonEmpty) + clauses + .map(_.condition.getOrElse(Literal.TrueLiteral)) + .reduceLeft(Or) + } + + /** + * Returns the expression that can be used for selecting the modified rows generated + * by the merge operation. The expression is to designed to work irrespectively + * of the join type used between the source and target tables. + * + * The expression consists of two parts, one for each of the action clause types that produce + * row modifications: MATCHED, NOT MATCHED BY SOURCE. All actions of the same clause type form + * a disjunctive clause. The result is then conjucted to an expression that filters the rows + * of the particular action clause type. For example: + * + * MERGE INTO t + * USING s + * ON s.id = t.id + * WHEN MATCHED AND id < 5 THEN ... + * WHEN MATCHED AND id > 10 THEN ... + * WHEN NOT MATCHED BY SOURCE AND id > 20 THEN ... + * + * Produces the following expression: + * + * ((as.id = t.id) AND (id < 5 OR id > 10)) + * OR + * ((SOURCE TABLE IS NULL) AND (id > 20)) + */ + protected def generateFilterForModifiedRows(): Expression = { + val matchedExpression = if (matchedClauses.nonEmpty) { + And(Column(condition).expr, clauseDisjunction(matchedClauses)) + } else { + Literal.FalseLiteral + } + + val notMatchedBySourceExpression = if (notMatchedBySourceClauses.nonEmpty) { + val combinedClauses = clauseDisjunction(notMatchedBySourceClauses) + And(col(SOURCE_ROW_PRESENT_COL).isNull.expr, combinedClauses) + } else { + Literal.FalseLiteral + } + + Or(matchedExpression, notMatchedBySourceExpression) + } + + /** + * Returns the expression that can be used for selecting the new rows generated + * by the merge operation. + */ + protected def generateFilterForNewRows(): Expression = { + if (notMatchedClauses.nonEmpty) { + val combinedClauses = clauseDisjunction(notMatchedClauses) + And(col(TARGET_ROW_PRESENT_COL).isNull.expr, combinedClauses) + } else { + Literal.FalseLiteral + } + } + + /** + * Write new files by reading the touched files and updating/inserting data using the source + * query/table. This is implemented using a full-outer-join using the merge condition. + * + * Note that unlike the insert-only code paths with just one control column ROW_DROPPED_COL, this + * method has a second control column CDC_TYPE_COL_NAME used for handling CDC when enabled. + */ + protected def writeAllChanges( + spark: SparkSession, + deltaTxn: OptimisticTransaction, + filesToRewrite: Seq[AddFile], + deduplicateCDFDeletes: DeduplicateCDFDeletes, + writeUnmodifiedRows: Boolean): Seq[FileAction] = recordMergeOperation( + extraOpType = if (!writeUnmodifiedRows) { + "writeModifiedRowsOnly" + } else if (shouldOptimizeMatchedOnlyMerge(spark)) { + "writeAllUpdatesAndDeletes" + } else { + "writeAllChanges" + }, + status = s"MERGE operation - Rewriting ${filesToRewrite.size} files", + sqlMetricName = "rewriteTimeMs") { + + val cdcEnabled = isCdcEnabled(deltaTxn) + + require( + !deduplicateCDFDeletes.enabled || cdcEnabled, + "CDF delete duplication is enabled but overall the CDF generation is disabled") + + // Generate a new target dataframe that has same output attributes exprIds as the target plan. + // This allows us to apply the existing resolved update/insert expressions. + val targetPlan = buildTargetPlanWithFiles( + spark, + deltaTxn, + filesToRewrite, + columnsToDrop = Nil) + val baseTargetDF = RowTracking.preserveRowTrackingColumns( + dfWithoutRowTrackingColumns = Dataset.ofRows(spark, targetPlan), + snapshot = deltaTxn.snapshot) + + val joinType = if (writeUnmodifiedRows) { + if (shouldOptimizeMatchedOnlyMerge(spark)) { + "rightOuter" + } else { + "fullOuter" + } + } else { + // Since we do not need to write unmodified rows, we can perform stricter joins. + if (isMatchedOnly) { + "inner" + } else if (notMatchedBySourceClauses.isEmpty) { + "leftOuter" + } else if (notMatchedClauses.isEmpty) { + "rightOuter" + } else { + "fullOuter" + } + } + + logDebug(s"""writeAllChanges using $joinType join: + | source.output: ${source.outputSet} + | target.output: ${target.outputSet} + | condition: $condition + | newTarget.output: ${baseTargetDF.queryExecution.logical.outputSet} + """.stripMargin) + + // Expressions to update metrics + val incrSourceRowCountExpr = incrementMetricAndReturnBool( + "numSourceRowsInSecondScan", valueToReturn = true) + val incrNoopCountExpr = incrementMetricAndReturnBool( + "numTargetRowsCopied", valueToReturn = false) + + // Apply an outer join to find both, matches and non-matches. We are adding two boolean fields + // with value `true`, one to each side of the join. Whether this field is null or not after + // the outer join, will allow us to identify whether the joined row was a + // matched inner result or an unmatched result with null on one side. + val joinedBaseDF = { + var sourceDF = getMergeSource.df + if (deduplicateCDFDeletes.enabled && deduplicateCDFDeletes.includesInserts) { + // Add row index for the source rows to identify inserted rows during the cdf deleted rows + // deduplication. See [[deduplicateCDFDeletes()]] + sourceDF = sourceDF.withColumn(SOURCE_ROW_INDEX_COL, monotonically_increasing_id()) + } + val left = sourceDF + .withColumn(SOURCE_ROW_PRESENT_COL, Column(incrSourceRowCountExpr)) + // In some cases, the optimizer (incorrectly) decides to omit the metrics column. + // This causes issues in the source determinism validation. We work around the issue by + // adding a redundant dummy filter to make sure the column is not pruned. + .filter(SOURCE_ROW_PRESENT_COL) + + val targetDF = baseTargetDF + .withColumn(TARGET_ROW_PRESENT_COL, lit(true)) + val right = if (deduplicateCDFDeletes.enabled) { + targetDF.withColumn(TARGET_ROW_INDEX_COL, monotonically_increasing_id()) + } else { + targetDF + } + left.join(right, Column(condition), joinType) + } + + val joinedDF = + if (writeUnmodifiedRows) { + joinedBaseDF + } else { + val filter = Or(generateFilterForModifiedRows(), generateFilterForNewRows()) + joinedBaseDF.filter(Column(filter)) + } + + // Precompute conditions in matched and not matched clauses and generate + // the joinedDF with precomputed columns and clauses with rewritten conditions. + val (joinedAndPrecomputedConditionsDF, clausesWithPrecompConditions) = + generatePrecomputedConditionsAndDF( + joinedDF, + clauses = matchedClauses ++ notMatchedClauses ++ notMatchedBySourceClauses) + + // In case Row IDs are preserved, get the attribute expression of the Row ID column. + val rowIdColumnExpressionOpt = + MaterializedRowId.getAttribute(deltaTxn.snapshot, joinedAndPrecomputedConditionsDF) + + val rowCommitVersionColumnExpressionOpt = + MaterializedRowCommitVersion.getAttribute(deltaTxn.snapshot, joinedAndPrecomputedConditionsDF) + + // The target output columns need to be marked as nullable here, as they are going to be used + // to reference the output of an outer join. + val targetWriteCols = postEvolutionTargetExpressions(makeNullable = true) + + // If there are N columns in the target table, the full outer join output will have: + // - N columns for target table + // - Two optional Row ID / Row commit version preservation columns with their physical name. + // - ROW_DROPPED_COL to define whether the generated row should be dropped or written + // - if CDC is enabled, also CDC_TYPE_COLUMN_NAME with the type of change being performed + // in a particular row + // (N+1 or N+2 columns depending on CDC disabled / enabled) + val outputColNames = + targetWriteCols.map(_.name) ++ + rowIdColumnExpressionOpt.map(_.name) ++ + rowCommitVersionColumnExpressionOpt.map(_.name) ++ + Seq(ROW_DROPPED_COL) ++ + (if (cdcEnabled) Some(CDC_TYPE_COLUMN_NAME) else None) + + // Copy expressions to copy the existing target row and not drop it (ROW_DROPPED_COL=false), + // and in case CDC is enabled, set it to CDC_TYPE_NOT_CDC. + // (N+1 or N+2 or N+3 columns depending on CDC disabled / enabled and if Row IDs are preserved) + val noopCopyExprs = + targetWriteCols ++ + rowIdColumnExpressionOpt ++ + rowCommitVersionColumnExpressionOpt ++ + Seq(incrNoopCountExpr) ++ + (if (cdcEnabled) Seq(CDC_TYPE_NOT_CDC) else Seq()) + + // Generate output columns. + val outputCols = generateWriteAllChangesOutputCols( + targetWriteCols, + rowIdColumnExpressionOpt, + rowCommitVersionColumnExpressionOpt, + outputColNames, + noopCopyExprs, + clausesWithPrecompConditions, + cdcEnabled + ) + + val preOutputDF = if (cdcEnabled) { + generateCdcAndOutputRows( + joinedAndPrecomputedConditionsDF, + outputCols, + outputColNames, + noopCopyExprs, + rowIdColumnExpressionOpt.map(_.name), + rowCommitVersionColumnExpressionOpt.map(_.name), + deduplicateCDFDeletes) + } else { + // change data capture is off, just output the normal data + joinedAndPrecomputedConditionsDF + .select(outputCols: _*) + } + // The filter ensures we only consider rows that are not dropped. + // The drop ensures that the dropped flag does not leak out to the output. + val outputDF = preOutputDF + .filter(s"$ROW_DROPPED_COL = false") + .drop(ROW_DROPPED_COL) + + logDebug("writeAllChanges: join output plan:\n" + outputDF.queryExecution) + + // Write to Delta + val newFiles = writeFiles(spark, deltaTxn, outputDF) + + // Update metrics + val (addedBytes, addedPartitions) = totalBytesAndDistinctPartitionValues(newFiles) + metrics("numTargetFilesAdded") += newFiles.count(_.isInstanceOf[AddFile]) + metrics("numTargetChangeFilesAdded") += newFiles.count(_.isInstanceOf[AddCDCFile]) + metrics("numTargetChangeFileBytes") += newFiles.collect{ case f: AddCDCFile => f.size }.sum + metrics("numTargetBytesAdded") += addedBytes + metrics("numTargetPartitionsAddedTo") += addedPartitions + if (multipleMatchDeleteOnlyOvercount.isDefined) { + // Compensate for counting duplicates during the query. + val actualRowsDeleted = + metrics("numTargetRowsDeleted").value - multipleMatchDeleteOnlyOvercount.get + assert(actualRowsDeleted >= 0) + metrics("numTargetRowsDeleted").set(actualRowsDeleted) + val actualRowsMatchedDeleted = + metrics("numTargetRowsMatchedDeleted").value - multipleMatchDeleteOnlyOvercount.get + assert(actualRowsMatchedDeleted >= 0) + metrics("numTargetRowsMatchedDeleted").set(actualRowsMatchedDeleted) + } + + newFiles + } + + /** + * Writes Deletion Vectors for rows modified by the merge operation. + */ + protected def writeDVs( + spark: SparkSession, + deltaTxn: OptimisticTransaction, + filesToRewrite: Seq[AddFile]): Seq[FileAction] = recordMergeOperation( + extraOpType = "writeDeletionVectors", + status = s"MERGE operation - Rewriting Deletion Vectors to ${filesToRewrite.size} files", + sqlMetricName = "rewriteTimeMs") { + + val fileIndex = new TahoeBatchFileIndex( + spark, + actionType = "merge", + addFiles = filesToRewrite, + deltaLog = deltaTxn.deltaLog, + path = deltaTxn.deltaLog.dataPath, + snapshot = deltaTxn.snapshot) + + val targetDF = DMLWithDeletionVectorsHelper.createTargetDfForScanningForMatches( + spark, + target, + fileIndex) + + // For writing DVs we are only interested in the target table. When there are no + // notMatchedBySource clauses an inner join is sufficient. Otherwise, we need an rightOuter + // join to include target rows that are not matched. + val joinType = if (notMatchedBySourceClauses.isEmpty) { + "inner" + } else { + "rightOuter" + } + + val joinedDF = getMergeSource.df + .withColumn(SOURCE_ROW_PRESENT_COL, lit(true)) + .join(targetDF, Column(condition), joinType) + + val modifiedRowsFilter = generateFilterForModifiedRows() + val matchedDVResult = + DeletionVectorBitmapGenerator.buildRowIndexSetsForFilesMatchingCondition( + spark, + deltaTxn, + tableHasDVs = true, + targetDf = joinedDF, + candidateFiles = filesToRewrite, + condition = modifiedRowsFilter + ) + + val nameToAddFileMap = generateCandidateFileMap(targetDeltaLog.dataPath, filesToRewrite) + + val touchedFilesWithDVs = DMLWithDeletionVectorsHelper + .findFilesWithMatchingRows(deltaTxn, nameToAddFileMap, matchedDVResult) + + val (dvActions, metricsMap) = DMLWithDeletionVectorsHelper.processUnmodifiedData( + spark, + touchedFilesWithDVs, + deltaTxn.snapshot) + + metrics("numTargetDeletionVectorsAdded") + .set(metricsMap.getOrElse("numDeletionVectorsAdded", 0L)) + metrics("numTargetDeletionVectorsRemoved") + .set(metricsMap.getOrElse("numDeletionVectorsRemoved", 0L)) + metrics("numTargetDeletionVectorsUpdated") + .set(metricsMap.getOrElse("numDeletionVectorsUpdated", 0L)) + + // When DVs are enabled we override metrics related to removed files. + metrics("numTargetFilesRemoved").set(metricsMap.getOrElse("numRemovedFiles", 0L)) + + val fullyRemovedFiles = touchedFilesWithDVs.filter(_.isFullyReplaced()).map(_.fileLogEntry) + val (removedBytes, removedPartitions) = totalBytesAndDistinctPartitionValues(fullyRemovedFiles) + metrics("numTargetBytesRemoved").set(removedBytes) + metrics("numTargetPartitionsRemovedFrom").set(removedPartitions) + + dvActions + } +} diff --git a/backends-clickhouse/src/main/delta-32/org/apache/spark/sql/delta/files/MergeTreeCommitProtocol.scala b/backends-clickhouse/src/main/delta-32/org/apache/spark/sql/delta/files/MergeTreeCommitProtocol.scala new file mode 100644 index 0000000000000..33dbce138a426 --- /dev/null +++ b/backends-clickhouse/src/main/delta-32/org/apache/spark/sql/delta/files/MergeTreeCommitProtocol.scala @@ -0,0 +1,255 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.delta.files + +// scalastyle:off import.ordering.noEmptyLine +import java.util.UUID + +import scala.collection.mutable.ArrayBuffer + +import org.apache.spark.sql.delta.DeltaErrors +import org.apache.spark.sql.delta.actions.{AddCDCFile, AddFile, FileAction} +import org.apache.spark.sql.delta.commands.cdc.CDCReader.{CDC_LOCATION, CDC_PARTITION_COL} +import org.apache.spark.sql.delta.util.{DateFormatter, PartitionUtils, TimestampFormatter, Utils => DeltaUtils} +import org.apache.hadoop.fs.{FileStatus, Path} +import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext} + +import org.apache.spark.internal.Logging +import org.apache.spark.internal.io.FileCommitProtocol +import org.apache.spark.internal.io.FileCommitProtocol.TaskCommitMessage +import org.apache.spark.sql.catalyst.expressions.Cast +import org.apache.spark.sql.types.StringType + +/** + * This file is copied from the DelayedCommitProtocol of the Delta 3.2.0 + * and renamed to MergeTreeCommitProtocol. + * It is modified to overcome the following issues: + * 1. the function commitTask will return TaskCommitMessage(Nil), + * the FileStatus list will be get from the CH backend. + */ + +/** + * Writes out the files to `path` and returns a list of them in `addedStatuses`. Includes + * special handling for partitioning on [[CDC_PARTITION_COL]] for + * compatibility between enabled and disabled CDC; partitions with a value of false in this + * column produce no corresponding partitioning directory. + * @param path The base path files will be written + * @param randomPrefixLength The length of random subdir name under 'path' that files been written + * @param subdir The immediate subdir under path; If randomPrefixLength and subdir both exist, file + * path will be path/subdir/[rand str of randomPrefixLength]/file + */ +class MergeTreeCommitProtocol( + jobId: String, + path: String, + randomPrefixLength: Option[Int], + subdir: Option[String]) + extends FileCommitProtocol with Serializable with Logging { + // Track the list of files added by a task, only used on the executors. + @transient protected var addedFiles: ArrayBuffer[(Map[String, String], String)] = _ + + // Track the change files added, only used on the driver. Files are sorted between this buffer + // and addedStatuses based on the value of the [[CDC_TYPE_COLUMN_NAME]] partition column - a + // file goes to addedStatuses if the value is CDC_TYPE_NOT_CDC and changeFiles otherwise. + @transient val changeFiles = new ArrayBuffer[AddCDCFile] + + // Track the overall files added, only used on the driver. + // + // In rare cases, some of these AddFiles can be empty (i.e. contain no logical records). + // If the caller wishes to have only non-empty AddFiles, they must collect stats and perform + // the filter themselves. See TransactionalWrite::writeFiles. This filter will be best-effort, + // since there's no guarantee the stats will exist. + @transient val addedStatuses = new ArrayBuffer[AddFile] + + val timestampPartitionPattern = "yyyy-MM-dd HH:mm:ss[.S]" + + // Constants for CDC partition manipulation. Used only in newTaskTempFile(), but we define them + // here to avoid building a new redundant regex for every file. + protected val cdcPartitionFalse = s"${CDC_PARTITION_COL}=false" + protected val cdcPartitionTrue = s"${CDC_PARTITION_COL}=true" + protected val cdcPartitionTrueRegex = cdcPartitionTrue.r + + override def setupJob(jobContext: JobContext): Unit = { + + } + + /** + * Commits a job after the writes succeed. Must be called on the driver. Partitions the written + * files into [[AddFile]]s and [[AddCDCFile]]s as these metadata actions are treated differently + * by [[TransactionalWrite]] (i.e. AddFile's may have additional statistics injected) + */ + override def commitJob(jobContext: JobContext, taskCommits: Seq[TaskCommitMessage]): Unit = { + val (addFiles, changeFiles) = taskCommits.flatMap(_.obj.asInstanceOf[Seq[_]]) + .partition { + case _: AddFile => true + case _: AddCDCFile => false + case other => + throw DeltaErrors.unrecognizedFileAction(s"$other", s"${other.getClass}") + } + + // we cannot add type information above because of type erasure + addedStatuses ++= addFiles.map(_.asInstanceOf[AddFile]) + this.changeFiles ++= changeFiles.map(_.asInstanceOf[AddCDCFile]).toArray[AddCDCFile] + } + + override def abortJob(jobContext: JobContext): Unit = { + // TODO: Best effort cleanup + } + + override def setupTask(taskContext: TaskAttemptContext): Unit = { + addedFiles = new ArrayBuffer[(Map[String, String], String)] + } + + protected def getFileName( + taskContext: TaskAttemptContext, + ext: String, + partitionValues: Map[String, String]): String = { + // The file name looks like part-r-00000-2dd664f9-d2c4-4ffe-878f-c6c70c1fb0cb_00003.gz.parquet + // Note that %05d does not truncate the split number, so if we have more than 100000 tasks, + // the file name is fine and won't overflow. + val split = taskContext.getTaskAttemptID.getTaskID.getId + val uuid = UUID.randomUUID.toString + // CDC files (CDC_PARTITION_COL = true) are named with "cdc-..." instead of "part-...". + if (partitionValues.get(CDC_PARTITION_COL).contains("true")) { + f"cdc-$split%05d-$uuid$ext" + } else { + f"part-$split%05d-$uuid$ext" + } + } + + protected def parsePartitions(dir: String): Map[String, String] = { + // TODO: timezones? + // TODO: enable validatePartitionColumns? + val dateFormatter = DateFormatter() + val timestampFormatter = + TimestampFormatter(timestampPartitionPattern, java.util.TimeZone.getDefault) + val parsedPartition = + PartitionUtils + .parsePartition( + new Path(dir), + typeInference = false, + Set.empty, + Map.empty, + validatePartitionColumns = false, + java.util.TimeZone.getDefault, + dateFormatter, + timestampFormatter) + ._1 + .get + parsedPartition + .columnNames + .zip( + parsedPartition + .literals + .map(l => Cast(l, StringType).eval()) + .map(Option(_).map(_.toString).orNull)) + .toMap + } + + /** + * Notifies the commit protocol to add a new file, and gets back the full path that should be + * used. + * + * Includes special logic for CDC files and paths. Specifically, if the directory `dir` contains + * the CDC partition `__is_cdc=true` then + * - the file name begins with `cdc-` instead of `part-` + * - the directory has the `__is_cdc=true` partition removed and is placed in the `_changed_data` + * folder + */ + override def newTaskTempFile( + taskContext: TaskAttemptContext, dir: Option[String], ext: String): String = { + val partitionValues = dir.map(parsePartitions).getOrElse(Map.empty[String, String]) + val filename = getFileName(taskContext, ext, partitionValues) + val relativePath = randomPrefixLength.map { prefixLength => + DeltaUtils.getRandomPrefix(prefixLength) // Generate a random prefix as a first choice + }.orElse { + dir // or else write into the partition directory if it is partitioned + }.map { subDir => + // Do some surgery on the paths we write out to eliminate the CDC_PARTITION_COL. Non-CDC + // data is written to the base location, while CDC data is written to a special folder + // _change_data. + // The code here gets a bit complicated to accommodate two corner cases: an empty subdir + // can't be passed to new Path() at all, and a single-level subdir won't have a trailing + // slash. + if (subDir == cdcPartitionFalse) { + new Path(filename) + } else if (subDir.startsWith(cdcPartitionTrue)) { + val cleanedSubDir = cdcPartitionTrueRegex.replaceFirstIn(subDir, CDC_LOCATION) + new Path(cleanedSubDir, filename) + } else if (subDir.startsWith(cdcPartitionFalse)) { + // We need to remove the trailing slash in addition to the directory - otherwise + // it'll be interpreted as an absolute path and fail. + val cleanedSubDir = subDir.stripPrefix(cdcPartitionFalse + "/") + new Path(cleanedSubDir, filename) + } else { + new Path(subDir, filename) + } + }.getOrElse(new Path(filename)) // or directly write out to the output path + + val relativePathWithSubdir = subdir.map(new Path(_, relativePath)).getOrElse(relativePath) + addedFiles.append((partitionValues, relativePathWithSubdir.toUri.toString)) + new Path(path, relativePathWithSubdir).toString + } + + override def newTaskTempFileAbsPath( + taskContext: TaskAttemptContext, absoluteDir: String, ext: String): String = { + throw DeltaErrors.unsupportedAbsPathAddFile(s"$this") + } + + protected def buildActionFromAddedFile( + f: (Map[String, String], String), + stat: FileStatus, + taskContext: TaskAttemptContext): FileAction = { + // The partitioning in the Delta log action will be read back as part of the data, so our + // virtual CDC_PARTITION_COL needs to be stripped out. + val partitioning = f._1.filter { case (k, v) => k != CDC_PARTITION_COL } + f._1.get(CDC_PARTITION_COL) match { + case Some("true") => + val partitioning = f._1.filter { case (k, v) => k != CDC_PARTITION_COL } + AddCDCFile(f._2, partitioning, stat.getLen) + case _ => + val addFile = AddFile(f._2, partitioning, stat.getLen, stat.getModificationTime, true) + addFile + } + } + + override def commitTask(taskContext: TaskAttemptContext): TaskCommitMessage = { + // --- modified start + /* if (addedFiles.nonEmpty) { + val fs = new Path(path, addedFiles.head._2).getFileSystem(taskContext.getConfiguration) + val statuses: Seq[FileAction] = addedFiles.map { f => + // scalastyle:off pathfromuri + val filePath = new Path(path, new Path(new URI(f._2))) + // scalastyle:on pathfromuri + val stat = fs.getFileStatus(filePath) + + buildActionFromAddedFile(f, stat, taskContext) + }.toSeq + + new TaskCommitMessage(statuses) + } else { + new TaskCommitMessage(Nil) + } */ + // --- modified end + new TaskCommitMessage(Nil) + } + + override def abortTask(taskContext: TaskAttemptContext): Unit = { + // TODO: we can also try delete the addedFiles as a best-effort cleanup. + } +} + diff --git a/backends-clickhouse/src/main/delta-32/org/apache/spark/sql/execution/datasources/v2/clickhouse/ClickHouseDataSource.scala b/backends-clickhouse/src/main/delta-32/org/apache/spark/sql/execution/datasources/v2/clickhouse/ClickHouseDataSource.scala new file mode 100644 index 0000000000000..fcf1cee666719 --- /dev/null +++ b/backends-clickhouse/src/main/delta-32/org/apache/spark/sql/execution/datasources/v2/clickhouse/ClickHouseDataSource.scala @@ -0,0 +1,144 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.execution.datasources.v2.clickhouse + +import org.apache.spark.sql._ +import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap +import org.apache.spark.sql.connector.catalog.Table +import org.apache.spark.sql.connector.expressions.Transform +import org.apache.spark.sql.delta._ +import org.apache.spark.sql.delta.catalog.ClickHouseTableV2 +import org.apache.spark.sql.delta.commands.WriteIntoDelta +import org.apache.spark.sql.delta.commands.cdc.CDCReader +import org.apache.spark.sql.delta.sources.{DeltaDataSource, DeltaSourceUtils, DeltaSQLConf} +import org.apache.spark.sql.sources.BaseRelation +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.util.CaseInsensitiveStringMap + +import org.apache.hadoop.fs.Path + +import scala.collection.JavaConverters._ +import scala.collection.mutable + +/** A DataSource V1 for integrating Delta into Spark SQL batch and Streaming APIs. */ +class ClickHouseDataSource extends DeltaDataSource { + + override def shortName(): String = { + ClickHouseConfig.NAME + } + + override def getTable( + schema: StructType, + partitioning: Array[Transform], + properties: java.util.Map[String, String]): Table = { + val options = new CaseInsensitiveStringMap(properties) + val path = options.get("path") + if (path == null) throw DeltaErrors.pathNotSpecifiedException + new ClickHouseTableV2( + SparkSession.active, + new Path(path), + options = properties.asScala.toMap, + clickhouseExtensionOptions = ClickHouseConfig + .createMergeTreeConfigurations( + ClickHouseConfig + .getMergeTreeConfigurations(properties) + .asJava) + ) + } + + override def createRelation( + sqlContext: SQLContext, + mode: SaveMode, + parameters: Map[String, String], + data: DataFrame): BaseRelation = { + val path = parameters.getOrElse("path", throw DeltaErrors.pathNotSpecifiedException) + val partitionColumns = parameters + .get(DeltaSourceUtils.PARTITIONING_COLUMNS_KEY) + .map(DeltaDataSource.decodePartitioningColumns) + .getOrElse(Nil) + + val deltaLog = DeltaLog.forTable(sqlContext.sparkSession, new Path(path), parameters) + // need to use the latest snapshot + val configs = if (deltaLog.update().version < 0) { + // when creating table, save the clickhouse config to the delta metadata + val clickHouseTableV2 = ClickHouseTableV2.getTable(deltaLog) + clickHouseTableV2.properties().asScala.toMap ++ DeltaConfigs + .validateConfigurations(parameters.filterKeys(_.startsWith("delta.")).toMap) + } else { + DeltaConfigs.validateConfigurations(parameters.filterKeys(_.startsWith("delta.")).toMap) + } + WriteIntoDelta( + deltaLog = deltaLog, + mode = mode, + new DeltaOptions(parameters, sqlContext.sparkSession.sessionState.conf), + partitionColumns = partitionColumns, + configuration = configs, + data = data + ).run(sqlContext.sparkSession) + + deltaLog.createRelation() + } + + override def createRelation( + sqlContext: SQLContext, + parameters: Map[String, String]): BaseRelation = { + recordFrameProfile("Delta", "DeltaDataSource.createRelation") { + val maybePath = parameters.getOrElse("path", throw DeltaErrors.pathNotSpecifiedException) + + // Log any invalid options that are being passed in + DeltaOptions.verifyOptions(CaseInsensitiveMap(parameters)) + + val timeTravelByParams = DeltaDataSource.getTimeTravelVersion(parameters) + var cdcOptions: mutable.Map[String, String] = mutable.Map.empty + val caseInsensitiveParams = new CaseInsensitiveStringMap(parameters.asJava) + if (CDCReader.isCDCRead(caseInsensitiveParams)) { + cdcOptions = mutable.Map[String, String](DeltaDataSource.CDC_ENABLED_KEY -> "true") + if (caseInsensitiveParams.containsKey(DeltaDataSource.CDC_START_VERSION_KEY)) { + cdcOptions(DeltaDataSource.CDC_START_VERSION_KEY) = + caseInsensitiveParams.get(DeltaDataSource.CDC_START_VERSION_KEY) + } + if (caseInsensitiveParams.containsKey(DeltaDataSource.CDC_START_TIMESTAMP_KEY)) { + cdcOptions(DeltaDataSource.CDC_START_TIMESTAMP_KEY) = + caseInsensitiveParams.get(DeltaDataSource.CDC_START_TIMESTAMP_KEY) + } + if (caseInsensitiveParams.containsKey(DeltaDataSource.CDC_END_VERSION_KEY)) { + cdcOptions(DeltaDataSource.CDC_END_VERSION_KEY) = + caseInsensitiveParams.get(DeltaDataSource.CDC_END_VERSION_KEY) + } + if (caseInsensitiveParams.containsKey(DeltaDataSource.CDC_END_TIMESTAMP_KEY)) { + cdcOptions(DeltaDataSource.CDC_END_TIMESTAMP_KEY) = + caseInsensitiveParams.get(DeltaDataSource.CDC_END_TIMESTAMP_KEY) + } + } + val dfOptions: Map[String, String] = + if ( + sqlContext.sparkSession.sessionState.conf.getConf( + DeltaSQLConf.LOAD_FILE_SYSTEM_CONFIGS_FROM_DATAFRAME_OPTIONS) + ) { + parameters ++ cdcOptions + } else { + cdcOptions.toMap + } + (new ClickHouseTableV2( + sqlContext.sparkSession, + new Path(maybePath), + timeTravelOpt = timeTravelByParams, + options = dfOptions + )).toBaseRelation + } + } +} diff --git a/backends-clickhouse/src/main/delta-32/org/apache/spark/sql/execution/datasources/v2/clickhouse/ClickHouseSparkCatalog.scala b/backends-clickhouse/src/main/delta-32/org/apache/spark/sql/execution/datasources/v2/clickhouse/ClickHouseSparkCatalog.scala new file mode 100644 index 0000000000000..dde7013962d0c --- /dev/null +++ b/backends-clickhouse/src/main/delta-32/org/apache/spark/sql/execution/datasources/v2/clickhouse/ClickHouseSparkCatalog.scala @@ -0,0 +1,734 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.execution.datasources.v2.clickhouse + +import org.apache.gluten.sql.shims.SparkShimLoader + +import org.apache.spark.SparkException +import org.apache.spark.sql.{AnalysisException, DataFrame, SparkSession} +import org.apache.spark.sql.catalyst.TableIdentifier +import org.apache.spark.sql.catalyst.analysis.{NoSuchDatabaseException, NoSuchNamespaceException, NoSuchTableException} +import org.apache.spark.sql.catalyst.catalog._ +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.connector.catalog._ +import org.apache.spark.sql.connector.catalog.TableCapability.V1_BATCH_WRITE +import org.apache.spark.sql.connector.expressions.{FieldReference, IdentityTransform, Transform} +import org.apache.spark.sql.connector.write.{LogicalWriteInfo, V1Write, WriteBuilder} +import org.apache.spark.sql.delta.{DeltaConfigs, DeltaErrors, DeltaLog, DeltaOptions, DeltaTableUtils} +import org.apache.spark.sql.delta.DeltaTableIdentifier.gluePermissionError +import org.apache.spark.sql.delta.catalog.{BucketTransform, ClickHouseTableV2, DeltaTableV2, TempClickHouseTableV2} +import org.apache.spark.sql.delta.commands.{CreateDeltaTableCommand, TableCreationModes, WriteIntoDelta} +import org.apache.spark.sql.delta.metering.DeltaLogging +import org.apache.spark.sql.delta.skipping.clustering.ClusteredTableUtils +import org.apache.spark.sql.delta.skipping.clustering.temp.{ClusterBySpec, ClusterByTransform => TempClusterByTransform} +import org.apache.spark.sql.delta.sources.{DeltaSourceUtils, DeltaSQLConf} +import org.apache.spark.sql.delta.stats.StatisticsCollection +import org.apache.spark.sql.execution.datasources.{DataSource, PartitioningUtils} +import org.apache.spark.sql.execution.datasources.v2.clickhouse.utils.CHDataSourceUtils +import org.apache.spark.sql.sources.InsertableRelation +import org.apache.spark.sql.types.StructType + +import org.apache.hadoop.fs.Path + +import java.util +import java.util.Locale + +import scala.collection.JavaConverters._ +import scala.collection.mutable + +class ClickHouseSparkCatalog + extends DelegatingCatalogExtension + with StagingTableCatalog + with SupportsPathIdentifier + with DeltaLogging { + + val spark = SparkSession.active + + private def createCatalogTable( + ident: Identifier, + schema: StructType, + partitions: Array[Transform], + properties: util.Map[String, String] + ): Table = { + super.createTable(ident, schema, partitions, properties) + } + + override def createTable( + ident: Identifier, + columns: Array[org.apache.spark.sql.connector.catalog.Column], + partitions: Array[Transform], + properties: util.Map[String, String]): Table = { + createTable( + ident, + org.apache.spark.sql.connector.catalog.CatalogV2Util.v2ColumnsToStructType(columns), + partitions, + properties) + } + + override def createTable( + ident: Identifier, + schema: StructType, + partitions: Array[Transform], + properties: util.Map[String, String]): Table = { + if (CHDataSourceUtils.isClickHouseDataSourceName(getProvider(properties))) { + createClickHouseTable( + ident, + schema, + partitions, + properties, + Map.empty, + sourceQuery = None, + TableCreationModes.Create) + } else if (DeltaSourceUtils.isDeltaDataSourceName(getProvider(properties))) { + createDeltaTable( + ident, + schema, + partitions, + properties, + Map.empty, + sourceQuery = None, + TableCreationModes.Create + ) + } else { + createCatalogTable(ident, schema, partitions, properties) + } + } + + /** + * Creates a ClickHouse table + * + * @param ident + * The identifier of the table + * @param schema + * The schema of the table + * @param partitions + * The partition transforms for the table + * @param allTableProperties + * The table properties that configure the behavior of the table or provide information about + * the table + * @param writeOptions + * Options specific to the write during table creation or replacement + * @param sourceQuery + * A query if this CREATE request came from a CTAS or RTAS + * @param operation + * The specific table creation mode, whether this is a Create/Replace/Create or Replace + */ + private def createClickHouseTable( + ident: Identifier, + schema: StructType, + partitions: Array[Transform], + allTableProperties: util.Map[String, String], + writeOptions: Map[String, String], + sourceQuery: Option[DataFrame], + operation: TableCreationModes.CreationMode): Table = { + val (partitionColumns, maybeBucketSpec) = + SparkShimLoader.getSparkShims.convertPartitionTransforms(partitions) + var newSchema = schema + var newPartitionColumns = partitionColumns + var newBucketSpec = maybeBucketSpec + + // Delta does not support bucket feature, so save the bucket infos into properties if exists. + val tableProperties = + ClickHouseConfig.createMergeTreeConfigurations(allTableProperties, newBucketSpec) + + val isByPath = isPathIdentifier(ident) + val location = if (isByPath) { + Option(ident.name()) + } else { + Option(allTableProperties.get("location")) + } + val locUriOpt = location.map(CatalogUtils.stringToURI) + val storage = DataSource + .buildStorageFormatFromOptions(writeOptions) + .copy(locationUri = locUriOpt) + val tableType = + if (location.isDefined) CatalogTableType.EXTERNAL else CatalogTableType.MANAGED + val id = { + TableIdentifier(ident.name(), ident.namespace().lastOption) + } + val existingTableOpt = getExistingTableIfExists(id) + val loc = new Path(locUriOpt.getOrElse(spark.sessionState.catalog.defaultTablePath(id))) + val commentOpt = Option(allTableProperties.get("comment")) + + val tableDesc = new CatalogTable( + identifier = id, + tableType = tableType, + storage = storage, + schema = newSchema, + provider = Some(ClickHouseConfig.ALT_NAME), + partitionColumnNames = newPartitionColumns, + bucketSpec = newBucketSpec, + properties = tableProperties, + comment = commentOpt + ) + + val withDb = verifyTableAndSolidify(tableDesc, None, isMergeTree = true) + + val writer = sourceQuery.map { + df => + WriteIntoDelta( + DeltaLog.forTable(spark, loc), + operation.mode, + new DeltaOptions(withDb.storage.properties, spark.sessionState.conf), + withDb.partitionColumnNames, + withDb.properties ++ commentOpt.map("comment" -> _), + df, + schemaInCatalog = if (newSchema != schema) Some(newSchema) else None + ) + } + try { + ClickHouseTableV2.temporalThreadLocalCHTable.set( + new TempClickHouseTableV2(spark, Some(withDb))) + + CreateDeltaTableCommand( + withDb, + existingTableOpt, + operation.mode, + writer, + operation = operation, + tableByPath = isByPath).run(spark) + } finally { + ClickHouseTableV2.temporalThreadLocalCHTable.remove() + } + + logInfo(s"create table ${ident.toString} successfully.") + loadTable(ident) + } + + /** + * Creates a Delta table + * + * @param ident + * The identifier of the table + * @param schema + * The schema of the table + * @param partitions + * The partition transforms for the table + * @param allTableProperties + * The table properties that configure the behavior of the table or provide information about + * the table + * @param writeOptions + * Options specific to the write during table creation or replacement + * @param sourceQuery + * A query if this CREATE request came from a CTAS or RTAS + * @param operation + * The specific table creation mode, whether this is a Create/Replace/Create or Replace + */ + private def createDeltaTable( + ident: Identifier, + schema: StructType, + partitions: Array[Transform], + allTableProperties: util.Map[String, String], + writeOptions: Map[String, String], + sourceQuery: Option[DataFrame], + operation: TableCreationModes.CreationMode + ): Table = { + // These two keys are tableProperties in data source v2 but not in v1, so we have to filter + // them out. Otherwise property consistency checks will fail. + val tableProperties = allTableProperties.asScala.filterKeys { + case TableCatalog.PROP_LOCATION => false + case TableCatalog.PROP_PROVIDER => false + case TableCatalog.PROP_COMMENT => false + case TableCatalog.PROP_OWNER => false + case TableCatalog.PROP_EXTERNAL => false + case "path" => false + case "option.path" => false + case _ => true + }.toMap + val (partitionColumns, maybeBucketSpec, maybeClusterBySpec) = convertTransforms(partitions) + var newSchema = schema + var newPartitionColumns = partitionColumns + var newBucketSpec = maybeBucketSpec + val conf = spark.sessionState.conf + allTableProperties.asScala + .get(DeltaConfigs.DATA_SKIPPING_STATS_COLUMNS.key) + .foreach(StatisticsCollection.validateDeltaStatsColumns(schema, partitionColumns, _)) + val isByPath = isPathIdentifier(ident) + if ( + isByPath && !conf.getConf(DeltaSQLConf.DELTA_LEGACY_ALLOW_AMBIGUOUS_PATHS) + && allTableProperties.containsKey("location") + // The location property can be qualified and different from the path in the identifier, so + // we check `endsWith` here. + && Option(allTableProperties.get("location")).exists(!_.endsWith(ident.name())) + ) { + throw DeltaErrors.ambiguousPathsInCreateTableException( + ident.name(), + allTableProperties.get("location")) + } + val location = if (isByPath) { + Option(ident.name()) + } else { + Option(allTableProperties.get("location")) + } + val id = { + TableIdentifier(ident.name(), ident.namespace().lastOption) + } + var locUriOpt = location.map(CatalogUtils.stringToURI) + val existingTableOpt = getExistingTableIfExists(id) + val loc = locUriOpt + .orElse(existingTableOpt.flatMap(_.storage.locationUri)) + .getOrElse(spark.sessionState.catalog.defaultTablePath(id)) + val storage = DataSource + .buildStorageFormatFromOptions(writeOptions) + .copy(locationUri = Option(loc)) + val tableType = + if (location.isDefined) CatalogTableType.EXTERNAL else CatalogTableType.MANAGED + val commentOpt = Option(allTableProperties.get("comment")) + + var tableDesc = new CatalogTable( + identifier = id, + tableType = tableType, + storage = storage, + schema = newSchema, + provider = Some(DeltaSourceUtils.ALT_NAME), + partitionColumnNames = newPartitionColumns, + bucketSpec = newBucketSpec, + properties = tableProperties, + comment = commentOpt + ) + + val withDb = + verifyTableAndSolidify( + tableDesc, + None, + maybeClusterBySpec + ) + + val writer = sourceQuery.map { + df => + WriteIntoDelta( + DeltaLog.forTable(spark, new Path(loc)), + operation.mode, + new DeltaOptions(withDb.storage.properties, spark.sessionState.conf), + withDb.partitionColumnNames, + withDb.properties ++ commentOpt.map("comment" -> _), + df, + Some(tableDesc), + schemaInCatalog = if (newSchema != schema) Some(newSchema) else None + ) + } + + CreateDeltaTableCommand( + withDb, + existingTableOpt, + operation.mode, + writer, + operation, + tableByPath = isByPath).run(spark) + + loadTable(ident) + } + + /** Performs checks on the parameters provided for table creation for a ClickHouse table. */ + private def verifyTableAndSolidify( + tableDesc: CatalogTable, + query: Option[LogicalPlan], + maybeClusterBySpec: Option[ClusterBySpec] = None, + isMergeTree: Boolean = false): CatalogTable = { + + if (!isMergeTree && tableDesc.bucketSpec.isDefined) { + throw DeltaErrors.operationNotSupportedException("Bucketing", tableDesc.identifier) + } + + val schema = query + .map { + plan => + assert(tableDesc.schema.isEmpty, "Can't specify table schema in CTAS.") + plan.schema.asNullable + } + .getOrElse(tableDesc.schema) + + PartitioningUtils.validatePartitionColumn( + schema, + tableDesc.partitionColumnNames, + caseSensitive = false + ) // Delta is case insensitive + + var validatedConfigurations = if (isMergeTree) { + tableDesc.properties + } else { + DeltaConfigs.validateConfigurations(tableDesc.properties) + } + + ClusteredTableUtils.validateExistingTableFeatureProperties(validatedConfigurations) + + // Add needed configs for Clustered table. + if (maybeClusterBySpec.nonEmpty) { + validatedConfigurations = validatedConfigurations ++ + ClusteredTableUtils.getClusteringColumnsAsProperty(maybeClusterBySpec) ++ + ClusteredTableUtils.getTableFeatureProperties(validatedConfigurations) + } + + val db = tableDesc.identifier.database.getOrElse(catalog.getCurrentDatabase) + val tableIdentWithDB = tableDesc.identifier.copy(database = Some(db)) + tableDesc.copy( + identifier = tableIdentWithDB, + schema = schema, + properties = validatedConfigurations) + } + + /** Checks if a table already exists for the provided identifier. */ + def getExistingTableIfExists(table: TableIdentifier): Option[CatalogTable] = { + // If this is a path identifier, we cannot return an existing CatalogTable. The Create command + // will check the file system itself + if (isPathIdentifier(table)) return None + val tableExists = catalog.tableExists(table) + if (tableExists) { + val oldTable = catalog.getTableMetadata(table) + if (oldTable.tableType == CatalogTableType.VIEW) { + throw new AnalysisException(s"$table is a view. You may not write data into a view.") + } + if ( + !DeltaSourceUtils.isDeltaTable(oldTable.provider) && + !CHDataSourceUtils.isClickHouseTable(oldTable.provider) + ) { + throw DeltaErrors.notADeltaTable(table.table) + } + Some(oldTable) + } else { + None + } + } + + private def getProvider(properties: util.Map[String, String]): String = { + Option(properties.get("provider")).getOrElse(ClickHouseConfig.NAME) + } + + override def loadTable(ident: Identifier): Table = { + try { + super.loadTable(ident) match { + case v1: V1Table if CHDataSourceUtils.isClickHouseTable(v1.catalogTable) => + new ClickHouseTableV2( + spark, + new Path(v1.catalogTable.location), + catalogTable = Some(v1.catalogTable), + tableIdentifier = Some(ident.toString)) + case v1: V1Table if DeltaTableUtils.isDeltaTable(v1.catalogTable) => + DeltaTableV2( + spark, + new Path(v1.catalogTable.location), + catalogTable = Some(v1.catalogTable), + tableIdentifier = Some(ident.toString)) + case o => + o + } + } catch { + case _: NoSuchDatabaseException | _: NoSuchNamespaceException | _: NoSuchTableException + if isPathIdentifier(ident) => + newDeltaPathTable(ident) + case e: AnalysisException if gluePermissionError(e) && isPathIdentifier(ident) => + logWarning( + "Received an access denied error from Glue. Assuming this " + + s"identifier ($ident) is path based.", + e) + newDeltaPathTable(ident) + } + } + + private def newDeltaPathTable(ident: Identifier): DeltaTableV2 = { + if (hasClickHouseNamespace(ident)) { + new ClickHouseTableV2(spark, new Path(ident.name())) + } else { + DeltaTableV2(spark, new Path(ident.name())) + } + } + + /** support to delete mergetree data from the external table */ + override def purgeTable(ident: Identifier): Boolean = { + try { + loadTable(ident) match { + case t: ClickHouseTableV2 => + val tableType = t.properties().getOrDefault("Type", "") + // file-based or external table + val isExternal = tableType.isEmpty || tableType.equalsIgnoreCase("external") + val tablePath = t.rootPath + // first delete the table metadata + val deletedTable = super.dropTable(ident) + if (deletedTable && isExternal) { + val fs = tablePath.getFileSystem(spark.sessionState.newHadoopConf()) + // delete all data if there is a external table + fs.delete(tablePath, true) + } + true + case _ => super.purgeTable(ident) + } + } catch { + case _: Exception => + false + } + } + + override def stageReplace( + ident: Identifier, + schema: StructType, + partitions: Array[Transform], + properties: util.Map[String, String]): StagedTable = + recordFrameProfile("DeltaCatalog", "stageReplace") { + if ( + CHDataSourceUtils.isClickHouseDataSourceName(getProvider(properties)) || + DeltaSourceUtils.isDeltaDataSourceName(getProvider(properties)) + ) { + new StagedDeltaTableV2(ident, schema, partitions, properties, TableCreationModes.Replace) + } else { + super.dropTable(ident) + val table = createCatalogTable(ident, schema, partitions, properties) + BestEffortStagedTable(ident, table, this) + } + } + + override def stageCreateOrReplace( + ident: Identifier, + schema: StructType, + partitions: Array[Transform], + properties: util.Map[String, String]): StagedTable = + recordFrameProfile("DeltaCatalog", "stageCreateOrReplace") { + if ( + CHDataSourceUtils.isClickHouseDataSourceName(getProvider(properties)) || + DeltaSourceUtils.isDeltaDataSourceName(getProvider(properties)) + ) { + new StagedDeltaTableV2( + ident, + schema, + partitions, + properties, + TableCreationModes.CreateOrReplace) + } else { + try super.dropTable(ident) + catch { + case _: NoSuchDatabaseException => // this is fine + case _: NoSuchTableException => // this is fine + } + val table = createCatalogTable(ident, schema, partitions, properties) + BestEffortStagedTable(ident, table, this) + } + } + + override def stageCreate( + ident: Identifier, + schema: StructType, + partitions: Array[Transform], + properties: util.Map[String, String]): StagedTable = + recordFrameProfile("DeltaCatalog", "stageCreate") { + if ( + CHDataSourceUtils.isClickHouseDataSourceName(getProvider(properties)) || + DeltaSourceUtils.isDeltaDataSourceName(getProvider(properties)) + ) { + new StagedDeltaTableV2(ident, schema, partitions, properties, TableCreationModes.Create) + } else { + val table = createCatalogTable(ident, schema, partitions, properties) + BestEffortStagedTable(ident, table, this) + } + } + + // Copy of V2SessionCatalog.convertTransforms, which is private. + private def convertTransforms( + partitions: Seq[Transform]): (Seq[String], Option[BucketSpec], Option[ClusterBySpec]) = { + val identityCols = new mutable.ArrayBuffer[String] + var bucketSpec = Option.empty[BucketSpec] + var clusterBySpec = Option.empty[ClusterBySpec] + + partitions.map { + case IdentityTransform(FieldReference(Seq(col))) => + identityCols += col + + case BucketTransform(numBuckets, bucketCols, sortCols) => + bucketSpec = Some( + BucketSpec( + numBuckets, + bucketCols.map(_.fieldNames.head), + sortCols.map(_.fieldNames.head))) + case TempClusterByTransform(columnNames) => + if (clusterBySpec.nonEmpty) { + // Parser guarantees that it only passes down one TempClusterByTransform. + throw SparkException.internalError("Cannot have multiple cluster by transforms.") + } + clusterBySpec = Some(ClusterBySpec(columnNames)) + + case transform => + throw DeltaErrors.operationNotSupportedException(s"Partitioning by expressions") + } + // Parser guarantees that partition and cluster by can't both exist. + assert(!(identityCols.toSeq.nonEmpty && clusterBySpec.nonEmpty)) + // Parser guarantees that bucketing and cluster by can't both exist. + assert(!(bucketSpec.nonEmpty && clusterBySpec.nonEmpty)) + + (identityCols.toSeq, bucketSpec, clusterBySpec) + } + + /** + * A staged delta table, which creates a HiveMetaStore entry and appends data if this was a + * CTAS/RTAS command. We have a ugly way of using this API right now, but it's the best way to + * maintain old behavior compatibility between Databricks Runtime and OSS Delta Lake. + */ + private class StagedDeltaTableV2( + ident: Identifier, + override val schema: StructType, + val partitions: Array[Transform], + override val properties: util.Map[String, String], + operation: TableCreationModes.CreationMode) + extends StagedTable + with SupportsWrite { + + private var asSelectQuery: Option[DataFrame] = None + private var writeOptions: Map[String, String] = Map.empty + + override def commitStagedChanges(): Unit = + recordFrameProfile("DeltaCatalog", "commitStagedChanges") { + val conf = spark.sessionState.conf + val props = new util.HashMap[String, String]() + // Options passed in through the SQL API will show up both with an "option." prefix and + // without in Spark 3.1, so we need to remove those from the properties + val optionsThroughProperties = properties.asScala.collect { + case (k, _) if k.startsWith("option.") => k.stripPrefix("option.") + }.toSet + val sqlWriteOptions = new util.HashMap[String, String]() + properties.asScala.foreach { + case (k, v) => + if (!k.startsWith("option.") && !optionsThroughProperties.contains(k)) { + // Do not add to properties + props.put(k, v) + } else if (optionsThroughProperties.contains(k)) { + sqlWriteOptions.put(k, v) + } + } + if (writeOptions.isEmpty && !sqlWriteOptions.isEmpty) { + writeOptions = sqlWriteOptions.asScala.toMap + } + if (conf.getConf(DeltaSQLConf.DELTA_LEGACY_STORE_WRITER_OPTIONS_AS_PROPS)) { + // Legacy behavior + writeOptions.foreach { case (k, v) => props.put(k, v) } + } else { + writeOptions.foreach { + case (k, v) => + // Continue putting in Delta prefixed options to avoid breaking workloads + if (k.toLowerCase(Locale.ROOT).startsWith("delta.")) { + props.put(k, v) + } + } + } + if (CHDataSourceUtils.isClickHouseDataSourceName(getProvider(properties))) { + createClickHouseTable( + ident, + schema, + partitions, + props, + writeOptions, + asSelectQuery, + operation) + } else { + createDeltaTable(ident, schema, partitions, props, writeOptions, asSelectQuery, operation) + } + } + + override def name(): String = ident.name() + + override def abortStagedChanges(): Unit = {} + + override def capabilities(): util.Set[TableCapability] = Set(V1_BATCH_WRITE).asJava + + override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder = { + writeOptions = info.options.asCaseSensitiveMap().asScala.toMap + new DeltaV1WriteBuilder + } + + /* + * WriteBuilder for creating a Delta table. + */ + private class DeltaV1WriteBuilder extends WriteBuilder { + override def build(): V1Write = new V1Write { + override def toInsertableRelation(): InsertableRelation = { + new InsertableRelation { + override def insert(data: DataFrame, overwrite: Boolean): Unit = { + asSelectQuery = Option(data) + } + } + } + } + } + } + + private case class BestEffortStagedTable(ident: Identifier, table: Table, catalog: TableCatalog) + extends StagedTable + with SupportsWrite { + override def abortStagedChanges(): Unit = catalog.dropTable(ident) + + override def commitStagedChanges(): Unit = {} + + // Pass through + override def name(): String = table.name() + + @deprecated + override def schema(): StructType = table.schema() + override def partitioning(): Array[Transform] = table.partitioning() + override def capabilities(): util.Set[TableCapability] = table.capabilities() + override def properties(): util.Map[String, String] = table.properties() + + override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder = table match { + case supportsWrite: SupportsWrite => supportsWrite.newWriteBuilder(info) + case _ => throw DeltaErrors.unsupportedWriteStagedTable(name) + } + } +} + +/** + * A trait for handling table access through clickhouse.`/some/path`. This is a stop-gap solution + * until PathIdentifiers are implemented in Apache Spark. + */ +trait SupportsPathIdentifier extends TableCatalog { + self: ClickHouseSparkCatalog => + + protected lazy val catalog: SessionCatalog = spark.sessionState.catalog + + override def tableExists(ident: Identifier): Boolean = { + if (isPathIdentifier(ident)) { + val path = new Path(ident.name()) + val fs = path.getFileSystem(spark.sessionState.newHadoopConf()) + fs.exists(path) && fs.listStatus(path).nonEmpty + } else { + super.tableExists(ident) + } + } + + protected def isPathIdentifier(ident: Identifier): Boolean = { + // Should be a simple check of a special PathIdentifier class in the future + try { + supportSQLOnFile && (hasClickHouseNamespace(ident) || hasDeltaNamespace(ident)) && + new Path(ident.name()).isAbsolute + } catch { + case _: IllegalArgumentException => false + } + } + + protected def isPathIdentifier(table: CatalogTable): Boolean = { + isPathIdentifier(table.identifier) + } + + protected def isPathIdentifier(tableIdentifier: TableIdentifier): Boolean = { + isPathIdentifier(Identifier.of(tableIdentifier.database.toArray, tableIdentifier.table)) + } + + private def supportSQLOnFile: Boolean = spark.sessionState.conf.runSQLonFile + + protected def hasClickHouseNamespace(ident: Identifier): Boolean = { + ident.namespace().length == 1 && + CHDataSourceUtils.isClickHouseDataSourceName(ident.namespace().head) + } + + protected def hasDeltaNamespace(ident: Identifier): Boolean = { + ident.namespace().length == 1 && DeltaSourceUtils.isDeltaDataSourceName(ident.namespace().head) + } +} diff --git a/backends-clickhouse/src/main/delta-32/org/apache/spark/sql/execution/datasources/v2/clickhouse/source/DeltaMergeTreeFileFormat.scala b/backends-clickhouse/src/main/delta-32/org/apache/spark/sql/execution/datasources/v2/clickhouse/source/DeltaMergeTreeFileFormat.scala new file mode 100644 index 0000000000000..2f71a0a0ef7b3 --- /dev/null +++ b/backends-clickhouse/src/main/delta-32/org/apache/spark/sql/execution/datasources/v2/clickhouse/source/DeltaMergeTreeFileFormat.scala @@ -0,0 +1,133 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.execution.datasources.v2.clickhouse.source + +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.delta.DeltaParquetFileFormat +import org.apache.spark.sql.delta.actions.{Metadata, Protocol} +import org.apache.spark.sql.execution.datasources.{OutputWriter, OutputWriterFactory} +import org.apache.spark.sql.execution.datasources.v1.GlutenMergeTreeWriterInjects +import org.apache.spark.sql.types.StructType + +import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext} + +@SuppressWarnings(Array("io.github.zhztheplayer.scalawarts.InheritFromCaseClass")) +class DeltaMergeTreeFileFormat(protocol: Protocol, metadata: Metadata) + extends DeltaParquetFileFormat(protocol, metadata) { + + protected var database = "" + protected var tableName = "" + protected var snapshotId = "" + protected var orderByKeyOption: Option[Seq[String]] = None + protected var lowCardKeyOption: Option[Seq[String]] = None + protected var minmaxIndexKeyOption: Option[Seq[String]] = None + protected var bfIndexKeyOption: Option[Seq[String]] = None + protected var setIndexKeyOption: Option[Seq[String]] = None + protected var primaryKeyOption: Option[Seq[String]] = None + protected var partitionColumns: Seq[String] = Seq.empty[String] + protected var clickhouseTableConfigs: Map[String, String] = Map.empty + + // scalastyle:off argcount + def this( + protocol: Protocol, + metadata: Metadata, + database: String, + tableName: String, + snapshotId: String, + orderByKeyOption: Option[Seq[String]], + lowCardKeyOption: Option[Seq[String]], + minmaxIndexKeyOption: Option[Seq[String]], + bfIndexKeyOption: Option[Seq[String]], + setIndexKeyOption: Option[Seq[String]], + primaryKeyOption: Option[Seq[String]], + clickhouseTableConfigs: Map[String, String], + partitionColumns: Seq[String]) = { + this(protocol, metadata) + this.database = database + this.tableName = tableName + this.snapshotId = snapshotId + this.orderByKeyOption = orderByKeyOption + this.lowCardKeyOption = lowCardKeyOption + this.minmaxIndexKeyOption = minmaxIndexKeyOption + this.bfIndexKeyOption = bfIndexKeyOption + this.setIndexKeyOption = setIndexKeyOption + this.primaryKeyOption = primaryKeyOption + this.clickhouseTableConfigs = clickhouseTableConfigs + this.partitionColumns = partitionColumns + } + // scalastyle:on argcount + + override def shortName(): String = "mergetree" + + override def toString(): String = "MergeTree" + + override def equals(other: Any): Boolean = { + other match { + case ff: DeltaMergeTreeFileFormat => + ff.columnMappingMode == columnMappingMode && + ff.referenceSchema == referenceSchema && + ff.optimizationsEnabled == optimizationsEnabled + case _ => false + } + } + + override def hashCode(): Int = getClass.getCanonicalName.hashCode() + + override def prepareWrite( + sparkSession: SparkSession, + job: Job, + options: Map[String, String], + dataSchema: StructType): OutputWriterFactory = { + // pass compression to job conf so that the file extension can be aware of it. + // val conf = ContextUtil.getConfiguration(job) + val nativeConf = + GlutenMergeTreeWriterInjects + .getInstance() + .nativeConf(options, "") + + new OutputWriterFactory { + override def getFileExtension(context: TaskAttemptContext): String = { + ".mergetree" + } + + override def newInstance( + path: String, + dataSchema: StructType, + context: TaskAttemptContext): OutputWriter = { + GlutenMergeTreeWriterInjects + .getInstance() + .createOutputWriter( + path, + database, + tableName, + snapshotId, + orderByKeyOption, + lowCardKeyOption, + minmaxIndexKeyOption, + bfIndexKeyOption, + setIndexKeyOption, + primaryKeyOption, + partitionColumns, + metadata.schema, + clickhouseTableConfigs, + context, + nativeConf + ) + } + } + } +} diff --git a/backends-clickhouse/src/main/java/org/apache/gluten/memory/alloc/CHNativeMemoryAllocators.java b/backends-clickhouse/src/main/java/org/apache/gluten/memory/alloc/CHNativeMemoryAllocators.java index 01fb7e3e2f7cf..0f30972fcd44d 100644 --- a/backends-clickhouse/src/main/java/org/apache/gluten/memory/alloc/CHNativeMemoryAllocators.java +++ b/backends-clickhouse/src/main/java/org/apache/gluten/memory/alloc/CHNativeMemoryAllocators.java @@ -19,13 +19,12 @@ import org.apache.gluten.memory.SimpleMemoryUsageRecorder; import org.apache.gluten.memory.memtarget.MemoryTargets; import org.apache.gluten.memory.memtarget.Spiller; +import org.apache.gluten.memory.memtarget.Spillers; import org.apache.spark.memory.TaskMemoryManager; import org.apache.spark.util.TaskResources; -import java.util.Arrays; import java.util.Collections; -import java.util.List; /** * Built-in toolkit for managing native memory allocations. To use the facility, one should import @@ -46,12 +45,12 @@ private CHNativeMemoryAllocators() {} private static CHNativeMemoryAllocatorManager createNativeMemoryAllocatorManager( String name, TaskMemoryManager taskMemoryManager, - List spillers, + Spiller spiller, SimpleMemoryUsageRecorder usage) { CHManagedCHReservationListener rl = new CHManagedCHReservationListener( - MemoryTargets.newConsumer(taskMemoryManager, name, spillers, Collections.emptyMap()), + MemoryTargets.newConsumer(taskMemoryManager, name, spiller, Collections.emptyMap()), usage); return new CHNativeMemoryAllocatorManagerImpl(CHNativeMemoryAllocator.createListenable(rl)); } @@ -67,7 +66,7 @@ public static CHNativeMemoryAllocator contextInstance() { createNativeMemoryAllocatorManager( "ContextInstance", TaskResources.getLocalTaskContext().taskMemoryManager(), - Collections.emptyList(), + Spillers.NOOP, TaskResources.getSharedUsage()); TaskResources.addResource(id, manager); } @@ -78,7 +77,7 @@ public static CHNativeMemoryAllocator contextInstanceForUT() { return CHNativeMemoryAllocator.getDefaultForUT(); } - public static CHNativeMemoryAllocator createSpillable(String name, Spiller... spillers) { + public static CHNativeMemoryAllocator createSpillable(String name, Spiller spiller) { if (!TaskResources.inSparkTask()) { throw new IllegalStateException("spiller must be used in a Spark task"); } @@ -87,7 +86,7 @@ public static CHNativeMemoryAllocator createSpillable(String name, Spiller... sp createNativeMemoryAllocatorManager( name, TaskResources.getLocalTaskContext().taskMemoryManager(), - Arrays.asList(spillers), + spiller, TaskResources.getSharedUsage()); TaskResources.addAnonymousResource(manager); // force add memory consumer to task memory manager, will release by inactivate diff --git a/backends-clickhouse/src/main/java/org/apache/gluten/vectorized/BatchIterator.java b/backends-clickhouse/src/main/java/org/apache/gluten/vectorized/BatchIterator.java index 5698caf026370..1fbb6053a2afc 100644 --- a/backends-clickhouse/src/main/java/org/apache/gluten/vectorized/BatchIterator.java +++ b/backends-clickhouse/src/main/java/org/apache/gluten/vectorized/BatchIterator.java @@ -17,15 +17,18 @@ package org.apache.gluten.vectorized; import org.apache.gluten.metrics.IMetrics; +import org.apache.gluten.metrics.NativeMetrics; import org.apache.spark.sql.execution.utils.CHExecUtil; import org.apache.spark.sql.vectorized.ColumnVector; import org.apache.spark.sql.vectorized.ColumnarBatch; import java.io.IOException; +import java.util.concurrent.atomic.AtomicBoolean; public class BatchIterator extends GeneralOutIterator { private final long handle; + private final AtomicBoolean cancelled = new AtomicBoolean(false); public BatchIterator(long handle) { super(); @@ -46,7 +49,9 @@ public String getId() { private native void nativeClose(long nativeHandle); - private native IMetrics nativeFetchMetrics(long nativeHandle); + private native void nativeCancel(long nativeHandle); + + private native String nativeFetchMetrics(long nativeHandle); @Override public boolean hasNextInternal() throws IOException { @@ -68,12 +73,19 @@ public ColumnarBatch nextInternal() throws IOException { } @Override - public IMetrics getMetricsInternal() throws IOException, ClassNotFoundException { - return nativeFetchMetrics(handle); + public IMetrics getMetricsInternal() { + return new NativeMetrics(nativeFetchMetrics(handle)); } @Override public void closeInternal() { nativeClose(handle); } + + // Used to cancel native pipeline execution when spark task is killed + public final void cancel() { + if (cancelled.compareAndSet(false, true)) { + nativeCancel(handle); + } + } } diff --git a/backends-clickhouse/src/main/java/org/apache/gluten/vectorized/CHNativeExpressionEvaluator.java b/backends-clickhouse/src/main/java/org/apache/gluten/vectorized/CHNativeExpressionEvaluator.java index 0d307d23102dc..b8b4138dc8c05 100644 --- a/backends-clickhouse/src/main/java/org/apache/gluten/vectorized/CHNativeExpressionEvaluator.java +++ b/backends-clickhouse/src/main/java/org/apache/gluten/vectorized/CHNativeExpressionEvaluator.java @@ -83,7 +83,7 @@ private Map getNativeBackendConf() { // Used by WholeStageTransform to create the native computing pipeline and // return a columnar result iterator. - public GeneralOutIterator createKernelWithBatchIterator( + public BatchIterator createKernelWithBatchIterator( byte[] wsPlan, byte[][] splitInfo, List iterList, @@ -97,11 +97,11 @@ public GeneralOutIterator createKernelWithBatchIterator( iterList.toArray(new GeneralInIterator[0]), buildNativeConf(getNativeBackendConf()), materializeInput); - return createOutIterator(handle); + return createBatchIterator(handle); } // Only for UT. - public GeneralOutIterator createKernelWithBatchIterator( + public BatchIterator createKernelWithBatchIterator( long allocId, byte[] wsPlan, byte[][] splitInfo, List iterList) { long handle = jniWrapper.nativeCreateKernelWithIterator( @@ -111,10 +111,10 @@ public GeneralOutIterator createKernelWithBatchIterator( iterList.toArray(new GeneralInIterator[0]), buildNativeConf(getNativeBackendConf()), false); - return createOutIterator(handle); + return createBatchIterator(handle); } - private GeneralOutIterator createOutIterator(long nativeHandle) { + private BatchIterator createBatchIterator(long nativeHandle) { return new BatchIterator(nativeHandle); } } diff --git a/backends-clickhouse/src/main/java/org/apache/gluten/vectorized/CHShuffleSplitterJniWrapper.java b/backends-clickhouse/src/main/java/org/apache/gluten/vectorized/CHShuffleSplitterJniWrapper.java index f81ec88c254e7..815bf472c027a 100644 --- a/backends-clickhouse/src/main/java/org/apache/gluten/vectorized/CHShuffleSplitterJniWrapper.java +++ b/backends-clickhouse/src/main/java/org/apache/gluten/vectorized/CHShuffleSplitterJniWrapper.java @@ -37,7 +37,8 @@ public long make( boolean flushBlockBufferBeforeEvict, long maxSortBufferSize, boolean spillFirstlyBeforeStop, - boolean forceSort) { + boolean forceExternalSort, + boolean forceMemorySort) { return nativeMake( part.getShortName(), part.getNumPartitions(), @@ -57,7 +58,8 @@ public long make( flushBlockBufferBeforeEvict, maxSortBufferSize, spillFirstlyBeforeStop, - forceSort); + forceExternalSort, + forceMemorySort); } public long makeForRSS( @@ -70,7 +72,9 @@ public long makeForRSS( String hashAlgorithm, Object pusher, boolean throwIfMemoryExceed, - boolean flushBlockBufferBeforeEvict) { + boolean flushBlockBufferBeforeEvict, + boolean forceExternalSort, + boolean forceMemorySort) { return nativeMakeForRSS( part.getShortName(), part.getNumPartitions(), @@ -84,7 +88,9 @@ public long makeForRSS( hashAlgorithm, pusher, throwIfMemoryExceed, - flushBlockBufferBeforeEvict); + flushBlockBufferBeforeEvict, + forceExternalSort, + forceMemorySort); } public native long nativeMake( @@ -106,7 +112,8 @@ public native long nativeMake( boolean flushBlockBufferBeforeEvict, long maxSortBufferSize, boolean spillFirstlyBeforeStop, - boolean forceSort); + boolean forceSort, + boolean forceMemorySort); public native long nativeMakeForRSS( String shortName, @@ -121,7 +128,9 @@ public native long nativeMakeForRSS( String hashAlgorithm, Object pusher, boolean throwIfMemoryExceed, - boolean flushBlockBufferBeforeEvict); + boolean flushBlockBufferBeforeEvict, + boolean forceSort, + boolean forceMemorySort); public native void split(long splitterId, long block); diff --git a/backends-clickhouse/src/main/java/org/apache/gluten/vectorized/StorageJoinBuilder.java b/backends-clickhouse/src/main/java/org/apache/gluten/vectorized/StorageJoinBuilder.java index 065be9de2557f..9cb49b6a2d30e 100644 --- a/backends-clickhouse/src/main/java/org/apache/gluten/vectorized/StorageJoinBuilder.java +++ b/backends-clickhouse/src/main/java/org/apache/gluten/vectorized/StorageJoinBuilder.java @@ -44,6 +44,7 @@ private static native long nativeBuild( long rowCount, String joinKeys, int joinType, + boolean hasMixedFiltCondition, byte[] namedStruct); private StorageJoinBuilder() {} @@ -79,6 +80,7 @@ public static long build( rowCount, joinKey, SubstraitUtil.toSubstrait(broadCastContext.joinType()).ordinal(), + broadCastContext.hasMixedFiltCondition(), toNameStruct(output).toByteArray()); } diff --git a/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHBackend.scala b/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHBackend.scala index bc0c8d1c07f20..cdca1b031a915 100644 --- a/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHBackend.scala +++ b/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHBackend.scala @@ -26,7 +26,7 @@ import org.apache.gluten.substrait.rel.LocalFilesNode.ReadFileFormat._ import org.apache.spark.SparkEnv import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.catalog.BucketSpec -import org.apache.spark.sql.catalyst.expressions.{Alias, AttributeReference, DenseRank, Lag, Lead, NamedExpression, Rank, RowNumber} +import org.apache.spark.sql.catalyst.expressions.{Alias, AttributeReference, DenseRank, Expression, Lag, Lead, Literal, NamedExpression, Rank, RowNumber} import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, Partitioning} import org.apache.spark.sql.execution.SparkPlan @@ -49,7 +49,6 @@ class CHBackend extends Backend { override def validatorApi(): ValidatorApi = new CHValidatorApi override def metricsApi(): MetricsApi = new CHMetricsApi override def listenerApi(): ListenerApi = new CHListenerApi - override def broadcastApi(): BroadcastApi = new CHBroadcastApi override def settings(): BackendSettingsApi = CHBackendSettings } @@ -127,7 +126,7 @@ object CHBackendSettings extends BackendSettingsApi with Logging { val GLUTEN_MAX_SHUFFLE_READ_BYTES: String = GlutenConfig.GLUTEN_CONFIG_PREFIX + CHBackend.BACKEND_NAME + ".runtime_config.max_source_concatenate_bytes" - val GLUTEN_MAX_SHUFFLE_READ_BYTES_DEFAULT = -1 + val GLUTEN_MAX_SHUFFLE_READ_BYTES_DEFAULT = GLUTEN_MAX_BLOCK_SIZE_DEFAULT * 256 def affinityMode: String = { SparkEnv.get.conf @@ -190,7 +189,6 @@ object CHBackendSettings extends BackendSettingsApi with Logging { } } - override def utilizeShuffledHashJoinHint(): Boolean = true override def supportShuffleWithProject( outputPartitioning: Partitioning, child: SparkPlan): Boolean = { @@ -226,10 +224,25 @@ object CHBackendSettings extends BackendSettingsApi with Logging { func => { val aliasExpr = func.asInstanceOf[Alias] val wExpression = WindowFunctionsBuilder.extractWindowExpression(aliasExpr.child) + + def checkLagOrLead(third: Expression): Unit = { + third match { + case _: Literal => + allSupported = allSupported + case _ => + logInfo("Not support lag/lead function with default value not literal null") + allSupported = false + break + } + } + wExpression.windowFunction match { - case _: RowNumber | _: AggregateExpression | _: Rank | _: Lead | _: Lag | - _: DenseRank => + case _: RowNumber | _: AggregateExpression | _: Rank | _: DenseRank => allSupported = allSupported + case l: Lag => + checkLagOrLead(l.third) + case l: Lead => + checkLagOrLead(l.third) case _ => logDebug(s"Not support window function: ${wExpression.getClass}") allSupported = false diff --git a/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHBroadcastApi.scala b/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHBroadcastApi.scala deleted file mode 100644 index d70ba6b8d2f13..0000000000000 --- a/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHBroadcastApi.scala +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.gluten.backendsapi.clickhouse - -import org.apache.gluten.backendsapi.BroadcastApi -import org.apache.gluten.execution.CHBroadcastBuildSideCache - -import org.apache.spark.internal.Logging -import org.apache.spark.rpc.GlutenDriverEndpoint - -class CHBroadcastApi extends BroadcastApi with Logging { - override def cleanExecutionBroadcastTable( - executionId: String, - broadcastTableIds: java.util.Set[String]): Unit = { - if (broadcastTableIds != null) { - broadcastTableIds.forEach( - resource_id => CHBroadcastBuildSideCache.invalidateBroadcastHashtable(resource_id)) - } - } - - override def collectExecutionBroadcastTableId(executionId: String, buildTableId: String): Unit = { - if (executionId != null) { - GlutenDriverEndpoint.collectResources(executionId, buildTableId) - } else { - logWarning( - s"Can't not trace broadcast hash table data $buildTableId" + - s" because execution id is null." + - s" Will clean up until expire time.") - } - } -} diff --git a/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHIteratorApi.scala b/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHIteratorApi.scala index 4926a97eb8249..4b9ec739028f6 100644 --- a/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHIteratorApi.scala +++ b/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHIteratorApi.scala @@ -16,21 +16,21 @@ */ package org.apache.gluten.backendsapi.clickhouse -import org.apache.gluten.{GlutenConfig, GlutenNumaBindingInfo} +import org.apache.gluten.GlutenNumaBindingInfo import org.apache.gluten.backendsapi.IteratorApi import org.apache.gluten.execution._ import org.apache.gluten.expression.ConverterUtils -import org.apache.gluten.metrics.{GlutenTimeMetric, IMetrics, NativeMetrics} +import org.apache.gluten.metrics.{IMetrics, NativeMetrics} import org.apache.gluten.substrait.plan.PlanNode import org.apache.gluten.substrait.rel._ import org.apache.gluten.substrait.rel.LocalFilesNode.ReadFileFormat import org.apache.gluten.utils.LogLevelUtil -import org.apache.gluten.vectorized.{CHNativeExpressionEvaluator, CloseableCHColumnBatchIterator, GeneralInIterator, GeneralOutIterator} +import org.apache.gluten.vectorized.{BatchIterator, CHNativeExpressionEvaluator, CloseableCHColumnBatchIterator, GeneralInIterator} -import org.apache.spark.{InterruptibleIterator, SparkConf, SparkContext, TaskContext} +import org.apache.spark.{InterruptibleIterator, SparkConf, TaskContext} import org.apache.spark.affinity.CHAffinity +import org.apache.spark.executor.InputMetrics import org.apache.spark.internal.Logging -import org.apache.spark.rdd.RDD import org.apache.spark.sql.connector.read.InputPartition import org.apache.spark.sql.execution.datasources.FilePartition import org.apache.spark.sql.execution.metric.SQLMetric @@ -58,7 +58,53 @@ class CHIteratorApi extends IteratorApi with Logging with LogLevelUtil { } dataSchema += newField } - StructType(dataSchema) + StructType(dataSchema.toSeq) + } + + private def createNativeIterator( + splitInfoByteArray: Array[Array[Byte]], + wsPlan: Array[Byte], + materializeInput: Boolean, + inputIterators: Seq[Iterator[ColumnarBatch]]): BatchIterator = { + + /** Generate closeable ColumnBatch iterator. */ + val listIterator = + inputIterators + .map { + case i: CloseableCHColumnBatchIterator => i + case it => new CloseableCHColumnBatchIterator(it) + } + .map(it => new ColumnarNativeIterator(it.asJava).asInstanceOf[GeneralInIterator]) + .asJava + new CHNativeExpressionEvaluator().createKernelWithBatchIterator( + wsPlan, + splitInfoByteArray, + listIterator, + materializeInput + ) + } + + private def createCloseIterator( + context: TaskContext, + pipelineTime: SQLMetric, + updateNativeMetrics: IMetrics => Unit, + updateInputMetrics: Option[InputMetricsWrapper => Unit] = None, + nativeIter: BatchIterator): CloseableCHColumnBatchIterator = { + + val iter = new CollectMetricIterator( + nativeIter, + updateNativeMetrics, + updateInputMetrics, + updateInputMetrics.map(_ => context.taskMetrics().inputMetrics).orNull) + + context.addTaskFailureListener( + (ctx, _) => { + if (ctx.isInterrupted()) { + iter.cancel() + } + }) + context.addTaskCompletionListener[Unit](_ => iter.close()) + new CloseableCHColumnBatchIterator(iter, Some(pipelineTime)) } // only set file schema for text format table @@ -68,7 +114,7 @@ class CHIteratorApi extends IteratorApi with Logging with LogLevelUtil { if (scan.fileFormat == ReadFileFormat.TextReadFormat) { val names = ConverterUtils.collectAttributeNamesWithoutExprId(scan.outputAttributes()) - localFilesNode.setFileSchema(getFileSchema(scan.getDataSchema, names.asScala)) + localFilesNode.setFileSchema(getFileSchema(scan.getDataSchema, names.asScala.toSeq)) } } @@ -118,7 +164,7 @@ class CHIteratorApi extends IteratorApi with Logging with LogLevelUtil { val partitionColumns = new JArrayList[JMap[String, String]] f.files.foreach { file => - paths.add(new URI(file.filePath).toASCIIString) + paths.add(new URI(file.filePath.toString()).toASCIIString) starts.add(JLong.valueOf(file.start)) lengths.add(JLong.valueOf(file.length)) // TODO: Support custom partition location @@ -132,10 +178,13 @@ class CHIteratorApi extends IteratorApi with Logging with LogLevelUtil { paths, starts, lengths, + new JArrayList[JLong](), + new JArrayList[JLong](), partitionColumns, new JArrayList[JMap[String, String]](), fileFormat, - preferredLocations.toList.asJava) + preferredLocations.toList.asJava + ) case _ => throw new UnsupportedOperationException(s"Unsupported input partition: $partition.") } @@ -195,59 +244,24 @@ class CHIteratorApi extends IteratorApi with Logging with LogLevelUtil { inputIterators: Seq[Iterator[ColumnarBatch]] = Seq() ): Iterator[ColumnarBatch] = { - assert( + require( inputPartition.isInstanceOf[GlutenPartition], "CH backend only accepts GlutenPartition in GlutenWholeStageColumnarRDD.") - - val transKernel = new CHNativeExpressionEvaluator() - val inBatchIters = new JArrayList[GeneralInIterator](inputIterators.map { - iter => new ColumnarNativeIterator(CHIteratorApi.genCloseableColumnBatchIterator(iter).asJava) - }.asJava) - val splitInfoByteArray = inputPartition .asInstanceOf[GlutenPartition] .splitInfosByteArray - val resIter: GeneralOutIterator = - transKernel.createKernelWithBatchIterator( - inputPartition.plan, - splitInfoByteArray, - inBatchIters, - false) - - context.addTaskCompletionListener[Unit](_ => resIter.close()) - val iter = new Iterator[Any] { - private val inputMetrics = context.taskMetrics().inputMetrics - private var outputRowCount = 0L - private var outputVectorCount = 0L - private var metricsUpdated = false - - override def hasNext: Boolean = { - val res = resIter.hasNext - // avoid to collect native metrics more than once, 'hasNext' is a idempotent operation - if (!res && !metricsUpdated) { - val nativeMetrics = resIter.getMetrics.asInstanceOf[NativeMetrics] - nativeMetrics.setFinalOutputMetrics(outputRowCount, outputVectorCount) - updateNativeMetrics(nativeMetrics) - updateInputMetrics(inputMetrics) - metricsUpdated = true - } - res - } + val wsPlan = inputPartition.plan + val materializeInput = false - override def next(): Any = { - val cb = resIter.next() - outputVectorCount += 1 - outputRowCount += cb.numRows() - cb - } - } - - // TODO: SPARK-25083 remove the type erasure hack in data source scan new InterruptibleIterator( context, - new CloseableCHColumnBatchIterator( - iter.asInstanceOf[Iterator[ColumnarBatch]], - Some(pipelineTime))) + createCloseIterator( + context, + pipelineTime, + updateNativeMetrics, + Some(updateInputMetrics), + createNativeIterator(splitInfoByteArray, wsPlan, materializeInput, inputIterators)) + ) } // Generate Iterator[ColumnarBatch] for final stage. @@ -263,105 +277,59 @@ class CHIteratorApi extends IteratorApi with Logging with LogLevelUtil { partitionIndex: Int, materializeInput: Boolean): Iterator[ColumnarBatch] = { // scalastyle:on argcount - GlutenConfig.getConf - - val transKernel = new CHNativeExpressionEvaluator() - val columnarNativeIterator = - new JArrayList[GeneralInIterator](inputIterators.map { - iter => - new ColumnarNativeIterator(CHIteratorApi.genCloseableColumnBatchIterator(iter).asJava) - }.asJava) - // we need to complete dependency RDD's firstly - val nativeIterator = transKernel.createKernelWithBatchIterator( - rootNode.toProtobuf.toByteArray, - // Final iterator does not contain scan split, so pass empty split info to native here. - new Array[Array[Byte]](0), - columnarNativeIterator, - materializeInput - ) - val resIter = new Iterator[ColumnarBatch] { - private var outputRowCount = 0L - private var outputVectorCount = 0L - private var metricsUpdated = false + // Final iterator does not contain scan split, so pass empty split info to native here. + val splitInfoByteArray = new Array[Array[Byte]](0) + val wsPlan = rootNode.toProtobuf.toByteArray - override def hasNext: Boolean = { - val res = nativeIterator.hasNext - // avoid to collect native metrics more than once, 'hasNext' is a idempotent operation - if (!res && !metricsUpdated) { - val nativeMetrics = nativeIterator.getMetrics.asInstanceOf[NativeMetrics] - nativeMetrics.setFinalOutputMetrics(outputRowCount, outputVectorCount) - updateNativeMetrics(nativeMetrics) - metricsUpdated = true - } - res - } - - override def next(): ColumnarBatch = { - val cb = nativeIterator.next() - outputVectorCount += 1 - outputRowCount += cb.numRows() - cb - } - } - var closed = false + // we need to complete dependency RDD's firstly + createCloseIterator( + context, + pipelineTime, + updateNativeMetrics, + None, + createNativeIterator(splitInfoByteArray, wsPlan, materializeInput, inputIterators)) + } +} - def close(): Unit = { - closed = true - nativeIterator.close() - // relationHolder.clear() - } +class CollectMetricIterator( + val nativeIterator: BatchIterator, + val updateNativeMetrics: IMetrics => Unit, + val updateInputMetrics: Option[InputMetricsWrapper => Unit] = None, + val inputMetrics: InputMetrics = null +) extends Iterator[ColumnarBatch] { + private var outputRowCount = 0L + private var outputVectorCount = 0L + private var metricsUpdated = false - context.addTaskCompletionListener[Unit](_ => close()) - new CloseableCHColumnBatchIterator(resIter, Some(pipelineTime)) + override def hasNext: Boolean = { + nativeIterator.hasNext } - /** Generate Native FileScanRDD, currently only for ClickHouse Backend. */ - override def genNativeFileScanRDD( - sparkContext: SparkContext, - wsCtx: WholeStageTransformContext, - splitInfos: Seq[SplitInfo], - scan: BasicScanExecTransformer, - numOutputRows: SQLMetric, - numOutputBatches: SQLMetric, - scanTime: SQLMetric): RDD[ColumnarBatch] = { - val substraitPlanPartition = GlutenTimeMetric.withMillisTime { - val planByteArray = wsCtx.root.toProtobuf.toByteArray - splitInfos.zipWithIndex.map { - case (splitInfo, index) => - val splitInfoByteArray = splitInfo match { - case filesNode: LocalFilesNode => - setFileSchemaForLocalFiles(filesNode, scan) - filesNode.setFileReadProperties(mapAsJavaMap(scan.getProperties)) - filesNode.toProtobuf.toByteArray - case extensionTableNode: ExtensionTableNode => - extensionTableNode.toProtobuf.toByteArray - } - - GlutenPartition( - index, - planByteArray, - Array(splitInfoByteArray), - locations = splitInfo.preferredLocations().asScala.toArray) - } - }(t => logInfo(s"Generating the Substrait plan took: $t ms.")) + override def next(): ColumnarBatch = { + val cb = nativeIterator.next() + outputVectorCount += 1 + outputRowCount += cb.numRows() + cb + } - new NativeFileScanColumnarRDD( - sparkContext, - substraitPlanPartition, - numOutputRows, - numOutputBatches, - scanTime) + def close(): Unit = { + collectStageMetrics() + nativeIterator.close() } -} -object CHIteratorApi { + def cancel(): Unit = { + collectStageMetrics() + nativeIterator.cancel() + } - /** Generate closeable ColumnBatch iterator. */ - def genCloseableColumnBatchIterator(iter: Iterator[ColumnarBatch]): Iterator[ColumnarBatch] = { - iter match { - case _: CloseableCHColumnBatchIterator => iter - case _ => new CloseableCHColumnBatchIterator(iter) + private def collectStageMetrics(): Unit = { + if (!metricsUpdated) { + val nativeMetrics = nativeIterator.getMetrics.asInstanceOf[NativeMetrics] + nativeMetrics.setFinalOutputMetrics(outputRowCount, outputVectorCount) + updateNativeMetrics(nativeMetrics) + updateInputMetrics.foreach(_(inputMetrics)) + metricsUpdated = true } } } diff --git a/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHListenerApi.scala b/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHListenerApi.scala index 952812d68c2a7..43e0627dffefe 100644 --- a/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHListenerApi.scala +++ b/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHListenerApi.scala @@ -23,9 +23,12 @@ import org.apache.gluten.execution.datasource.{GlutenOrcWriterInjects, GlutenPar import org.apache.gluten.expression.UDFMappings import org.apache.gluten.vectorized.{CHNativeExpressionEvaluator, JniLibLoader} -import org.apache.spark.SparkConf +import org.apache.spark.{SparkConf, SparkContext} +import org.apache.spark.api.plugin.PluginContext import org.apache.spark.internal.Logging +import org.apache.spark.listener.CHGlutenSQLAppStatusListener import org.apache.spark.network.util.JavaUtils +import org.apache.spark.rpc.{GlutenDriverEndpoint, GlutenExecutorEndpoint} import org.apache.spark.sql.execution.datasources.v1._ import org.apache.spark.util.SparkDirectoryUtil @@ -35,11 +38,22 @@ import java.util.TimeZone class CHListenerApi extends ListenerApi with Logging { - override def onDriverStart(conf: SparkConf): Unit = initialize(conf, isDriver = true) + override def onDriverStart(sc: SparkContext, pc: PluginContext): Unit = { + GlutenDriverEndpoint.glutenDriverEndpointRef = (new GlutenDriverEndpoint).self + CHGlutenSQLAppStatusListener.registerListener(sc) + initialize(pc.conf, isDriver = true) + } override def onDriverShutdown(): Unit = shutdown() - override def onExecutorStart(conf: SparkConf): Unit = initialize(conf, isDriver = false) + override def onExecutorStart(pc: PluginContext): Unit = { + GlutenExecutorEndpoint.executorEndpoint = new GlutenExecutorEndpoint(pc.executorID, pc.conf) + if (pc.conf().get("spark.master").startsWith("local")) { + logDebug("Skipping duplicate initializing clickhouse backend on spark local mode") + } else { + initialize(pc.conf, isDriver = false) + } + } override def onExecutorShutdown(): Unit = shutdown() diff --git a/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHMetricsApi.scala b/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHMetricsApi.scala index 30f682f0fb2ee..a5fb4a1853e83 100644 --- a/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHMetricsApi.scala +++ b/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHMetricsApi.scala @@ -144,7 +144,7 @@ class CHMetricsApi extends MetricsApi with Logging with LogLevelUtil { "extraTime" -> SQLMetrics.createTimingMetric(sparkContext, "extra operators time"), "inputWaitTime" -> SQLMetrics.createTimingMetric(sparkContext, "time of waiting for data"), "outputWaitTime" -> SQLMetrics.createTimingMetric(sparkContext, "time of waiting for output"), - "totalTime" -> SQLMetrics.createTimingMetric(sparkContext, "total time") + "totalTime" -> SQLMetrics.createTimingMetric(sparkContext, "time") ) override def genFilterTransformerMetricsUpdater(metrics: Map[String, SQLMetric]): MetricsUpdater = @@ -160,7 +160,7 @@ class CHMetricsApi extends MetricsApi with Logging with LogLevelUtil { "extraTime" -> SQLMetrics.createTimingMetric(sparkContext, "extra operators time"), "inputWaitTime" -> SQLMetrics.createTimingMetric(sparkContext, "time of waiting for data"), "outputWaitTime" -> SQLMetrics.createTimingMetric(sparkContext, "time of waiting for output"), - "totalTime" -> SQLMetrics.createTimingMetric(sparkContext, "total time") + "totalTime" -> SQLMetrics.createTimingMetric(sparkContext, "time") ) override def genProjectTransformerMetricsUpdater( @@ -181,7 +181,7 @@ class CHMetricsApi extends MetricsApi with Logging with LogLevelUtil { "resizeOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of resize output rows"), "aggregatingTime" -> SQLMetrics.createTimingMetric(sparkContext, "time of aggregating"), - "totalTime" -> SQLMetrics.createTimingMetric(sparkContext, "total time") + "totalTime" -> SQLMetrics.createTimingMetric(sparkContext, "time") ) override def genHashAggregateTransformerMetricsUpdater( @@ -198,7 +198,7 @@ class CHMetricsApi extends MetricsApi with Logging with LogLevelUtil { "extraTime" -> SQLMetrics.createTimingMetric(sparkContext, "extra operators time"), "inputWaitTime" -> SQLMetrics.createTimingMetric(sparkContext, "time of waiting for data"), "outputWaitTime" -> SQLMetrics.createTimingMetric(sparkContext, "time of waiting for output"), - "totalTime" -> SQLMetrics.createTimingMetric(sparkContext, "total time") + "totalTime" -> SQLMetrics.createTimingMetric(sparkContext, "time") ) override def genExpandTransformerMetricsUpdater(metrics: Map[String, SQLMetric]): MetricsUpdater = @@ -213,17 +213,15 @@ class CHMetricsApi extends MetricsApi with Logging with LogLevelUtil { Map( "dataSize" -> SQLMetrics.createSizeMetric(sparkContext, "data size"), "bytesSpilled" -> SQLMetrics.createSizeMetric(sparkContext, "shuffle bytes spilled"), - "computePidTime" -> SQLMetrics.createNanoTimingMetric( - sparkContext, - "totaltime to compute pid"), - "splitTime" -> SQLMetrics.createNanoTimingMetric(sparkContext, "totaltime to split"), - "IOTime" -> SQLMetrics.createNanoTimingMetric(sparkContext, "totaltime to disk io"), + "computePidTime" -> SQLMetrics.createNanoTimingMetric(sparkContext, "time to compute pid"), + "splitTime" -> SQLMetrics.createNanoTimingMetric(sparkContext, "time to split"), + "IOTime" -> SQLMetrics.createNanoTimingMetric(sparkContext, "time to disk io"), "serializeTime" -> SQLMetrics.createNanoTimingMetric( sparkContext, - "totaltime to block serialization"), - "spillTime" -> SQLMetrics.createNanoTimingMetric(sparkContext, "totaltime to spill"), - "compressTime" -> SQLMetrics.createNanoTimingMetric(sparkContext, "totaltime to compress"), - "prepareTime" -> SQLMetrics.createNanoTimingMetric(sparkContext, "totaltime to prepare"), + "time to block serialization"), + "spillTime" -> SQLMetrics.createNanoTimingMetric(sparkContext, "time to spill"), + "compressTime" -> SQLMetrics.createNanoTimingMetric(sparkContext, "time to compress"), + "prepareTime" -> SQLMetrics.createNanoTimingMetric(sparkContext, "time to prepare"), "avgReadBatchNumRows" -> SQLMetrics .createAverageMetric(sparkContext, "avg read batch num rows"), "numInputRows" -> SQLMetrics.createMetric(sparkContext, "number of input rows"), @@ -245,7 +243,7 @@ class CHMetricsApi extends MetricsApi with Logging with LogLevelUtil { "extraTime" -> SQLMetrics.createTimingMetric(sparkContext, "extra operators time"), "inputWaitTime" -> SQLMetrics.createTimingMetric(sparkContext, "time of waiting for data"), "outputWaitTime" -> SQLMetrics.createTimingMetric(sparkContext, "time of waiting for output"), - "totalTime" -> SQLMetrics.createTimingMetric(sparkContext, "total time") + "totalTime" -> SQLMetrics.createTimingMetric(sparkContext, "time") ) override def genWindowTransformerMetricsUpdater(metrics: Map[String, SQLMetric]): MetricsUpdater = @@ -262,7 +260,7 @@ class CHMetricsApi extends MetricsApi with Logging with LogLevelUtil { Map( "numInputRows" -> SQLMetrics.createMetric(sparkContext, "number of input rows"), "numOutputBatches" -> SQLMetrics.createMetric(sparkContext, "number of output batches"), - "convertTime" -> SQLMetrics.createTimingMetric(sparkContext, "total time to convert") + "convertTime" -> SQLMetrics.createTimingMetric(sparkContext, "time to convert") ) override def genLimitTransformerMetrics(sparkContext: SparkContext): Map[String, SQLMetric] = @@ -275,7 +273,7 @@ class CHMetricsApi extends MetricsApi with Logging with LogLevelUtil { "extraTime" -> SQLMetrics.createTimingMetric(sparkContext, "extra operators time"), "inputWaitTime" -> SQLMetrics.createTimingMetric(sparkContext, "time of waiting for data"), "outputWaitTime" -> SQLMetrics.createTimingMetric(sparkContext, "time of waiting for output"), - "totalTime" -> SQLMetrics.createTimingMetric(sparkContext, "total time") + "totalTime" -> SQLMetrics.createTimingMetric(sparkContext, "time") ) override def genLimitTransformerMetricsUpdater(metrics: Map[String, SQLMetric]): MetricsUpdater = @@ -291,7 +289,7 @@ class CHMetricsApi extends MetricsApi with Logging with LogLevelUtil { "extraTime" -> SQLMetrics.createTimingMetric(sparkContext, "extra operators time"), "inputWaitTime" -> SQLMetrics.createTimingMetric(sparkContext, "time of waiting for data"), "outputWaitTime" -> SQLMetrics.createTimingMetric(sparkContext, "time of waiting for output"), - "totalTime" -> SQLMetrics.createTimingMetric(sparkContext, "total time") + "totalTime" -> SQLMetrics.createTimingMetric(sparkContext, "time") ) override def genSortTransformerMetricsUpdater(metrics: Map[String, SQLMetric]): MetricsUpdater = @@ -339,7 +337,7 @@ class CHMetricsApi extends MetricsApi with Logging with LogLevelUtil { SQLMetrics.createTimingMetric(sparkContext, "time of postProjection"), "probeTime" -> SQLMetrics.createTimingMetric(sparkContext, "time of probe"), - "totalTime" -> SQLMetrics.createTimingMetric(sparkContext, "total time"), + "totalTime" -> SQLMetrics.createTimingMetric(sparkContext, "time"), "fillingRightJoinSideTime" -> SQLMetrics.createTimingMetric( sparkContext, "filling right join side time"), @@ -361,6 +359,17 @@ class CHMetricsApi extends MetricsApi with Logging with LogLevelUtil { s"NestedLoopJoinTransformer metrics update is not supported in CH backend") } + override def genSampleTransformerMetrics(sparkContext: SparkContext): Map[String, SQLMetric] = { + throw new UnsupportedOperationException( + s"SampleTransformer metrics update is not supported in CH backend") + } + + override def genSampleTransformerMetricsUpdater( + metrics: Map[String, SQLMetric]): MetricsUpdater = { + throw new UnsupportedOperationException( + s"SampleTransformer metrics update is not supported in CH backend") + } + def genWriteFilesTransformerMetrics(sparkContext: SparkContext): Map[String, SQLMetric] = { throw new UnsupportedOperationException( s"WriteFilesTransformer metrics update is not supported in CH backend") diff --git a/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHSparkPlanExecApi.scala b/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHSparkPlanExecApi.scala index 8c2b20db6f845..44aeba0215572 100644 --- a/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHSparkPlanExecApi.scala +++ b/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHSparkPlanExecApi.scala @@ -22,12 +22,12 @@ import org.apache.gluten.exception.GlutenNotSupportException import org.apache.gluten.execution._ import org.apache.gluten.expression._ import org.apache.gluten.extension.{CountDistinctWithoutExpand, FallbackBroadcastHashJoin, FallbackBroadcastHashJoinPrepQueryStage, RewriteToDateExpresstionRule} -import org.apache.gluten.extension.columnar.AddTransformHintRule +import org.apache.gluten.extension.columnar.AddFallbackTagRule import org.apache.gluten.extension.columnar.MiscColumnarRules.TransformPreOverrides import org.apache.gluten.extension.columnar.transition.Convention import org.apache.gluten.sql.shims.SparkShimLoader import org.apache.gluten.substrait.expression.{ExpressionBuilder, ExpressionNode, WindowFunctionNode} -import org.apache.gluten.utils.CHJoinValidateUtil +import org.apache.gluten.utils.{CHJoinValidateUtil, UnknownJoinStrategy} import org.apache.gluten.vectorized.CHColumnarBatchSerializer import org.apache.spark.{ShuffleDependency, SparkException} @@ -50,7 +50,6 @@ import org.apache.spark.sql.delta.files.TahoeFileIndex import org.apache.spark.sql.execution._ import org.apache.spark.sql.execution.adaptive.AQEShuffleReadExec import org.apache.spark.sql.execution.datasources.{FileFormat, HadoopFsRelation} -import org.apache.spark.sql.execution.datasources.GlutenWriterColumnarRules.NativeWritePostRule import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat import org.apache.spark.sql.execution.datasources.v2.clickhouse.source.DeltaMergeTreeFileFormat import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, ShuffleExchangeExec} @@ -66,6 +65,7 @@ import org.apache.commons.lang3.ClassUtils import java.lang.{Long => JLong} import java.util.{ArrayList => JArrayList, List => JList, Map => JMap} +import scala.collection.JavaConverters._ import scala.collection.mutable.ArrayBuffer class CHSparkPlanExecApi extends SparkPlanExecApi { @@ -146,7 +146,7 @@ class CHSparkPlanExecApi extends SparkPlanExecApi { child match { case scan: FileSourceScanExec if (checkMergeTreeFileFormat(scan.relation)) => - // For the validation phase of the AddTransformHintRule + // For the validation phase of the AddFallbackTagRule CHFilterExecTransformer(condition, child) case scan: FileSourceScanExecTransformerBase if (checkMergeTreeFileFormat(scan.relation)) => // For the transform phase, the FileSourceScanExec is already transformed @@ -226,7 +226,7 @@ class CHSparkPlanExecApi extends SparkPlanExecApi { // FIXME: The operation happens inside ReplaceSingleNode(). // Caller may not know it adds project on top of the shuffle. val project = TransformPreOverrides().apply( - AddTransformHintRule().apply( + AddFallbackTagRule().apply( ProjectExec(plan.child.output ++ projectExpressions, plan.child))) var newExprs = Seq[Expression]() for (i <- exprs.indices) { @@ -251,7 +251,7 @@ class CHSparkPlanExecApi extends SparkPlanExecApi { // FIXME: The operation happens inside ReplaceSingleNode(). // Caller may not know it adds project on top of the shuffle. val project = TransformPreOverrides().apply( - AddTransformHintRule().apply( + AddFallbackTagRule().apply( ProjectExec(plan.child.output ++ projectExpressions, plan.child))) var newOrderings = Seq[SortOrder]() for (i <- orderings.indices) { @@ -271,13 +271,10 @@ class CHSparkPlanExecApi extends SparkPlanExecApi { } } - override def genColumnarShuffleExchange( - shuffle: ShuffleExchangeExec, - child: SparkPlan): SparkPlan = { + override def genColumnarShuffleExchange(shuffle: ShuffleExchangeExec): SparkPlan = { + val child = shuffle.child if ( - BackendsApiManager.getSettings.supportShuffleWithProject( - shuffle.outputPartitioning, - shuffle.child) + BackendsApiManager.getSettings.supportShuffleWithProject(shuffle.outputPartitioning, child) ) { val (projectColumnNumber, newPartitioning, newChild) = addProjectionForShuffleExchange(shuffle) @@ -379,13 +376,21 @@ class CHSparkPlanExecApi extends SparkPlanExecApi { throw new GlutenNotSupportException( "BroadcastNestedLoopJoinExecTransformer is not supported in ch backend.") + override def genSampleExecTransformer( + lowerBound: Double, + upperBound: Double, + withReplacement: Boolean, + seed: Long, + child: SparkPlan): SampleExecTransformer = + throw new GlutenNotSupportException("SampleExecTransformer is not supported in ch backend.") + /** Generate an expression transformer to transform GetMapValue to Substrait. */ def genGetMapValueTransformer( substraitExprName: String, left: ExpressionTransformer, right: ExpressionTransformer, original: GetMapValue): ExpressionTransformer = - GetMapValueTransformer(substraitExprName, left, right, original.failOnError, original) + GetMapValueTransformer(substraitExprName, left, right, false, original) /** * Generate ShuffleDependency for ColumnarShuffleExchangeExec. @@ -577,14 +582,6 @@ class CHSparkPlanExecApi extends SparkPlanExecApi { override def genExtendedColumnarTransformRules(): List[SparkSession => Rule[SparkPlan]] = List() - /** - * Generate extended columnar post-rules. - * - * @return - */ - override def genExtendedColumnarPostRules(): List[SparkSession => Rule[SparkPlan]] = - List(spark => NativeWritePostRule(spark)) - override def genInjectPostHocResolutionRules(): List[SparkSession => Rule[LogicalPlan]] = { List() } @@ -615,13 +612,6 @@ class CHSparkPlanExecApi extends SparkPlanExecApi { CHStringTranslateTransformer(substraitExprName, srcExpr, matchingExpr, replaceExpr, original) } - override def genSizeExpressionTransformer( - substraitExprName: String, - child: ExpressionTransformer, - original: Size): ExpressionTransformer = { - CHSizeExpressionTransformer(substraitExprName, child, original) - } - override def genLikeTransformer( substraitExprName: String, left: ExpressionTransformer, @@ -686,15 +676,19 @@ class CHSparkPlanExecApi extends SparkPlanExecApi { } /** - * Define whether the join operator is fallback because of the join operator is not supported by - * backend + * This is only used to control whether transform smj into shj or not at present. We always prefer + * shj. */ override def joinFallback( - JoinType: JoinType, + joinType: JoinType, leftOutputSet: AttributeSet, rightOutputSet: AttributeSet, condition: Option[Expression]): Boolean = { - CHJoinValidateUtil.shouldFallback(JoinType, leftOutputSet, rightOutputSet, condition) + CHJoinValidateUtil.shouldFallback( + UnknownJoinStrategy(joinType), + leftOutputSet, + rightOutputSet, + condition) } /** Generate window function node */ @@ -718,9 +712,10 @@ class CHSparkPlanExecApi extends SparkPlanExecApi { new JArrayList[ExpressionNode](), columnName, ConverterUtils.getTypeNode(aggWindowFunc.dataType, aggWindowFunc.nullable), - WindowExecTransformer.getFrameBound(frame.upper), - WindowExecTransformer.getFrameBound(frame.lower), - frame.frameType.sql + frame.upper, + frame.lower, + frame.frameType.sql, + originalInputAttributes.asJava ) windowExpressionNodes.add(windowFunctionNode) case aggExpression: AggregateExpression => @@ -744,9 +739,10 @@ class CHSparkPlanExecApi extends SparkPlanExecApi { childrenNodeList, columnName, ConverterUtils.getTypeNode(aggExpression.dataType, aggExpression.nullable), - WindowExecTransformer.getFrameBound(frame.upper), - WindowExecTransformer.getFrameBound(frame.lower), - frame.frameType.sql + frame.upper, + frame.lower, + frame.frameType.sql, + originalInputAttributes.asJava ) windowExpressionNodes.add(windowFunctionNode) case wf @ (Lead(_, _, _, _) | Lag(_, _, _, _)) => @@ -793,9 +789,10 @@ class CHSparkPlanExecApi extends SparkPlanExecApi { childrenNodeList, columnName, ConverterUtils.getTypeNode(offsetWf.dataType, offsetWf.nullable), - WindowExecTransformer.getFrameBound(frame.upper), - WindowExecTransformer.getFrameBound(frame.lower), - frame.frameType.sql + frame.upper, + frame.lower, + frame.frameType.sql, + originalInputAttributes.asJava ) windowExpressionNodes.add(windowFunctionNode) case _ => @@ -845,6 +842,24 @@ class CHSparkPlanExecApi extends SparkPlanExecApi { CHGenerateExecTransformer(generator, requiredChildOutput, outer, generatorOutput, child) } + /** Transform array filter to Substrait. */ + override def genArrayFilterTransformer( + substraitExprName: String, + argument: ExpressionTransformer, + function: ExpressionTransformer, + expr: ArrayFilter): ExpressionTransformer = { + GenericExpressionTransformer(substraitExprName, Seq(argument, function), expr) + } + + /** Transform array transform to Substrait. */ + override def genArrayTransformTransformer( + substraitExprName: String, + argument: ExpressionTransformer, + function: ExpressionTransformer, + expr: ArrayTransform): ExpressionTransformer = { + GenericExpressionTransformer(substraitExprName, Seq(argument, function), expr) + } + override def genPreProjectForGenerate(generate: GenerateExec): SparkPlan = generate override def genPostProjectForGenerate(generate: GenerateExec): SparkPlan = generate diff --git a/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHTransformerApi.scala b/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHTransformerApi.scala index c75cf4788ba97..9653256256bde 100644 --- a/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHTransformerApi.scala +++ b/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHTransformerApi.scala @@ -23,6 +23,7 @@ import org.apache.gluten.substrait.expression.{BooleanLiteralNode, ExpressionBui import org.apache.gluten.utils.{CHInputPartitionsUtil, ExpressionDocUtil} import org.apache.spark.internal.Logging +import org.apache.spark.rpc.GlutenDriverEndpoint import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} import org.apache.spark.sql.connector.read.InputPartition import org.apache.spark.sql.delta.catalog.ClickHouseTableV2 @@ -44,6 +45,7 @@ class CHTransformerApi extends TransformerApi with Logging { /** Generate Seq[InputPartition] for FileSourceScanExecTransformer. */ def genInputPartitionSeq( relation: HadoopFsRelation, + requiredSchema: StructType, selectedPartitions: Array[PartitionDirectory], output: Seq[Attribute], bucketedScan: Boolean, @@ -68,12 +70,11 @@ class CHTransformerApi extends TransformerApi with Logging { disableBucketedScan, filterExprs ) - case _: TahoeFileIndex => - throw new UnsupportedOperationException("Does not support delta-parquet") case _ => // Generate FilePartition for Parquet CHInputPartitionsUtil( relation, + requiredSchema, selectedPartitions, output, bucketedScan, @@ -91,12 +92,6 @@ class CHTransformerApi extends TransformerApi with Logging { val offHeapSize = nativeConfMap.getOrDefault("spark.gluten.memory.offHeap.size.in.bytes", "0").toLong if (offHeapSize > 0) { - // Only set default max_bytes_before_external_sort for CH when it is not set explicitly. - val sortSpillKey = settingPrefix + "max_bytes_before_external_sort"; - if (!nativeConfMap.containsKey(sortSpillKey)) { - val sortSpillValue = offHeapSize * 0.5 - nativeConfMap.put(sortSpillKey, sortSpillValue.toLong.toString) - } // Only set default max_bytes_before_external_group_by for CH when it is not set explicitly. val groupBySpillKey = settingPrefix + "max_bytes_before_external_group_by"; @@ -233,4 +228,8 @@ class CHTransformerApi extends TransformerApi with Logging { } override def packPBMessage(message: Message): Any = Any.pack(message) + + override def invalidateSQLExecutionResource(executionId: String): Unit = { + GlutenDriverEndpoint.invalidateResourceRelation(executionId) + } } diff --git a/backends-clickhouse/src/main/scala/org/apache/gluten/execution/CHGenerateExecTransformer.scala b/backends-clickhouse/src/main/scala/org/apache/gluten/execution/CHGenerateExecTransformer.scala index f1e7d305a3589..733c0a472814d 100644 --- a/backends-clickhouse/src/main/scala/org/apache/gluten/execution/CHGenerateExecTransformer.scala +++ b/backends-clickhouse/src/main/scala/org/apache/gluten/execution/CHGenerateExecTransformer.scala @@ -54,7 +54,7 @@ case class CHGenerateExecTransformer( "extraTime" -> SQLMetrics.createTimingMetric(sparkContext, "extra operators time"), "inputWaitTime" -> SQLMetrics.createTimingMetric(sparkContext, "time of waiting for data"), "outputWaitTime" -> SQLMetrics.createTimingMetric(sparkContext, "time of waiting for output"), - "totalTime" -> SQLMetrics.createTimingMetric(sparkContext, "total time") + "totalTime" -> SQLMetrics.createTimingMetric(sparkContext, "time") ) override def metricsUpdater(): MetricsUpdater = new GenerateMetricsUpdater(metrics) diff --git a/backends-clickhouse/src/main/scala/org/apache/gluten/execution/CHHashAggregateExecTransformer.scala b/backends-clickhouse/src/main/scala/org/apache/gluten/execution/CHHashAggregateExecTransformer.scala index 4a4d345db1dd7..7e688814381bc 100644 --- a/backends-clickhouse/src/main/scala/org/apache/gluten/execution/CHHashAggregateExecTransformer.scala +++ b/backends-clickhouse/src/main/scala/org/apache/gluten/execution/CHHashAggregateExecTransformer.scala @@ -81,8 +81,8 @@ case class CHHashAggregateExecTransformer( } } - override def doTransform(context: SubstraitContext): TransformContext = { - val childCtx = child.asInstanceOf[TransformSupport].doTransform(context) + override protected def doTransform(context: SubstraitContext): TransformContext = { + val childCtx = child.asInstanceOf[TransformSupport].transform(context) val operatorId = context.nextOperatorId(this.nodeName) val aggParams = new AggregationParams diff --git a/backends-clickhouse/src/main/scala/org/apache/gluten/execution/CHHashJoinExecTransformer.scala b/backends-clickhouse/src/main/scala/org/apache/gluten/execution/CHHashJoinExecTransformer.scala index 046f26043c5d3..da9d9c7586c05 100644 --- a/backends-clickhouse/src/main/scala/org/apache/gluten/execution/CHHashJoinExecTransformer.scala +++ b/backends-clickhouse/src/main/scala/org/apache/gluten/execution/CHHashJoinExecTransformer.scala @@ -16,15 +16,16 @@ */ package org.apache.gluten.execution -import org.apache.gluten.backendsapi.clickhouse.CHIteratorApi import org.apache.gluten.extension.ValidationResult -import org.apache.gluten.utils.CHJoinValidateUtil +import org.apache.gluten.utils.{BroadcastHashJoinStrategy, CHJoinValidateUtil, ShuffleHashJoinStrategy} import org.apache.spark.{broadcast, SparkContext} +import org.apache.spark.rdd.RDD +import org.apache.spark.rpc.GlutenDriverEndpoint import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.optimizer.BuildSide import org.apache.spark.sql.catalyst.plans._ -import org.apache.spark.sql.execution.SparkPlan +import org.apache.spark.sql.execution.{SparkPlan, SQLExecution} import org.apache.spark.sql.execution.joins.BuildSideRelation import org.apache.spark.sql.vectorized.ColumnarBatch @@ -53,7 +54,11 @@ case class CHShuffledHashJoinExecTransformer( override protected def doValidateInternal(): ValidationResult = { val shouldFallback = - CHJoinValidateUtil.shouldFallback(joinType, left.outputSet, right.outputSet, condition) + CHJoinValidateUtil.shouldFallback( + ShuffleHashJoinStrategy(joinType), + left.outputSet, + right.outputSet, + condition) if (shouldFallback) { return ValidationResult.notOk("ch join validate fail") } @@ -69,13 +74,14 @@ case class CHBroadcastBuildSideRDD( override def genBroadcastBuildSideIterator(): Iterator[ColumnarBatch] = { CHBroadcastBuildSideCache.getOrBuildBroadcastHashTable(broadcasted, broadcastContext) - CHIteratorApi.genCloseableColumnBatchIterator(Iterator.empty) + Iterator.empty } } case class BroadCastHashJoinContext( buildSideJoinKeys: Seq[Expression], joinType: JoinType, + hasMixedFiltCondition: Boolean, buildSideStructure: Seq[Attribute], buildHashTableId: String) @@ -105,7 +111,11 @@ case class CHBroadcastHashJoinExecTransformer( override protected def doValidateInternal(): ValidationResult = { val shouldFallback = - CHJoinValidateUtil.shouldFallback(joinType, left.outputSet, right.outputSet, condition) + CHJoinValidateUtil.shouldFallback( + BroadcastHashJoinStrategy(joinType), + left.outputSet, + right.outputSet, + condition) if (shouldFallback) { return ValidationResult.notOk("ch join validate fail") @@ -116,10 +126,39 @@ case class CHBroadcastHashJoinExecTransformer( super.doValidateInternal() } - override protected def createBroadcastBuildSideRDD(): BroadcastBuildSideRDD = { + override def columnarInputRDDs: Seq[RDD[ColumnarBatch]] = { + val streamedRDD = getColumnarInputRDDs(streamedPlan) + val executionId = sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY) + if (executionId != null) { + GlutenDriverEndpoint.collectResources(executionId, buildHashTableId) + } else { + logWarning( + s"Can't not trace broadcast hash table data $buildHashTableId" + + s" because execution id is null." + + s" Will clean up until expire time.") + } val broadcast = buildPlan.executeBroadcast[BuildSideRelation]() val context = - BroadCastHashJoinContext(buildKeyExprs, joinType, buildPlan.output, buildHashTableId) - CHBroadcastBuildSideRDD(sparkContext, broadcast, context) + BroadCastHashJoinContext( + buildKeyExprs, + joinType, + isMixedCondition(condition), + buildPlan.output, + buildHashTableId) + val broadcastRDD = CHBroadcastBuildSideRDD(sparkContext, broadcast, context) + // FIXME: Do we have to make build side a RDD? + streamedRDD :+ broadcastRDD + } + + def isMixedCondition(cond: Option[Expression]): Boolean = { + val res = if (cond.isDefined) { + val leftOutputSet = left.outputSet + val rightOutputSet = right.outputSet + val allReferences = cond.get.references + !(allReferences.subsetOf(leftOutputSet) || allReferences.subsetOf(rightOutputSet)) + } else { + false + } + res } } diff --git a/backends-clickhouse/src/main/scala/org/apache/gluten/execution/CHSortMergeJoinExecTransformer.scala b/backends-clickhouse/src/main/scala/org/apache/gluten/execution/CHSortMergeJoinExecTransformer.scala index a5ac5f65840d4..e2b5865517391 100644 --- a/backends-clickhouse/src/main/scala/org/apache/gluten/execution/CHSortMergeJoinExecTransformer.scala +++ b/backends-clickhouse/src/main/scala/org/apache/gluten/execution/CHSortMergeJoinExecTransformer.scala @@ -17,7 +17,7 @@ package org.apache.gluten.execution import org.apache.gluten.extension.ValidationResult -import org.apache.gluten.utils.CHJoinValidateUtil +import org.apache.gluten.utils.{CHJoinValidateUtil, SortMergeJoinStrategy} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans._ @@ -44,7 +44,11 @@ case class CHSortMergeJoinExecTransformer( override protected def doValidateInternal(): ValidationResult = { val shouldFallback = - CHJoinValidateUtil.shouldFallback(joinType, left.outputSet, right.outputSet, condition, true) + CHJoinValidateUtil.shouldFallback( + SortMergeJoinStrategy(joinType), + left.outputSet, + right.outputSet, + condition) if (shouldFallback) { return ValidationResult.notOk("ch join validate fail") } diff --git a/backends-clickhouse/src/main/scala/org/apache/gluten/execution/NativeFileScanColumnarRDD.scala b/backends-clickhouse/src/main/scala/org/apache/gluten/execution/NativeFileScanColumnarRDD.scala index 624a4390d729f..af512934bc96b 100644 --- a/backends-clickhouse/src/main/scala/org/apache/gluten/execution/NativeFileScanColumnarRDD.scala +++ b/backends-clickhouse/src/main/scala/org/apache/gluten/execution/NativeFileScanColumnarRDD.scala @@ -17,7 +17,7 @@ package org.apache.gluten.execution import org.apache.gluten.metrics.GlutenTimeMetric -import org.apache.gluten.vectorized.{CHNativeExpressionEvaluator, CloseableCHColumnBatchIterator, GeneralInIterator, GeneralOutIterator} +import org.apache.gluten.vectorized.{CHNativeExpressionEvaluator, CloseableCHColumnBatchIterator, GeneralInIterator} import org.apache.spark.{Partition, SparkContext, SparkException, TaskContext} import org.apache.spark.rdd.RDD @@ -47,7 +47,7 @@ class NativeFileScanColumnarRDD( .asInstanceOf[GlutenPartition] .splitInfosByteArray - val resIter: GeneralOutIterator = GlutenTimeMetric.millis(scanTime) { + val resIter = GlutenTimeMetric.millis(scanTime) { _ => val transKernel = new CHNativeExpressionEvaluator() val inBatchIters = new util.ArrayList[GeneralInIterator]() @@ -58,6 +58,14 @@ class NativeFileScanColumnarRDD( false ) } + TaskContext + .get() + .addTaskFailureListener( + (ctx, _) => { + if (ctx.isInterrupted()) { + resIter.cancel() + } + }) TaskContext.get().addTaskCompletionListener[Unit](_ => resIter.close()) val iter: Iterator[ColumnarBatch] = new Iterator[ColumnarBatch] { var scanTotalTime = 0L diff --git a/backends-clickhouse/src/main/scala/org/apache/gluten/expression/CHExpressionTransformer.scala b/backends-clickhouse/src/main/scala/org/apache/gluten/expression/CHExpressionTransformer.scala index 5ca4e02339d0a..7b389ead0091b 100644 --- a/backends-clickhouse/src/main/scala/org/apache/gluten/expression/CHExpressionTransformer.scala +++ b/backends-clickhouse/src/main/scala/org/apache/gluten/expression/CHExpressionTransformer.scala @@ -30,16 +30,6 @@ import com.google.common.collect.Lists import java.util.Locale -case class CHSizeExpressionTransformer( - substraitExprName: String, - expr: ExpressionTransformer, - original: Size) - extends BinaryExpressionTransformer { - override def left: ExpressionTransformer = expr - // Pass legacyLiteral as second argument in substrait function - override def right: ExpressionTransformer = LiteralTransformer(original.legacySizeOfNull) -} - case class CHTruncTimestampTransformer( substraitExprName: String, format: ExpressionTransformer, diff --git a/backends-clickhouse/src/main/scala/org/apache/gluten/extension/FallbackBroadcaseHashJoinRules.scala b/backends-clickhouse/src/main/scala/org/apache/gluten/extension/FallbackBroadcaseHashJoinRules.scala index 873ecb8342a60..59c2d6494bdba 100644 --- a/backends-clickhouse/src/main/scala/org/apache/gluten/extension/FallbackBroadcaseHashJoinRules.scala +++ b/backends-clickhouse/src/main/scala/org/apache/gluten/extension/FallbackBroadcaseHashJoinRules.scala @@ -19,7 +19,7 @@ package org.apache.gluten.extension import org.apache.gluten.GlutenConfig import org.apache.gluten.backendsapi.BackendsApiManager import org.apache.gluten.extension.columnar._ -import org.apache.gluten.extension.columnar.TransformHints.EncodeTransformableTagImplicits +import org.apache.gluten.extension.columnar.FallbackTags.EncodeFallbackTagImplicits import org.apache.gluten.utils.PhysicalPlanSelector import org.apache.spark.sql.SparkSession @@ -61,7 +61,7 @@ case class FallbackBroadcastHashJoinPrepQueryStage(session: SparkSession) extend "columnar broadcast exchange is disabled or " + "columnar broadcast join is disabled") } else { - if (TransformHints.isNotTransformable(bhj)) { + if (FallbackTags.nonEmpty(bhj)) { ValidationResult.notOk("broadcast join is already tagged as not transformable") } else { val bhjTransformer = BackendsApiManager.getSparkPlanExecApiInstance @@ -83,8 +83,8 @@ case class FallbackBroadcastHashJoinPrepQueryStage(session: SparkSession) extend } } } - TransformHints.tagNotTransformable(bhj, isTransformable) - TransformHints.tagNotTransformable(exchange, isTransformable) + FallbackTags.add(bhj, isTransformable) + FallbackTags.add(exchange, isTransformable) case _ => // Skip. This might be the case that the exchange was already // executed in earlier stage @@ -116,7 +116,7 @@ case class FallbackBroadcastHashJoin(session: SparkSession) extends Rule[SparkPl // Currently their doBroadcast() methods just propagate child's broadcast // payloads which is not right in speaking of columnar. if (!enableColumnarBroadcastJoin) { - TransformHints.tagNotTransformable( + FallbackTags.add( bhj, "columnar BroadcastJoin is not enabled in BroadcastHashJoinExec") } else { @@ -149,7 +149,7 @@ case class FallbackBroadcastHashJoin(session: SparkSession) extends Rule[SparkPl case Some(exchange @ BroadcastExchangeExec(mode, child)) => isBhjTransformable.tagOnFallback(bhj) if (!isBhjTransformable.isValid) { - TransformHints.tagNotTransformable(exchange, isBhjTransformable) + FallbackTags.add(exchange, isBhjTransformable) } case None => // we are in AQE, find the hidden exchange @@ -182,7 +182,7 @@ case class FallbackBroadcastHashJoin(session: SparkSession) extends Rule[SparkPl // to conform to the underlying exchange's type, columnar or vanilla exchange match { case BroadcastExchangeExec(mode, child) => - TransformHints.tagNotTransformable( + FallbackTags.add( bhj, "it's a materialized broadcast exchange or reused broadcast exchange") case ColumnarBroadcastExchangeExec(mode, child) => @@ -199,7 +199,7 @@ case class FallbackBroadcastHashJoin(session: SparkSession) extends Rule[SparkPl } } catch { case e: UnsupportedOperationException => - TransformHints.tagNotTransformable( + FallbackTags.add( p, s"${e.getMessage}, original Spark plan is " + s"${p.getClass}(${p.children.toList.map(_.getClass)})") diff --git a/backends-clickhouse/src/main/scala/org/apache/gluten/metrics/HashAggregateMetricsUpdater.scala b/backends-clickhouse/src/main/scala/org/apache/gluten/metrics/HashAggregateMetricsUpdater.scala index e2014e5b8b844..b035d7a04fb0a 100644 --- a/backends-clickhouse/src/main/scala/org/apache/gluten/metrics/HashAggregateMetricsUpdater.scala +++ b/backends-clickhouse/src/main/scala/org/apache/gluten/metrics/HashAggregateMetricsUpdater.scala @@ -65,7 +65,7 @@ class HashAggregateMetricsUpdater(val metrics: Map[String, SQLMetric]) } } } catch { - case e: Throwable => + case e: Exception => logError(s"Updating native metrics failed due to ${e.getCause}.") throw e } diff --git a/backends-clickhouse/src/main/scala/org/apache/gluten/metrics/HashJoinMetricsUpdater.scala b/backends-clickhouse/src/main/scala/org/apache/gluten/metrics/HashJoinMetricsUpdater.scala index 3c35286c1c13f..ca891bac27c63 100644 --- a/backends-clickhouse/src/main/scala/org/apache/gluten/metrics/HashJoinMetricsUpdater.scala +++ b/backends-clickhouse/src/main/scala/org/apache/gluten/metrics/HashJoinMetricsUpdater.scala @@ -104,7 +104,7 @@ class HashJoinMetricsUpdater(val metrics: Map[String, SQLMetric]) } } } catch { - case e: Throwable => + case e: Exception => logError(s"Updating native metrics failed due to ${e.getCause}.") throw e } diff --git a/backends-clickhouse/src/main/scala/org/apache/gluten/metrics/MetricsUtil.scala b/backends-clickhouse/src/main/scala/org/apache/gluten/metrics/MetricsUtil.scala index a6dfb3dbcb1fd..e1e0f7c11a09a 100644 --- a/backends-clickhouse/src/main/scala/org/apache/gluten/metrics/MetricsUtil.scala +++ b/backends-clickhouse/src/main/scala/org/apache/gluten/metrics/MetricsUtil.scala @@ -41,7 +41,7 @@ object MetricsUtil extends Logging { case t: TransformSupport => MetricsUpdaterTree(t.metricsUpdater(), t.children.map(treeifyMetricsUpdaters)) case _ => - MetricsUpdaterTree(NoopMetricsUpdater, Seq()) + MetricsUpdaterTree(MetricsUpdater.Terminate, Seq()) } } @@ -107,7 +107,7 @@ object MetricsUtil extends Logging { s"Updating native metrics failed due to the wrong size of metrics data: " + s"$numNativeMetrics") () - } else if (mutNode.updater == NoopMetricsUpdater) { + } else if (mutNode.updater == MetricsUpdater.Terminate) { () } else { updateTransformerMetricsInternal( @@ -159,7 +159,7 @@ object MetricsUtil extends Logging { mutNode.children.foreach { child => - if (child.updater != NoopMetricsUpdater) { + if (child.updater != MetricsUpdater.Terminate) { val result = updateTransformerMetricsInternal( child, relMap, @@ -177,10 +177,12 @@ object MetricsUtil extends Logging { /** Get all processors */ def getAllProcessorList(metricData: MetricsData): Seq[MetricsProcessor] = { - metricData.steps.asScala.flatMap( - step => { - step.processors.asScala - }) + metricData.steps.asScala + .flatMap( + step => { + step.processors.asScala + }) + .toSeq } /** Update extra time metric by the processors */ diff --git a/backends-clickhouse/src/main/scala/org/apache/gluten/utils/CHExpressionUtil.scala b/backends-clickhouse/src/main/scala/org/apache/gluten/utils/CHExpressionUtil.scala index 5f78d25cc5c2b..14f0ff4891884 100644 --- a/backends-clickhouse/src/main/scala/org/apache/gluten/utils/CHExpressionUtil.scala +++ b/backends-clickhouse/src/main/scala/org/apache/gluten/utils/CHExpressionUtil.scala @@ -44,6 +44,15 @@ case class SequenceValidator() extends FunctionValidator { } } +case class UtcTimestampValidator() extends FunctionValidator { + override def doValidate(expr: Expression): Boolean = expr match { + // CH backend doest not support non-const timezone parameter + case t: ToUTCTimestamp => t.children(1).isInstanceOf[Literal] + case f: FromUTCTimestamp => f.children(1).isInstanceOf[Literal] + case _ => false + } +} + case class UnixTimeStampValidator() extends FunctionValidator { final val DATE_TYPE = "date" @@ -184,7 +193,7 @@ object CHExpressionUtil { URL_DECODE -> DefaultValidator(), URL_ENCODE -> DefaultValidator(), SKEWNESS -> DefaultValidator(), - BIT_LENGTH -> DefaultValidator(), + SOUNDEX -> DefaultValidator(), MAKE_YM_INTERVAL -> DefaultValidator(), MAP_ZIP_WITH -> DefaultValidator(), ZIP_WITH -> DefaultValidator(), @@ -193,14 +202,12 @@ object CHExpressionUtil { REGR_SLOPE -> DefaultValidator(), REGR_INTERCEPT -> DefaultValidator(), REGR_SXY -> DefaultValidator(), - TO_UTC_TIMESTAMP -> DefaultValidator(), - FROM_UTC_TIMESTAMP -> DefaultValidator(), + TO_UTC_TIMESTAMP -> UtcTimestampValidator(), + FROM_UTC_TIMESTAMP -> UtcTimestampValidator(), UNIX_MILLIS -> DefaultValidator(), UNIX_MICROS -> DefaultValidator(), TIMESTAMP_MILLIS -> DefaultValidator(), TIMESTAMP_MICROS -> DefaultValidator(), - FLATTEN -> DefaultValidator(), - RINT -> DefaultValidator(), STACK -> DefaultValidator() ) } diff --git a/backends-clickhouse/src/main/scala/org/apache/gluten/utils/CHInputPartitionsUtil.scala b/backends-clickhouse/src/main/scala/org/apache/gluten/utils/CHInputPartitionsUtil.scala index 90ab336689fe7..0f35ff66d4d1e 100644 --- a/backends-clickhouse/src/main/scala/org/apache/gluten/utils/CHInputPartitionsUtil.scala +++ b/backends-clickhouse/src/main/scala/org/apache/gluten/utils/CHInputPartitionsUtil.scala @@ -22,8 +22,8 @@ import org.apache.gluten.sql.shims.SparkShimLoader import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.connector.read.InputPartition -import org.apache.spark.sql.execution.PartitionedFileUtil import org.apache.spark.sql.execution.datasources._ +import org.apache.spark.sql.types.StructType import org.apache.spark.util.SparkResourceUtil import org.apache.spark.util.collection.BitSet @@ -33,6 +33,7 @@ import scala.collection.mutable.ArrayBuffer case class CHInputPartitionsUtil( relation: HadoopFsRelation, + requiredSchema: StructType, selectedPartitions: Array[PartitionDirectory], output: Seq[Attribute], bucketedScan: Boolean, @@ -66,7 +67,7 @@ case class CHInputPartitionsUtil( val splitFiles = selectedPartitions .flatMap { partition => - partition.files.flatMap { + SparkShimLoader.getSparkShims.getFileStatus(partition).flatMap { file => // getPath() is very expensive so we only want to call it once in this block: val filePath = file.getPath @@ -74,13 +75,14 @@ case class CHInputPartitionsUtil( if (shouldProcess(filePath)) { val isSplitable = relation.fileFormat.isSplitable(relation.sparkSession, relation.options, filePath) - PartitionedFileUtil.splitFiles( - sparkSession = relation.sparkSession, - file = file, - filePath = filePath, - isSplitable = isSplitable, - maxSplitBytes = maxSplitBytes, - partitionValues = partition.values) + SparkShimLoader.getSparkShims.splitFiles( + relation.sparkSession, + file, + filePath, + isSplitable, + maxSplitBytes, + partition.values + ) } else { Seq.empty } diff --git a/backends-clickhouse/src/main/scala/org/apache/gluten/utils/CHJoinValidateUtil.scala b/backends-clickhouse/src/main/scala/org/apache/gluten/utils/CHJoinValidateUtil.scala index 06b2445af6e1c..dae8e6e073a18 100644 --- a/backends-clickhouse/src/main/scala/org/apache/gluten/utils/CHJoinValidateUtil.scala +++ b/backends-clickhouse/src/main/scala/org/apache/gluten/utils/CHJoinValidateUtil.scala @@ -17,9 +17,17 @@ package org.apache.gluten.utils import org.apache.spark.internal.Logging -import org.apache.spark.sql.catalyst.expressions.{AttributeSet, EqualTo, Expression, GreaterThan, GreaterThanOrEqual, In, LessThan, LessThanOrEqual, Not, Or} +import org.apache.spark.sql.catalyst.expressions.{AttributeSet, Expression} import org.apache.spark.sql.catalyst.plans.JoinType +trait JoinStrategy { + val joinType: JoinType +} +case class UnknownJoinStrategy(joinType: JoinType) extends JoinStrategy {} +case class ShuffleHashJoinStrategy(joinType: JoinType) extends JoinStrategy {} +case class BroadcastHashJoinStrategy(joinType: JoinType) extends JoinStrategy {} +case class SortMergeJoinStrategy(joinType: JoinType) extends JoinStrategy {} + /** * The logic here is that if it is not an equi-join spark will create BNLJ, which will fallback, if * it is an equi-join, spark will create BroadcastHashJoin or ShuffleHashJoin, for these join types, @@ -34,78 +42,40 @@ object CHJoinValidateUtil extends Logging { def hasTwoTableColumn( leftOutputSet: AttributeSet, rightOutputSet: AttributeSet, - l: Expression, - r: Expression): Boolean = { - val allReferences = l.references ++ r.references + expr: Expression): Boolean = { + val allReferences = expr.references !(allReferences.subsetOf(leftOutputSet) || allReferences.subsetOf(rightOutputSet)) } def shouldFallback( - joinType: JoinType, + joinStrategy: JoinStrategy, leftOutputSet: AttributeSet, rightOutputSet: AttributeSet, - condition: Option[Expression], - isSMJ: Boolean = false): Boolean = { + condition: Option[Expression]): Boolean = { var shouldFallback = false + val joinType = joinStrategy.joinType if (joinType.toString.contains("ExistenceJoin")) { return true } - if (joinType.sql.equals("INNER")) { - return shouldFallback - } - if (isSMJ) { - if ( - joinType.sql.contains("SEMI") - || joinType.sql.contains("ANTI") - ) { - return true + if (joinType.sql.contains("INNER")) { + shouldFallback = false; + } else if ( + condition.isDefined && hasTwoTableColumn(leftOutputSet, rightOutputSet, condition.get) + ) { + shouldFallback = joinStrategy match { + case BroadcastHashJoinStrategy(joinTy) => + joinTy.sql.contains("SEMI") || joinTy.sql.contains("ANTI") + case SortMergeJoinStrategy(_) => true + case ShuffleHashJoinStrategy(joinTy) => + joinTy.sql.contains("SEMI") || joinTy.sql.contains("ANTI") + case UnknownJoinStrategy(joinTy) => + joinTy.sql.contains("SEMI") || joinTy.sql.contains("ANTI") } - } - if (condition.isDefined) { - condition.get.transform { - case Or(l, r) => - if (hasTwoTableColumn(leftOutputSet, rightOutputSet, l, r)) { - shouldFallback = true - } - Or(l, r) - case Not(EqualTo(l, r)) => - if (hasTwoTableColumn(leftOutputSet, rightOutputSet, l, r)) { - shouldFallback = true - } - Not(EqualTo(l, r)) - case LessThan(l, r) => - if (hasTwoTableColumn(leftOutputSet, rightOutputSet, l, r)) { - shouldFallback = true - } - LessThan(l, r) - case LessThanOrEqual(l, r) => - if (hasTwoTableColumn(leftOutputSet, rightOutputSet, l, r)) { - shouldFallback = true - } - LessThanOrEqual(l, r) - case GreaterThan(l, r) => - if (hasTwoTableColumn(leftOutputSet, rightOutputSet, l, r)) { - shouldFallback = true - } - GreaterThan(l, r) - case GreaterThanOrEqual(l, r) => - if (hasTwoTableColumn(leftOutputSet, rightOutputSet, l, r)) { - shouldFallback = true - } - GreaterThanOrEqual(l, r) - case In(l, r) => - r.foreach( - e => { - if (hasTwoTableColumn(leftOutputSet, rightOutputSet, l, e)) { - shouldFallback = true - } - }) - In(l, r) - case EqualTo(l, r) => - if (hasTwoTableColumn(leftOutputSet, rightOutputSet, l, r)) { - shouldFallback = true - } - EqualTo(l, r) + } else { + shouldFallback = joinStrategy match { + case SortMergeJoinStrategy(joinTy) => + joinTy.sql.contains("SEMI") || joinTy.sql.contains("ANTI") + case _ => false } } shouldFallback diff --git a/gluten-core/src/main/scala/org/apache/spark/listener/GlutenSQLAppStatusListener.scala b/backends-clickhouse/src/main/scala/org/apache/spark/listener/CHGlutenSQLAppStatusListener.scala similarity index 86% rename from gluten-core/src/main/scala/org/apache/spark/listener/GlutenSQLAppStatusListener.scala rename to backends-clickhouse/src/main/scala/org/apache/spark/listener/CHGlutenSQLAppStatusListener.scala index 8c98d30442a53..7984fa8465066 100644 --- a/gluten-core/src/main/scala/org/apache/spark/listener/GlutenSQLAppStatusListener.scala +++ b/backends-clickhouse/src/main/scala/org/apache/spark/listener/CHGlutenSQLAppStatusListener.scala @@ -16,14 +16,15 @@ */ package org.apache.spark.listener +import org.apache.spark.SparkContext import org.apache.spark.internal.Logging +import org.apache.spark.rpc.{GlutenDriverEndpoint, RpcEndpointRef} import org.apache.spark.rpc.GlutenRpcMessages._ -import org.apache.spark.rpc.RpcEndpointRef import org.apache.spark.scheduler._ import org.apache.spark.sql.execution.ui._ /** Gluten SQL listener. Used for monitor sql on whole life cycle.Create and release resource. */ -class GlutenSQLAppStatusListener(val driverEndpointRef: RpcEndpointRef) +class CHGlutenSQLAppStatusListener(val driverEndpointRef: RpcEndpointRef) extends SparkListener with Logging { @@ -68,3 +69,9 @@ class GlutenSQLAppStatusListener(val driverEndpointRef: RpcEndpointRef) logTrace(s"Execution $executionId end.") } } +object CHGlutenSQLAppStatusListener { + def registerListener(sc: SparkContext): Unit = { + sc.listenerBus.addToStatusQueue( + new CHGlutenSQLAppStatusListener(GlutenDriverEndpoint.glutenDriverEndpointRef)) + } +} diff --git a/gluten-core/src/main/scala/org/apache/spark/rpc/GlutenDriverEndpoint.scala b/backends-clickhouse/src/main/scala/org/apache/spark/rpc/GlutenDriverEndpoint.scala similarity index 100% rename from gluten-core/src/main/scala/org/apache/spark/rpc/GlutenDriverEndpoint.scala rename to backends-clickhouse/src/main/scala/org/apache/spark/rpc/GlutenDriverEndpoint.scala diff --git a/gluten-core/src/main/scala/org/apache/spark/rpc/GlutenExecutorEndpoint.scala b/backends-clickhouse/src/main/scala/org/apache/spark/rpc/GlutenExecutorEndpoint.scala similarity index 89% rename from gluten-core/src/main/scala/org/apache/spark/rpc/GlutenExecutorEndpoint.scala rename to backends-clickhouse/src/main/scala/org/apache/spark/rpc/GlutenExecutorEndpoint.scala index e481942327710..f05933ef78e33 100644 --- a/gluten-core/src/main/scala/org/apache/spark/rpc/GlutenExecutorEndpoint.scala +++ b/backends-clickhouse/src/main/scala/org/apache/spark/rpc/GlutenExecutorEndpoint.scala @@ -16,7 +16,7 @@ */ package org.apache.spark.rpc -import org.apache.gluten.backendsapi.BackendsApiManager +import org.apache.gluten.execution.CHBroadcastBuildSideCache import org.apache.spark.{SparkConf, SparkEnv} import org.apache.spark.internal.{config, Logging} @@ -60,10 +60,15 @@ class GlutenExecutorEndpoint(val executorId: String, val conf: SparkConf) override def receive: PartialFunction[Any, Unit] = { case GlutenCleanExecutionResource(executionId, hashIds) => - BackendsApiManager.getBroadcastApiInstance - .cleanExecutionBroadcastTable(executionId, hashIds) + if (executionId != null) { + hashIds.forEach( + resource_id => CHBroadcastBuildSideCache.invalidateBroadcastHashtable(resource_id)) + } case e => logError(s"Received unexpected message. $e") } } +object GlutenExecutorEndpoint { + var executorEndpoint: GlutenExecutorEndpoint = _ +} diff --git a/gluten-core/src/main/scala/org/apache/spark/rpc/GlutenRpcConstants.scala b/backends-clickhouse/src/main/scala/org/apache/spark/rpc/GlutenRpcConstants.scala similarity index 100% rename from gluten-core/src/main/scala/org/apache/spark/rpc/GlutenRpcConstants.scala rename to backends-clickhouse/src/main/scala/org/apache/spark/rpc/GlutenRpcConstants.scala diff --git a/gluten-core/src/main/scala/org/apache/spark/rpc/GlutenRpcMessages.scala b/backends-clickhouse/src/main/scala/org/apache/spark/rpc/GlutenRpcMessages.scala similarity index 100% rename from gluten-core/src/main/scala/org/apache/spark/rpc/GlutenRpcMessages.scala rename to backends-clickhouse/src/main/scala/org/apache/spark/rpc/GlutenRpcMessages.scala diff --git a/backends-clickhouse/src/main/scala/org/apache/spark/shuffle/CHColumnarShuffleWriter.scala b/backends-clickhouse/src/main/scala/org/apache/spark/shuffle/CHColumnarShuffleWriter.scala index 3a80e18bdcd62..c113f8d4dd319 100644 --- a/backends-clickhouse/src/main/scala/org/apache/spark/shuffle/CHColumnarShuffleWriter.scala +++ b/backends-clickhouse/src/main/scala/org/apache/spark/shuffle/CHColumnarShuffleWriter.scala @@ -29,7 +29,6 @@ import org.apache.spark.sql.vectorized.ColumnarBatch import org.apache.spark.util.{SparkDirectoryUtil, Utils} import java.io.IOException -import java.util import java.util.{Locale, UUID} class CHColumnarShuffleWriter[K, V]( @@ -61,7 +60,8 @@ class CHColumnarShuffleWriter[K, V]( GlutenConfig.getConf.chColumnarFlushBlockBufferBeforeEvict private val maxSortBufferSize = GlutenConfig.getConf.chColumnarMaxSortBufferSize private val spillFirstlyBeforeStop = GlutenConfig.getConf.chColumnarSpillFirstlyBeforeStop - private val forceSortShuffle = GlutenConfig.getConf.chColumnarForceSortShuffle + private val forceExternalSortShuffle = GlutenConfig.getConf.chColumnarForceExternalSortShuffle + private val forceMemorySortShuffle = GlutenConfig.getConf.chColumnarForceMemorySortShuffle private val spillThreshold = GlutenConfig.getConf.chColumnarShuffleSpillThreshold private val jniWrapper = new CHShuffleSplitterJniWrapper // Are we in the process of stopping? Because map tasks can call stop() with success = true @@ -115,25 +115,27 @@ class CHColumnarShuffleWriter[K, V]( flushBlockBufferBeforeEvict, maxSortBufferSize, spillFirstlyBeforeStop, - forceSortShuffle + forceExternalSortShuffle, + forceMemorySortShuffle ) CHNativeMemoryAllocators.createSpillable( "ShuffleWriter", new Spiller() { - override def spill(self: MemoryTarget, size: Long): Long = { + override def spill(self: MemoryTarget, phase: Spiller.Phase, size: Long): Long = { + if (!Spillers.PHASE_SET_SPILL_ONLY.contains(phase)) { + return 0L; + } if (nativeSplitter == 0) { throw new IllegalStateException( "Fatal: spill() called before a shuffle writer " + "is created. This behavior should be optimized by moving memory " + "allocations from make() to split()") } - logInfo(s"Gluten shuffle writer: Trying to spill $size bytes of data") + logError(s"Gluten shuffle writer: Trying to spill $size bytes of data") val spilled = splitterJniWrapper.evict(nativeSplitter); - logInfo(s"Gluten shuffle writer: Spilled $spilled / $size bytes of data") + logError(s"Gluten shuffle writer: Spilled $spilled / $size bytes of data") spilled } - - override def applicablePhases(): util.Set[Spiller.Phase] = Spillers.PHASE_SET_SPILL_ONLY } ) } diff --git a/backends-clickhouse/src/main/scala/org/apache/spark/sql/delta/ClickhouseOptimisticTransaction.scala b/backends-clickhouse/src/main/scala/org/apache/spark/sql/delta/ClickhouseOptimisticTransaction.scala deleted file mode 100644 index e315602597202..0000000000000 --- a/backends-clickhouse/src/main/scala/org/apache/spark/sql/delta/ClickhouseOptimisticTransaction.scala +++ /dev/null @@ -1,206 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.spark.sql.delta - -import org.apache.gluten.backendsapi.clickhouse.CHBackendSettings -import org.apache.gluten.execution.ColumnarToRowExecBase - -import org.apache.spark.SparkException -import org.apache.spark.sql.Dataset -import org.apache.spark.sql.delta.actions._ -import org.apache.spark.sql.delta.catalog.ClickHouseTableV2 -import org.apache.spark.sql.delta.constraints.{Constraint, Constraints} -import org.apache.spark.sql.delta.files.MergeTreeCommitProtocol -import org.apache.spark.sql.delta.schema.InvariantViolationException -import org.apache.spark.sql.delta.sources.DeltaSQLConf -import org.apache.spark.sql.execution.SQLExecution -import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec -import org.apache.spark.sql.execution.datasources.{BasicWriteJobStatsTracker, FakeRowAdaptor, FileFormatWriter, WriteJobStatsTracker} -import org.apache.spark.sql.execution.datasources.v1.clickhouse.MergeTreeFileFormatWriter -import org.apache.spark.sql.execution.datasources.v2.clickhouse.source.DeltaMergeTreeFileFormat -import org.apache.spark.util.{Clock, SerializableConfiguration} - -import org.apache.commons.lang3.exception.ExceptionUtils - -import scala.collection.mutable.ListBuffer - -object ClickhouseOptimisticTransaction {} -class ClickhouseOptimisticTransaction( - override val deltaLog: DeltaLog, - override val snapshot: Snapshot)(implicit override val clock: Clock) - extends OptimisticTransaction(deltaLog, snapshot) { - - def this(deltaLog: DeltaLog, snapshotOpt: Option[Snapshot] = None)(implicit clock: Clock) { - this( - deltaLog, - snapshotOpt.getOrElse(deltaLog.update()) - ) - } - - override def writeFiles( - inputData: Dataset[_], - writeOptions: Option[DeltaOptions], - additionalConstraints: Seq[Constraint]): Seq[FileAction] = { - hasWritten = true - - val spark = inputData.sparkSession - val (data, partitionSchema) = performCDCPartition(inputData) - val outputPath = deltaLog.dataPath - - val (queryExecution, output, generatedColumnConstraints, _) = - normalizeData(deltaLog, data) - val partitioningColumns = getPartitioningColumns(partitionSchema, output) - - val committer = new MergeTreeCommitProtocol("delta-mergetree", outputPath.toString, None) - - // val (optionalStatsTracker, _) = getOptionalStatsTrackerAndStatsCollection(output, outputPath, - // partitionSchema, data) - val (optionalStatsTracker, _) = (None, None) - - val constraints = - Constraints.getAll(metadata, spark) ++ generatedColumnConstraints ++ additionalConstraints - - SQLExecution.withNewExecutionId(queryExecution, Option("deltaTransactionalWrite")) { - val outputSpec = FileFormatWriter.OutputSpec(outputPath.toString, Map.empty, output) - - val queryPlan = queryExecution.executedPlan - val newQueryPlan = queryPlan match { - // if the child is columnar, we can just wrap&transfer the columnar data - case c2r: ColumnarToRowExecBase => - FakeRowAdaptor(c2r.child) - // If the child is aqe, we make aqe "support columnar", - // then aqe itself will guarantee to generate columnar outputs. - // So FakeRowAdaptor will always consumes columnar data, - // thus avoiding the case of c2r->aqe->r2c->writer - case aqe: AdaptiveSparkPlanExec => - FakeRowAdaptor( - AdaptiveSparkPlanExec( - aqe.inputPlan, - aqe.context, - aqe.preprocessingRules, - aqe.isSubquery, - supportsColumnar = true - )) - case other => FakeRowAdaptor(other) - } - - val statsTrackers: ListBuffer[WriteJobStatsTracker] = ListBuffer() - - if (spark.conf.get(DeltaSQLConf.DELTA_HISTORY_METRICS_ENABLED)) { - val basicWriteJobStatsTracker = new BasicWriteJobStatsTracker( - new SerializableConfiguration(deltaLog.newDeltaHadoopConf()), - BasicWriteJobStatsTracker.metrics) -// registerSQLMetrics(spark, basicWriteJobStatsTracker.driverSideMetrics) - statsTrackers.append(basicWriteJobStatsTracker) - } - - // Retain only a minimal selection of Spark writer options to avoid any potential - // compatibility issues - var options = writeOptions match { - case None => Map.empty[String, String] - case Some(writeOptions) => - writeOptions.options.filterKeys { - key => - key.equalsIgnoreCase(DeltaOptions.MAX_RECORDS_PER_FILE) || - key.equalsIgnoreCase(DeltaOptions.COMPRESSION) - }.toMap - } - - spark.conf.getAll.foreach( - entry => { - if ( - entry._1.startsWith(s"${CHBackendSettings.getBackendConfigPrefix}.runtime_settings") - || entry._1.equalsIgnoreCase(DeltaSQLConf.DELTA_OPTIMIZE_MIN_FILE_SIZE.key) - ) { - options += (entry._1 -> entry._2) - } - }) - - try { - val tableV2 = ClickHouseTableV2.getTable(deltaLog) - MergeTreeFileFormatWriter.write( - sparkSession = spark, - plan = newQueryPlan, - fileFormat = new DeltaMergeTreeFileFormat( - metadata, - tableV2.dataBaseName, - tableV2.tableName, - ClickhouseSnapshot.genSnapshotId(tableV2.snapshot), - tableV2.orderByKeyOption, - tableV2.lowCardKeyOption, - tableV2.minmaxIndexKeyOption, - tableV2.bfIndexKeyOption, - tableV2.setIndexKeyOption, - tableV2.primaryKeyOption, - tableV2.clickhouseTableConfigs, - tableV2.partitionColumns - ), - // formats. - committer = committer, - outputSpec = outputSpec, - // scalastyle:off deltahadoopconfiguration - hadoopConf = - spark.sessionState.newHadoopConfWithOptions(metadata.configuration ++ deltaLog.options), - // scalastyle:on deltahadoopconfiguration - orderByKeyOption = tableV2.orderByKeyOption, - lowCardKeyOption = tableV2.lowCardKeyOption, - minmaxIndexKeyOption = tableV2.minmaxIndexKeyOption, - bfIndexKeyOption = tableV2.bfIndexKeyOption, - setIndexKeyOption = tableV2.setIndexKeyOption, - primaryKeyOption = tableV2.primaryKeyOption, - partitionColumns = partitioningColumns, - bucketSpec = tableV2.bucketOption, - statsTrackers = optionalStatsTracker.toSeq ++ statsTrackers, - options = options, - constraints = constraints - ) - } catch { - case s: SparkException => - // Pull an InvariantViolationException up to the top level if it was the root cause. - val violationException = ExceptionUtils.getRootCause(s) - if (violationException.isInstanceOf[InvariantViolationException]) { - throw violationException - } else { - throw s - } - } - } - - // val resultFiles = committer.addedStatuses - // .map { - // a => - // a.copy(stats = optionalStatsTracker - // .map(_.recordedStats(new Path(new URI(a.path)).getName)) - // .getOrElse(a.stats)) - // } - /* - .filter { - // In some cases, we can write out an empty `inputData`. - // Some examples of this (though, they - // may be fixed in the future) are the MERGE command when you delete with empty source, or - // empty target, or on disjoint tables. This is hard to catch before the write without - // collecting the DF ahead of time. Instead, we can return only the AddFiles that - // a) actually add rows, or - // b) don't have any stats so we don't know the number of rows at all - case a: AddFile => a.numLogicalRecords.forall(_ > 0) - case _ => true - } - */ - - committer.addedStatuses.toSeq ++ committer.changeFiles - } -} diff --git a/backends-clickhouse/src/main/delta-20/org/apache/spark/sql/execution/datasources/v2/clickhouse/DeltaLogAdapter.scala b/backends-clickhouse/src/main/scala/org/apache/spark/sql/delta/DeltaAdapterTrait.scala similarity index 79% rename from backends-clickhouse/src/main/delta-20/org/apache/spark/sql/execution/datasources/v2/clickhouse/DeltaLogAdapter.scala rename to backends-clickhouse/src/main/scala/org/apache/spark/sql/delta/DeltaAdapterTrait.scala index 4283d6728fc7b..3ea4af4ae7087 100644 --- a/backends-clickhouse/src/main/delta-20/org/apache/spark/sql/execution/datasources/v2/clickhouse/DeltaLogAdapter.scala +++ b/backends-clickhouse/src/main/scala/org/apache/spark/sql/delta/DeltaAdapterTrait.scala @@ -14,10 +14,9 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.spark.sql.execution.datasources.v2.clickhouse +package org.apache.spark.sql.delta -import org.apache.spark.sql.delta.{DeltaLog, Snapshot} +trait DeltaAdapterTrait { -object DeltaLogAdapter { - def snapshot(deltaLog: DeltaLog): Snapshot = deltaLog.snapshot + def snapshot(deltaLog: DeltaLog): Snapshot } diff --git a/backends-clickhouse/src/main/scala/org/apache/spark/sql/delta/catalog/ClickHouseTableV2Base.scala b/backends-clickhouse/src/main/scala/org/apache/spark/sql/delta/catalog/ClickHouseTableV2Base.scala new file mode 100644 index 0000000000000..9c129b9f5d918 --- /dev/null +++ b/backends-clickhouse/src/main/scala/org/apache/spark/sql/delta/catalog/ClickHouseTableV2Base.scala @@ -0,0 +1,185 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.delta.catalog + +import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogTable} +import org.apache.spark.sql.delta.Snapshot + +import org.apache.hadoop.fs.Path + +import java.{util => ju} + +trait ClickHouseTableV2Base { + + def deltaProperties(): ju.Map[String, String] + + def deltaCatalog(): Option[CatalogTable] + + def deltaPath(): Path + + def deltaSnapshot(): Snapshot + + lazy val dataBaseName = deltaCatalog + .map(_.identifier.database.getOrElse("default")) + .getOrElse("clickhouse") + + lazy val tableName = deltaCatalog + .map(_.identifier.table) + .getOrElse(deltaPath.toUri.getPath) + + lazy val bucketOption: Option[BucketSpec] = { + val tableProperties = deltaProperties + if (tableProperties.containsKey("numBuckets")) { + val numBuckets = tableProperties.get("numBuckets").trim.toInt + val bucketColumnNames: Seq[String] = + tableProperties.get("bucketColumnNames").split(",").map(_.trim).toSeq + val sortColumnNames: Seq[String] = if (tableProperties.containsKey("orderByKey")) { + tableProperties.get("orderByKey").split(",").map(_.trim).toSeq + } else Seq.empty[String] + Some(BucketSpec(numBuckets, bucketColumnNames, sortColumnNames)) + } else { + None + } + } + + lazy val lowCardKeyOption: Option[Seq[String]] = { + getCommaSeparatedColumns("lowCardKey") + } + + lazy val minmaxIndexKeyOption: Option[Seq[String]] = { + getCommaSeparatedColumns("minmaxIndexKey") + } + + lazy val bfIndexKeyOption: Option[Seq[String]] = { + getCommaSeparatedColumns("bloomfilterIndexKey") + } + + lazy val setIndexKeyOption: Option[Seq[String]] = { + getCommaSeparatedColumns("setIndexKey") + } + + private def getCommaSeparatedColumns(keyName: String) = { + val tableProperties = deltaProperties + if (tableProperties.containsKey(keyName)) { + if (tableProperties.get(keyName).nonEmpty) { + val keys = tableProperties.get(keyName).split(",").map(_.trim).toSeq + keys.foreach( + s => { + if (s.contains(".")) { + throw new IllegalStateException( + s"$keyName $s can not contain '.' (not support nested column yet)") + } + }) + Some(keys.map(s => s.toLowerCase())) + } else { + None + } + } else { + None + } + } + + lazy val orderByKeyOption: Option[Seq[String]] = { + if (bucketOption.isDefined && bucketOption.get.sortColumnNames.nonEmpty) { + val orderByKes = bucketOption.get.sortColumnNames + val invalidKeys = orderByKes.intersect(partitionColumns) + if (invalidKeys.nonEmpty) { + throw new IllegalStateException( + s"partition cols $invalidKeys can not be in the order by keys.") + } + Some(orderByKes) + } else { + val tableProperties = deltaProperties + if (tableProperties.containsKey("orderByKey")) { + if (tableProperties.get("orderByKey").nonEmpty) { + val orderByKes = tableProperties.get("orderByKey").split(",").map(_.trim).toSeq + val invalidKeys = orderByKes.intersect(partitionColumns) + if (invalidKeys.nonEmpty) { + throw new IllegalStateException( + s"partition cols $invalidKeys can not be in the order by keys.") + } + Some(orderByKes) + } else { + None + } + } else { + None + } + } + } + + lazy val primaryKeyOption: Option[Seq[String]] = { + if (orderByKeyOption.isDefined) { + val tableProperties = deltaProperties + if (tableProperties.containsKey("primaryKey")) { + if (tableProperties.get("primaryKey").nonEmpty) { + val primaryKeys = tableProperties.get("primaryKey").split(",").map(_.trim).toSeq + if (!orderByKeyOption.get.mkString(",").startsWith(primaryKeys.mkString(","))) { + throw new IllegalStateException( + s"Primary key $primaryKeys must be a prefix of the sorting key") + } + Some(primaryKeys) + } else { + None + } + } else { + None + } + } else { + None + } + } + + lazy val partitionColumns = deltaSnapshot.metadata.partitionColumns + + lazy val clickhouseTableConfigs: Map[String, String] = { + val tableProperties = deltaProperties() + val configs = scala.collection.mutable.Map[String, String]() + configs += ("storage_policy" -> tableProperties.getOrDefault("storage_policy", "default")) + configs.toMap + } + + def primaryKey(): String = primaryKeyOption match { + case Some(keys) => keys.mkString(",") + case None => "" + } + + def orderByKey(): String = orderByKeyOption match { + case Some(keys) => keys.mkString(",") + case None => "tuple()" + } + + def lowCardKey(): String = lowCardKeyOption match { + case Some(keys) => keys.mkString(",") + case None => "" + } + + def minmaxIndexKey(): String = minmaxIndexKeyOption match { + case Some(keys) => keys.mkString(",") + case None => "" + } + + def bfIndexKey(): String = bfIndexKeyOption match { + case Some(keys) => keys.mkString(",") + case None => "" + } + + def setIndexKey(): String = setIndexKeyOption match { + case Some(keys) => keys.mkString(",") + case None => "" + } +} diff --git a/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/datasources/utils/MergeTreePartsPartitionsUtil.scala b/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/datasources/utils/MergeTreePartsPartitionsUtil.scala index 80257c3b5e640..ac6ac959f97cc 100644 --- a/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/datasources/utils/MergeTreePartsPartitionsUtil.scala +++ b/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/datasources/utils/MergeTreePartsPartitionsUtil.scala @@ -127,7 +127,7 @@ object MergeTreePartsPartitionsUtil extends Logging { sparkSession ) } - partitions + partitions.toSeq } def genInputPartitionSeq( diff --git a/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/datasources/v1/CHMergeTreeWriterInjects.scala b/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/datasources/v1/CHMergeTreeWriterInjects.scala index 8a61385fcbd0a..e11406d566195 100644 --- a/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/datasources/v1/CHMergeTreeWriterInjects.scala +++ b/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/datasources/v1/CHMergeTreeWriterInjects.scala @@ -19,6 +19,7 @@ package org.apache.spark.sql.execution.datasources.v1 import org.apache.gluten.backendsapi.BackendsApiManager import org.apache.gluten.expression.ConverterUtils import org.apache.gluten.memory.alloc.CHNativeMemoryAllocators +import org.apache.gluten.sql.shims.SparkShimLoader import org.apache.gluten.substrait.`type`.ColumnTypeNode import org.apache.gluten.substrait.SubstraitContext import org.apache.gluten.substrait.expression.{ExpressionBuilder, StringMapNode} @@ -93,7 +94,8 @@ class CHMergeTreeWriterInjects extends GlutenFormatWriterInjectsBase { Seq(), ConverterUtils.convertNamedStructJson(tableSchema), clickhouseTableConfigs, - tableSchema.toAttributes // use table schema instead of data schema + // use table schema instead of data schema + SparkShimLoader.getSparkShims.attributesFromStruct(tableSchema) ) val allocId = CHNativeMemoryAllocators.contextInstance.getNativeInstanceId val datasourceJniWrapper = new CHDatasourceJniWrapper() diff --git a/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/datasources/v1/clickhouse/MergeTreeFileFormatDataWriter.scala b/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/datasources/v1/clickhouse/MergeTreeFileFormatDataWriter.scala index 3a68ac16df80d..712afb3788d10 100644 --- a/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/datasources/v1/clickhouse/MergeTreeFileFormatDataWriter.scala +++ b/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/datasources/v1/clickhouse/MergeTreeFileFormatDataWriter.scala @@ -117,10 +117,12 @@ abstract class MergeTreeFileFormatDataWriter( releaseResources() val (taskCommitMessage, taskCommitTime) = Utils.timeTakenMs { // committer.commitTask(taskAttemptContext) - val statuses = returnedMetrics.map( - v => { - v._2 - }) + val statuses = returnedMetrics + .map( + v => { + v._2 + }) + .toSeq new TaskCommitMessage(statuses) } diff --git a/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/datasources/v1/clickhouse/MergeTreeFileFormatWriter.scala b/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/datasources/v1/clickhouse/MergeTreeFileFormatWriter.scala index b6da32cfb1dfc..ad2f3851627c0 100644 --- a/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/datasources/v1/clickhouse/MergeTreeFileFormatWriter.scala +++ b/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/datasources/v1/clickhouse/MergeTreeFileFormatWriter.scala @@ -16,7 +16,7 @@ */ package org.apache.spark.sql.execution.datasources.v1.clickhouse -import org.apache.spark.{TaskContext, TaskOutputFileAlreadyExistException} +import org.apache.spark.{SparkException, TaskContext, TaskOutputFileAlreadyExistException} import org.apache.spark.internal.Logging import org.apache.spark.internal.io.{FileCommitProtocol, SparkHadoopWriterUtils} import org.apache.spark.shuffle.FetchFailedException @@ -28,7 +28,6 @@ import org.apache.spark.sql.catalyst.expressions.BindReferences.bindReferences import org.apache.spark.sql.catalyst.plans.physical.HashPartitioning import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, DateTimeUtils} import org.apache.spark.sql.delta.constraints.Constraint -import org.apache.spark.sql.errors.QueryExecutionErrors import org.apache.spark.sql.execution._ import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.execution.datasources.FileFormatWriter.{processStats, ConcurrentOutputWriterSpec, OutputSpec} @@ -249,7 +248,7 @@ object MergeTreeFileFormatWriter extends Logging { case cause: Throwable => logError(s"Aborting job ${description.uuid}.", cause) committer.abortJob(job) - throw QueryExecutionErrors.jobAbortedError(cause) + throw cause } } // scalastyle:on argcount @@ -329,7 +328,7 @@ object MergeTreeFileFormatWriter extends Logging { // We throw the exception and let Executor throw ExceptionFailure to abort the job. throw new TaskOutputFileAlreadyExistException(f) case t: Throwable => - throw QueryExecutionErrors.taskFailedWhileWritingRowsError(t) + throw new SparkException("Task failed while writing rows.", t) } } } diff --git a/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/datasources/v2/clickhouse/ClickHouseConfig.scala b/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/datasources/v2/clickhouse/ClickHouseConfig.scala index e7eb3aeb05d04..232e9ec10c5be 100644 --- a/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/datasources/v2/clickhouse/ClickHouseConfig.scala +++ b/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/datasources/v2/clickhouse/ClickHouseConfig.scala @@ -17,7 +17,6 @@ package org.apache.spark.sql.execution.datasources.v2.clickhouse import org.apache.spark.sql.catalyst.catalog.BucketSpec -import org.apache.spark.sql.connector.catalog.TableCatalog import java.util @@ -29,7 +28,8 @@ object ClickHouseConfig { val NAME = "clickhouse" val ALT_NAME = "clickhouse" val METADATA_DIR = "_delta_log" - val DEFAULT_ENGINE = "MergeTree" + val FORMAT_ENGINE = "engine" + val DEFAULT_ENGINE = "mergetree" val OPT_NAME_PREFIX = "clickhouse." @deprecated @@ -47,16 +47,15 @@ object ClickHouseConfig { buckets: Option[BucketSpec] = None): Map[String, String] = { val configurations = scala.collection.mutable.Map[String, String]() allProperties.asScala.foreach(configurations += _) - configurations.put(TableCatalog.PROP_PROVIDER, ClickHouseConfig.NAME) if (!configurations.contains("metadata_path")) { configurations += ("metadata_path" -> METADATA_DIR) } - if (!configurations.contains("engine")) { - configurations += ("engine" -> DEFAULT_ENGINE) + if (!configurations.contains(FORMAT_ENGINE)) { + configurations += (FORMAT_ENGINE -> DEFAULT_ENGINE) } else { - val engineValue = configurations.get("engine") + val engineValue = configurations.get(FORMAT_ENGINE) if (!engineValue.equals(DEFAULT_ENGINE) && !engineValue.equals("parquet")) { - configurations += ("engine" -> DEFAULT_ENGINE) + configurations += (FORMAT_ENGINE -> DEFAULT_ENGINE) } } if (!configurations.contains("sampling_key")) { @@ -80,6 +79,11 @@ object ClickHouseConfig { configurations.toMap } + def isMergeTreeFormatEngine(configuration: Map[String, String]): Boolean = { + configuration.contains(FORMAT_ENGINE) && + configuration.get(FORMAT_ENGINE).get.equals(DEFAULT_ENGINE) + } + /** Get the related clickhouse option when using DataFrameWriter / DataFrameReader */ def getMergeTreeConfigurations( properties: util.Map[String, String] diff --git a/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/utils/CHExecUtil.scala b/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/utils/CHExecUtil.scala index 78fd5dd396e94..17eb0ed0b037b 100644 --- a/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/utils/CHExecUtil.scala +++ b/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/utils/CHExecUtil.scala @@ -23,7 +23,7 @@ import org.apache.gluten.row.SparkRowInfo import org.apache.gluten.vectorized._ import org.apache.gluten.vectorized.BlockSplitIterator.IteratorOptions -import org.apache.spark.ShuffleDependency +import org.apache.spark.{Partitioner, ShuffleDependency} import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD import org.apache.spark.serializer.Serializer @@ -33,7 +33,7 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, BindReferences, BoundReference, UnsafeProjection, UnsafeRow} import org.apache.spark.sql.catalyst.expressions.codegen.LazilyGeneratedOrdering import org.apache.spark.sql.catalyst.plans.physical.{SinglePartition, _} -import org.apache.spark.sql.execution.{PartitionIdPassthrough, SparkPlan} +import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.exchange.ShuffleExchangeExec import org.apache.spark.sql.execution.metric.{SQLMetric, SQLShuffleWriteMetricsReporter} import org.apache.spark.sql.internal.SQLConf @@ -354,3 +354,8 @@ object CHExecUtil extends Logging { dependency } } + +// Copy from the Vanilla Spark +private class PartitionIdPassthrough(override val numPartitions: Int) extends Partitioner { + override def getPartition(key: Any): Int = key.asInstanceOf[Int] +} diff --git a/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/utils/PushDownUtil.scala b/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/utils/PushDownUtil.scala index 4de85620d0ac0..4eb326fe9a114 100644 --- a/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/utils/PushDownUtil.scala +++ b/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/utils/PushDownUtil.scala @@ -16,42 +16,24 @@ */ package org.apache.spark.sql.execution.utils +import org.apache.gluten.sql.shims.SparkShimLoader + import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} -import org.apache.spark.sql.catalyst.util.RebaseDateTime.RebaseSpec import org.apache.spark.sql.execution.datasources.DataSourceStrategy -import org.apache.spark.sql.execution.datasources.parquet.{ParquetFilters, SparkToParquetSchemaConverter} +import org.apache.spark.sql.execution.datasources.parquet.SparkToParquetSchemaConverter import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.internal.SQLConf.LegacyBehaviorPolicy import org.apache.spark.sql.sources -import org.apache.spark.sql.types.StructType - -import org.apache.parquet.schema.MessageType object PushDownUtil { - private def createParquetFilters( - conf: SQLConf, - schema: MessageType, - caseSensitive: Option[Boolean] = None, - datetimeRebaseSpec: RebaseSpec = RebaseSpec(LegacyBehaviorPolicy.CORRECTED) - ): ParquetFilters = - new ParquetFilters( - schema, - conf.parquetFilterPushDownDate, - conf.parquetFilterPushDownTimestamp, - conf.parquetFilterPushDownDecimal, - conf.parquetFilterPushDownStringStartWith, - conf.parquetFilterPushDownInFilterThreshold, - caseSensitive.getOrElse(conf.caseSensitiveAnalysis), - datetimeRebaseSpec - ) def removeNotSupportPushDownFilters( conf: SQLConf, output: Seq[Attribute], dataFilters: Seq[Expression] ): Seq[Expression] = { - val schema = new SparkToParquetSchemaConverter(conf).convert(StructType.fromAttributes(output)) - val parquetFilters = createParquetFilters(conf, schema) + val schema = new SparkToParquetSchemaConverter(conf).convert( + SparkShimLoader.getSparkShims.structFromAttributes(output)) + val parquetFilters = SparkShimLoader.getSparkShims.createParquetFilters(conf, schema) dataFilters .flatMap { diff --git a/backends-clickhouse/src/test/java/org/apache/spark/memory/TestTaskMemoryManagerSuite.java b/backends-clickhouse/src/test/java/org/apache/spark/memory/TestTaskMemoryManagerSuite.java index b575de403bfde..905ffacde023d 100644 --- a/backends-clickhouse/src/test/java/org/apache/spark/memory/TestTaskMemoryManagerSuite.java +++ b/backends-clickhouse/src/test/java/org/apache/spark/memory/TestTaskMemoryManagerSuite.java @@ -21,6 +21,7 @@ import org.apache.gluten.memory.alloc.CHNativeMemoryAllocator; import org.apache.gluten.memory.alloc.CHNativeMemoryAllocatorManagerImpl; import org.apache.gluten.memory.memtarget.MemoryTargets; +import org.apache.gluten.memory.memtarget.Spillers; import org.apache.spark.SparkConf; import org.apache.spark.internal.config.package$; @@ -52,7 +53,7 @@ public void initMemoryManager() { listener = new CHManagedCHReservationListener( MemoryTargets.newConsumer( - taskMemoryManager, "test", Collections.emptyList(), Collections.emptyMap()), + taskMemoryManager, "test", Spillers.NOOP, Collections.emptyMap()), new SimpleMemoryUsageRecorder()); manager = new CHNativeMemoryAllocatorManagerImpl(new CHNativeMemoryAllocator(-1L, listener)); diff --git a/backends-clickhouse/src/test/resources/text-data/abnormal-json/data2.txt b/backends-clickhouse/src/test/resources/text-data/abnormal-json/data2.txt index 485fb6d7205a8..e323f5986f7bb 100644 Binary files a/backends-clickhouse/src/test/resources/text-data/abnormal-json/data2.txt and b/backends-clickhouse/src/test/resources/text-data/abnormal-json/data2.txt differ diff --git a/backends-clickhouse/src/test/resources/tpcds-queries-output/q17.out b/backends-clickhouse/src/test/resources/tpcds-queries-output/q17.out index ab9c217f7d73f..95891dd390985 100644 --- a/backends-clickhouse/src/test/resources/tpcds-queries-output/q17.out +++ b/backends-clickhouse/src/test/resources/tpcds-queries-output/q17.out @@ -1,2 +1,2 @@ -AAAAAAAADOCAAAAA|-|Little, national services will buy young molecules. In part video-taped activities join now|-|TN|-|1|-|24.0|-|NaN|-|NaN|-|1|-|11.0|-|NaN|-|NaN|-|1|-|49.0|-|NaN|-|NaN -AAAAAAAAEBOBAAAA|-|Special words should tell by a follower|-|TN|-|1|-|66.0|-|NaN|-|NaN|-|1|-|38.0|-|NaN|-|NaN|-|1|-|56.0|-|NaN|-|NaN +AAAAAAAADOCAAAAA|-|Little, national services will buy young molecules. In part video-taped activities join now|-|TN|-|1|-|24.0|-|null|-|null|-|1|-|11.0|-|null|-|null|-|1|-|49.0|-|null|-|null +AAAAAAAAEBOBAAAA|-|Special words should tell by a follower|-|TN|-|1|-|66.0|-|null|-|null|-|1|-|38.0|-|null|-|null|-|1|-|56.0|-|null|-|null diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseColumnarSortShuffleAQESuite.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseColumnarExternalSortShuffleSuite.scala similarity index 52% rename from backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseColumnarSortShuffleAQESuite.scala rename to backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseColumnarExternalSortShuffleSuite.scala index 098c7117bf7d3..be36cd998485d 100644 --- a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseColumnarSortShuffleAQESuite.scala +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseColumnarExternalSortShuffleSuite.scala @@ -17,10 +17,9 @@ package org.apache.gluten.execution import org.apache.spark.SparkConf -import org.apache.spark.sql.execution.CoalescedPartitionSpec -import org.apache.spark.sql.execution.adaptive.{AdaptiveSparkPlanExec, AdaptiveSparkPlanHelper, AQEShuffleReadExec} +import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper -class GlutenClickHouseColumnarSortShuffleAQESuite +class GlutenClickHouseColumnarExternalSortShuffleSuite extends GlutenClickHouseTPCHAbstractSuite with AdaptiveSparkPlanHelper { @@ -36,29 +35,11 @@ class GlutenClickHouseColumnarSortShuffleAQESuite .set("spark.sql.shuffle.partitions", "5") .set("spark.sql.autoBroadcastJoinThreshold", "10MB") .set("spark.sql.adaptive.enabled", "true") - .set("spark.gluten.sql.columnar.backend.ch.forceSortShuffle", "true") + .set("spark.gluten.sql.columnar.backend.ch.forceExternalSortShuffle", "true") } test("TPCH Q1") { - runTPCHQuery(1) { - df => - assert(df.queryExecution.executedPlan.isInstanceOf[AdaptiveSparkPlanExec]) - - val colCustomShuffleReaderExecs = collect(df.queryExecution.executedPlan) { - case csr: AQEShuffleReadExec => csr - } - assert(colCustomShuffleReaderExecs.size == 2) - val coalescedPartitionSpec0 = colCustomShuffleReaderExecs(0) - .partitionSpecs(0) - .asInstanceOf[CoalescedPartitionSpec] - assert(coalescedPartitionSpec0.startReducerIndex == 0) - assert(coalescedPartitionSpec0.endReducerIndex == 5) - val coalescedPartitionSpec1 = colCustomShuffleReaderExecs(1) - .partitionSpecs(0) - .asInstanceOf[CoalescedPartitionSpec] - assert(coalescedPartitionSpec1.startReducerIndex == 0) - assert(coalescedPartitionSpec1.endReducerIndex == 5) - } + runTPCHQuery(1) { df => } } test("TPCH Q2") { @@ -98,14 +79,7 @@ class GlutenClickHouseColumnarSortShuffleAQESuite } test("TPCH Q11") { - runTPCHQuery(11) { - df => - assert(df.queryExecution.executedPlan.isInstanceOf[AdaptiveSparkPlanExec]) - val adaptiveSparkPlanExec = collectWithSubqueries(df.queryExecution.executedPlan) { - case adaptive: AdaptiveSparkPlanExec => adaptive - } - assert(adaptiveSparkPlanExec.size == 2) - } + runTPCHQuery(11) { df => } } test("TPCH Q12") { @@ -121,14 +95,7 @@ class GlutenClickHouseColumnarSortShuffleAQESuite } test("TPCH Q15") { - runTPCHQuery(15) { - df => - assert(df.queryExecution.executedPlan.isInstanceOf[AdaptiveSparkPlanExec]) - val adaptiveSparkPlanExec = collectWithSubqueries(df.queryExecution.executedPlan) { - case adaptive: AdaptiveSparkPlanExec => adaptive - } - assert(adaptiveSparkPlanExec.size == 2) - } + runTPCHQuery(15) { df => } } test("TPCH Q16") { @@ -140,13 +107,7 @@ class GlutenClickHouseColumnarSortShuffleAQESuite } test("TPCH Q18") { - runTPCHQuery(18) { - df => - val hashAggregates = collect(df.queryExecution.executedPlan) { - case hash: HashAggregateExecBaseTransformer => hash - } - assert(hashAggregates.size == 3) - } + runTPCHQuery(18) { df => } } test("TPCH Q19") { @@ -162,14 +123,6 @@ class GlutenClickHouseColumnarSortShuffleAQESuite } test("TPCH Q22") { - runTPCHQuery(22) { - df => - assert(df.queryExecution.executedPlan.isInstanceOf[AdaptiveSparkPlanExec]) - val adaptiveSparkPlanExec = collectWithSubqueries(df.queryExecution.executedPlan) { - case adaptive: AdaptiveSparkPlanExec => adaptive - } - assert(adaptiveSparkPlanExec.size == 3) - assert(adaptiveSparkPlanExec(1) == adaptiveSparkPlanExec(2)) - } + runTPCHQuery(22) { df => } } } diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseColumnarMemorySortShuffleSuite.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseColumnarMemorySortShuffleSuite.scala new file mode 100644 index 0000000000000..b9d580c7249cd --- /dev/null +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseColumnarMemorySortShuffleSuite.scala @@ -0,0 +1,128 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.execution + +import org.apache.spark.SparkConf +import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper + +class GlutenClickHouseColumnarMemorySortShuffleSuite + extends GlutenClickHouseTPCHAbstractSuite + with AdaptiveSparkPlanHelper { + + override protected val tablesPath: String = basePath + "/tpch-data-ch" + override protected val tpchQueries: String = rootPath + "queries/tpch-queries-ch" + override protected val queriesResults: String = rootPath + "mergetree-queries-output" + + /** Run Gluten + ClickHouse Backend with ColumnarShuffleManager */ + override protected def sparkConf: SparkConf = { + super.sparkConf + .set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.ColumnarShuffleManager") + .set("spark.io.compression.codec", "LZ4") + .set("spark.sql.shuffle.partitions", "5") + .set("spark.sql.autoBroadcastJoinThreshold", "10MB") + .set("spark.sql.adaptive.enabled", "true") + .set("spark.gluten.sql.columnar.backend.ch.forceMemorySortShuffle", "true") + } + + test("TPCH Q1") { + runTPCHQuery(1) { df => } + } + + test("TPCH Q2") { + runTPCHQuery(2) { df => } + } + + test("TPCH Q3") { + runTPCHQuery(3) { df => } + } + + test("TPCH Q4") { + runTPCHQuery(4) { df => } + } + + test("TPCH Q5") { + runTPCHQuery(5) { df => } + } + + test("TPCH Q6") { + runTPCHQuery(6) { df => } + } + + test("TPCH Q7") { + runTPCHQuery(7) { df => } + } + + test("TPCH Q8") { + runTPCHQuery(8) { df => } + } + + test("TPCH Q9") { + runTPCHQuery(9) { df => } + } + + test("TPCH Q10") { + runTPCHQuery(10) { df => } + } + + test("TPCH Q11") { + runTPCHQuery(11) { df => } + } + + test("TPCH Q12") { + runTPCHQuery(12) { df => } + } + + test("TPCH Q13") { + runTPCHQuery(13) { df => } + } + + test("TPCH Q14") { + runTPCHQuery(14) { df => } + } + + test("TPCH Q15") { + runTPCHQuery(15) { df => } + } + + test("TPCH Q16") { + runTPCHQuery(16, noFallBack = false) { df => } + } + + test("TPCH Q17") { + runTPCHQuery(17) { df => } + } + + test("TPCH Q18") { + runTPCHQuery(18) { df => } + } + + test("TPCH Q19") { + runTPCHQuery(19) { df => } + } + + test("TPCH Q20") { + runTPCHQuery(20) { df => } + } + + test("TPCH Q21") { + runTPCHQuery(21, noFallBack = false) { df => } + } + + test("TPCH Q22") { + runTPCHQuery(22) { df => } + } +} diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseDatetimeExpressionSuite.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseDatetimeExpressionSuite.scala index 295ad4f251366..53416607521ef 100644 --- a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseDatetimeExpressionSuite.scala +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseDatetimeExpressionSuite.scala @@ -41,6 +41,7 @@ class GlutenClickHouseDatetimeExpressionSuite .set("spark.sql.shuffle.partitions", "5") .set("spark.sql.autoBroadcastJoinThreshold", "10MB") .set("spark.sql.adaptive.enabled", "true") + .set("spark.sql.session.timeZone", "GMT+08:00") } override protected def createTPCHNotNullTables(): Unit = { @@ -143,7 +144,7 @@ class GlutenClickHouseDatetimeExpressionSuite | date_trunc('month', t) c |from date_trunc_tmp1 |""".stripMargin - compareResultsAgainstVanillaSpark(sql2, true, { _ => }, false) + compareResultsAgainstVanillaSpark(sql2, true, { _ => }) } } diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseDecimalSuite.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseDecimalSuite.scala index c41ea0ccb2ea0..7320b7c051522 100644 --- a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseDecimalSuite.scala +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseDecimalSuite.scala @@ -67,9 +67,9 @@ class GlutenClickHouseDecimalSuite private val decimalTPCHTables: Seq[(DecimalType, Seq[Int])] = Seq.apply( (DecimalType.apply(9, 4), Seq()), // 1: ch decimal avg is float - (DecimalType.apply(18, 8), Seq(1)), + (DecimalType.apply(18, 8), Seq()), // 1: ch decimal avg is float, 3/10: all value is null and compare with limit - (DecimalType.apply(38, 19), Seq(1, 3, 10)) + (DecimalType.apply(38, 19), Seq(3, 10)) ) private def createDecimalTables(dataType: DecimalType): Unit = { @@ -300,6 +300,39 @@ class GlutenClickHouseDecimalSuite customCheck = customCheck, noFallBack = noFallBack) } + + test("from decimalArithmeticOperations.sql") { + // prepare + val createSql = + "create table decimals_test(id int, a decimal(38,18), b decimal(38,18)) using parquet" + val inserts = + "insert into decimals_test values(1, 100.0, 999.0)" + + ", (2, 12345.123, 12345.123)" + + ", (3, 0.1234567891011, 1234.1)" + + ", (4, 123456789123456789.0, 1.123456789123456789)" + spark.sql(createSql) + + try { + spark.sql(inserts) + + val q1 = "select id, a+b, a-b, a*b, a/b ,a%b from decimals_test order by id" + + // test operations between decimals and constants + val q2 = "select id, a*10, b/10 from decimals_test order by id" + // FIXME val q2 = "select id, a*10, b/10, a%20, b%30 from decimals_test order by id" + + Seq("true", "false").foreach { + allowPrecisionLoss => + withSQLConf((SQLConf.DECIMAL_OPERATIONS_ALLOW_PREC_LOSS.key, allowPrecisionLoss)) { + compareResultsAgainstVanillaSpark(q1, compareResult = true, _ => {}) + compareResultsAgainstVanillaSpark(q2, compareResult = true, _ => {}) + } + } + } finally { + spark.sql("drop table if exists decimals_test") + } + } + // FIXME: Support AVG for Decimal Type Seq("true", "false").foreach { allowPrecisionLoss => Range @@ -390,6 +423,30 @@ class GlutenClickHouseDecimalSuite compareResultsAgainstVanillaSpark(sql_not_null, compareResult = true, _ => {}) } + test("bigint % 6.1") { + val sql = + s""" + | select + | s_suppkey, + | s_suppkey % 6.1 + | from supplier + |""".stripMargin + spark.sql(s"use decimal_${9}_${4}") + withSQLConf(vanillaSparkConfs(): _*) { + val df2 = spark.sql(sql) + print(df2.queryExecution.executedPlan) + } + testFromRandomBase( + sql, + _ => {} + ) + } + + test("Fix issue(6015) allow overflow when converting decimal to integer") { + val sql = "select int(cast(id * 9999999999 as decimal(29, 2))) from range(10)" + runQueryAndCompare(sql)(checkGlutenOperatorMatch[ProjectExecTransformer]) + } + def testFromRandomBase( sql: String, customCheck: DataFrame => Unit, diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseDeltaParquetWriteSuite.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseDeltaParquetWriteSuite.scala new file mode 100644 index 0000000000000..8f8351baeae10 --- /dev/null +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseDeltaParquetWriteSuite.scala @@ -0,0 +1,1422 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.execution + +import org.apache.spark.SparkConf +import org.apache.spark.sql.SaveMode +import org.apache.spark.sql.delta.actions.AddFile +import org.apache.spark.sql.delta.files.TahoeFileIndex +import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper + +import io.delta.tables.DeltaTable + +import java.io.File + +// Some sqls' line length exceeds 100 +// scalastyle:off line.size.limit + +class GlutenClickHouseDeltaParquetWriteSuite + extends GlutenClickHouseTPCHAbstractSuite + with AdaptiveSparkPlanHelper { + + override protected val needCopyParquetToTablePath = true + + override protected val tablesPath: String = basePath + "/tpch-data" + override protected val tpchQueries: String = rootPath + "queries/tpch-queries-ch" + override protected val queriesResults: String = rootPath + "mergetree-queries-output" + + /** Run Gluten + ClickHouse Backend with SortShuffleManager */ + override protected def sparkConf: SparkConf = { + super.sparkConf + .set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.ColumnarShuffleManager") + .set("spark.io.compression.codec", "LZ4") + .set("spark.sql.shuffle.partitions", "5") + .set("spark.sql.autoBroadcastJoinThreshold", "10MB") + .set("spark.sql.adaptive.enabled", "true") + .set("spark.sql.files.maxPartitionBytes", "20000000") + .set("spark.gluten.sql.native.writer.enabled", "true") + .set("spark.sql.storeAssignmentPolicy", "legacy") + .set( + "spark.gluten.sql.columnar.backend.ch.runtime_settings.mergetree.merge_after_insert", + "false") + .set( + "spark.databricks.delta.retentionDurationCheck.enabled", + "false" + ) + } + + override protected def createTPCHNotNullTables(): Unit = { + createNotNullTPCHTablesInParquet(tablesPath) + } + + test("test parquet table write with the delta") { + spark.sql(s""" + |DROP TABLE IF EXISTS lineitem_delta_parquet; + |""".stripMargin) + + spark.sql(s""" + |CREATE TABLE IF NOT EXISTS lineitem_delta_parquet + |( + | l_orderkey bigint, + | l_partkey bigint, + | l_suppkey bigint, + | l_linenumber bigint, + | l_quantity double, + | l_extendedprice double, + | l_discount double, + | l_tax double, + | l_returnflag string, + | l_linestatus string, + | l_shipdate date, + | l_commitdate date, + | l_receiptdate date, + | l_shipinstruct string, + | l_shipmode string, + | l_comment string + |) + |USING delta + |LOCATION '$basePath/lineitem_delta_parquet' + |""".stripMargin) + + spark.sql(s""" + | insert into table lineitem_delta_parquet + | select /*+ REPARTITION(5) */ * from lineitem + |""".stripMargin) + + val sqlStr = + s""" + |SELECT + | l_returnflag, + | l_linestatus, + | sum(l_quantity) AS sum_qty, + | sum(l_extendedprice) AS sum_base_price, + | sum(l_extendedprice * (1 - l_discount)) AS sum_disc_price, + | sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) AS sum_charge, + | avg(l_quantity) AS avg_qty, + | avg(l_extendedprice) AS avg_price, + | avg(l_discount) AS avg_disc, + | count(*) AS count_order + |FROM + | lineitem_delta_parquet + |WHERE + | l_shipdate <= date'1998-09-02' - interval 1 day + |GROUP BY + | l_returnflag, + | l_linestatus + |ORDER BY + | l_returnflag, + | l_linestatus; + | + |""".stripMargin + runTPCHQueryBySQL(1, sqlStr) { + df => + val plans = collect(df.queryExecution.executedPlan) { + case f: FileSourceScanExecTransformer => f + case w: WholeStageTransformer => w + } + assert(plans.size == 4) + + val parquetScan = plans(3).asInstanceOf[FileSourceScanExecTransformer] + assert(parquetScan.nodeName.startsWith("Scan parquet ")) + + val fileIndex = parquetScan.relation.location.asInstanceOf[TahoeFileIndex] + val addFiles = fileIndex.matchingFiles(Nil, Nil).map(f => f.asInstanceOf[AddFile]) + assert(addFiles.size == 5) + } + } + + test("test parquet insert overwrite with the delta") { + spark.sql(s""" + |DROP TABLE IF EXISTS lineitem_delta_parquet_insertoverwrite; + |""".stripMargin) + + spark.sql(s""" + |CREATE TABLE IF NOT EXISTS lineitem_delta_parquet_insertoverwrite + |( + | l_orderkey bigint, + | l_partkey bigint, + | l_suppkey bigint, + | l_linenumber bigint, + | l_quantity double, + | l_extendedprice double, + | l_discount double, + | l_tax double, + | l_returnflag string, + | l_linestatus string, + | l_shipdate date, + | l_commitdate date, + | l_receiptdate date, + | l_shipinstruct string, + | l_shipmode string, + | l_comment string + |) + |USING delta + |LOCATION '$basePath/lineitem_delta_parquet_insertoverwrite' + |""".stripMargin) + + spark.sql(s""" + | insert into table lineitem_delta_parquet_insertoverwrite + | select * from lineitem + |""".stripMargin) + + spark.sql(s""" + | insert overwrite table lineitem_delta_parquet_insertoverwrite + | select * from lineitem where mod(l_orderkey,2) = 1 + |""".stripMargin) + val sql2 = + s""" + | select count(*) from lineitem_delta_parquet_insertoverwrite + |""".stripMargin + assert( + // total rows should remain unchanged + spark.sql(sql2).collect().apply(0).get(0) == 300001 + ) + } + + test("test parquet insert overwrite partitioned table with small table, static with delta") { + spark.sql(s""" + |DROP TABLE IF EXISTS lineitem_delta_parquet_insertoverwrite2; + |""".stripMargin) + + spark.sql(s""" + |CREATE TABLE IF NOT EXISTS lineitem_delta_parquet_insertoverwrite2 + |( + | l_orderkey bigint, + | l_partkey bigint, + | l_suppkey bigint, + | l_linenumber bigint, + | l_quantity double, + | l_extendedprice double, + | l_discount double, + | l_tax double, + | l_returnflag string, + | l_linestatus string, + | l_shipdate date, + | l_commitdate date, + | l_receiptdate date, + | l_shipinstruct string, + | l_shipmode string, + | l_comment string + |) + |USING delta + |PARTITIONED BY (l_shipdate) + |LOCATION '$basePath/lineitem_delta_parquet_insertoverwrite2' + |""".stripMargin) + + spark.sql(s""" + | insert into table lineitem_delta_parquet_insertoverwrite2 + | select * from lineitem + | where l_shipdate BETWEEN date'1993-01-01' AND date'1993-03-31' + |""".stripMargin) + + spark.sql( + s""" + | insert overwrite table lineitem_delta_parquet_insertoverwrite2 + | select * from lineitem where l_shipdate BETWEEN date'1993-02-01' AND date'1993-02-10' + |""".stripMargin) + val sql2 = + s""" + | select count(*) from lineitem_delta_parquet_insertoverwrite2 + | + |""".stripMargin + assert( + // total rows should remain unchanged + spark.sql(sql2).collect().apply(0).get(0) == 2418 + ) + } + + test("test parquet insert overwrite partitioned table with small table, dynamic with delta") { + withSQLConf(("spark.sql.sources.partitionOverwriteMode", "dynamic")) { + spark.sql(s""" + |DROP TABLE IF EXISTS lineitem_delta_parquet_insertoverwrite3 PURGE; + |""".stripMargin) + + spark.sql(s""" + |CREATE TABLE IF NOT EXISTS lineitem_delta_parquet_insertoverwrite3 + |( + | l_orderkey bigint, + | l_partkey bigint, + | l_suppkey bigint, + | l_linenumber bigint, + | l_quantity double, + | l_extendedprice double, + | l_discount double, + | l_tax double, + | l_returnflag string, + | l_linestatus string, + | l_shipdate date, + | l_commitdate date, + | l_receiptdate date, + | l_shipinstruct string, + | l_shipmode string, + | l_comment string + |) + |USING delta + |PARTITIONED BY (l_shipdate) + |LOCATION '$basePath/lineitem_delta_parquet_insertoverwrite3' + |""".stripMargin) + + spark.sql(s""" + | insert into table lineitem_delta_parquet_insertoverwrite3 + | select * from lineitem + | where l_shipdate BETWEEN date'1993-01-01' AND date'1993-03-31' + |""".stripMargin) + + spark.sql( + s""" + | insert overwrite table lineitem_delta_parquet_insertoverwrite3 + | select * from lineitem where l_shipdate BETWEEN date'1993-02-01' AND date'1993-02-10' + |""".stripMargin) + val sql2 = + s""" + | select count(*) from lineitem_delta_parquet_insertoverwrite3 + | + |""".stripMargin + assert( + // total rows should remain unchanged + spark.sql(sql2).collect().apply(0).get(0) == 21875 + ) + } + } + + test("test parquet table update with the delta") { + spark.sql(s""" + |DROP TABLE IF EXISTS lineitem_delta_parquet_update; + |""".stripMargin) + + spark.sql(s""" + |CREATE TABLE IF NOT EXISTS lineitem_delta_parquet_update + |( + | l_orderkey bigint, + | l_partkey bigint, + | l_suppkey bigint, + | l_linenumber bigint, + | l_quantity double, + | l_extendedprice double, + | l_discount double, + | l_tax double, + | l_returnflag string, + | l_linestatus string, + | l_shipdate date, + | l_commitdate date, + | l_receiptdate date, + | l_shipinstruct string, + | l_shipmode string, + | l_comment string + |) + |USING delta + |LOCATION '$basePath/lineitem_delta_parquet_update' + |""".stripMargin) + + spark.sql(s""" + | insert into table lineitem_delta_parquet_update + | select /*+ REPARTITION(6) */ * from lineitem + |""".stripMargin) + + spark.sql( + s""" + | update lineitem_delta_parquet_update set l_returnflag = 'Z' where l_orderkey = 12647 + |""".stripMargin) + + { + val sql1 = + s""" + | select count(*) from lineitem_delta_parquet_update where l_returnflag = 'Z' + | + |""".stripMargin + + val df = spark.sql(sql1) + val result = df.collect() + assert( + // in test data, there are only 1 row with l_orderkey = 12647 + result.apply(0).get(0) == 1 + ) + val scanExec = collect(df.queryExecution.executedPlan) { + case f: FileSourceScanExecTransformer => f + } + assert(scanExec.size == 1) + + val parquetScan = scanExec.head + assert(parquetScan.nodeName.startsWith("Scan parquet")) + + val fileIndex = parquetScan.relation.location.asInstanceOf[TahoeFileIndex] + val addFiles = fileIndex.matchingFiles(Nil, Nil).map(f => f.asInstanceOf[AddFile]) + assert(addFiles.size == 4) + } + + val sql2 = + s""" + | select count(*) from lineitem_delta_parquet_update + | + |""".stripMargin + assert( + // total rows should remain unchanged + spark.sql(sql2).collect().apply(0).get(0) == 600572 + ) + } + + test("test parquet table delete with the delta") { + spark.sql(s""" + |DROP TABLE IF EXISTS lineitem_delta_parquet_delete; + |""".stripMargin) + + spark.sql(s""" + |CREATE TABLE IF NOT EXISTS lineitem_delta_parquet_delete + |( + | l_orderkey bigint, + | l_partkey bigint, + | l_suppkey bigint, + | l_linenumber bigint, + | l_quantity double, + | l_extendedprice double, + | l_discount double, + | l_tax double, + | l_returnflag string, + | l_linestatus string, + | l_shipdate date, + | l_commitdate date, + | l_receiptdate date, + | l_shipinstruct string, + | l_shipmode string, + | l_comment string + |) + |USING delta + |LOCATION '$basePath/lineitem_delta_parquet_delete' + |""".stripMargin) + + spark.sql(s""" + | insert into table lineitem_delta_parquet_delete + | select /*+ REPARTITION(6) */ * from lineitem + |""".stripMargin) + val df1 = spark.sql(s""" + | delete from lineitem_delta_parquet_delete where l_orderkey = 12647 + |""".stripMargin) + + { + val df = spark.sql(s""" + | select sum(l_linenumber) from lineitem_delta_parquet_delete + |""".stripMargin) + val result = df.collect() + assert( + result.apply(0).get(0) == 1802445 + ) + val scanExec = collect(df.queryExecution.executedPlan) { + case f: FileSourceScanExecTransformer => f + } + val parquetScan = scanExec.head + val fileIndex = parquetScan.relation.location.asInstanceOf[TahoeFileIndex] + val addFiles = fileIndex.matchingFiles(Nil, Nil).map(f => f.asInstanceOf[AddFile]) + assert(addFiles.size == 4) + } + + { + spark.sql(s""" + | delete from lineitem_delta_parquet_delete where mod(l_orderkey, 3) = 2 + |""".stripMargin) + val df3 = spark.sql(s""" + | select sum(l_linenumber) from lineitem_delta_parquet_delete + |""".stripMargin) + assert( + df3.collect().apply(0).get(0) == 1200671 + ) + } + } + + test("test parquet table upsert with the delta") { + spark.sql(s""" + |DROP TABLE IF EXISTS lineitem_delta_parquet_upsert; + |""".stripMargin) + + spark.sql(s""" + |CREATE TABLE IF NOT EXISTS lineitem_delta_parquet_upsert + |( + | l_orderkey bigint, + | l_partkey bigint, + | l_suppkey bigint, + | l_linenumber bigint, + | l_quantity double, + | l_extendedprice double, + | l_discount double, + | l_tax double, + | l_returnflag string, + | l_linestatus string, + | l_shipdate date, + | l_commitdate date, + | l_receiptdate date, + | l_shipinstruct string, + | l_shipmode string, + | l_comment string + |) + |USING delta + |LOCATION '$basePath/lineitem_delta_parquet_upsert' + |""".stripMargin) + + spark.sql(s""" + | insert into table lineitem_delta_parquet_upsert + | select * from lineitem + |""".stripMargin) + + { + val df0 = spark.sql(s""" + | select sum(l_linenumber) from lineitem_delta_parquet_upsert + |""".stripMargin) + assert( + df0.collect().apply(0).get(0) == 1802446 + ) + } + + upsertSourceTableAndCheck("lineitem_delta_parquet_upsert") + } + + private def upsertSourceTableAndCheck(tableName: String) = { + // Why selecting l_orderkey having count(*) =1 ? + // Answer: to avoid "org.apache.spark.sql.delta.DeltaUnsupportedOperationException: + // Cannot perform Merge as multiple source rows matched and attempted to modify the same + // target row in the Delta table in possibly conflicting ways." + spark.sql(s""" + merge into $tableName + using ( + + select l_orderkey, l_partkey, l_suppkey, l_linenumber, l_quantity, l_extendedprice, l_discount, l_tax, + 'Z' as `l_returnflag`, + l_linestatus, l_shipdate, l_commitdate, l_receiptdate, l_shipinstruct, l_shipmode, l_comment + from lineitem where l_orderkey in (select l_orderkey from lineitem group by l_orderkey having count(*) =1 ) and l_orderkey < 100000 + + union + + select l_orderkey + 10000000, + l_partkey, l_suppkey, l_linenumber, l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, + l_linestatus, l_shipdate, l_commitdate, l_receiptdate, l_shipinstruct, l_shipmode, l_comment + from lineitem where l_orderkey in (select l_orderkey from lineitem group by l_orderkey having count(*) =1 ) and l_orderkey < 100000 + + ) as updates + on updates.l_orderkey = $tableName.l_orderkey + when matched then update set * + when not matched then insert * + """.stripMargin) + + { + val df1 = spark.sql(s""" + | select count(*) from $tableName + |""".stripMargin) + assert( + df1.collect().apply(0).get(0) == 600572 + 3506 + ) + } + { + val df2 = + spark.sql(s""" + | select count(*) from $tableName where l_returnflag = 'Z' + |""".stripMargin) + assert( + df2.collect().apply(0).get(0) == 3506 + ) + } + + { + val df3 = + spark.sql(s""" + | select count(*) from $tableName where l_orderkey > 10000000 + |""".stripMargin) + assert( + df3.collect().apply(0).get(0) == 3506 + ) + } + } + + test("test parquet write with partition + delta") { + spark.sql(s""" + |DROP TABLE IF EXISTS lineitem_delta_parquet_partition; + |""".stripMargin) + + spark.sql(s""" + |CREATE TABLE IF NOT EXISTS lineitem_delta_parquet_partition + |( + | l_orderkey bigint, + | l_partkey bigint, + | l_suppkey bigint, + | l_linenumber bigint, + | l_quantity double, + | l_extendedprice double, + | l_discount double, + | l_tax double, + | l_returnflag string, + | l_linestatus string, + | l_shipdate date, + | l_commitdate date, + | l_receiptdate date, + | l_shipinstruct string, + | l_shipmode string, + | l_comment string + |) + |USING delta + |PARTITIONED BY (l_shipdate, l_returnflag) + |LOCATION '$basePath/lineitem_delta_parquet_partition' + |""".stripMargin) + + // dynamic partitions + spark.sql(s""" + | insert into table lineitem_delta_parquet_partition + | select * from lineitem + | where l_shipdate BETWEEN date'1993-01-01' AND date'1993-03-31' + |""".stripMargin) + + // write with dataframe api + val source = spark.sql(s""" + |select + | l_orderkey , + | l_partkey , + | l_suppkey , + | l_linenumber , + | l_quantity , + | l_extendedprice , + | l_discount , + | l_tax , + | l_returnflag , + | l_linestatus , + | l_shipdate , + | l_commitdate , + | l_receiptdate , + | l_shipinstruct , + | l_shipmode , + | l_comment + | from lineitem + | where l_shipdate BETWEEN date'1993-01-01' AND date'1993-01-10' + |""".stripMargin) + + source.write + .format("delta") + .mode(SaveMode.Append) + .insertInto("lineitem_delta_parquet_partition") + + // static partition + spark.sql( + s""" + | insert into lineitem_delta_parquet_partition PARTITION (l_shipdate=date'1993-02-21', + | l_returnflag = 'A') + | (l_orderkey, + | l_partkey, + | l_suppkey, + | l_linenumber, + | l_quantity, + | l_extendedprice, + | l_discount, + | l_tax, + | l_linestatus, + | l_commitdate, + | l_receiptdate, + | l_shipinstruct, + | l_shipmode, + | l_comment) + | select l_orderkey, + | l_partkey, + | l_suppkey, + | l_linenumber, + | l_quantity, + | l_extendedprice, + | l_discount, + | l_tax, + | l_linestatus, + | l_commitdate, + | l_receiptdate, + | l_shipinstruct, + | l_shipmode, + | l_comment from lineitem + | where l_shipdate BETWEEN date'1993-02-01' AND date'1993-02-10' + |""".stripMargin) + val sqlStr = + s""" + |SELECT + | l_returnflag, + | l_linestatus, + | sum(l_quantity) AS sum_qty, + | sum(l_extendedprice) AS sum_base_price, + | sum(l_extendedprice * (1 - l_discount)) AS sum_disc_price, + | sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) AS sum_charge, + | avg(l_quantity) AS avg_qty, + | avg(l_extendedprice) AS avg_price, + | avg(l_discount) AS avg_disc, + | count(*) AS count_order + |FROM + | lineitem_delta_parquet_partition + |WHERE + | l_shipdate <= date'1998-09-02' - interval 1 day + |GROUP BY + | l_returnflag, + | l_linestatus + |ORDER BY + | l_returnflag, + | l_linestatus; + | + |""".stripMargin + runTPCHQueryBySQL(1, sqlStr, compareResult = false) { + df => + val result = df.collect() + assert(result.size == 2) + assert(result(0).getString(0).equals("A")) + assert(result(0).getString(1).equals("F")) + assert(result(0).getDouble(2) == 368009.0) + + assert(result(1).getString(0).equals("R")) + assert(result(1).getString(1).equals("F")) + assert(result(1).getDouble(2) == 312371.0) + + val scanExec = collect(df.queryExecution.executedPlan) { + case f: FileSourceScanExecTransformer => f + } + assert(scanExec.size == 1) + + val parquetScan = scanExec(0) + assert(parquetScan.nodeName.startsWith("Scan parquet")) + assert(parquetScan.metrics("numFiles").value == 201) + + val fileIndex = parquetScan.relation.location.asInstanceOf[TahoeFileIndex] + val addFiles = fileIndex.matchingFiles(Nil, Nil).map(f => f.asInstanceOf[AddFile]) + + assert(addFiles.size == 201) + assert( + addFiles.filter(_.partitionValues.get("l_shipdate").get.equals("1993-03-31")).size == 2) + assert( + addFiles.filter(_.partitionValues.get("l_shipdate").get.equals("1993-01-01")).size == 4) + assert( + addFiles.filter(_.partitionValues.get("l_shipdate").get.equals("1993-02-21")).size == 3) + } + } + + test("test parquet CTAS simple with the delta") { + spark.sql(s""" + |DROP TABLE IF EXISTS lineitem_delta_parquet_ctas1; + |""".stripMargin) + + spark.sql(s""" + |CREATE TABLE lineitem_delta_parquet_ctas1 + |USING delta + |LOCATION '$basePath/lineitem_delta_parquet_ctas1' + | as select * from lineitem + |""".stripMargin) + + val sqlStr = + s""" + |SELECT + | l_returnflag, + | l_linestatus, + | sum(l_quantity) AS sum_qty, + | sum(l_extendedprice) AS sum_base_price, + | sum(l_extendedprice * (1 - l_discount)) AS sum_disc_price, + | sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) AS sum_charge, + | avg(l_quantity) AS avg_qty, + | avg(l_extendedprice) AS avg_price, + | avg(l_discount) AS avg_disc, + | count(*) AS count_order + |FROM + | lineitem_delta_parquet_ctas1 + |WHERE + | l_shipdate <= date'1998-09-02' - interval 1 day + |GROUP BY + | l_returnflag, + | l_linestatus + |ORDER BY + | l_returnflag, + | l_linestatus; + | + |""".stripMargin + runTPCHQueryBySQL(1, sqlStr) { + df => + val scanExec = collect(df.queryExecution.executedPlan) { + case f: FileSourceScanExecTransformer => f + } + assert(scanExec.size == 1) + + val parquetScan = scanExec(0) + assert(parquetScan.nodeName.startsWith("Scan parquet")) + + val fileIndex = parquetScan.relation.location.asInstanceOf[TahoeFileIndex] + val addFiles = fileIndex.matchingFiles(Nil, Nil).map(f => f.asInstanceOf[AddFile]) + assert(addFiles.size == 1) + } + } + + test("test parquet CTAS complex with the delta") { + spark.sql(s""" + |DROP TABLE IF EXISTS lineitem_delta_parquet_ctas2; + |""".stripMargin) + + spark.sql(s""" + |CREATE TABLE IF NOT EXISTS lineitem_delta_parquet_ctas2 + |USING delta + |PARTITIONED BY (l_returnflag) + |LOCATION '$basePath/lineitem_mergetree_ctas2' + | as select * from lineitem + |""".stripMargin) + + val sqlStr = + s""" + |SELECT + | l_returnflag, + | l_linestatus, + | sum(l_quantity) AS sum_qty, + | sum(l_extendedprice) AS sum_base_price, + | sum(l_extendedprice * (1 - l_discount)) AS sum_disc_price, + | sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) AS sum_charge, + | avg(l_quantity) AS avg_qty, + | avg(l_extendedprice) AS avg_price, + | avg(l_discount) AS avg_disc, + | count(*) AS count_order + |FROM + | lineitem_delta_parquet_ctas2 + |WHERE + | l_shipdate <= date'1998-09-02' - interval 1 day + |GROUP BY + | l_returnflag, + | l_linestatus + |ORDER BY + | l_returnflag, + | l_linestatus; + | + |""".stripMargin + runTPCHQueryBySQL(1, sqlStr) { _ => {} } + + } + + test("test path based parquet write with the delta") { + val dataPath = s"$basePath/lineitem_delta_parquet_filebased" + clearDataPath(dataPath) + + val sourceDF = spark.sql(s""" + |select * from lineitem + |""".stripMargin) + + spark.sql(s""" + |CREATE TABLE delta.`$dataPath` ( + | l_orderkey bigint, + | l_partkey bigint, + | l_suppkey bigint, + | l_linenumber bigint, + | l_quantity double, + | l_extendedprice double, + | l_discount double, + | l_tax double, + | l_returnflag string, + | l_linestatus string, + | l_shipdate date, + | l_commitdate date, + | l_receiptdate date, + | l_shipinstruct string, + | l_shipmode string, + | l_comment string + |) + |USING delta + |""".stripMargin) + + sourceDF.write + .format("delta") + .mode(SaveMode.Append) + .save(dataPath) + sourceDF.write + .format("delta") + .mode(SaveMode.Append) + .save(dataPath) + sourceDF.write + .format("delta") + .mode(SaveMode.Overwrite) + .save(dataPath) + + val sqlStr = + s""" + |SELECT + | l_returnflag, + | l_linestatus, + | sum(l_quantity) AS sum_qty, + | sum(l_extendedprice) AS sum_base_price, + | sum(l_extendedprice * (1 - l_discount)) AS sum_disc_price, + | sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) AS sum_charge, + | avg(l_quantity) AS avg_qty, + | avg(l_extendedprice) AS avg_price, + | avg(l_discount) AS avg_disc, + | count(*) AS count_order + |FROM + | delta.`$dataPath` + |WHERE + | l_shipdate <= date'1998-09-02' - interval 1 day + |GROUP BY + | l_returnflag, + | l_linestatus + |ORDER BY + | l_returnflag, + | l_linestatus; + | + |""".stripMargin + runTPCHQueryBySQL(1, sqlStr) { + df => + val plans = collect(df.queryExecution.executedPlan) { + case f: FileSourceScanExecTransformer => f + case w: WholeStageTransformer => w + } + assert(plans.size == 4) + + val parquetScan = plans(3).asInstanceOf[FileSourceScanExecTransformer] + assert(parquetScan.nodeName.startsWith("Scan parquet")) + + val fileIndex = parquetScan.relation.location.asInstanceOf[TahoeFileIndex] + val addFiles = fileIndex.matchingFiles(Nil, Nil).map(f => f.asInstanceOf[AddFile]) + assert(addFiles.size == 1) + } + + val result = spark.read + .format("delta") + .load(dataPath) + .where("l_shipdate = date'1998-09-02'") + .count() + assert(result == 183) + } + + test( + "test path based parquet insert overwrite partitioned table with small table, static with delta") { + val dataPath = s"$basePath/lineitem_delta_parquet_insertoverwrite2" + clearDataPath(dataPath) + + val sourceDF = spark.sql(s""" + |select * from lineitem + |where l_shipdate BETWEEN date'1993-01-01' AND date'1993-03-31' + |""".stripMargin) + + sourceDF.write + .format("delta") + .mode(SaveMode.Append) + .partitionBy("l_shipdate") + .save(dataPath) + + val sourceDF1 = spark.sql( + s""" + |select * from lineitem where l_shipdate BETWEEN date'1993-02-01' AND date'1993-02-10' + |""".stripMargin) + sourceDF1.write + .format("delta") + .mode(SaveMode.Overwrite) + .partitionBy("l_shipdate") + .save(dataPath) + + val result = spark.read + .format("delta") + .load(dataPath) + .count() + assert(result == 2418) + } + + test( + "test path based parquet insert overwrite partitioned table with small table, dynamic with delta") { + withSQLConf(("spark.sql.sources.partitionOverwriteMode", "dynamic")) { + val dataPath = s"$basePath/lineitem_delta_parquet_insertoverwrite3" + clearDataPath(dataPath) + + val sourceDF = spark.sql(s""" + |select * from lineitem + |where l_shipdate BETWEEN date'1993-01-01' AND date'1993-03-31' + |""".stripMargin) + + sourceDF.write + .format("delta") + .mode(SaveMode.Append) + .partitionBy("l_shipdate") + .save(dataPath) + + val sourceDF1 = spark.sql( + s""" + |select * from lineitem where l_shipdate BETWEEN date'1993-02-01' AND date'1993-02-10' + |""".stripMargin) + sourceDF1.write + .format("delta") + .mode(SaveMode.Overwrite) + .partitionBy("l_shipdate") + .save(dataPath) + + val result = spark.read + .format("delta") + .load(dataPath) + .count() + assert(result == 21875) + } + } + + test("test path based parquet update with the delta") { + val dataPath = s"$basePath/lineitem_delta_parquet_update" + clearDataPath(dataPath) + + val sourceDF = spark.sql(s""" + |select /*+ REPARTITION(6) */ * from lineitem + |""".stripMargin) + + sourceDF.write + .format("delta") + .mode(SaveMode.Append) + .save(dataPath) + + spark.sql(s""" + | update delta.`$dataPath` set l_returnflag = 'Z' where l_orderkey = 12647 + |""".stripMargin) + + { + val df = spark.read + .format("delta") + .load(dataPath) + .where("l_returnflag = 'Z'") + assert(df.count() == 1) + val scanExec = collect(df.queryExecution.executedPlan) { + case f: FileSourceScanExecTransformer => f + } + assert(scanExec.size == 1) + + val parquetScan = scanExec.head + assert(parquetScan.nodeName.startsWith("Scan parquet")) + + val fileIndex = parquetScan.relation.location.asInstanceOf[TahoeFileIndex] + val addFiles = fileIndex.matchingFiles(Nil, Nil).map(f => f.asInstanceOf[AddFile]) + assert(addFiles.size == 4) + } + + val clickhouseTable = DeltaTable.forPath(spark, dataPath) + clickhouseTable.updateExpr("l_orderkey = 10086", Map("l_returnflag" -> "'X'")) + + { + val df = spark.read + .format("delta") + .load(dataPath) + .where("l_returnflag = 'X'") + assert(df.count() == 1) + val scanExec = collect(df.queryExecution.executedPlan) { + case f: FileSourceScanExecTransformer => f + } + assert(scanExec.size == 1) + + val parquetScan = scanExec.head + assert(parquetScan.nodeName.startsWith("Scan parquet")) + + val fileIndex = parquetScan.relation.location.asInstanceOf[TahoeFileIndex] + val addFiles = fileIndex.matchingFiles(Nil, Nil).map(f => f.asInstanceOf[AddFile]) + assert(addFiles.size == 3) + } + + val df = spark.read + .format("delta") + .load(dataPath) + assert(df.count() == 600572) + } + + test("test path based parquet delete with the delta") { + val dataPath = s"$basePath/lineitem_delta_parquet_delete" + clearDataPath(dataPath) + + val sourceDF = spark.sql(s""" + |select /*+ REPARTITION(6) */ * from lineitem + |""".stripMargin) + + sourceDF.write + .format("delta") + .mode(SaveMode.Append) + .save(dataPath) + + spark.sql(s""" + | delete from delta.`$dataPath` where l_orderkey = 12647 + |""".stripMargin) + val df = spark.read + .format("delta") + .load(dataPath) + assert(df.count() == 600571) + val scanExec = collect(df.queryExecution.executedPlan) { + case f: FileSourceScanExecTransformer => f + } + val parquetScan = scanExec.head + val fileIndex = parquetScan.relation.location.asInstanceOf[TahoeFileIndex] + val addFiles = fileIndex.matchingFiles(Nil, Nil).map(f => f.asInstanceOf[AddFile]) + assert(addFiles.size == 4) + + val clickhouseTable = DeltaTable.forPath(spark, dataPath) + clickhouseTable.delete("mod(l_orderkey, 3) = 2") + val df1 = spark.read + .format("delta") + .load(dataPath) + assert(df1.count() == 400089) + } + + test("test path based parquet upsert with the delta") { + val dataPath = s"$basePath/lineitem_delta_parquet_upsert" + clearDataPath(dataPath) + + val sourceDF = spark.sql(s""" + |select /*+ REPARTITION(6) */ * from lineitem + |""".stripMargin) + + sourceDF.write + .format("delta") + .mode(SaveMode.Append) + .save(dataPath) + + val df0 = spark.sql(s""" + | select count(*) from delta.`$dataPath` + |""".stripMargin) + assert( + df0.collect().apply(0).get(0) == 600572 + ) + upsertPathBasedSourceTableAndCheck(dataPath) + } + + private def upsertPathBasedSourceTableAndCheck(dataPath: String) = { + // Why selecting l_orderkey having count(*) =1 ? + // Answer: to avoid "org.apache.spark.sql.delta.DeltaUnsupportedOperationException: + // Cannot perform Merge as multiple source rows matched and attempted to modify the same + // target row in the Delta table in possibly conflicting ways." + spark.sql(s""" + merge into delta.`$dataPath` + using ( + + select l_orderkey, l_partkey, l_suppkey, l_linenumber, l_quantity, l_extendedprice, l_discount, l_tax, + 'Z' as `l_returnflag`, + l_linestatus, l_shipdate, l_commitdate, l_receiptdate, l_shipinstruct, l_shipmode, l_comment + from lineitem where l_orderkey in (select l_orderkey from lineitem group by l_orderkey having count(*) =1 ) and l_orderkey < 100000 + + union + + select l_orderkey + 10000000, + l_partkey, l_suppkey, l_linenumber, l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, + l_linestatus, l_shipdate, l_commitdate, l_receiptdate, l_shipinstruct, l_shipmode, l_comment + from lineitem where l_orderkey in (select l_orderkey from lineitem group by l_orderkey having count(*) =1 ) and l_orderkey < 100000 + + ) as updates + on updates.l_orderkey = delta.`$dataPath`.l_orderkey + when matched then update set * + when not matched then insert * + """.stripMargin) + + { + val df1 = spark.sql(s""" + | select count(*) from delta.`$dataPath` + |""".stripMargin) + assert( + df1.collect().apply(0).get(0) == 600572 + 3506 + ) + } + { + val df2 = + spark.sql(s""" + | select count(*) from delta.`$dataPath` where l_returnflag = 'Z' + |""".stripMargin) + assert( + df2.collect().apply(0).get(0) == 3506 + ) + } + + { + val df3 = + spark.sql(s""" + | select count(*) from delta.`$dataPath` where l_orderkey > 10000000 + |""".stripMargin) + assert( + df3.collect().apply(0).get(0) == 3506 + ) + } + } + + test("test path based parquet write with partition + delta") { + val dataPath = s"$basePath/lineitem_delta_parquet_partition" + clearDataPath(dataPath) + + val sourceDF = spark.sql(s""" + |select * from lineitem + |where l_shipdate BETWEEN date'1993-01-01' AND date'1993-03-31' + |""".stripMargin) + + sourceDF.write + .format("delta") + .partitionBy("l_shipdate", "l_returnflag") + .mode(SaveMode.Append) + .save(dataPath) + + val sourceDF1 = spark.sql( + s""" + |select * from lineitem where l_shipdate BETWEEN date'1993-01-01' AND date'1993-01-10' + |""".stripMargin) + + sourceDF1.write + .format("delta") + .mode(SaveMode.Append) + .save(dataPath) + + val sqlStr = + s""" + |SELECT + | l_returnflag, + | l_linestatus, + | sum(l_quantity) AS sum_qty, + | sum(l_extendedprice) AS sum_base_price, + | sum(l_extendedprice * (1 - l_discount)) AS sum_disc_price, + | sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) AS sum_charge, + | avg(l_quantity) AS avg_qty, + | avg(l_extendedprice) AS avg_price, + | avg(l_discount) AS avg_disc, + | count(*) AS count_order + |FROM + | delta.`$dataPath` + |WHERE + | l_shipdate <= date'1998-09-02' - interval 1 day + |GROUP BY + | l_returnflag, + | l_linestatus + |ORDER BY + | l_returnflag, + | l_linestatus; + | + |""".stripMargin + runTPCHQueryBySQL(1, sqlStr, compareResult = false) { + df => + val result = df.collect() + assert(result.size == 2) + assert(result(0).getString(0).equals("A")) + assert(result(0).getString(1).equals("F")) + assert(result(0).getDouble(2) == 306633.0) + + assert(result(1).getString(0).equals("R")) + assert(result(1).getString(1).equals("F")) + assert(result(1).getDouble(2) == 312371.0) + + val scanExec = collect(df.queryExecution.executedPlan) { + case f: FileSourceScanExecTransformer => f + } + assert(scanExec.size == 1) + + val parquetScan = scanExec(0) + assert(parquetScan.nodeName.startsWith("Scan parquet")) + assert(parquetScan.metrics("numFiles").value == 200) + + val fileIndex = parquetScan.relation.location.asInstanceOf[TahoeFileIndex] + val addFiles = fileIndex.matchingFiles(Nil, Nil).map(f => f.asInstanceOf[AddFile]) + + assert(addFiles.size == 200) + assert( + addFiles.filter(_.partitionValues.get("l_shipdate").get.equals("1993-03-31")).size == 2) + assert( + addFiles.filter(_.partitionValues.get("l_shipdate").get.equals("1993-01-01")).size == 4) + } + } + + test("test path based parquet CTAS with delta") { + val dataPath = s"$basePath/lineitem_delta_parquet_ctas" + clearDataPath(dataPath) + + spark.sql(s""" + |CREATE TABLE delta.`$dataPath` + |USING delta + |PARTITIONED BY (l_linestatus) + | as select * from lineitem + |""".stripMargin) + + val sqlStr = + s""" + |SELECT + | l_returnflag, + | l_linestatus, + | sum(l_quantity) AS sum_qty, + | sum(l_extendedprice) AS sum_base_price, + | sum(l_extendedprice * (1 - l_discount)) AS sum_disc_price, + | sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) AS sum_charge, + | avg(l_quantity) AS avg_qty, + | avg(l_extendedprice) AS avg_price, + | avg(l_discount) AS avg_disc, + | count(*) AS count_order + |FROM + | delta.`$dataPath` + |WHERE + | l_shipdate <= date'1998-09-02' - interval 1 day + |GROUP BY + | l_returnflag, + | l_linestatus + |ORDER BY + | l_returnflag, + | l_linestatus; + | + |""".stripMargin + runTPCHQueryBySQL(1, sqlStr) { _ => {} } + } + + test("test parquet optimize basic") { + withSQLConf("spark.databricks.delta.optimize.maxFileSize" -> "20000000") { + spark.sql(s""" + |DROP TABLE IF EXISTS lineitem_delta_parquet_optimize; + |""".stripMargin) + + spark.sql(s""" + |CREATE TABLE IF NOT EXISTS lineitem_delta_parquet_optimize + |USING delta + |LOCATION '$basePath/lineitem_delta_parquet_optimize' + | as select /*+ REPARTITION(20) */ * from lineitem + |""".stripMargin) + + spark.sql("optimize lineitem_delta_parquet_optimize") + val ret = spark.sql("select count(*) from lineitem_delta_parquet_optimize").collect() + assert(ret.apply(0).get(0) == 600572) + + assert( + countFiles(new File(s"$basePath/lineitem_delta_parquet_optimize")) == 24 + ) + } + } + + def countFiles(directory: File): Int = { + if (directory.exists && directory.isDirectory && !directory.getName.equals("_commits")) { + val files = directory.listFiles + val count = files + .filter(!_.getName.endsWith(".crc")) + .count(_.isFile) + files.filter(_.isDirectory).map(countFiles).sum + count + } else { + 0 + } + } + + test("test parquet optimize partitioned by one low card column") { + spark.sql(s""" + |DROP TABLE IF EXISTS lineitem_delta_parquet_optimize_p2; + |""".stripMargin) + + spark.sql(s""" + |CREATE TABLE IF NOT EXISTS lineitem_delta_parquet_optimize_p2 + |USING delta + |PARTITIONED BY (l_returnflag) + |LOCATION '$basePath/lineitem_delta_parquet_optimize_p2' + | as select /*+ REPARTITION(6) */ * from lineitem + |""".stripMargin) + + spark.sparkContext.setJobGroup("test3", "test3") + spark.sql("optimize lineitem_delta_parquet_optimize_p2") + val job_ids = spark.sparkContext.statusTracker.getJobIdsForGroup("test3") + if (sparkVersion.equals("3.2")) { + assert(job_ids.size == 7) // WILL trigger actual merge job + } else { + assert(job_ids.size == 8) // WILL trigger actual merge job + } + + spark.sparkContext.clearJobGroup() + + val ret = spark.sql("select count(*) from lineitem_delta_parquet_optimize_p2").collect() + assert(ret.apply(0).get(0) == 600572) + + assert(countFiles(new File(s"$basePath/lineitem_delta_parquet_optimize_p2")) == 23) + spark.sql("VACUUM lineitem_delta_parquet_optimize_p2 RETAIN 0 HOURS") + if (sparkVersion.equals("3.2")) { + assert(countFiles(new File(s"$basePath/lineitem_delta_parquet_optimize_p2")) == 5) + } else { + assert(countFiles(new File(s"$basePath/lineitem_delta_parquet_optimize_p2")) == 7) + } + + val ret2 = spark.sql("select count(*) from lineitem_delta_parquet_optimize_p2").collect() + assert(ret2.apply(0).get(0) == 600572) + } + + test("test parquet optimize parallel delete") { + withSQLConf("spark.databricks.delta.vacuum.parallelDelete.enabled" -> "true") { + spark.sql(s""" + |DROP TABLE IF EXISTS lineitem_delta_parquet_optimize_p4; + |""".stripMargin) + + spark.sql(s""" + |CREATE TABLE IF NOT EXISTS lineitem_delta_parquet_optimize_p4 + |USING delta + |PARTITIONED BY (l_linenumber,l_returnflag) + |LOCATION '$basePath/lineitem_delta_parquet_optimize_p4' + | as select /*+ REPARTITION(6) */ * from lineitem + |""".stripMargin) + + spark.sql("optimize lineitem_delta_parquet_optimize_p4") + val ret = spark.sql("select count(*) from lineitem_delta_parquet_optimize_p4").collect() + assert(ret.apply(0).get(0) == 600572) + + assert(countFiles(new File(s"$basePath/lineitem_delta_parquet_optimize_p4")) == 149) + spark.sql("VACUUM lineitem_delta_parquet_optimize_p4 RETAIN 0 HOURS") + if (sparkVersion.equals("3.2")) { + assert(countFiles(new File(s"$basePath/lineitem_delta_parquet_optimize_p4")) == 23) + } else { + assert(countFiles(new File(s"$basePath/lineitem_delta_parquet_optimize_p4")) == 25) + } + + val ret2 = spark.sql("select count(*) from lineitem_delta_parquet_optimize_p4").collect() + assert(ret2.apply(0).get(0) == 600572) + } + } + + test("test parquet optimize with the path based table") { + val dataPath = s"$basePath/lineitem_delta_parquet_optimize_path_based" + clearDataPath(dataPath) + withSQLConf( + ("spark.databricks.delta.optimize.maxFileSize" -> "1000000"), + ("spark.databricks.delta.optimize.minFileSize" -> "838000")) { + + val sourceDF = spark.sql(s""" + |select /*+ REPARTITION(50) */ * from lineitem + |""".stripMargin) + + sourceDF.write + .format("delta") + .mode(SaveMode.Append) + .save(dataPath) + + val clickhouseTable = DeltaTable.forPath(spark, dataPath) + clickhouseTable.optimize().executeCompaction() + + clickhouseTable.vacuum(0.0) + if (sparkVersion.equals("3.2")) { + assert(countFiles(new File(dataPath)) == 27) + } else { + assert(countFiles(new File(dataPath)) == 29) + } + + val ret = spark.sql(s"select count(*) from clickhouse.`$dataPath`").collect() + assert(ret.apply(0).get(0) == 600572) + } + + withSQLConf( + ("spark.databricks.delta.optimize.maxFileSize" -> "10000000"), + ("spark.databricks.delta.optimize.minFileSize" -> "1000000")) { + + val clickhouseTable = DeltaTable.forPath(spark, dataPath) + clickhouseTable.optimize().executeCompaction() + + clickhouseTable.vacuum(0.0) + if (sparkVersion.equals("3.2")) { + assert(countFiles(new File(dataPath)) == 6) + } else { + assert(countFiles(new File(dataPath)) == 12) + } + + val ret = spark.sql(s"select count(*) from clickhouse.`$dataPath`").collect() + assert(ret.apply(0).get(0) == 600572) + } + + // now merge all parts (testing merging from merged parts) + val clickhouseTable = DeltaTable.forPath(spark, dataPath) + clickhouseTable.optimize().executeCompaction() + + clickhouseTable.vacuum(0.0) + if (sparkVersion.equals("3.2")) { + assert(countFiles(new File(dataPath)) == 5) + } else { + assert(countFiles(new File(dataPath)) == 13) + } + + val ret = spark.sql(s"select count(*) from clickhouse.`$dataPath`").collect() + assert(ret.apply(0).get(0) == 600572) + } +} +// scalastyle:off line.size.limit diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseHiveTableSuite.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseHiveTableSuite.scala index 3c993b6220186..4e190c087920b 100644 --- a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseHiveTableSuite.scala +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseHiveTableSuite.scala @@ -111,6 +111,7 @@ class GlutenClickHouseHiveTableSuite getClass.getResource("/").getPath + "tests-working-home/spark-warehouse") .set("spark.hive.exec.dynamic.partition.mode", "nonstrict") .set("spark.gluten.supported.hive.udfs", "my_add") + .set("spark.gluten.sql.columnar.backend.ch.runtime_config.use_local_format", "true") .setMaster("local[*]") } @@ -1251,4 +1252,29 @@ class GlutenClickHouseHiveTableSuite } spark.sql("drop table test_tbl_3452") } + + test("GLUTEN-6235: Fix crash on ExpandTransform::work()") { + val tbl = "test_tbl_6235" + sql(s"drop table if exists $tbl") + val createSql = + s""" + |create table $tbl + |stored as textfile + |as select 1 as a1, 2 as a2, 3 as a3, 4 as a4, 5 as a5, 6 as a6, 7 as a7, 8 as a8, 9 as a9 + |""".stripMargin + sql(createSql) + val select_sql = + s""" + |select + |a5,a6,a7,a8,a3,a4,a9 + |,count(distinct a2) as a2 + |,count(distinct a1) as a1 + |,count(distinct if(a3=1,a2,null)) as a33 + |,count(distinct if(a4=2,a1,null)) as a43 + |from $tbl + |group by a5,a6,a7,a8,a3,a4,a9 with cube + |""".stripMargin + compareResultsAgainstVanillaSpark(select_sql, true, { _ => }) + sql(s"drop table if exists $tbl") + } } diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseMergeTreeOptimizeSuite.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseMergeTreeOptimizeSuite.scala index d4302193f5b8c..7989c02ba872a 100644 --- a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseMergeTreeOptimizeSuite.scala +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseMergeTreeOptimizeSuite.scala @@ -16,8 +16,6 @@ */ package org.apache.gluten.execution -import org.apache.gluten.GlutenConfig - import org.apache.spark.SparkConf import org.apache.spark.sql.SaveMode import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper @@ -61,6 +59,9 @@ class GlutenClickHouseMergeTreeOptimizeSuite .set( "spark.gluten.sql.columnar.backend.ch.runtime_settings.mergetree.merge_after_insert", "false") + .set( + "spark.gluten.sql.columnar.backend.ch.runtime_settings.input_format_parquet_max_block_size", + "8192") } override protected def createTPCHNotNullTables(): Unit = { @@ -82,16 +83,16 @@ class GlutenClickHouseMergeTreeOptimizeSuite spark.sql("optimize lineitem_mergetree_optimize") val ret = spark.sql("select count(*) from lineitem_mergetree_optimize").collect() - assert(ret.apply(0).get(0) == 600572) + assertResult(600572)(ret.apply(0).get(0)) - assert( - countFiles(new File(s"$basePath/lineitem_mergetree_optimize")) == 462 + assertResult(462)( + countFiles(new File(s"$basePath/lineitem_mergetree_optimize")) ) // many merged parts } } def countFiles(directory: File): Int = { - if (directory.exists && directory.isDirectory) { + if (directory.exists && directory.isDirectory && !directory.getName.equals("_commits")) { val files = directory.listFiles val count = files .filter(!_.getName.endsWith(".crc")) @@ -118,20 +119,27 @@ class GlutenClickHouseMergeTreeOptimizeSuite spark.sparkContext.setJobGroup("test", "test") spark.sql("optimize lineitem_mergetree_optimize_p") val job_ids = spark.sparkContext.statusTracker.getJobIdsForGroup("test") - assert(job_ids.size == 1) // will not trigger actual merge job + if (sparkVersion.equals("3.5")) { + assertResult(4)(job_ids.length) + } else { + assertResult(1)(job_ids.length) // will not trigger actual merge job + } spark.sparkContext.clearJobGroup() val ret = spark.sql("select count(*) from lineitem_mergetree_optimize_p").collect() - assert(ret.apply(0).get(0) == 600572) + assertResult(600572)(ret.apply(0).get(0)) - spark.sql(s"set ${GlutenConfig.GLUTEN_ENABLED.key}=false") - assert(countFiles(new File(s"$basePath/lineitem_mergetree_optimize_p")) == 22728) + assertResult(22728)(countFiles(new File(s"$basePath/lineitem_mergetree_optimize_p"))) spark.sql("VACUUM lineitem_mergetree_optimize_p RETAIN 0 HOURS") - assert(countFiles(new File(s"$basePath/lineitem_mergetree_optimize_p")) == 22728) - spark.sql(s"set ${GlutenConfig.GLUTEN_ENABLED.key}=true") + if (sparkVersion.equals("3.2")) { + assertResult(22728)(countFiles(new File(s"$basePath/lineitem_mergetree_optimize_p"))) + } else { + // For Spark 3.3 + Delta 2.3, vacuum command will create two commit files in deltalog dir. + assertResult(22730)(countFiles(new File(s"$basePath/lineitem_mergetree_optimize_p"))) + } val ret2 = spark.sql("select count(*) from lineitem_mergetree_optimize_p").collect() - assert(ret2.apply(0).get(0) == 600572) + assertResult(600572)(ret2.apply(0).get(0)) } test("test mergetree optimize partitioned by one low card column") { @@ -150,23 +158,34 @@ class GlutenClickHouseMergeTreeOptimizeSuite spark.sparkContext.setJobGroup("test2", "test2") spark.sql("optimize lineitem_mergetree_optimize_p2") val job_ids = spark.sparkContext.statusTracker.getJobIdsForGroup("test2") - assert(job_ids.size == 7) // WILL trigger actual merge job + if (sparkVersion.equals("3.2")) { + assertResult(7)(job_ids.length) // WILL trigger actual merge job + } else { + assertResult(8)(job_ids.length) // WILL trigger actual merge job + } + spark.sparkContext.clearJobGroup() val ret = spark.sql("select count(*) from lineitem_mergetree_optimize_p2").collect() - assert(ret.apply(0).get(0) == 600572) + assertResult(600572)(ret.apply(0).get(0)) - spark.sql(s"set ${GlutenConfig.GLUTEN_ENABLED.key}=false") - assert(countFiles(new File(s"$basePath/lineitem_mergetree_optimize_p2")) == 812) + assertResult(372)(countFiles(new File(s"$basePath/lineitem_mergetree_optimize_p2"))) spark.sql("VACUUM lineitem_mergetree_optimize_p2 RETAIN 0 HOURS") - assert(countFiles(new File(s"$basePath/lineitem_mergetree_optimize_p2")) == 232) + if (sparkVersion.equals("3.2")) { + assertResult(239)(countFiles(new File(s"$basePath/lineitem_mergetree_optimize_p2"))) + } else { + assertResult(241)(countFiles(new File(s"$basePath/lineitem_mergetree_optimize_p2"))) + } spark.sql("VACUUM lineitem_mergetree_optimize_p2 RETAIN 0 HOURS") // the second VACUUM will remove some empty folders - assert(countFiles(new File(s"$basePath/lineitem_mergetree_optimize_p2")) == 220) - spark.sql(s"set ${GlutenConfig.GLUTEN_ENABLED.key}=true") + if (sparkVersion.equals("3.2")) { + assertResult(220)(countFiles(new File(s"$basePath/lineitem_mergetree_optimize_p2"))) + } else { + assertResult(226)(countFiles(new File(s"$basePath/lineitem_mergetree_optimize_p2"))) + } val ret2 = spark.sql("select count(*) from lineitem_mergetree_optimize_p2").collect() - assert(ret2.apply(0).get(0) == 600572) + assertResult(600572)(ret2.apply(0).get(0)) } test("test mergetree optimize partitioned by two low card column") { @@ -185,18 +204,24 @@ class GlutenClickHouseMergeTreeOptimizeSuite spark.sql("optimize lineitem_mergetree_optimize_p3") val ret = spark.sql("select count(*) from lineitem_mergetree_optimize_p3").collect() - assert(ret.apply(0).get(0) == 600572) + assertResult(600572)(ret.apply(0).get(0)) - spark.sql(s"set ${GlutenConfig.GLUTEN_ENABLED.key}=false") - assert(countFiles(new File(s"$basePath/lineitem_mergetree_optimize_p3")) == 398) + assertResult(516)(countFiles(new File(s"$basePath/lineitem_mergetree_optimize_p3"))) spark.sql("VACUUM lineitem_mergetree_optimize_p3 RETAIN 0 HOURS") - assert(countFiles(new File(s"$basePath/lineitem_mergetree_optimize_p3")) == 286) + if (sparkVersion.equals("3.2")) { + assertResult(306)(countFiles(new File(s"$basePath/lineitem_mergetree_optimize_p3"))) + } else { + assertResult(308)(countFiles(new File(s"$basePath/lineitem_mergetree_optimize_p3"))) + } spark.sql("VACUUM lineitem_mergetree_optimize_p3 RETAIN 0 HOURS") - assert(countFiles(new File(s"$basePath/lineitem_mergetree_optimize_p3")) == 270) - spark.sql(s"set ${GlutenConfig.GLUTEN_ENABLED.key}=true") + if (sparkVersion.equals("3.2")) { + assertResult(276)(countFiles(new File(s"$basePath/lineitem_mergetree_optimize_p3"))) + } else { + assertResult(282)(countFiles(new File(s"$basePath/lineitem_mergetree_optimize_p3"))) + } val ret2 = spark.sql("select count(*) from lineitem_mergetree_optimize_p3").collect() - assert(ret2.apply(0).get(0) == 600572) + assertResult(600572)(ret2.apply(0).get(0)) } } @@ -216,18 +241,24 @@ class GlutenClickHouseMergeTreeOptimizeSuite spark.sql("optimize lineitem_mergetree_optimize_p4") val ret = spark.sql("select count(*) from lineitem_mergetree_optimize_p4").collect() - assert(ret.apply(0).get(0) == 600572) + assertResult(600572)(ret.apply(0).get(0)) - spark.sql(s"set ${GlutenConfig.GLUTEN_ENABLED.key}=false") - assert(countFiles(new File(s"$basePath/lineitem_mergetree_optimize_p4")) == 398) + assertResult(516)(countFiles(new File(s"$basePath/lineitem_mergetree_optimize_p4"))) spark.sql("VACUUM lineitem_mergetree_optimize_p4 RETAIN 0 HOURS") - assert(countFiles(new File(s"$basePath/lineitem_mergetree_optimize_p4")) == 286) + if (sparkVersion.equals("3.2")) { + assertResult(306)(countFiles(new File(s"$basePath/lineitem_mergetree_optimize_p4"))) + } else { + assertResult(308)(countFiles(new File(s"$basePath/lineitem_mergetree_optimize_p4"))) + } spark.sql("VACUUM lineitem_mergetree_optimize_p4 RETAIN 0 HOURS") - assert(countFiles(new File(s"$basePath/lineitem_mergetree_optimize_p4")) == 270) - spark.sql(s"set ${GlutenConfig.GLUTEN_ENABLED.key}=true") + if (sparkVersion.equals("3.2")) { + assertResult(276)(countFiles(new File(s"$basePath/lineitem_mergetree_optimize_p4"))) + } else { + assertResult(282)(countFiles(new File(s"$basePath/lineitem_mergetree_optimize_p4"))) + } val ret2 = spark.sql("select count(*) from lineitem_mergetree_optimize_p4").collect() - assert(ret2.apply(0).get(0) == 600572) + assertResult(600572)(ret2.apply(0).get(0)) } } @@ -248,19 +279,23 @@ class GlutenClickHouseMergeTreeOptimizeSuite spark.sql("optimize lineitem_mergetree_optimize_p5") - spark.sql(s"set ${GlutenConfig.GLUTEN_ENABLED.key}=false") spark.sql("VACUUM lineitem_mergetree_optimize_p5 RETAIN 0 HOURS") spark.sql("VACUUM lineitem_mergetree_optimize_p5 RETAIN 0 HOURS") - assert(countFiles(new File(s"$basePath/lineitem_mergetree_optimize_p5")) == 99) - spark.sql(s"set ${GlutenConfig.GLUTEN_ENABLED.key}=true") + if (sparkVersion.equals("3.2")) { + assertResult(99)(countFiles(new File(s"$basePath/lineitem_mergetree_optimize_p5"))) + } else { + // For Spark 3.3 + Delta 2.3, vacuum command will create two commit files in deltalog dir. + // this case will create a checkpoint + assertResult(105)(countFiles(new File(s"$basePath/lineitem_mergetree_optimize_p5"))) + } val ret = spark.sql("select count(*) from lineitem_mergetree_optimize_p5").collect() - assert(ret.apply(0).get(0) == 600572) + assertResult(600572)(ret.apply(0).get(0)) } withSQLConf( - ("spark.databricks.delta.optimize.maxFileSize" -> "10000000"), - ("spark.databricks.delta.optimize.minFileSize" -> "838250")) { + "spark.databricks.delta.optimize.maxFileSize" -> "10000000", + "spark.databricks.delta.optimize.minFileSize" -> "838250") { // of the remaing 3 original parts, 2 are less than 838250, 1 is larger (size 838255) // the merged part is ~27MB, so after optimize there should be 3 parts: // 1 merged part from 2 original parts, 1 merged part from 34 original parts @@ -268,27 +303,33 @@ class GlutenClickHouseMergeTreeOptimizeSuite spark.sql("optimize lineitem_mergetree_optimize_p5") - spark.sql(s"set ${GlutenConfig.GLUTEN_ENABLED.key}=false") spark.sql("VACUUM lineitem_mergetree_optimize_p5 RETAIN 0 HOURS") spark.sql("VACUUM lineitem_mergetree_optimize_p5 RETAIN 0 HOURS") - assert(countFiles(new File(s"$basePath/lineitem_mergetree_optimize_p5")) == 93) - spark.sql(s"set ${GlutenConfig.GLUTEN_ENABLED.key}=true") + if (sparkVersion.equals("3.2")) { + assertResult(93)(countFiles(new File(s"$basePath/lineitem_mergetree_optimize_p5"))) + } else { + // For Spark 3.3 + Delta 2.3, vacuum command will create two commit files in deltalog dir. + assertResult(104)(countFiles(new File(s"$basePath/lineitem_mergetree_optimize_p5"))) + } val ret = spark.sql("select count(*) from lineitem_mergetree_optimize_p5").collect() - assert(ret.apply(0).get(0) == 600572) + assertResult(600572)(ret.apply(0).get(0)) } // now merge all parts (testing merging from merged parts) spark.sql("optimize lineitem_mergetree_optimize_p5") - spark.sql(s"set ${GlutenConfig.GLUTEN_ENABLED.key}=false") spark.sql("VACUUM lineitem_mergetree_optimize_p5 RETAIN 0 HOURS") spark.sql("VACUUM lineitem_mergetree_optimize_p5 RETAIN 0 HOURS") - assert(countFiles(new File(s"$basePath/lineitem_mergetree_optimize_p5")) == 77) - spark.sql(s"set ${GlutenConfig.GLUTEN_ENABLED.key}=true") + if (sparkVersion.equals("3.2")) { + assertResult(77)(countFiles(new File(s"$basePath/lineitem_mergetree_optimize_p5"))) + } else { + // For Spark 3.3 + Delta 2.3, vacuum command will create two commit files in deltalog dir. + assertResult(93)(countFiles(new File(s"$basePath/lineitem_mergetree_optimize_p5"))) + } val ret = spark.sql("select count(*) from lineitem_mergetree_optimize_p5").collect() - assert(ret.apply(0).get(0) == 600572) + assertResult(600572)(ret.apply(0).get(0)) } test("test mergetree optimize table with partition and bucket") { @@ -309,21 +350,17 @@ class GlutenClickHouseMergeTreeOptimizeSuite spark.sql("optimize lineitem_mergetree_optimize_p6") val ret = spark.sql("select count(*) from lineitem_mergetree_optimize_p6").collect() - assert(ret.apply(0).get(0) == 600572) + assertResult(600572)(ret.apply(0).get(0)) - spark.sql(s"set ${GlutenConfig.GLUTEN_ENABLED.key}=false") - assert(countFiles(new File(s"$basePath/lineitem_mergetree_optimize_p6")) == { - if (sparkVersion.equals("3.2")) 931 else 1014 - }) + assertResult(countFiles(new File(s"$basePath/lineitem_mergetree_optimize_p6")))( + if (sparkVersion.equals("3.2")) 499 else 528) spark.sql("VACUUM lineitem_mergetree_optimize_p6 RETAIN 0 HOURS") spark.sql("VACUUM lineitem_mergetree_optimize_p6 RETAIN 0 HOURS") - assert(countFiles(new File(s"$basePath/lineitem_mergetree_optimize_p6")) == { - if (sparkVersion.equals("3.2")) 439 else 445 - }) - spark.sql(s"set ${GlutenConfig.GLUTEN_ENABLED.key}=true") + assertResult(countFiles(new File(s"$basePath/lineitem_mergetree_optimize_p6")))( + if (sparkVersion.equals("3.2")) 315 else 327) val ret2 = spark.sql("select count(*) from lineitem_mergetree_optimize_p6").collect() - assert(ret2.apply(0).get(0) == 600572) + assertResult(600572)(ret2.apply(0).get(0)) } test("test skip index after optimize") { @@ -343,9 +380,7 @@ class GlutenClickHouseMergeTreeOptimizeSuite |""".stripMargin) spark.sql("optimize lineitem_mergetree_index") - spark.sql(s"set ${GlutenConfig.GLUTEN_ENABLED.key}=false") spark.sql("vacuum lineitem_mergetree_index") - spark.sql(s"set ${GlutenConfig.GLUTEN_ENABLED.key}=true") val df = spark .sql(s""" @@ -355,12 +390,12 @@ class GlutenClickHouseMergeTreeOptimizeSuite val scanExec = collect(df.queryExecution.executedPlan) { case f: FileSourceScanExecTransformer => f } - assert(scanExec.size == 1) - val mergetreeScan = scanExec(0) + assertResult(1)(scanExec.size) + val mergetreeScan = scanExec.head val ret = df.collect() - assert(ret.apply(0).get(0) == 2) + assertResult(2)(ret.apply(0).get(0)) val marks = mergetreeScan.metrics("selectedMarks").value - assert(marks == 1) + assertResult(1)(marks) val directory = new File(s"$basePath/lineitem_mergetree_index") val partDir = directory.listFiles().filter(f => f.getName.endsWith("merged")).head @@ -389,19 +424,21 @@ class GlutenClickHouseMergeTreeOptimizeSuite val clickhouseTable = ClickhouseTable.forPath(spark, dataPath) clickhouseTable.optimize().executeCompaction() - spark.sql(s"set ${GlutenConfig.GLUTEN_ENABLED.key}=false") clickhouseTable.vacuum(0.0) clickhouseTable.vacuum(0.0) - spark.sql(s"set ${GlutenConfig.GLUTEN_ENABLED.key}=true") - assert(countFiles(new File(dataPath)) == 99) + if (sparkVersion.equals("3.2")) { + assertResult(99)(countFiles(new File(dataPath))) + } else { + assertResult(105)(countFiles(new File(dataPath))) + } val ret = spark.sql(s"select count(*) from clickhouse.`$dataPath`").collect() - assert(ret.apply(0).get(0) == 600572) + assertResult(600572)(ret.apply(0).get(0)) } withSQLConf( - ("spark.databricks.delta.optimize.maxFileSize" -> "10000000"), - ("spark.databricks.delta.optimize.minFileSize" -> "838250")) { + "spark.databricks.delta.optimize.maxFileSize" -> "10000000", + "spark.databricks.delta.optimize.minFileSize" -> "838250") { // of the remaing 3 original parts, 2 are less than 838250, 1 is larger (size 838255) // the merged part is ~27MB, so after optimize there should be 3 parts: // 1 merged part from 2 original parts, 1 merged part from 34 original parts @@ -410,34 +447,38 @@ class GlutenClickHouseMergeTreeOptimizeSuite val clickhouseTable = ClickhouseTable.forPath(spark, dataPath) clickhouseTable.optimize().executeCompaction() - spark.sql(s"set ${GlutenConfig.GLUTEN_ENABLED.key}=false") clickhouseTable.vacuum(0.0) clickhouseTable.vacuum(0.0) - spark.sql(s"set ${GlutenConfig.GLUTEN_ENABLED.key}=true") - assert(countFiles(new File(dataPath)) == 93) + if (sparkVersion.equals("3.2")) { + assertResult(93)(countFiles(new File(dataPath))) + } else { + assertResult(104)(countFiles(new File(dataPath))) + } val ret = spark.sql(s"select count(*) from clickhouse.`$dataPath`").collect() - assert(ret.apply(0).get(0) == 600572) + assertResult(600572)(ret.apply(0).get(0)) } // now merge all parts (testing merging from merged parts) val clickhouseTable = ClickhouseTable.forPath(spark, dataPath) clickhouseTable.optimize().executeCompaction() - spark.sql(s"set ${GlutenConfig.GLUTEN_ENABLED.key}=false") clickhouseTable.vacuum(0.0) clickhouseTable.vacuum(0.0) - spark.sql(s"set ${GlutenConfig.GLUTEN_ENABLED.key}=true") - assert(countFiles(new File(dataPath)) == 77) + if (sparkVersion.equals("3.2")) { + assertResult(77)(countFiles(new File(dataPath))) + } else { + assertResult(93)(countFiles(new File(dataPath))) + } val ret = spark.sql(s"select count(*) from clickhouse.`$dataPath`").collect() - assert(ret.apply(0).get(0) == 600572) + assertResult(600572)(ret.apply(0).get(0)) } test("test mergetree insert with optimize basic") { withSQLConf( - ("spark.databricks.delta.optimize.minFileSize" -> "200000000"), - ("spark.gluten.sql.columnar.backend.ch.runtime_settings.mergetree.merge_after_insert" -> "true") + "spark.databricks.delta.optimize.minFileSize" -> "200000000", + "spark.gluten.sql.columnar.backend.ch.runtime_settings.mergetree.merge_after_insert" -> "true" ) { spark.sql(s""" |DROP TABLE IF EXISTS lineitem_mergetree_insert_optimize_basic; @@ -451,10 +492,10 @@ class GlutenClickHouseMergeTreeOptimizeSuite |""".stripMargin) val ret = spark.sql("select count(*) from lineitem_mergetree_insert_optimize_basic").collect() - assert(ret.apply(0).get(0) == 600572) + assertResult(600572)(ret.apply(0).get(0)) eventually(timeout(60.seconds), interval(3.seconds)) { - assert( - new File(s"$basePath/lineitem_mergetree_insert_optimize_basic").listFiles().length == 2 + assertResult(2)( + new File(s"$basePath/lineitem_mergetree_insert_optimize_basic").listFiles().length ) } } diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseMergeTreePathBasedWriteSuite.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseMergeTreePathBasedWriteSuite.scala index 93f22baa25756..79d663debcdec 100644 --- a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseMergeTreePathBasedWriteSuite.scala +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseMergeTreePathBasedWriteSuite.scala @@ -60,6 +60,9 @@ class GlutenClickHouseMergeTreePathBasedWriteSuite .set( "spark.gluten.sql.columnar.backend.ch.runtime_settings.mergetree.merge_after_insert", "false") + .set( + "spark.gluten.sql.columnar.backend.ch.runtime_settings.input_format_parquet_max_block_size", + "8192") } override protected def createTPCHNotNullTables(): Unit = { @@ -143,7 +146,7 @@ class GlutenClickHouseMergeTreePathBasedWriteSuite case f: FileSourceScanExecTransformer => f case w: WholeStageTransformer => w } - assert(plans.size == 4) + assertResult(4)(plans.size) val mergetreeScan = plans(3).asInstanceOf[FileSourceScanExecTransformer] assert(mergetreeScan.nodeName.startsWith("Scan mergetree")) @@ -154,10 +157,8 @@ class GlutenClickHouseMergeTreePathBasedWriteSuite assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).primaryKeyOption.nonEmpty) assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).partitionColumns.isEmpty) val addFiles = fileIndex.matchingFiles(Nil, Nil).map(f => f.asInstanceOf[AddMergeTreeParts]) - assert(addFiles.size == 6) - assert( - addFiles.map(_.rows).sum - == 600572) + assertResult(6)(addFiles.size) + assertResult(600572)(addFiles.map(_.rows).sum) // GLUTEN-5060: check the unnecessary FilterExec val wholeStageTransformer = plans(2).asInstanceOf[WholeStageTransformer] @@ -174,7 +175,7 @@ class GlutenClickHouseMergeTreePathBasedWriteSuite .load(dataPath) .where("l_shipdate = date'1998-09-02'") .count() - assert(result == 183) + assertResult(183)(result) } test("test mergetree path based write with dataframe api") { @@ -236,40 +237,35 @@ class GlutenClickHouseMergeTreePathBasedWriteSuite case f: FileSourceScanExecTransformer => f case w: WholeStageTransformer => w } - assert(plans.size == 4) + assertResult(4)(plans.size) val mergetreeScan = plans(3).asInstanceOf[FileSourceScanExecTransformer] assert(mergetreeScan.nodeName.startsWith("Scan mergetree")) val fileIndex = mergetreeScan.relation.location.asInstanceOf[TahoeFileIndex] assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).clickhouseTableConfigs.nonEmpty) - assert( + assertResult("l_shipdate,l_orderkey")( ClickHouseTableV2 .getTable(fileIndex.deltaLog) .orderByKeyOption .get - .mkString(",") - .equals("l_shipdate,l_orderkey")) - assert( + .mkString(",")) + assertResult("l_shipdate")( ClickHouseTableV2 .getTable(fileIndex.deltaLog) .primaryKeyOption .get - .mkString(",") - .equals("l_shipdate")) - assert( + .mkString(",")) + assertResult("l_returnflag,l_linestatus")( ClickHouseTableV2 .getTable(fileIndex.deltaLog) .lowCardKeyOption .get - .mkString(",") - .equals("l_returnflag,l_linestatus")) + .mkString(",")) assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).partitionColumns.isEmpty) val addFiles = fileIndex.matchingFiles(Nil, Nil).map(f => f.asInstanceOf[AddMergeTreeParts]) - assert(addFiles.size == 6) - assert( - addFiles.map(_.rows).sum - == 600572) + assertResult(6)(addFiles.size) + assertResult(600572)(addFiles.map(_.rows).sum) // GLUTEN-5060: check the unnecessary FilterExec val wholeStageTransformer = plans(2).asInstanceOf[WholeStageTransformer] @@ -286,7 +282,7 @@ class GlutenClickHouseMergeTreePathBasedWriteSuite .load(dataPath) .where("l_shipdate = date'1998-09-02'") .collect() - assert(result.apply(0).get(0) == 110501) + assertResult(110501)(result.apply(0).get(0)) } test("test mergetree path based insert overwrite partitioned table with small table, static") { @@ -320,7 +316,7 @@ class GlutenClickHouseMergeTreePathBasedWriteSuite .format("clickhouse") .load(dataPath) .count() - assert(result == 2418) + assertResult(2418)(result) } test("test mergetree path based insert overwrite partitioned table with small table, dynamic") { @@ -355,7 +351,7 @@ class GlutenClickHouseMergeTreePathBasedWriteSuite .format("clickhouse") .load(dataPath) .count() - assert(result == 600572) + assertResult(600572)(result) } } @@ -381,11 +377,11 @@ class GlutenClickHouseMergeTreePathBasedWriteSuite .format("clickhouse") .load(dataPath) .where("l_returnflag = 'Z'") - assert(df.count() == 1) + assertResult(1)(df.count()) val scanExec = collect(df.queryExecution.executedPlan) { case f: FileSourceScanExecTransformer => f } - assert(scanExec.size == 1) + assertResult(1)(scanExec.size) val mergetreeScan = scanExec.head assert(mergetreeScan.nodeName.startsWith("Scan mergetree")) @@ -397,16 +393,13 @@ class GlutenClickHouseMergeTreePathBasedWriteSuite assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).primaryKeyOption.isEmpty) assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).partitionColumns.isEmpty) val addFiles = fileIndex.matchingFiles(Nil, Nil).map(f => f.asInstanceOf[AddMergeTreeParts]) - assert( - addFiles.map(_.rows).sum - == 600572) - + assertResult(600572)(addFiles.map(_.rows).sum) // 4 parts belong to the first batch // 2 parts belong to the second batch (1 actual updated part, 1 passively updated). - assert(addFiles.size == 6) + assertResult(6)(addFiles.size) val filePaths = addFiles.map(_.path).groupBy(name => name.substring(0, name.lastIndexOf("_"))) - assert(filePaths.size == 2) - assert(Array(2, 4).sameElements(filePaths.values.map(paths => paths.size).toArray.sorted)) + assertResult(2)(filePaths.size) + assertResult(Array(2, 4))(filePaths.values.map(paths => paths.size).toArray.sorted) } val clickhouseTable = ClickhouseTable.forPath(spark, dataPath) @@ -417,33 +410,30 @@ class GlutenClickHouseMergeTreePathBasedWriteSuite .format("clickhouse") .load(dataPath) .where("l_returnflag = 'X'") - assert(df.count() == 1) + assertResult(1)(df.count()) val scanExec = collect(df.queryExecution.executedPlan) { case f: FileSourceScanExecTransformer => f } - assert(scanExec.size == 1) + assertResult(1)(scanExec.size) val mergetreeScan = scanExec.head assert(mergetreeScan.nodeName.startsWith("Scan mergetree")) val fileIndex = mergetreeScan.relation.location.asInstanceOf[TahoeFileIndex] val addFiles = fileIndex.matchingFiles(Nil, Nil).map(f => f.asInstanceOf[AddMergeTreeParts]) - assert( - addFiles.map(_.rows).sum - == 600572) + assertResult(600572)(addFiles.map(_.rows).sum) // 4 parts belong to the first batch // 2 parts belong to the second batch (1 actual updated part, 1 passively updated). - assert(addFiles.size == 6) + assertResult(6)(addFiles.size) val filePaths = addFiles.map(_.path).groupBy(name => name.substring(0, name.lastIndexOf("_"))) - assert(filePaths.size == 2) - assert(Array(2, 4).sameElements(filePaths.values.map(paths => paths.size).toArray.sorted)) + assertResult(2)(filePaths.size) } val df = spark.read .format("clickhouse") .load(dataPath) - assert(df.count() == 600572) + assertResult(600572)(df.count()) } test("test mergetree path based table delete") { @@ -465,7 +455,7 @@ class GlutenClickHouseMergeTreePathBasedWriteSuite val df = spark.read .format("clickhouse") .load(dataPath) - assert(df.count() == 600571) + assertResult(600571)(df.count()) val scanExec = collect(df.queryExecution.executedPlan) { case f: FileSourceScanExecTransformer => f } @@ -474,17 +464,17 @@ class GlutenClickHouseMergeTreePathBasedWriteSuite val addFiles = fileIndex.matchingFiles(Nil, Nil).map(f => f.asInstanceOf[AddMergeTreeParts]) // 4 parts belong to the first batch // 2 parts belong to the second batch (1 actual updated part, 1 passively updated). - assert(addFiles.size == 6) + assertResult(6)(addFiles.size) val filePaths = addFiles.map(_.path).groupBy(name => name.substring(0, name.lastIndexOf("_"))) - assert(filePaths.size == 2) - assert(Array(2, 4).sameElements(filePaths.values.map(paths => paths.size).toArray.sorted)) + assertResult(2)(filePaths.size) + assertResult(Array(2, 4))(filePaths.values.map(paths => paths.size).toArray.sorted) val clickhouseTable = ClickhouseTable.forPath(spark, dataPath) clickhouseTable.delete("mod(l_orderkey, 3) = 2") val df1 = spark.read .format("clickhouse") .load(dataPath) - assert(df1.count() == 400089) + assertResult(400089)(df1.count()) } test("test mergetree path based table upsert") { @@ -503,8 +493,8 @@ class GlutenClickHouseMergeTreePathBasedWriteSuite val df0 = spark.sql(s""" | select count(*) from clickhouse.`$dataPath` |""".stripMargin) - assert( - df0.collect().apply(0).get(0) == 600572 + assertResult(600572)( + df0.collect().apply(0).get(0) ) upsertSourceTableAndCheck(dataPath) } @@ -540,8 +530,8 @@ class GlutenClickHouseMergeTreePathBasedWriteSuite val df1 = spark.sql(s""" | select count(*) from clickhouse.`$dataPath` |""".stripMargin) - assert( - df1.collect().apply(0).get(0) == 600572 + 3506 + assertResult(600572 + 3506)( + df1.collect().apply(0).get(0) ) } { @@ -549,8 +539,8 @@ class GlutenClickHouseMergeTreePathBasedWriteSuite spark.sql(s""" | select count(*) from clickhouse.`$dataPath` where l_returnflag = 'Z' |""".stripMargin) - assert( - df2.collect().apply(0).get(0) == 3506 + assertResult(3506)( + df2.collect().apply(0).get(0) ) } @@ -559,8 +549,8 @@ class GlutenClickHouseMergeTreePathBasedWriteSuite spark.sql(s""" | select count(*) from clickhouse.`$dataPath` where l_orderkey > 10000000 |""".stripMargin) - assert( - df3.collect().apply(0).get(0) == 3506 + assertResult(3506)( + df3.collect().apply(0).get(0) ) } } @@ -610,33 +600,31 @@ class GlutenClickHouseMergeTreePathBasedWriteSuite val scanExec = collect(df.queryExecution.executedPlan) { case f: FileSourceScanExecTransformer => f } - assert(scanExec.size == 1) + assertResult(1)(scanExec.size) - val mergetreeScan = scanExec(0) + val mergetreeScan = scanExec.head assert(mergetreeScan.nodeName.startsWith("Scan mergetree")) val fileIndex = mergetreeScan.relation.location.asInstanceOf[TahoeFileIndex] assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).clickhouseTableConfigs.nonEmpty) assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).bucketOption.isEmpty) - assert( + assertResult("l_shipdate,l_orderkey")( ClickHouseTableV2 .getTable(fileIndex.deltaLog) .orderByKeyOption .get - .mkString(",") - .equals("l_shipdate,l_orderkey")) - assert( + .mkString(",")) + assertResult("l_shipdate")( ClickHouseTableV2 .getTable(fileIndex.deltaLog) .primaryKeyOption .get - .mkString(",") - .equals("l_shipdate")) + .mkString(",")) assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).partitionColumns.isEmpty) val addFiles = fileIndex.matchingFiles(Nil, Nil).map(f => f.asInstanceOf[AddMergeTreeParts]) - assert(addFiles.size == 6) - assert(addFiles.map(_.rows).sum == 600572) + assertResult(6)(addFiles.size) + assertResult(600572)(addFiles.map(_.rows).sum) } val df = spark.read @@ -650,7 +638,7 @@ class GlutenClickHouseMergeTreePathBasedWriteSuite |""".stripMargin) .agg(sum("l_linenumber").alias("res")) val result = df.collect() - assert(result(0).getLong(0) == 34842) + assertResult(34842)(result(0).getLong(0)) } test("test mergetree path based write with partition") { @@ -707,62 +695,56 @@ class GlutenClickHouseMergeTreePathBasedWriteSuite runTPCHQueryBySQL(1, sqlStr, compareResult = false) { df => val result = df.collect() - assert(result.size == 4) - assert(result(0).getString(0).equals("A")) - assert(result(0).getString(1).equals("F")) - assert(result(0).getDouble(2) == 3803858.0) + assertResult(4)(result.length) + assertResult("A")(result(0).getString(0)) + assertResult("F")(result(0).getString(1)) + assertResult(3803858.0)(result(0).getDouble(2)) - assert(result(2).getString(0).equals("N")) - assert(result(2).getString(1).equals("O")) - assert(result(2).getDouble(2) == 7454519.0) + assertResult("N")(result(2).getString(0)) + assertResult("O")(result(2).getString(1)) + assertResult(7454519.0)(result(2).getDouble(2)) val scanExec = collect(df.queryExecution.executedPlan) { case f: FileSourceScanExecTransformer => f } - assert(scanExec.size == 1) + assertResult(1)(scanExec.size) - val mergetreeScan = scanExec(0) + val mergetreeScan = scanExec.head assert(mergetreeScan.nodeName.startsWith("Scan mergetree")) - assert(mergetreeScan.metrics("numFiles").value == 3744) + assertResult(3744)(mergetreeScan.metrics("numFiles").value) val fileIndex = mergetreeScan.relation.location.asInstanceOf[TahoeFileIndex] assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).clickhouseTableConfigs.nonEmpty) assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).bucketOption.isEmpty) - assert( + assertResult("l_orderkey")( ClickHouseTableV2 .getTable(fileIndex.deltaLog) .orderByKeyOption .get - .mkString(",") - .equals("l_orderkey")) - assert( + .mkString(",")) + assertResult("l_orderkey")( ClickHouseTableV2 .getTable(fileIndex.deltaLog) .primaryKeyOption .get - .mkString(",") - .equals("l_orderkey")) - assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).partitionColumns.size == 2) - assert( + .mkString(",")) + assertResult(2)(ClickHouseTableV2.getTable(fileIndex.deltaLog).partitionColumns.size) + assertResult("l_shipdate")( ClickHouseTableV2 .getTable(fileIndex.deltaLog) - .partitionColumns(0) - .equals("l_shipdate")) - assert( + .partitionColumns + .head) + assertResult("l_returnflag")( ClickHouseTableV2 .getTable(fileIndex.deltaLog) - .partitionColumns(1) - .equals("l_returnflag")) + .partitionColumns(1)) val addFiles = fileIndex.matchingFiles(Nil, Nil).map(f => f.asInstanceOf[AddMergeTreeParts]) - assert(addFiles.size == 3835) - assert(addFiles.map(_.rows).sum == 602945) - assert( - addFiles.filter(_.partitionValues.get("l_shipdate").get.equals("1992-06-01")).size == 2) - assert( - addFiles.filter(_.partitionValues.get("l_shipdate").get.equals("1993-01-01")).size == 4) - assert( - addFiles.filter(_.partitionValues.get("l_shipdate").get.equals("1995-01-21")).size == 2) + assertResult(3835)(addFiles.size) + assertResult(602945)(addFiles.map(_.rows).sum) + assertResult(2)(addFiles.count(_.partitionValues("l_shipdate").equals("1992-06-01"))) + assertResult(4)(addFiles.count(_.partitionValues("l_shipdate").equals("1993-01-01"))) + assertResult(2)(addFiles.count(_.partitionValues("l_shipdate").equals("1995-01-21"))) } } @@ -777,10 +759,10 @@ class GlutenClickHouseMergeTreePathBasedWriteSuite sourceDF.write .format("clickhouse") .partitionBy("l_shipdate") - .option("clickhouse.orderByKey", "l_partkey,l_returnflag") - .option("clickhouse.primaryKey", "l_partkey") + .option("clickhouse.orderByKey", "l_orderkey,l_returnflag") + .option("clickhouse.primaryKey", "l_orderkey") .option("clickhouse.numBuckets", "4") - .option("clickhouse.bucketColumnNames", "l_orderkey") + .option("clickhouse.bucketColumnNames", "l_partkey") .mode(SaveMode.Append) .save(dataPath) @@ -814,61 +796,49 @@ class GlutenClickHouseMergeTreePathBasedWriteSuite val scanExec = collect(df.queryExecution.executedPlan) { case f: FileSourceScanExecTransformer => f } - assert(scanExec.size == 1) + assertResult(1)(scanExec.size) - val mergetreeScan = scanExec(0) + val mergetreeScan = scanExec.head assert(mergetreeScan.nodeName.startsWith("Scan mergetree")) val fileIndex = mergetreeScan.relation.location.asInstanceOf[TahoeFileIndex] assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).clickhouseTableConfigs.nonEmpty) val buckets = ClickHouseTableV2.getTable(fileIndex.deltaLog).bucketOption - assert(!buckets.isEmpty) - assert(buckets.get.numBuckets == 4) - assert( + assert(buckets.isDefined) + assertResult(4)(buckets.get.numBuckets) + assertResult("l_orderkey,l_returnflag")( buckets.get.sortColumnNames - .mkString(",") - .equals("l_partkey,l_returnflag")) - assert( + .mkString(",")) + assertResult("l_partkey")( buckets.get.bucketColumnNames - .mkString(",") - .equals("l_orderkey")) - assert( + .mkString(",")) + assertResult("l_orderkey,l_returnflag")( ClickHouseTableV2 .getTable(fileIndex.deltaLog) .orderByKeyOption .get - .mkString(",") - .equals("l_partkey,l_returnflag")) - assert( + .mkString(",")) + assertResult("l_orderkey")( ClickHouseTableV2 .getTable(fileIndex.deltaLog) .primaryKeyOption .get - .mkString(",") - .equals("l_partkey")) - assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).partitionColumns.size == 1) - assert( + .mkString(",")) + assertResult(1)(ClickHouseTableV2.getTable(fileIndex.deltaLog).partitionColumns.size) + assertResult("l_shipdate")( ClickHouseTableV2 .getTable(fileIndex.deltaLog) - .partitionColumns(0) - .equals("l_shipdate")) + .partitionColumns + .head) val addFiles = fileIndex.matchingFiles(Nil, Nil).map(f => f.asInstanceOf[AddMergeTreeParts]) - assert(addFiles.size == 10089) - assert(addFiles.map(_.rows).sum == 600572) - assert( - addFiles.filter(_.partitionValues.get("l_shipdate").get.equals("1992-06-01")).size == 4) - assert( - addFiles.filter(_.partitionValues.get("l_shipdate").get.equals("1993-01-01")).size == 4) - assert( - addFiles.filter(_.partitionValues.get("l_shipdate").get.equals("1995-01-21")).size == 4) - assert( - addFiles - .filter( - f => - f.partitionValues.get("l_shipdate").get.equals("1995-01-21") && f.bucketNum.equals( - "00000")) - .size == 1) + assertResult(10089)(addFiles.size) + assertResult(600572)(addFiles.map(_.rows).sum) + assertResult(4)(addFiles.count(_.partitionValues("l_shipdate").equals("1992-06-01"))) + assertResult(4)(addFiles.count(_.partitionValues("l_shipdate").equals("1993-01-01"))) + assertResult(4)(addFiles.count(_.partitionValues("l_shipdate").equals("1995-01-21"))) + assertResult(1)(addFiles.count( + f => f.partitionValues("l_shipdate").equals("1995-01-21") && f.bucketNum.equals("00000"))) } // check part pruning effect of filter on bucket column val df = spark.sql(s""" @@ -883,7 +853,7 @@ class GlutenClickHouseMergeTreePathBasedWriteSuite .flatMap(partition => partition.asInstanceOf[GlutenMergeTreePartition].partList) .map(_.name) .distinct - assert(touchedParts.size == 1) + assertResult(4)(touchedParts.size) // test upsert on partitioned & bucketed table upsertSourceTableAndCheck(dataPath) @@ -929,9 +899,9 @@ class GlutenClickHouseMergeTreePathBasedWriteSuite val scanExec = collect(df.queryExecution.executedPlan) { case f: FileSourceScanExecTransformer => f } - assert(scanExec.size == 1) + assertResult(1)(scanExec.size) - val mergetreeScan = scanExec(0) + val mergetreeScan = scanExec.head assert(mergetreeScan.nodeName.startsWith("Scan mergetree")) val fileIndex = mergetreeScan.relation.location.asInstanceOf[TahoeFileIndex] @@ -941,10 +911,8 @@ class GlutenClickHouseMergeTreePathBasedWriteSuite assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).primaryKeyOption.isEmpty) assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).partitionColumns.isEmpty) val addFiles = fileIndex.matchingFiles(Nil, Nil).map(f => f.asInstanceOf[AddMergeTreeParts]) - assert(addFiles.size == 6) - assert( - addFiles.map(_.rows).sum - == 600572) + assertResult(6)(addFiles.size) + assertResult(600572)(addFiles.map(_.rows).sum) } } @@ -1052,9 +1020,9 @@ class GlutenClickHouseMergeTreePathBasedWriteSuite | |""".stripMargin - assert( + assertResult("R")( // total rows should remain unchanged - spark.sql(sqlStr2).collect().apply(0).get(0) == "R" + spark.sql(sqlStr2).collect().apply(0).get(0) ) // test select * @@ -1101,40 +1069,38 @@ class GlutenClickHouseMergeTreePathBasedWriteSuite val scanExec = collect(df.queryExecution.executedPlan) { case f: FileSourceScanExecTransformer => f } - assert(scanExec.size == 1) + assertResult(1)(scanExec.size) - val mergetreeScan = scanExec(0) + val mergetreeScan = scanExec.head assert(mergetreeScan.nodeName.startsWith("Scan mergetree")) val fileIndex = mergetreeScan.relation.location.asInstanceOf[TahoeFileIndex] assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).clickhouseTableConfigs.nonEmpty) assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).bucketOption.isEmpty) - assert( + assertResult("l_shipdate,l_orderkey")( ClickHouseTableV2 .getTable(fileIndex.deltaLog) .orderByKeyOption .get - .mkString(",") - .equals("l_shipdate,l_orderkey")) - assert( + .mkString(",")) + assertResult("l_shipdate")( ClickHouseTableV2 .getTable(fileIndex.deltaLog) .primaryKeyOption .get - .mkString(",") - .equals("l_shipdate")) + .mkString(",")) assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).partitionColumns.isEmpty) val addFiles = fileIndex.matchingFiles(Nil, Nil).map(f => f.asInstanceOf[AddMergeTreeParts]) - assert(addFiles.size == 6) - assert(addFiles.map(_.rows).sum == 600572) + assertResult(6)(addFiles.size) + assertResult(600572)(addFiles.map(_.rows).sum) val plans = collect(df.queryExecution.executedPlan) { case scanExec: BasicScanExecTransformer => scanExec } - assert(plans.size == 1) - assert(plans(0).metrics("selectedMarksPk").value === 17) - assert(plans(0).metrics("totalMarksPk").value === 74) + assertResult(1)(plans.size) + assertResult(17)(plans.head.metrics("selectedMarksPk").value) + assertResult(74)(plans.head.metrics("totalMarksPk").value) } } @@ -1161,12 +1127,12 @@ class GlutenClickHouseMergeTreePathBasedWriteSuite val scanExec = collect(df.queryExecution.executedPlan) { case f: FileSourceScanExecTransformer => f } - assert(scanExec.size == 1) - val mergetreeScan = scanExec(0) + assertResult(1)(scanExec.size) + val mergetreeScan = scanExec.head - assert(ret.apply(0).get(0) == 1) + assertResult(1)(ret.apply(0).get(0)) val marks = mergetreeScan.metrics("selectedMarks").value - assert(marks == 1) + assertResult(1)(marks) val directory = new File(dataPath) // find a folder whose name is like 48b70783-b3b8-4bf8-9c52-5261aead8e3e_0_006 @@ -1197,11 +1163,11 @@ class GlutenClickHouseMergeTreePathBasedWriteSuite val scanExec = collect(df.queryExecution.executedPlan) { case f: FileSourceScanExecTransformer => f } - assert(scanExec.size == 1) - val mergetreeScan = scanExec(0) - assert(ret.apply(0).get(0) == 2) + assertResult(1)(scanExec.size) + val mergetreeScan = scanExec.head + assertResult(2)(ret.apply(0).get(0)) val marks = mergetreeScan.metrics("selectedMarks").value - assert(marks == 1) + assertResult(1)(marks) val directory = new File(dataPath) // find a folder whose name is like 48b70783-b3b8-4bf8-9c52-5261aead8e3e_0_006 @@ -1233,11 +1199,11 @@ class GlutenClickHouseMergeTreePathBasedWriteSuite val scanExec = collect(df.queryExecution.executedPlan) { case f: FileSourceScanExecTransformer => f } - assert(scanExec.size == 1) - val mergetreeScan = scanExec(0) - assert(ret.apply(0).get(0) == 2) + assertResult(1)(scanExec.size) + val mergetreeScan = scanExec.head + assertResult(2)(ret.apply(0).get(0)) val marks = mergetreeScan.metrics("selectedMarks").value - assert(marks == 1) + assertResult(1)(marks) val directory = new File(dataPath) // find a folder whose name is like 48b70783-b3b8-4bf8-9c52-5261aead8e3e_0_006 @@ -1277,18 +1243,16 @@ class GlutenClickHouseMergeTreePathBasedWriteSuite val scanExec = collect(df.queryExecution.executedPlan) { case f: FileSourceScanExecTransformer => f } - assert(scanExec.size == 1) + assertResult(1)(scanExec.size) - val mergetreeScan = scanExec(0) + val mergetreeScan = scanExec.head assert(mergetreeScan.nodeName.startsWith("Scan mergetree")) val fileIndex = mergetreeScan.relation.location.asInstanceOf[TahoeFileIndex] val addFiles = fileIndex.matchingFiles(Nil, Nil).map(f => f.asInstanceOf[AddMergeTreeParts]) - assert(addFiles.size == 6) - assert( - addFiles.map(_.rows).sum - == 600572) + assertResult(6)(addFiles.size) + assertResult(600572)(addFiles.map(_.rows).sum) } } @@ -1320,7 +1284,7 @@ class GlutenClickHouseMergeTreePathBasedWriteSuite val fileFilter = new WildcardFileFilter("*_0_*") var dataFileList = dataPathFile.list(fileFilter) - assert(dataFileList.size == 6) + assertResult(6)(dataFileList.length) // re-create the same table val dataPath2 = s"$basePath/lineitem_mergetree_5219_s" @@ -1339,7 +1303,7 @@ class GlutenClickHouseMergeTreePathBasedWriteSuite assert(dataPathFile.isDirectory && dataPathFile.isDirectory) dataFileList = dataPathFile.list(fileFilter) - assert(dataFileList.size == 6) + assertResult(6)(dataFileList.length) } } // scalastyle:off line.size.limit diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseMergeTreeWriteOnHDFSSuite.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseMergeTreeWriteOnHDFSSuite.scala index ca5b39fff1ace..99b2120599666 100644 --- a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseMergeTreeWriteOnHDFSSuite.scala +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseMergeTreeWriteOnHDFSSuite.scala @@ -25,10 +25,12 @@ import org.apache.spark.sql.execution.datasources.v2.clickhouse.metadata.AddMerg import org.apache.commons.io.FileUtils import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.fs.FileSystem +import org.apache.hadoop.fs.{FileSystem, Path} import java.io.File +import scala.concurrent.duration.DurationInt + // Some sqls' line length exceeds 100 // scalastyle:off line.size.limit @@ -57,6 +59,7 @@ class GlutenClickHouseMergeTreeWriteOnHDFSSuite .set( "spark.gluten.sql.columnar.backend.ch.runtime_settings.mergetree.merge_after_insert", "false") + // .set("spark.gluten.sql.columnar.backend.ch.runtime_config.path", "/data") // for local test } override protected def beforeEach(): Unit = { @@ -74,7 +77,7 @@ class GlutenClickHouseMergeTreeWriteOnHDFSSuite FileUtils.deleteDirectory(new File(HDFS_METADATA_PATH)) } - ignore("test mergetree table write") { + test("test mergetree table write") { spark.sql(s""" |DROP TABLE IF EXISTS lineitem_mergetree_hdfs; |""".stripMargin) @@ -139,7 +142,7 @@ class GlutenClickHouseMergeTreeWriteOnHDFSSuite val scanExec = collect(df.queryExecution.executedPlan) { case f: FileSourceScanExecTransformer => f } - assert(scanExec.size == 1) + assertResult(1)(scanExec.size) val mergetreeScan = scanExec.head assert(mergetreeScan.nodeName.startsWith("Scan mergetree")) @@ -151,13 +154,13 @@ class GlutenClickHouseMergeTreeWriteOnHDFSSuite assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).primaryKeyOption.isEmpty) assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).partitionColumns.isEmpty) val addFiles = fileIndex.matchingFiles(Nil, Nil).map(f => f.asInstanceOf[AddMergeTreeParts]) - assert(addFiles.size == 1) - assert(addFiles.head.rows == 600572) + assertResult(1)(addFiles.size) + assertResult(600572)(addFiles.head.rows) } spark.sql("drop table lineitem_mergetree_hdfs") } - ignore("test mergetree write with orderby keys / primary keys") { + test("test mergetree write with orderby keys / primary keys") { spark.sql(s""" |DROP TABLE IF EXISTS lineitem_mergetree_orderbykey_hdfs; |""".stripMargin) @@ -224,7 +227,7 @@ class GlutenClickHouseMergeTreeWriteOnHDFSSuite val scanExec = collect(df.queryExecution.executedPlan) { case f: FileSourceScanExecTransformer => f } - assert(scanExec.size == 1) + assertResult(1)(scanExec.size) val mergetreeScan = scanExec.head assert(mergetreeScan.nodeName.startsWith("Scan mergetree")) @@ -232,29 +235,27 @@ class GlutenClickHouseMergeTreeWriteOnHDFSSuite val fileIndex = mergetreeScan.relation.location.asInstanceOf[TahoeFileIndex] assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).clickhouseTableConfigs.nonEmpty) assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).bucketOption.isEmpty) - assert( + assertResult("l_shipdate,l_orderkey")( ClickHouseTableV2 .getTable(fileIndex.deltaLog) .orderByKeyOption .get - .mkString(",") - .equals("l_shipdate,l_orderkey")) - assert( + .mkString(",")) + assertResult("l_shipdate")( ClickHouseTableV2 .getTable(fileIndex.deltaLog) .primaryKeyOption .get - .mkString(",") - .equals("l_shipdate")) + .mkString(",")) assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).partitionColumns.isEmpty) val addFiles = fileIndex.matchingFiles(Nil, Nil).map(f => f.asInstanceOf[AddMergeTreeParts]) - assert(addFiles.size == 1) - assert(addFiles.head.rows == 600572) + assertResult(1)(addFiles.size) + assertResult(600572)(addFiles.head.rows) } spark.sql("drop table lineitem_mergetree_orderbykey_hdfs") } - ignore("test mergetree write with partition") { + test("test mergetree write with partition") { spark.sql(s""" |DROP TABLE IF EXISTS lineitem_mergetree_partition_hdfs; |""".stripMargin) @@ -386,56 +387,54 @@ class GlutenClickHouseMergeTreeWriteOnHDFSSuite runTPCHQueryBySQL(1, sqlStr, compareResult = false) { df => val result = df.collect() - assert(result.length == 4) - assert(result(0).getString(0).equals("A")) - assert(result(0).getString(1).equals("F")) - assert(result(0).getDouble(2) == 7578058.0) + assertResult(4)(result.length) + assertResult("A")(result(0).getString(0)) + assertResult("F")(result(0).getString(1)) + assertResult(7578058.0)(result(0).getDouble(2)) - assert(result(2).getString(0).equals("N")) - assert(result(2).getString(1).equals("O")) - assert(result(2).getDouble(2) == 7454519.0) + assertResult("N")(result(2).getString(0)) + assertResult("O")(result(2).getString(1)) + assertResult(7454519.0)(result(2).getDouble(2)) val scanExec = collect(df.queryExecution.executedPlan) { case f: FileSourceScanExecTransformer => f } - assert(scanExec.size == 1) + assertResult(1)(scanExec.size) val mergetreeScan = scanExec.head assert(mergetreeScan.nodeName.startsWith("Scan mergetree")) - assert(mergetreeScan.metrics("numFiles").value == 6) + assertResult(6)(mergetreeScan.metrics("numFiles").value) val fileIndex = mergetreeScan.relation.location.asInstanceOf[TahoeFileIndex] assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).clickhouseTableConfigs.nonEmpty) assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).bucketOption.isEmpty) - assert( + assertResult("l_orderkey")( ClickHouseTableV2 .getTable(fileIndex.deltaLog) .orderByKeyOption .get - .mkString(",") - .equals("l_orderkey")) - assert( + .mkString(",")) + assertResult("l_orderkey")( ClickHouseTableV2 .getTable(fileIndex.deltaLog) .primaryKeyOption .get - .mkString(",") - .equals("l_orderkey")) - assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).partitionColumns.size == 1) - assert( + .mkString(",")) + assertResult(1)(ClickHouseTableV2.getTable(fileIndex.deltaLog).partitionColumns.size) + assertResult("l_returnflag")( ClickHouseTableV2 .getTable(fileIndex.deltaLog) - .partitionColumns(0) - .equals("l_returnflag")) + .partitionColumns + .head) val addFiles = fileIndex.matchingFiles(Nil, Nil).map(f => f.asInstanceOf[AddMergeTreeParts]) - assert(addFiles.size == 6) - assert(addFiles.map(_.rows).sum == 750735) + assertResult(6)(addFiles.size) + assertResult(750735)(addFiles.map(_.rows).sum) } spark.sql("drop table lineitem_mergetree_partition_hdfs") } - ignore("test mergetree write with bucket table") { + test("test mergetree write with bucket table") { spark.sql(s""" |DROP TABLE IF EXISTS lineitem_mergetree_bucket_hdfs; |""".stripMargin) @@ -503,41 +502,40 @@ class GlutenClickHouseMergeTreeWriteOnHDFSSuite val scanExec = collect(df.queryExecution.executedPlan) { case f: FileSourceScanExecTransformer => f } - assert(scanExec.size == 1) + assertResult(1)(scanExec.size) - val mergetreeScan = scanExec(0) + val mergetreeScan = scanExec.head assert(mergetreeScan.nodeName.startsWith("Scan mergetree")) val fileIndex = mergetreeScan.relation.location.asInstanceOf[TahoeFileIndex] assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).clickhouseTableConfigs.nonEmpty) - assert(!ClickHouseTableV2.getTable(fileIndex.deltaLog).bucketOption.isEmpty) + assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).bucketOption.isDefined) if (sparkVersion.equals("3.2")) { assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).orderByKeyOption.isEmpty) } else { - assert( + assertResult("l_partkey")( ClickHouseTableV2 .getTable(fileIndex.deltaLog) .orderByKeyOption .get - .mkString(",") - .equals("l_partkey")) + .mkString(",")) } assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).primaryKeyOption.isEmpty) - assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).partitionColumns.size == 1) - assert( + assertResult(1)(ClickHouseTableV2.getTable(fileIndex.deltaLog).partitionColumns.size) + assertResult("l_returnflag")( ClickHouseTableV2 .getTable(fileIndex.deltaLog) - .partitionColumns(0) - .equals("l_returnflag")) + .partitionColumns + .head) val addFiles = fileIndex.matchingFiles(Nil, Nil).map(f => f.asInstanceOf[AddMergeTreeParts]) - assert(addFiles.size == 12) - assert(addFiles.map(_.rows).sum == 600572) + assertResult(12)(addFiles.size) + assertResult(600572)(addFiles.map(_.rows).sum) } spark.sql("drop table lineitem_mergetree_bucket_hdfs") } - ignore("test mergetree write with the path based") { + test("test mergetree write with the path based") { val dataPath = s"$HDFS_URL/test/lineitem_mergetree_bucket_hdfs" val sourceDF = spark.sql(s""" @@ -585,39 +583,78 @@ class GlutenClickHouseMergeTreeWriteOnHDFSSuite val scanExec = collect(df.queryExecution.executedPlan) { case f: FileSourceScanExecTransformer => f } - assert(scanExec.size == 1) + assertResult(1)(scanExec.size) - val mergetreeScan = scanExec(0) + val mergetreeScan = scanExec.head assert(mergetreeScan.nodeName.startsWith("Scan mergetree")) val fileIndex = mergetreeScan.relation.location.asInstanceOf[TahoeFileIndex] assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).clickhouseTableConfigs.nonEmpty) - assert(!ClickHouseTableV2.getTable(fileIndex.deltaLog).bucketOption.isEmpty) - assert( + assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).bucketOption.isDefined) + assertResult("l_orderkey")( ClickHouseTableV2 .getTable(fileIndex.deltaLog) .orderByKeyOption .get - .mkString(",") - .equals("l_orderkey")) + .mkString(",")) assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).primaryKeyOption.nonEmpty) - assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).partitionColumns.size == 1) - assert( + assertResult(1)(ClickHouseTableV2.getTable(fileIndex.deltaLog).partitionColumns.size) + assertResult("l_returnflag")( ClickHouseTableV2 .getTable(fileIndex.deltaLog) - .partitionColumns(0) - .equals("l_returnflag")) + .partitionColumns + .head) val addFiles = fileIndex.matchingFiles(Nil, Nil).map(f => f.asInstanceOf[AddMergeTreeParts]) - assert(addFiles.size == 12) - assert(addFiles.map(_.rows).sum == 600572) + assertResult(12)(addFiles.size) + assertResult(600572)(addFiles.map(_.rows).sum) } val result = spark.read .format("clickhouse") .load(dataPath) .count() - assert(result == 600572) + assertResult(600572)(result) + } + + test("test mergetree insert with optimize basic") { + val tableName = "lineitem_mergetree_insert_optimize_basic_hdfs" + val dataPath = s"$HDFS_URL/test/$tableName" + + withSQLConf( + "spark.databricks.delta.optimize.minFileSize" -> "200000000", + "spark.gluten.sql.columnar.backend.ch.runtime_settings.mergetree.merge_after_insert" -> "true", + "spark.gluten.sql.columnar.backend.ch.runtime_settings.mergetree.insert_without_local_storage" -> "true", + "spark.gluten.sql.columnar.backend.ch.runtime_settings.min_insert_block_size_rows" -> "10000" + ) { + spark.sql(s""" + |DROP TABLE IF EXISTS $tableName; + |""".stripMargin) + + spark.sql(s""" + |CREATE TABLE IF NOT EXISTS $tableName + |USING clickhouse + |LOCATION '$dataPath' + |TBLPROPERTIES (storage_policy='__hdfs_main') + | as select * from lineitem + |""".stripMargin) + + val ret = spark.sql(s"select count(*) from $tableName").collect() + assertResult(600572)(ret.apply(0).get(0)) + val conf = new Configuration + conf.set("fs.defaultFS", HDFS_URL) + val fs = FileSystem.get(conf) + + eventually(timeout(60.seconds), interval(2.seconds)) { + val it = fs.listFiles(new Path(dataPath), true) + var files = 0 + while (it.hasNext) { + it.next() + files += 1 + } + assertResult(72)(files) + } + } } } // scalastyle:off line.size.limit diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseMergeTreeWriteOnS3Suite.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseMergeTreeWriteOnS3Suite.scala index 2b641438802f6..30f443265caeb 100644 --- a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseMergeTreeWriteOnS3Suite.scala +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseMergeTreeWriteOnS3Suite.scala @@ -55,6 +55,7 @@ class GlutenClickHouseMergeTreeWriteOnS3Suite .set("spark.sql.autoBroadcastJoinThreshold", "10MB") .set("spark.sql.adaptive.enabled", "true") .set("spark.gluten.sql.columnar.backend.ch.runtime_config.logger.level", "error") + // .set("spark.gluten.sql.columnar.backend.ch.runtime_config.path", "/data") // for local test } override protected def beforeEach(): Unit = { @@ -152,7 +153,7 @@ class GlutenClickHouseMergeTreeWriteOnS3Suite val scanExec = collect(df.queryExecution.executedPlan) { case f: FileSourceScanExecTransformer => f } - assert(scanExec.size == 1) + assertResult(1)(scanExec.size) val mergetreeScan = scanExec.head assert(mergetreeScan.nodeName.startsWith("Scan mergetree")) @@ -164,8 +165,8 @@ class GlutenClickHouseMergeTreeWriteOnS3Suite assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).primaryKeyOption.isEmpty) assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).partitionColumns.isEmpty) val addFiles = fileIndex.matchingFiles(Nil, Nil).map(f => f.asInstanceOf[AddMergeTreeParts]) - assert(addFiles.size == 1) - assert(addFiles.head.rows == 600572) + assertResult(1)(addFiles.size) + assertResult(600572)(addFiles.head.rows) } spark.sql("drop table lineitem_mergetree_s3") // clean up } @@ -237,7 +238,7 @@ class GlutenClickHouseMergeTreeWriteOnS3Suite val scanExec = collect(df.queryExecution.executedPlan) { case f: FileSourceScanExecTransformer => f } - assert(scanExec.size == 1) + assertResult(1)(scanExec.size) val mergetreeScan = scanExec.head assert(mergetreeScan.nodeName.startsWith("Scan mergetree")) @@ -245,24 +246,22 @@ class GlutenClickHouseMergeTreeWriteOnS3Suite val fileIndex = mergetreeScan.relation.location.asInstanceOf[TahoeFileIndex] assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).clickhouseTableConfigs.nonEmpty) assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).bucketOption.isEmpty) - assert( + assertResult("l_shipdate,l_orderkey")( ClickHouseTableV2 .getTable(fileIndex.deltaLog) .orderByKeyOption .get - .mkString(",") - .equals("l_shipdate,l_orderkey")) - assert( + .mkString(",")) + assertResult("l_shipdate")( ClickHouseTableV2 .getTable(fileIndex.deltaLog) .primaryKeyOption .get - .mkString(",") - .equals("l_shipdate")) + .mkString(",")) assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).partitionColumns.isEmpty) val addFiles = fileIndex.matchingFiles(Nil, Nil).map(f => f.asInstanceOf[AddMergeTreeParts]) - assert(addFiles.size == 1) - assert(addFiles.head.rows == 600572) + assertResult(1)(addFiles.size) + assertResult(600572)(addFiles.head.rows) } spark.sql("drop table lineitem_mergetree_orderbykey_s3") } @@ -399,51 +398,49 @@ class GlutenClickHouseMergeTreeWriteOnS3Suite runTPCHQueryBySQL(1, sqlStr, compareResult = false) { df => val result = df.collect() - assert(result.length == 4) - assert(result(0).getString(0).equals("A")) - assert(result(0).getString(1).equals("F")) - assert(result(0).getDouble(2) == 7578058.0) + assertResult(4)(result.length) + assertResult("A")(result(0).getString(0)) + assertResult("F")(result(0).getString(1)) + assertResult(7578058.0)(result(0).getDouble(2)) - assert(result(2).getString(0).equals("N")) - assert(result(2).getString(1).equals("O")) - assert(result(2).getDouble(2) == 7454519.0) + assertResult("N")(result(2).getString(0)) + assertResult("O")(result(2).getString(1)) + assertResult(7454519.0)(result(2).getDouble(2)) val scanExec = collect(df.queryExecution.executedPlan) { case f: FileSourceScanExecTransformer => f } - assert(scanExec.size == 1) + assertResult(1)(scanExec.size) val mergetreeScan = scanExec.head assert(mergetreeScan.nodeName.startsWith("Scan mergetree")) - assert(mergetreeScan.metrics("numFiles").value == 6) + assertResult(6)(mergetreeScan.metrics("numFiles").value) val fileIndex = mergetreeScan.relation.location.asInstanceOf[TahoeFileIndex] assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).clickhouseTableConfigs.nonEmpty) assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).bucketOption.isEmpty) - assert( + assertResult("l_orderkey")( ClickHouseTableV2 .getTable(fileIndex.deltaLog) .orderByKeyOption .get - .mkString(",") - .equals("l_orderkey")) - assert( + .mkString(",")) + assertResult("l_orderkey")( ClickHouseTableV2 .getTable(fileIndex.deltaLog) .primaryKeyOption .get - .mkString(",") - .equals("l_orderkey")) - assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).partitionColumns.size == 1) - assert( + .mkString(",")) + assertResult(1)(ClickHouseTableV2.getTable(fileIndex.deltaLog).partitionColumns.size) + assertResult("l_returnflag")( ClickHouseTableV2 .getTable(fileIndex.deltaLog) - .partitionColumns(0) - .equals("l_returnflag")) + .partitionColumns + .head) val addFiles = fileIndex.matchingFiles(Nil, Nil).map(f => f.asInstanceOf[AddMergeTreeParts]) - assert(addFiles.size == 6) - assert(addFiles.map(_.rows).sum == 750735) + assertResult(6)(addFiles.size) + assertResult(750735)(addFiles.map(_.rows).sum) } spark.sql("drop table lineitem_mergetree_partition_s3") @@ -517,36 +514,35 @@ class GlutenClickHouseMergeTreeWriteOnS3Suite val scanExec = collect(df.queryExecution.executedPlan) { case f: FileSourceScanExecTransformer => f } - assert(scanExec.size == 1) + assertResult(1)(scanExec.size) - val mergetreeScan = scanExec(0) + val mergetreeScan = scanExec.head assert(mergetreeScan.nodeName.startsWith("Scan mergetree")) val fileIndex = mergetreeScan.relation.location.asInstanceOf[TahoeFileIndex] assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).clickhouseTableConfigs.nonEmpty) - assert(!ClickHouseTableV2.getTable(fileIndex.deltaLog).bucketOption.isEmpty) + assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).bucketOption.isDefined) if (sparkVersion.equals("3.2")) { assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).orderByKeyOption.isEmpty) } else { - assert( + assertResult("l_partkey")( ClickHouseTableV2 .getTable(fileIndex.deltaLog) .orderByKeyOption .get - .mkString(",") - .equals("l_partkey")) + .mkString(",")) } assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).primaryKeyOption.isEmpty) - assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).partitionColumns.size == 1) - assert( + assertResult(1)(ClickHouseTableV2.getTable(fileIndex.deltaLog).partitionColumns.size) + assertResult("l_returnflag")( ClickHouseTableV2 .getTable(fileIndex.deltaLog) - .partitionColumns(0) - .equals("l_returnflag")) + .partitionColumns + .head) val addFiles = fileIndex.matchingFiles(Nil, Nil).map(f => f.asInstanceOf[AddMergeTreeParts]) - assert(addFiles.size == 12) - assert(addFiles.map(_.rows).sum == 600572) + assertResult(12)(addFiles.size) + assertResult(600572)(addFiles.map(_.rows).sum) } spark.sql("drop table lineitem_mergetree_bucket_s3") } @@ -599,39 +595,137 @@ class GlutenClickHouseMergeTreeWriteOnS3Suite val scanExec = collect(df.queryExecution.executedPlan) { case f: FileSourceScanExecTransformer => f } - assert(scanExec.size == 1) + assertResult(1)(scanExec.size) - val mergetreeScan = scanExec(0) + val mergetreeScan = scanExec.head assert(mergetreeScan.nodeName.startsWith("Scan mergetree")) val fileIndex = mergetreeScan.relation.location.asInstanceOf[TahoeFileIndex] assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).clickhouseTableConfigs.nonEmpty) - assert(!ClickHouseTableV2.getTable(fileIndex.deltaLog).bucketOption.isEmpty) - assert( + assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).bucketOption.isDefined) + assertResult("l_orderkey")( ClickHouseTableV2 .getTable(fileIndex.deltaLog) .orderByKeyOption .get - .mkString(",") - .equals("l_orderkey")) + .mkString(",")) assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).primaryKeyOption.nonEmpty) - assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).partitionColumns.size == 1) - assert( + assertResult(1)(ClickHouseTableV2.getTable(fileIndex.deltaLog).partitionColumns.size) + assertResult("l_returnflag")( ClickHouseTableV2 .getTable(fileIndex.deltaLog) - .partitionColumns(0) - .equals("l_returnflag")) + .partitionColumns + .head) val addFiles = fileIndex.matchingFiles(Nil, Nil).map(f => f.asInstanceOf[AddMergeTreeParts]) - assert(addFiles.size == 12) - assert(addFiles.map(_.rows).sum == 600572) + assertResult(12)(addFiles.size) + assertResult(600572)(addFiles.map(_.rows).sum) } val result = spark.read .format("clickhouse") .load(dataPath) .count() - assert(result == 600572) + assertResult(600572)(result) + } + + test("test mergetree insert with optimize basic") { + val tableName = "lineitem_mergetree_insert_optimize_basic_s3" + val dataPath = s"s3a://$BUCKET_NAME/$tableName" + + withSQLConf( + "spark.databricks.delta.optimize.minFileSize" -> "200000000", + "spark.gluten.sql.columnar.backend.ch.runtime_settings.mergetree.merge_after_insert" -> "true" + ) { + spark.sql(s""" + |DROP TABLE IF EXISTS $tableName; + |""".stripMargin) + + spark.sql(s""" + |CREATE TABLE IF NOT EXISTS $tableName + |USING clickhouse + |LOCATION '$dataPath' + | as select * from lineitem + |""".stripMargin) + + val ret = spark.sql(s"select count(*) from $tableName").collect() + assertResult(600572)(ret.apply(0).get(0)) + assert( + !new File(s"$CH_DEFAULT_STORAGE_DIR/lineitem_mergetree_insert_optimize_basic").exists()) + } + } + + test("test mergetree with primary keys pruning by driver") { + val tableName = "lineitem_mergetree_pk_pruning_by_driver_s3" + val dataPath = s"s3a://$BUCKET_NAME/$tableName" + spark.sql(s""" + |DROP TABLE IF EXISTS $tableName; + |""".stripMargin) + + spark.sql(s""" + |CREATE TABLE IF NOT EXISTS $tableName + |( + | l_orderkey bigint, + | l_partkey bigint, + | l_suppkey bigint, + | l_linenumber bigint, + | l_quantity double, + | l_extendedprice double, + | l_discount double, + | l_tax double, + | l_returnflag string, + | l_linestatus string, + | l_shipdate date, + | l_commitdate date, + | l_receiptdate date, + | l_shipinstruct string, + | l_shipmode string, + | l_comment string + |) + |USING clickhouse + |TBLPROPERTIES (storage_policy='__s3_main', orderByKey='l_shipdate') + |LOCATION '$dataPath' + |""".stripMargin) + + spark.sql(s""" + | insert into table $tableName + | select * from lineitem + |""".stripMargin) + + FileUtils.forceDelete(new File(S3_METADATA_PATH)) + + val sqlStr = + s""" + |SELECT + | sum(l_extendedprice * l_discount) AS revenue + |FROM + | $tableName + |WHERE + | l_shipdate >= date'1994-01-01' + | AND l_shipdate < date'1994-01-01' + interval 1 year + | AND l_discount BETWEEN 0.06 - 0.01 AND 0.06 + 0.01 + | AND l_quantity < 24 + |""".stripMargin + + withSQLConf( + "spark.gluten.sql.columnar.backend.ch.runtime_settings.enabled_driver_filter_mergetree_index" -> "true") { + runTPCHQueryBySQL(6, sqlStr) { + df => + val scanExec = collect(df.queryExecution.executedPlan) { + case f: FileSourceScanExecTransformer => f + } + assertResult(1)(scanExec.size) + + val mergetreeScan = scanExec.head + assert(mergetreeScan.nodeName.startsWith("Scan mergetree")) + + val plans = collect(df.queryExecution.executedPlan) { + case scanExec: BasicScanExecTransformer => scanExec + } + assertResult(1)(plans.size) + assertResult(1)(plans.head.getSplitInfos.size) + } + } } } // scalastyle:off line.size.limit diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseMergeTreeWriteSuite.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseMergeTreeWriteSuite.scala index 439a1b58fd4f2..27bd4372aa643 100644 --- a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseMergeTreeWriteSuite.scala +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseMergeTreeWriteSuite.scala @@ -57,6 +57,9 @@ class GlutenClickHouseMergeTreeWriteSuite .set( "spark.gluten.sql.columnar.backend.ch.runtime_settings.mergetree.merge_after_insert", "false") + .set( + "spark.gluten.sql.columnar.backend.ch.runtime_settings.input_format_parquet_max_block_size", + "8192") } override protected def createTPCHNotNullTables(): Unit = { @@ -128,7 +131,7 @@ class GlutenClickHouseMergeTreeWriteSuite case f: FileSourceScanExecTransformer => f case w: WholeStageTransformer => w } - assert(plans.size == 4) + assertResult(4)(plans.size) val mergetreeScan = plans(3).asInstanceOf[FileSourceScanExecTransformer] assert(mergetreeScan.nodeName.startsWith("Scan mergetree")) @@ -140,10 +143,8 @@ class GlutenClickHouseMergeTreeWriteSuite assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).primaryKeyOption.isEmpty) assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).partitionColumns.isEmpty) val addFiles = fileIndex.matchingFiles(Nil, Nil).map(f => f.asInstanceOf[AddMergeTreeParts]) - assert(addFiles.size == 6) - assert( - addFiles.map(_.rows).sum - == 600572) + assertResult(6)(addFiles.size) + assertResult(600572)(addFiles.map(_.rows).sum) // GLUTEN-5060: check the unnecessary FilterExec val wholeStageTransformer = plans(2).asInstanceOf[WholeStageTransformer] @@ -200,9 +201,9 @@ class GlutenClickHouseMergeTreeWriteSuite | select count(*) from lineitem_mergetree_insertoverwrite | |""".stripMargin - assert( + assertResult(300001)( // total rows should remain unchanged - spark.sql(sql2).collect().apply(0).get(0) == 300001 + spark.sql(sql2).collect().apply(0).get(0) ) } @@ -251,9 +252,9 @@ class GlutenClickHouseMergeTreeWriteSuite | select count(*) from lineitem_mergetree_insertoverwrite2 | |""".stripMargin - assert( + assertResult(2418)( // total rows should remain unchanged - spark.sql(sql2).collect().apply(0).get(0) == 2418 + spark.sql(sql2).collect().apply(0).get(0) ) } @@ -303,9 +304,9 @@ class GlutenClickHouseMergeTreeWriteSuite | select count(*) from lineitem_mergetree_insertoverwrite3 | |""".stripMargin - assert( + assertResult(600572)( // total rows should remain unchanged - spark.sql(sql2).collect().apply(0).get(0) == 600572 + spark.sql(sql2).collect().apply(0).get(0) ) } } @@ -357,14 +358,14 @@ class GlutenClickHouseMergeTreeWriteSuite val df = spark.sql(sql1) val result = df.collect() - assert( + assertResult(1)( // in test data, there are only 1 row with l_orderkey = 12647 - result.apply(0).get(0) == 1 + result.apply(0).get(0) ) val scanExec = collect(df.queryExecution.executedPlan) { case f: FileSourceScanExecTransformer => f } - assert(scanExec.size == 1) + assertResult(1)(scanExec.size) val mergetreeScan = scanExec.head assert(mergetreeScan.nodeName.startsWith("Scan mergetree")) @@ -376,16 +377,14 @@ class GlutenClickHouseMergeTreeWriteSuite assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).primaryKeyOption.isEmpty) assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).partitionColumns.isEmpty) val addFiles = fileIndex.matchingFiles(Nil, Nil).map(f => f.asInstanceOf[AddMergeTreeParts]) - assert( - addFiles.map(_.rows).sum - == 600572) + assertResult(600572)(addFiles.map(_.rows).sum) // 4 parts belong to the first batch // 2 parts belong to the second batch (1 actual updated part, 1 passively updated). - assert(addFiles.size == 6) + assertResult(6)(addFiles.size) val filePaths = addFiles.map(_.path).groupBy(name => name.substring(0, name.lastIndexOf("_"))) - assert(filePaths.size == 2) - assert(Array(2, 4).sameElements(filePaths.values.map(paths => paths.size).toArray.sorted)) + assertResult(2)(filePaths.size) + assertResult(Array(2, 4))(filePaths.values.map(paths => paths.size).toArray.sorted) } val sql2 = @@ -393,9 +392,9 @@ class GlutenClickHouseMergeTreeWriteSuite | select count(*) from lineitem_mergetree_update | |""".stripMargin - assert( + assertResult(600572)( // total rows should remain unchanged - spark.sql(sql2).collect().apply(0).get(0) == 600572 + spark.sql(sql2).collect().apply(0).get(0) ) } @@ -435,17 +434,14 @@ class GlutenClickHouseMergeTreeWriteSuite val df1 = spark.sql(s""" | delete from lineitem_mergetree_delete where l_orderkey = 12647 |""".stripMargin) -// assert( -// df1.collect().apply(0).get(0) == 1 -// ) { val df = spark.sql(s""" | select count(*) from lineitem_mergetree_delete |""".stripMargin) val result = df.collect() - assert( - result.apply(0).get(0) == 600571 + assertResult(600571)( + result.apply(0).get(0) ) val scanExec = collect(df.queryExecution.executedPlan) { case f: FileSourceScanExecTransformer => f @@ -455,10 +451,10 @@ class GlutenClickHouseMergeTreeWriteSuite val addFiles = fileIndex.matchingFiles(Nil, Nil).map(f => f.asInstanceOf[AddMergeTreeParts]) // 4 parts belong to the first batch // 2 parts belong to the second batch (1 actual updated part, 1 passively updated). - assert(addFiles.size == 6) + assertResult(6)(addFiles.size) val filePaths = addFiles.map(_.path).groupBy(name => name.substring(0, name.lastIndexOf("_"))) - assert(filePaths.size == 2) - assert(Array(2, 4).sameElements(filePaths.values.map(paths => paths.size).toArray.sorted)) + assertResult(2)(filePaths.size) + assertResult(Array(2, 4))(filePaths.values.map(paths => paths.size).toArray.sorted) } { @@ -468,9 +464,7 @@ class GlutenClickHouseMergeTreeWriteSuite val df3 = spark.sql(s""" | select count(*) from lineitem_mergetree_delete |""".stripMargin) - assert( - df3.collect().apply(0).get(0) == 400089 - ) + assertResult(400089)(df3.collect().apply(0).get(0)) } } @@ -512,9 +506,7 @@ class GlutenClickHouseMergeTreeWriteSuite val df0 = spark.sql(s""" | select count(*) from lineitem_mergetree_upsert |""".stripMargin) - assert( - df0.collect().apply(0).get(0) == 600572 - ) + assertResult(600572)(df0.collect().apply(0).get(0)) } upsertSourceTableAndCheck("lineitem_mergetree_upsert") @@ -551,18 +543,14 @@ class GlutenClickHouseMergeTreeWriteSuite val df1 = spark.sql(s""" | select count(*) from $tableName |""".stripMargin) - assert( - df1.collect().apply(0).get(0) == 600572 + 3506 - ) + assertResult(600572 + 3506)(df1.collect().apply(0).get(0)) } { val df2 = spark.sql(s""" | select count(*) from $tableName where l_returnflag = 'Z' |""".stripMargin) - assert( - df2.collect().apply(0).get(0) == 3506 - ) + assertResult(3506)(df2.collect().apply(0).get(0)) } { @@ -570,9 +558,7 @@ class GlutenClickHouseMergeTreeWriteSuite spark.sql(s""" | select count(*) from $tableName where l_orderkey > 10000000 |""".stripMargin) - assert( - df3.collect().apply(0).get(0) == 3506 - ) + assertResult(3506)(df3.collect().apply(0).get(0)) } } @@ -642,33 +628,31 @@ class GlutenClickHouseMergeTreeWriteSuite val scanExec = collect(df.queryExecution.executedPlan) { case f: FileSourceScanExecTransformer => f } - assert(scanExec.size == 1) + assertResult(1)(scanExec.size) - val mergetreeScan = scanExec(0) + val mergetreeScan = scanExec.head assert(mergetreeScan.nodeName.startsWith("Scan mergetree")) val fileIndex = mergetreeScan.relation.location.asInstanceOf[TahoeFileIndex] assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).clickhouseTableConfigs.nonEmpty) assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).bucketOption.isEmpty) - assert( + assertResult("l_shipdate,l_orderkey")( ClickHouseTableV2 .getTable(fileIndex.deltaLog) .orderByKeyOption .get - .mkString(",") - .equals("l_shipdate,l_orderkey")) - assert( + .mkString(",")) + assertResult("l_shipdate")( ClickHouseTableV2 .getTable(fileIndex.deltaLog) .primaryKeyOption .get - .mkString(",") - .equals("l_shipdate")) + .mkString(",")) assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).partitionColumns.isEmpty) val addFiles = fileIndex.matchingFiles(Nil, Nil).map(f => f.asInstanceOf[AddMergeTreeParts]) - assert(addFiles.size == 6) - assert(addFiles.map(_.rows).sum == 600572) + assertResult(6)(addFiles.size) + assertResult(600572)(addFiles.map(_.rows).sum) } } @@ -800,62 +784,56 @@ class GlutenClickHouseMergeTreeWriteSuite runTPCHQueryBySQL(1, sqlStr, compareResult = false) { df => val result = df.collect() - assert(result.size == 4) - assert(result(0).getString(0).equals("A")) - assert(result(0).getString(1).equals("F")) - assert(result(0).getDouble(2) == 3865234.0) + assertResult(4)(result.length) + assertResult("A")(result(0).getString(0)) + assertResult("F")(result(0).getString(1)) + assertResult(3865234.0)(result(0).getDouble(2)) - assert(result(2).getString(0).equals("N")) - assert(result(2).getString(1).equals("O")) - assert(result(2).getDouble(2) == 7454519.0) + assertResult("N")(result(2).getString(0)) + assertResult("O")(result(2).getString(1)) + assertResult(7454519.0)(result(2).getDouble(2)) val scanExec = collect(df.queryExecution.executedPlan) { case f: FileSourceScanExecTransformer => f } - assert(scanExec.size == 1) + assertResult(1)(scanExec.size) - val mergetreeScan = scanExec(0) + val mergetreeScan = scanExec.head assert(mergetreeScan.nodeName.startsWith("Scan mergetree")) - assert(mergetreeScan.metrics("numFiles").value == 3745) + assertResult(3745)(mergetreeScan.metrics("numFiles").value) val fileIndex = mergetreeScan.relation.location.asInstanceOf[TahoeFileIndex] assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).clickhouseTableConfigs.nonEmpty) assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).bucketOption.isEmpty) - assert( + assertResult("l_orderkey")( ClickHouseTableV2 .getTable(fileIndex.deltaLog) .orderByKeyOption .get - .mkString(",") - .equals("l_orderkey")) - assert( + .mkString(",")) + assertResult("l_orderkey")( ClickHouseTableV2 .getTable(fileIndex.deltaLog) .primaryKeyOption .get - .mkString(",") - .equals("l_orderkey")) - assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).partitionColumns.size == 2) - assert( + .mkString(",")) + assertResult(2)(ClickHouseTableV2.getTable(fileIndex.deltaLog).partitionColumns.size) + assertResult("l_shipdate")( ClickHouseTableV2 .getTable(fileIndex.deltaLog) - .partitionColumns(0) - .equals("l_shipdate")) - assert( + .partitionColumns + .head) + assertResult("l_returnflag")( ClickHouseTableV2 .getTable(fileIndex.deltaLog) - .partitionColumns(1) - .equals("l_returnflag")) + .partitionColumns(1)) val addFiles = fileIndex.matchingFiles(Nil, Nil).map(f => f.asInstanceOf[AddMergeTreeParts]) - assert(addFiles.size == 3836) - assert(addFiles.map(_.rows).sum == 605363) - assert( - addFiles.filter(_.partitionValues.get("l_shipdate").get.equals("1992-06-01")).size == 2) - assert( - addFiles.filter(_.partitionValues.get("l_shipdate").get.equals("1993-01-01")).size == 4) - assert( - addFiles.filter(_.partitionValues.get("l_shipdate").get.equals("1995-01-21")).size == 3) + assertResult(3836)(addFiles.size) + assertResult(605363)(addFiles.map(_.rows).sum) + assertResult(2)(addFiles.count(_.partitionValues("l_shipdate").equals("1992-06-01"))) + assertResult(4)(addFiles.count(_.partitionValues("l_shipdate").equals("1993-01-01"))) + assertResult(3)(addFiles.count(_.partitionValues("l_shipdate").equals("1995-01-21"))) } } @@ -887,8 +865,8 @@ class GlutenClickHouseMergeTreeWriteSuite |) |USING clickhouse |PARTITIONED BY (l_shipdate) - |CLUSTERED BY (l_orderkey) - |${if (sparkVersion.equals("3.2")) "" else "SORTED BY (l_partkey, l_returnflag)"} INTO 4 BUCKETS + |CLUSTERED BY (l_partkey) + |${if (sparkVersion.equals("3.2")) "" else "SORTED BY (l_orderkey, l_returnflag)"} INTO 4 BUCKETS |LOCATION '$basePath/lineitem_mergetree_bucket' |""".stripMargin) @@ -927,49 +905,40 @@ class GlutenClickHouseMergeTreeWriteSuite val scanExec = collect(df.queryExecution.executedPlan) { case f: FileSourceScanExecTransformer => f } - assert(scanExec.size == 1) + assertResult(1)(scanExec.size) - val mergetreeScan = scanExec(0) + val mergetreeScan = scanExec.head assert(mergetreeScan.nodeName.startsWith("Scan mergetree")) val fileIndex = mergetreeScan.relation.location.asInstanceOf[TahoeFileIndex] assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).clickhouseTableConfigs.nonEmpty) - assert(!ClickHouseTableV2.getTable(fileIndex.deltaLog).bucketOption.isEmpty) + assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).bucketOption.isDefined) if (sparkVersion.equals("3.2")) { assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).orderByKeyOption.isEmpty) } else { - assert( + assertResult("l_orderkey,l_returnflag")( ClickHouseTableV2 .getTable(fileIndex.deltaLog) .orderByKeyOption .get - .mkString(",") - .equals("l_partkey,l_returnflag")) + .mkString(",")) } assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).primaryKeyOption.isEmpty) - assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).partitionColumns.size == 1) - assert( + assertResult(1)(ClickHouseTableV2.getTable(fileIndex.deltaLog).partitionColumns.size) + assertResult("l_shipdate")( ClickHouseTableV2 .getTable(fileIndex.deltaLog) - .partitionColumns(0) - .equals("l_shipdate")) + .partitionColumns + .head) val addFiles = fileIndex.matchingFiles(Nil, Nil).map(f => f.asInstanceOf[AddMergeTreeParts]) - assert(addFiles.size == 10089) - assert(addFiles.map(_.rows).sum == 600572) - assert( - addFiles.filter(_.partitionValues.get("l_shipdate").get.equals("1992-06-01")).size == 4) - assert( - addFiles.filter(_.partitionValues.get("l_shipdate").get.equals("1993-01-01")).size == 4) - assert( - addFiles.filter(_.partitionValues.get("l_shipdate").get.equals("1995-01-21")).size == 4) - assert( - addFiles - .filter( - f => - f.partitionValues.get("l_shipdate").get.equals("1995-01-21") && f.bucketNum.equals( - "00000")) - .size == 1) + assertResult(10089)(addFiles.size) + assertResult(600572)(addFiles.map(_.rows).sum) + assertResult(4)(addFiles.count(_.partitionValues("l_shipdate").equals("1992-06-01"))) + assertResult(4)(addFiles.count(_.partitionValues("l_shipdate").equals("1993-01-01"))) + assertResult(4)(addFiles.count(_.partitionValues("l_shipdate").equals("1995-01-21"))) + assertResult(1)(addFiles.count( + f => f.partitionValues("l_shipdate").equals("1995-01-21") && f.bucketNum.equals("00000"))) } // check part pruning effect of filter on bucket column val df = spark.sql(s""" @@ -984,7 +953,7 @@ class GlutenClickHouseMergeTreeWriteSuite .flatMap(partition => partition.asInstanceOf[GlutenMergeTreePartition].partList) .map(_.name) .distinct - assert(touchedParts.size == 1) + assertResult(4)(touchedParts.size) // test upsert on partitioned & bucketed table upsertSourceTableAndCheck("lineitem_mergetree_bucket") @@ -996,9 +965,7 @@ class GlutenClickHouseMergeTreeWriteSuite val df0 = spark.sql(s""" | select count(*) from lineitem_mergetree_bucket |""".stripMargin) - assert( - df0.collect().apply(0).get(0) == 3 - ) + assertResult(3)(df0.collect().apply(0).get(0)) } @@ -1065,40 +1032,40 @@ class GlutenClickHouseMergeTreeWriteSuite warehouse + "/" + tableName } val deletedPath = new File(deletedPathStr) - assert(deletedPath.exists() == exceptedExists) + assertResult(exceptedExists)(deletedPath.exists()) } // test non external table var tableName = "lineitem_mergetree_drop" var tableLocation = "" createAndDropTable(tableName, tableLocation) - checkTableExists(tableName, tableLocation, false) + checkTableExists(tableName, tableLocation, exceptedExists = false) // test external table tableName = "lineitem_mergetree_external_drop" - createAndDropTable(tableName, tableLocation, true) - checkTableExists(tableName, tableLocation, false) + createAndDropTable(tableName, tableLocation, isExternal = true) + checkTableExists(tableName, tableLocation, exceptedExists = false) // test table with the specified location tableName = "lineitem_mergetree_location_drop" tableLocation = basePath + "/" + tableName createAndDropTable(tableName, tableLocation) - checkTableExists(tableName, tableLocation, true) + checkTableExists(tableName, tableLocation, exceptedExists = true) tableName = "lineitem_mergetree_external_location_drop" tableLocation = basePath + "/" + tableName - createAndDropTable(tableName, tableLocation, true) - checkTableExists(tableName, tableLocation, true) + createAndDropTable(tableName, tableLocation, isExternal = true) + checkTableExists(tableName, tableLocation, exceptedExists = true) tableName = "lineitem_mergetree_location_purge" tableLocation = basePath + "/" + tableName createAndDropTable(tableName, tableLocation, purgeTable = true) - checkTableExists(tableName, tableLocation, false) + checkTableExists(tableName, tableLocation, exceptedExists = false) tableName = "lineitem_mergetree_external_location_purge" tableLocation = basePath + "/" + tableName - createAndDropTable(tableName, tableLocation, true, true) - checkTableExists(tableName, tableLocation, false) + createAndDropTable(tableName, tableLocation, isExternal = true, purgeTable = true) + checkTableExists(tableName, tableLocation, exceptedExists = false) } test("test mergetree CTAS simple") { @@ -1143,9 +1110,9 @@ class GlutenClickHouseMergeTreeWriteSuite val scanExec = collect(df.queryExecution.executedPlan) { case f: FileSourceScanExecTransformer => f } - assert(scanExec.size == 1) + assertResult(1)(scanExec.size) - val mergetreeScan = scanExec(0) + val mergetreeScan = scanExec.head assert(mergetreeScan.nodeName.startsWith("Scan mergetree")) val fileIndex = mergetreeScan.relation.location.asInstanceOf[TahoeFileIndex] @@ -1155,10 +1122,8 @@ class GlutenClickHouseMergeTreeWriteSuite assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).primaryKeyOption.isEmpty) assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).partitionColumns.isEmpty) val addFiles = fileIndex.matchingFiles(Nil, Nil).map(f => f.asInstanceOf[AddMergeTreeParts]) - assert(addFiles.size == 6) - assert( - addFiles.map(_.rows).sum - == 600572) + assertResult(6)(addFiles.size) + assertResult(600572)(addFiles.map(_.rows).sum) } } @@ -1289,9 +1254,9 @@ class GlutenClickHouseMergeTreeWriteSuite | |""".stripMargin - assert( + assertResult("R")( // total rows should remain unchanged - spark.sql(sqlStr2).collect().apply(0).get(0) == "R" + spark.sql(sqlStr2).collect().apply(0).get(0) ) // test select * @@ -1359,40 +1324,38 @@ class GlutenClickHouseMergeTreeWriteSuite val scanExec = collect(df.queryExecution.executedPlan) { case f: FileSourceScanExecTransformer => f } - assert(scanExec.size == 1) + assertResult(1)(scanExec.size) - val mergetreeScan = scanExec(0) + val mergetreeScan = scanExec.head assert(mergetreeScan.nodeName.startsWith("Scan mergetree")) val fileIndex = mergetreeScan.relation.location.asInstanceOf[TahoeFileIndex] assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).clickhouseTableConfigs.nonEmpty) assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).bucketOption.isEmpty) - assert( + assertResult("l_shipdate,l_orderkey")( ClickHouseTableV2 .getTable(fileIndex.deltaLog) .orderByKeyOption .get - .mkString(",") - .equals("l_shipdate,l_orderkey")) - assert( + .mkString(",")) + assertResult("l_shipdate")( ClickHouseTableV2 .getTable(fileIndex.deltaLog) .primaryKeyOption .get - .mkString(",") - .equals("l_shipdate")) + .mkString(",")) assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).partitionColumns.isEmpty) val addFiles = fileIndex.matchingFiles(Nil, Nil).map(f => f.asInstanceOf[AddMergeTreeParts]) - assert(addFiles.size == 6) - assert(addFiles.map(_.rows).sum == 600572) + assertResult(6)(addFiles.size) + assertResult(600572)(addFiles.map(_.rows).sum) val plans = collect(df.queryExecution.executedPlan) { case scanExec: BasicScanExecTransformer => scanExec } - assert(plans.size == 1) - assert(plans(0).metrics("selectedMarksPk").value === 17) - assert(plans(0).metrics("totalMarksPk").value === 74) + assertResult(1)(plans.size) + assertResult(17)(plans.head.metrics("selectedMarksPk").value) + assertResult(74)(plans.head.metrics("totalMarksPk").value) } } @@ -1447,21 +1410,20 @@ class GlutenClickHouseMergeTreeWriteSuite val scanExec = collect(df.queryExecution.executedPlan) { case f: FileSourceScanExecTransformer => f } - assert(scanExec.size == 1) + assertResult(1)(scanExec.size) - val mergetreeScan = scanExec(0) + val mergetreeScan = scanExec.head assert(mergetreeScan.nodeName.startsWith("Scan mergetree")) val fileIndex = mergetreeScan.relation.location.asInstanceOf[TahoeFileIndex] assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).clickhouseTableConfigs.nonEmpty) assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).bucketOption.isEmpty) - assert( + assertResult("l_shipdate")( ClickHouseTableV2 .getTable(fileIndex.deltaLog) .orderByKeyOption .get - .mkString(",") - .equals("l_shipdate")) + .mkString(",")) assert( ClickHouseTableV2 .getTable(fileIndex.deltaLog) @@ -1470,15 +1432,15 @@ class GlutenClickHouseMergeTreeWriteSuite assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).partitionColumns.isEmpty) val addFiles = fileIndex.matchingFiles(Nil, Nil).map(f => f.asInstanceOf[AddMergeTreeParts]) - assert(addFiles.size == 6) - assert(addFiles.map(_.rows).sum == 600572) + assertResult(6)(addFiles.size) + assertResult(600572)(addFiles.map(_.rows).sum) val plans = collect(df.queryExecution.executedPlan) { case scanExec: BasicScanExecTransformer => scanExec } - assert(plans.size == 1) - assert(plans(0).metrics("selectedMarksPk").value === 17) - assert(plans(0).metrics("totalMarksPk").value === 74) + assertResult(1)(plans.size) + assertResult(17)(plans.head.metrics("selectedMarksPk").value) + assertResult(74)(plans.head.metrics("totalMarksPk").value) } } @@ -1527,21 +1489,21 @@ class GlutenClickHouseMergeTreeWriteSuite runSql(sqlStr)( df => { val result = df.collect() - assert(result.size == 1) - assert(result(0).getLong(0) == 10) + assertResult(1)(result.length) + assertResult(10)(result(0).getLong(0)) val scanExec = collect(df.queryExecution.executedPlan) { case f: FileSourceScanExecTransformer => f } - assert(scanExec.size == 1) + assertResult(1)(scanExec.size) - val mergetreeScan = scanExec(0) + val mergetreeScan = scanExec.head assert(mergetreeScan.nodeName.startsWith("Scan mergetree")) val fileIndex = mergetreeScan.relation.location.asInstanceOf[TahoeFileIndex] val addFiles = fileIndex.matchingFiles(Nil, Nil).map(f => f.asInstanceOf[AddMergeTreeParts]) - assert(addFiles.size == 1) - assert(addFiles(0).rows == 10) + assertResult(1)(addFiles.size) + assertResult(10)(addFiles.head.rows) }) } @@ -1585,16 +1547,16 @@ class GlutenClickHouseMergeTreeWriteSuite val scanExec = collect(df.queryExecution.executedPlan) { case f: FileSourceScanExecTransformer => f } - assert(scanExec.size == 1) + assertResult(1)(scanExec.size) - val mergetreeScan = scanExec(0) + val mergetreeScan = scanExec.head assert(mergetreeScan.nodeName.startsWith("Scan mergetree")) val fileIndex = mergetreeScan.relation.location.asInstanceOf[TahoeFileIndex] val addFiles = fileIndex.matchingFiles(Nil, Nil).map(f => f.asInstanceOf[AddMergeTreeParts]) - assert( - (addFiles.map(_.marks).sum - addFiles.size) == mergetreeScan.metrics("totalMarksPk").value) - assert(mergetreeScan.metrics("selectedMarksPk").value == exceptedCnt) + assertResult(mergetreeScan.metrics("totalMarksPk").value)( + addFiles.map(_.marks).sum - addFiles.size) + assertResult(exceptedCnt)(mergetreeScan.metrics("selectedMarksPk").value) } val sqlStr1 = @@ -1609,8 +1571,8 @@ class GlutenClickHouseMergeTreeWriteSuite runSql(sqlStr1)( df => { val result = df.collect() - assert(result.size == 1) - assert(result(0).getDouble(0).toString.substring(0, 6).equals("2.6480")) + assertResult(1)(result.length) + assertResult("2.6480")(result(0).getDouble(0).toString.substring(0, 6)) checkSelectedMarksCnt(df, 34) }) @@ -1627,8 +1589,8 @@ class GlutenClickHouseMergeTreeWriteSuite runSql(sqlStr2)( df => { val result = df.collect() - assert(result.size == 1) - assert(result(0).getDouble(0).toString.substring(0, 6).equals("5.3379")) + assertResult(1)(result.length) + assertResult("5.3379")(result(0).getDouble(0).toString.substring(0, 6)) checkSelectedMarksCnt(df, 29) }) @@ -1666,18 +1628,16 @@ class GlutenClickHouseMergeTreeWriteSuite val scanExec = collect(df.queryExecution.executedPlan) { case f: FileSourceScanExecTransformer => f } - assert(scanExec.size == 1) + assertResult(1)(scanExec.size) - val mergetreeScan = scanExec(0) + val mergetreeScan = scanExec.head assert(mergetreeScan.nodeName.startsWith("Scan mergetree")) val fileIndex = mergetreeScan.relation.location.asInstanceOf[TahoeFileIndex] val addFiles = fileIndex.matchingFiles(Nil, Nil).map(f => f.asInstanceOf[AddMergeTreeParts]) - assert(addFiles.size == 6) - assert( - addFiles.map(_.rows).sum - == 600572) + assertResult(6)(addFiles.size) + assertResult(600572)(addFiles.map(_.rows).sum) } } @@ -1715,7 +1675,7 @@ class GlutenClickHouseMergeTreeWriteSuite val fileFilter = new WildcardFileFilter("*_0_*") var dataFileList = dataPath.list(fileFilter) - assert(dataFileList.size == 6) + assertResult(6)(dataFileList.length) // test with the normal table spark.sql(s""" @@ -1796,7 +1756,7 @@ class GlutenClickHouseMergeTreeWriteSuite assert(dataPath.isDirectory && dataPath.isDirectory) dataFileList = dataPath.list(fileFilter) - assert(dataFileList.size == 6) + assertResult(6)(dataFileList.length) // re-create the same table for (i <- 0 until 10) { @@ -1818,7 +1778,7 @@ class GlutenClickHouseMergeTreeWriteSuite assert(dataPath.isDirectory && dataPath.isDirectory) dataFileList = dataPath.list(fileFilter) - assert(dataFileList.size == 6) + assertResult(6)(dataFileList.length) } test("test mergetree with primary keys filter pruning by driver") { @@ -1872,22 +1832,22 @@ class GlutenClickHouseMergeTreeWriteSuite Seq(("true", 2), ("false", 3)).foreach( conf => { withSQLConf( - ("spark.gluten.sql.columnar.backend.ch.runtime_settings.enabled_driver_filter_mergetree_index" -> conf._1)) { + "spark.gluten.sql.columnar.backend.ch.runtime_settings.enabled_driver_filter_mergetree_index" -> conf._1) { runTPCHQueryBySQL(6, sqlStr) { df => val scanExec = collect(df.queryExecution.executedPlan) { case f: FileSourceScanExecTransformer => f } - assert(scanExec.size == 1) + assertResult(1)(scanExec.size) - val mergetreeScan = scanExec(0) + val mergetreeScan = scanExec.head assert(mergetreeScan.nodeName.startsWith("Scan mergetree")) val plans = collect(df.queryExecution.executedPlan) { case scanExec: BasicScanExecTransformer => scanExec } - assert(plans.size == 1) - assert(plans(0).getSplitInfos.size == conf._2) + assertResult(1)(plans.size) + assertResult(conf._2)(plans.head.getSplitInfos.size) } } }) @@ -1990,14 +1950,14 @@ class GlutenClickHouseMergeTreeWriteSuite Seq(("true", 2), ("false", 2)).foreach( conf => { withSQLConf( - ("spark.gluten.sql.columnar.backend.ch.runtime_settings.enabled_driver_filter_mergetree_index" -> conf._1)) { + "spark.gluten.sql.columnar.backend.ch.runtime_settings.enabled_driver_filter_mergetree_index" -> conf._1) { runTPCHQueryBySQL(12, sqlStr) { df => val scanExec = collect(df.queryExecution.executedPlan) { case f: BasicScanExecTransformer => f } - assert(scanExec.size == 2) - assert(scanExec(1).getSplitInfos.size == conf._2) + assertResult(2)(scanExec.size) + assertResult(conf._2)(scanExec(1).getSplitInfos.size) } } }) diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseNativeLibSuite.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseNativeLibSuite.scala new file mode 100644 index 0000000000000..0221f06bd6814 --- /dev/null +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseNativeLibSuite.scala @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.execution + +import org.apache.gluten.GlutenConfig +import org.apache.gluten.exception.GlutenException +import org.apache.gluten.utils.UTSystemParameters + +import org.apache.spark.SparkConf +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.plans.PlanTest + +class GlutenClickHouseNativeLibSuite extends PlanTest { + + private def baseSparkConf: SparkConf = { + new SparkConf() + .set("spark.plugins", "org.apache.gluten.GlutenPlugin") + .set("spark.default.parallelism", "1") + .set("spark.memory.offHeap.enabled", "true") + .set("spark.memory.offHeap.size", "1024MB") + .set("spark.gluten.sql.enable.native.validation", "false") + } + + test("test columnar lib path not exist") { + var spark: SparkSession = null + try { + spark = SparkSession + .builder() + .master("local[1]") + .config(baseSparkConf) + .config(GlutenConfig.GLUTEN_LIB_PATH, "path/not/exist/libch.so") + .getOrCreate() + spark.sql("select 1").show() + } catch { + case e: Exception => + assert(e.isInstanceOf[GlutenException]) + assert( + e.getMessage.contains( + "library at path: path/not/exist/libch.so is not a file or does not exist")) + } finally { + if (spark != null) { + spark.stop() + } + } + } + + test("test CHListenerApi initialize only once") { + var spark: SparkSession = null + try { + spark = SparkSession + .builder() + .master("local[1]") + .config(baseSparkConf) + .config(GlutenConfig.GLUTEN_LIB_PATH, UTSystemParameters.clickHouseLibPath) + .config(GlutenConfig.GLUTEN_EXECUTOR_LIB_PATH, "/path/not/exist/libch.so") + .getOrCreate() + spark.sql("select 1").show() + } finally { + if (spark != null) { + spark.stop() + } + } + } + +} diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseNativeWriteTableSuite.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseNativeWriteTableSuite.scala index 9269303d92511..ccf7bb5d5b2ab 100644 --- a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseNativeWriteTableSuite.scala +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseNativeWriteTableSuite.scala @@ -21,6 +21,7 @@ import org.apache.gluten.execution.AllDataTypesWithComplexType.genTestData import org.apache.gluten.utils.UTSystemParameters import org.apache.spark.SparkConf +import org.apache.spark.gluten.NativeWriteChecker import org.apache.spark.sql.SparkSession import org.apache.spark.sql.delta.DeltaLog import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper @@ -28,11 +29,14 @@ import org.apache.spark.sql.test.SharedSparkSession import org.scalatest.BeforeAndAfterAll +import scala.reflect.runtime.universe.TypeTag + class GlutenClickHouseNativeWriteTableSuite extends GlutenClickHouseWholeStageTransformerSuite with AdaptiveSparkPlanHelper with SharedSparkSession - with BeforeAndAfterAll { + with BeforeAndAfterAll + with NativeWriteChecker { private var _hiveSpark: SparkSession = _ @@ -114,16 +118,19 @@ class GlutenClickHouseNativeWriteTableSuite def getColumnName(s: String): String = { s.replaceAll("\\(", "_").replaceAll("\\)", "_") } + import collection.immutable.ListMap import java.io.File def writeIntoNewTableWithSql(table_name: String, table_create_sql: String)( fields: Seq[String]): Unit = { - spark.sql(table_create_sql) - spark.sql( - s"insert overwrite $table_name select ${fields.mkString(",")}" + - s" from origin_table") + withDestinationTable(table_name, table_create_sql) { + checkNativeWrite( + s"insert overwrite $table_name select ${fields.mkString(",")}" + + s" from origin_table", + checkNative = true) + } } def writeAndCheckRead( @@ -170,82 +177,86 @@ class GlutenClickHouseNativeWriteTableSuite }) } - test("test insert into dir") { - withSQLConf( - ("spark.gluten.sql.native.writer.enabled", "true"), - (GlutenConfig.GLUTEN_ENABLED.key, "true")) { - - val originDF = spark.createDataFrame(genTestData()) - originDF.createOrReplaceTempView("origin_table") + private val fields_ = ListMap( + ("string_field", "string"), + ("int_field", "int"), + ("long_field", "long"), + ("float_field", "float"), + ("double_field", "double"), + ("short_field", "short"), + ("byte_field", "byte"), + ("boolean_field", "boolean"), + ("decimal_field", "decimal(23,12)"), + ("date_field", "date") + ) - val fields: ListMap[String, String] = ListMap( - ("string_field", "string"), - ("int_field", "int"), - ("long_field", "long"), - ("float_field", "float"), - ("double_field", "double"), - ("short_field", "short"), - ("byte_field", "byte"), - ("boolean_field", "boolean"), - ("decimal_field", "decimal(23,12)"), - ("date_field", "date") - ) + def withDestinationTable(table: String, createTableSql: String)(f: => Unit): Unit = { + spark.sql(s"drop table IF EXISTS $table") + spark.sql(s"$createTableSql") + f + } - for (format <- formats) { - spark.sql( - s"insert overwrite local directory '$basePath/test_insert_into_${format}_dir1' " - + s"stored as $format select " - + fields.keys.mkString(",") + - " from origin_table cluster by (byte_field)") - spark.sql( - s"insert overwrite local directory '$basePath/test_insert_into_${format}_dir2' " + - s"stored as $format " + - "select string_field, sum(int_field) as x from origin_table group by string_field") - } + def nativeWrite(f: String => Unit): Unit = { + withSQLConf(("spark.gluten.sql.native.writer.enabled", "true")) { + formats.foreach(f(_)) } } - test("test insert into partition") { - withSQLConf( - ("spark.gluten.sql.native.writer.enabled", "true"), - ("spark.sql.orc.compression.codec", "lz4"), - (GlutenConfig.GLUTEN_ENABLED.key, "true")) { - - val originDF = spark.createDataFrame(genTestData()) - originDF.createOrReplaceTempView("origin_table") - - val fields: ListMap[String, String] = ListMap( - ("string_field", "string"), - ("int_field", "int"), - ("long_field", "long"), - ("float_field", "float"), - ("double_field", "double"), - ("short_field", "short"), - ("byte_field", "byte"), - ("boolean_field", "boolean"), - ("decimal_field", "decimal(23,12)"), - ("date_field", "date") - ) - - for (format <- formats) { - val table_name = table_name_template.format(format) - spark.sql(s"drop table IF EXISTS $table_name") + def nativeWrite2( + f: String => (String, String, String), + extraCheck: (String, String, String) => Unit = null): Unit = nativeWrite { + format => + val (table_name, table_create_sql, insert_sql) = f(format) + withDestinationTable(table_name, table_create_sql) { + checkNativeWrite(insert_sql, checkNative = true) + Option(extraCheck).foreach(_(table_name, table_create_sql, insert_sql)) + } + } - val table_create_sql = - s"create table if not exists $table_name (" + - fields - .map(f => s"${f._1} ${f._2}") - .mkString(",") + - " ) partitioned by (another_date_field date) " + - s"stored as $format" + def nativeWriteWithOriginalView[A <: Product: TypeTag]( + data: Seq[A], + viewName: String, + pairs: (String, String)*)(f: String => Unit): Unit = { + val configs = pairs :+ ("spark.gluten.sql.native.writer.enabled", "true") + withSQLConf(configs: _*) { + withTempView(viewName) { + spark.createDataFrame(data).createOrReplaceTempView(viewName) + formats.foreach(f(_)) + } + } + } - spark.sql(table_create_sql) + test("test insert into dir") { + nativeWriteWithOriginalView(genTestData(), "origin_table") { + format => + Seq( + s"""insert overwrite local directory '$basePath/test_insert_into_${format}_dir1' + |stored as $format select ${fields_.keys.mkString(",")} + |from origin_table""".stripMargin, + s"""insert overwrite local directory '$basePath/test_insert_into_${format}_dir2' + |stored as $format select string_field, sum(int_field) as x + |from origin_table group by string_field""".stripMargin + ).foreach(checkNativeWrite(_, checkNative = true)) + } + } - spark.sql( - s"insert into $table_name partition(another_date_field = '2020-01-01') select " - + fields.keys.mkString(",") + - " from origin_table") + test("test insert into partition") { + def destination(format: String): (String, String, String) = { + val table_name = table_name_template.format(format) + val table_create_sql = + s"""create table if not exists $table_name + |(${fields_.map(f => s"${f._1} ${f._2}").mkString(",")}) + |partitioned by (another_date_field date) stored as $format""".stripMargin + val insert_sql = + s"""insert into $table_name partition(another_date_field = '2020-01-01') + | select ${fields_.keys.mkString(",")} from origin_table""".stripMargin + (table_name, table_create_sql, insert_sql) + } + def nativeFormatWrite(format: String): Unit = { + val (table_name, table_create_sql, insert_sql) = destination(format) + withDestinationTable(table_name, table_create_sql) { + checkNativeWrite(insert_sql, checkNative = true) var files = recursiveListFiles(new File(getWarehouseDir + "/" + table_name)) .filter(_.getName.endsWith(s".$format")) if (format == "orc") { @@ -255,154 +266,103 @@ class GlutenClickHouseNativeWriteTableSuite assert(files.head.getAbsolutePath.contains("another_date_field=2020-01-01")) } } + + nativeWriteWithOriginalView( + genTestData(), + "origin_table", + ("spark.sql.orc.compression.codec", "lz4"))(nativeFormatWrite) } test("test CTAS") { - withSQLConf( - ("spark.gluten.sql.native.writer.enabled", "true"), - (GlutenConfig.GLUTEN_ENABLED.key, "true")) { - - val originDF = spark.createDataFrame(genTestData()) - originDF.createOrReplaceTempView("origin_table") - val fields: ListMap[String, String] = ListMap( - ("string_field", "string"), - ("int_field", "int"), - ("long_field", "long"), - ("float_field", "float"), - ("double_field", "double"), - ("short_field", "short"), - ("byte_field", "byte"), - ("boolean_field", "boolean"), - ("decimal_field", "decimal(23,12)"), - ("date_field", "date") - ) - - for (format <- formats) { + nativeWriteWithOriginalView(genTestData(), "origin_table") { + format => val table_name = table_name_template.format(format) - spark.sql(s"drop table IF EXISTS $table_name") val table_create_sql = s"create table $table_name using $format as select " + - fields + fields_ .map(f => s"${f._1}") .mkString(",") + " from origin_table" - spark.sql(table_create_sql) - spark.sql(s"drop table IF EXISTS $table_name") + val insert_sql = + s"create table $table_name as select " + + fields_ + .map(f => s"${f._1}") + .mkString(",") + + " from origin_table" + withDestinationTable(table_name, table_create_sql) { + spark.sql(s"drop table IF EXISTS $table_name") - try { - val table_create_sql = - s"create table $table_name as select " + - fields - .map(f => s"${f._1}") - .mkString(",") + - " from origin_table" - spark.sql(table_create_sql) - } catch { - case _: UnsupportedOperationException => // expected - case _: Exception => fail("should not throw exception") + try { + // FIXME: using checkNativeWrite + spark.sql(insert_sql) + } catch { + case _: UnsupportedOperationException => // expected + case e: Exception => fail("should not throw exception", e) + } } - } } } test("test insert into partition, bigo's case which incur InsertIntoHiveTable") { - withSQLConf( - ("spark.gluten.sql.native.writer.enabled", "true"), - ("spark.sql.hive.convertMetastoreParquet", "false"), - ("spark.sql.hive.convertMetastoreOrc", "false"), - (GlutenConfig.GLUTEN_ENABLED.key, "true") - ) { - - val originDF = spark.createDataFrame(genTestData()) - originDF.createOrReplaceTempView("origin_table") - val fields: ListMap[String, String] = ListMap( - ("string_field", "string"), - ("int_field", "int"), - ("long_field", "long"), - ("float_field", "float"), - ("double_field", "double"), - ("short_field", "short"), - ("byte_field", "byte"), - ("boolean_field", "boolean"), - ("decimal_field", "decimal(23,12)"), - ("date_field", "date") - ) - - for (format <- formats) { - val table_name = table_name_template.format(format) - spark.sql(s"drop table IF EXISTS $table_name") - val table_create_sql = s"create table if not exists $table_name (" + fields - .map(f => s"${f._1} ${f._2}") - .mkString(",") + " ) partitioned by (another_date_field string)" + - s"stored as $format" + def destination(format: String): (String, String, String) = { + val table_name = table_name_template.format(format) + val table_create_sql = s"create table if not exists $table_name (" + fields_ + .map(f => s"${f._1} ${f._2}") + .mkString(",") + " ) partitioned by (another_date_field string)" + + s"stored as $format" + val insert_sql = + s"insert overwrite table $table_name " + + "partition(another_date_field = '2020-01-01') select " + + fields_.keys.mkString(",") + " from (select " + fields_.keys.mkString( + ",") + ", row_number() over (order by int_field desc) as rn " + + "from origin_table where float_field > 3 ) tt where rn <= 100" + (table_name, table_create_sql, insert_sql) + } - spark.sql(table_create_sql) - spark.sql( - s"insert overwrite table $table_name " + - "partition(another_date_field = '2020-01-01') select " - + fields.keys.mkString(",") + " from (select " + fields.keys.mkString( - ",") + ", row_number() over (order by int_field desc) as rn " + - "from origin_table where float_field > 3 ) tt where rn <= 100") + def nativeFormatWrite(format: String): Unit = { + val (table_name, table_create_sql, insert_sql) = destination(format) + withDestinationTable(table_name, table_create_sql) { + checkNativeWrite(insert_sql, checkNative = true) val files = recursiveListFiles(new File(getWarehouseDir + "/" + table_name)) .filter(_.getName.startsWith("part")) assert(files.length == 1) assert(files.head.getAbsolutePath.contains("another_date_field=2020-01-01")) } } + + nativeWriteWithOriginalView( + genTestData(), + "origin_table", + ("spark.sql.hive.convertMetastoreParquet", "false"), + ("spark.sql.hive.convertMetastoreOrc", "false"))(nativeFormatWrite) } test("test 1-col partitioned table") { + nativeWrite { + format => + { + val table_name = table_name_template.format(format) + val table_create_sql = + s"create table if not exists $table_name (" + + fields_ + .filterNot(e => e._1.equals("date_field")) + .map(f => s"${f._1} ${f._2}") + .mkString(",") + + " ) partitioned by (date_field date) " + + s"stored as $format" - withSQLConf(("spark.gluten.sql.native.writer.enabled", "true")) { - - val fields: ListMap[String, String] = ListMap( - ("string_field", "string"), - ("int_field", "int"), - ("long_field", "long"), - ("float_field", "float"), - ("double_field", "double"), - ("short_field", "short"), - ("byte_field", "byte"), - ("boolean_field", "boolean"), - ("decimal_field", "decimal(23,12)"), - ("date_field", "date") - ) - - for (format <- formats) { - val table_name = table_name_template.format(format) - val table_create_sql = - s"create table if not exists $table_name (" + - fields - .filterNot(e => e._1.equals("date_field")) - .map(f => s"${f._1} ${f._2}") - .mkString(",") + - " ) partitioned by (date_field date) " + - s"stored as $format" - - writeAndCheckRead( - table_name, - writeIntoNewTableWithSql(table_name, table_create_sql), - fields.keys.toSeq) - } + writeAndCheckRead( + table_name, + writeIntoNewTableWithSql(table_name, table_create_sql), + fields_.keys.toSeq) + } } } // even if disable native writer, this UT fail, spark bug??? ignore("test 1-col partitioned table, partitioned by already ordered column") { withSQLConf(("spark.gluten.sql.native.writer.enabled", "false")) { - val fields: ListMap[String, String] = ListMap( - ("string_field", "string"), - ("int_field", "int"), - ("long_field", "long"), - ("float_field", "float"), - ("double_field", "double"), - ("short_field", "short"), - ("byte_field", "byte"), - ("boolean_field", "boolean"), - ("decimal_field", "decimal(23,12)"), - ("date_field", "date") - ) val originDF = spark.createDataFrame(genTestData()) originDF.createOrReplaceTempView("origin_table") @@ -410,7 +370,7 @@ class GlutenClickHouseNativeWriteTableSuite val table_name = table_name_template.format(format) val table_create_sql = s"create table if not exists $table_name (" + - fields + fields_ .filterNot(e => e._1.equals("date_field")) .map(f => s"${f._1} ${f._2}") .mkString(",") + @@ -420,31 +380,27 @@ class GlutenClickHouseNativeWriteTableSuite spark.sql(s"drop table IF EXISTS $table_name") spark.sql(table_create_sql) spark.sql( - s"insert overwrite $table_name select ${fields.mkString(",")}" + + s"insert overwrite $table_name select ${fields_.mkString(",")}" + s" from origin_table order by date_field") } } } test("test 2-col partitioned table") { - withSQLConf( - ("spark.gluten.sql.native.writer.enabled", "true"), - (GlutenConfig.GLUTEN_ENABLED.key, "true")) { - - val fields: ListMap[String, String] = ListMap( - ("string_field", "string"), - ("int_field", "int"), - ("long_field", "long"), - ("float_field", "float"), - ("double_field", "double"), - ("short_field", "short"), - ("boolean_field", "boolean"), - ("decimal_field", "decimal(23,12)"), - ("date_field", "date"), - ("byte_field", "byte") - ) - - for (format <- formats) { + val fields: ListMap[String, String] = ListMap( + ("string_field", "string"), + ("int_field", "int"), + ("long_field", "long"), + ("float_field", "float"), + ("double_field", "double"), + ("short_field", "short"), + ("boolean_field", "boolean"), + ("decimal_field", "decimal(23,12)"), + ("date_field", "date"), + ("byte_field", "byte") + ) + nativeWrite { + format => val table_name = table_name_template.format(format) val table_create_sql = s"create table if not exists $table_name (" + @@ -458,7 +414,6 @@ class GlutenClickHouseNativeWriteTableSuite table_name, writeIntoNewTableWithSql(table_name, table_create_sql), fields.keys.toSeq) - } } } @@ -506,25 +461,21 @@ class GlutenClickHouseNativeWriteTableSuite // This test case will be failed with incorrect result randomly, ignore first. ignore("test hive parquet/orc table, all columns being partitioned. ") { - withSQLConf( - ("spark.gluten.sql.native.writer.enabled", "true"), - (GlutenConfig.GLUTEN_ENABLED.key, "true")) { - - val fields: ListMap[String, String] = ListMap( - ("date_field", "date"), - ("timestamp_field", "timestamp"), - ("string_field", "string"), - ("int_field", "int"), - ("long_field", "long"), - ("float_field", "float"), - ("double_field", "double"), - ("short_field", "short"), - ("byte_field", "byte"), - ("boolean_field", "boolean"), - ("decimal_field", "decimal(23,12)") - ) - - for (format <- formats) { + val fields: ListMap[String, String] = ListMap( + ("date_field", "date"), + ("timestamp_field", "timestamp"), + ("string_field", "string"), + ("int_field", "int"), + ("long_field", "long"), + ("float_field", "float"), + ("double_field", "double"), + ("short_field", "short"), + ("byte_field", "byte"), + ("boolean_field", "boolean"), + ("decimal_field", "decimal(23,12)") + ) + nativeWrite { + format => val table_name = table_name_template.format(format) val table_create_sql = s"create table if not exists $table_name (" + @@ -540,20 +491,15 @@ class GlutenClickHouseNativeWriteTableSuite table_name, writeIntoNewTableWithSql(table_name, table_create_sql), fields.keys.toSeq) - } } } - test(("test hive parquet/orc table with aggregated results")) { - withSQLConf( - ("spark.gluten.sql.native.writer.enabled", "true"), - (GlutenConfig.GLUTEN_ENABLED.key, "true")) { - - val fields: ListMap[String, String] = ListMap( - ("sum(int_field)", "bigint") - ) - - for (format <- formats) { + test("test hive parquet/orc table with aggregated results") { + val fields: ListMap[String, String] = ListMap( + ("sum(int_field)", "bigint") + ) + nativeWrite { + format => val table_name = table_name_template.format(format) val table_create_sql = s"create table if not exists $table_name (" + @@ -566,29 +512,12 @@ class GlutenClickHouseNativeWriteTableSuite table_name, writeIntoNewTableWithSql(table_name, table_create_sql), fields.keys.toSeq) - } } } test("test 1-col partitioned + 1-col bucketed table") { - withSQLConf( - ("spark.gluten.sql.native.writer.enabled", "true"), - (GlutenConfig.GLUTEN_ENABLED.key, "true")) { - - val fields: ListMap[String, String] = ListMap( - ("string_field", "string"), - ("int_field", "int"), - ("long_field", "long"), - ("float_field", "float"), - ("double_field", "double"), - ("short_field", "short"), - ("byte_field", "byte"), - ("boolean_field", "boolean"), - ("decimal_field", "decimal(23,12)"), - ("date_field", "date") - ) - - for (format <- formats) { + nativeWrite { + format => // spark write does not support bucketed table // https://issues.apache.org/jira/browse/SPARK-19256 val table_name = table_name_template.format(format) @@ -604,7 +533,7 @@ class GlutenClickHouseNativeWriteTableSuite .bucketBy(2, "byte_field") .saveAsTable(table_name) }, - fields.keys.toSeq + fields_.keys.toSeq ) assert( @@ -614,10 +543,8 @@ class GlutenClickHouseNativeWriteTableSuite .filter(!_.getName.equals("date_field=__HIVE_DEFAULT_PARTITION__")) .head .listFiles() - .filter(!_.isHidden) - .length == 2 + .count(!_.isHidden) == 2 ) // 2 bucket files - } } } @@ -745,8 +672,8 @@ class GlutenClickHouseNativeWriteTableSuite } test("test consecutive blocks having same partition value") { - withSQLConf(("spark.gluten.sql.native.writer.enabled", "true")) { - for (format <- formats) { + nativeWrite { + format => val table_name = table_name_template.format(format) spark.sql(s"drop table IF EXISTS $table_name") @@ -760,15 +687,14 @@ class GlutenClickHouseNativeWriteTableSuite .partitionBy("p") .saveAsTable(table_name) - val ret = spark.sql("select sum(id) from " + table_name).collect().apply(0).apply(0) + val ret = spark.sql(s"select sum(id) from $table_name").collect().apply(0).apply(0) assert(ret == 449985000) - } } } test("test decimal with rand()") { - withSQLConf(("spark.gluten.sql.native.writer.enabled", "true")) { - for (format <- formats) { + nativeWrite { + format => val table_name = table_name_template.format(format) spark.sql(s"drop table IF EXISTS $table_name") spark @@ -778,32 +704,30 @@ class GlutenClickHouseNativeWriteTableSuite .format(format) .partitionBy("p") .saveAsTable(table_name) - val ret = spark.sql("select max(p) from " + table_name).collect().apply(0).apply(0) - } + val ret = spark.sql(s"select max(p) from $table_name").collect().apply(0).apply(0) } } test("test partitioned by constant") { - withSQLConf(("spark.gluten.sql.native.writer.enabled", "true")) { - for (format <- formats) { - spark.sql(s"drop table IF EXISTS tmp_123_$format") - spark.sql( - s"create table tmp_123_$format(" + - s"x1 string, x2 bigint,x3 string, x4 bigint, x5 string )" + - s"partitioned by (day date) stored as $format") - - spark.sql( - s"insert into tmp_123_$format partition(day) " + - "select cast(id as string), id, cast(id as string), id, cast(id as string), " + - "'2023-05-09' from range(10000000)") - } + nativeWrite2 { + format => + val table_name = s"tmp_123_$format" + val create_sql = + s"""create table tmp_123_$format( + |x1 string, x2 bigint,x3 string, x4 bigint, x5 string ) + |partitioned by (day date) stored as $format""".stripMargin + val insert_sql = + s"""insert into tmp_123_$format partition(day) + |select cast(id as string), id, cast(id as string), + | id, cast(id as string), '2023-05-09' + |from range(10000000)""".stripMargin + (table_name, create_sql, insert_sql) } } test("test bucketed by constant") { - withSQLConf(("spark.gluten.sql.native.writer.enabled", "true")) { - - for (format <- formats) { + nativeWrite { + format => val table_name = table_name_template.format(format) spark.sql(s"drop table IF EXISTS $table_name") @@ -815,15 +739,13 @@ class GlutenClickHouseNativeWriteTableSuite .bucketBy(2, "p") .saveAsTable(table_name) - val ret = spark.sql("select count(*) from " + table_name).collect().apply(0).apply(0) - } + assertResult(10000000)(spark.table(table_name).count()) } } test("test consecutive null values being partitioned") { - withSQLConf(("spark.gluten.sql.native.writer.enabled", "true")) { - - for (format <- formats) { + nativeWrite { + format => val table_name = table_name_template.format(format) spark.sql(s"drop table IF EXISTS $table_name") @@ -835,14 +757,13 @@ class GlutenClickHouseNativeWriteTableSuite .partitionBy("p") .saveAsTable(table_name) - val ret = spark.sql("select count(*) from " + table_name).collect().apply(0).apply(0) - } + assertResult(30000)(spark.table(table_name).count()) } } test("test consecutive null values being bucketed") { - withSQLConf(("spark.gluten.sql.native.writer.enabled", "true")) { - for (format <- formats) { + nativeWrite { + format => val table_name = table_name_template.format(format) spark.sql(s"drop table IF EXISTS $table_name") @@ -854,78 +775,79 @@ class GlutenClickHouseNativeWriteTableSuite .bucketBy(2, "p") .saveAsTable(table_name) - val ret = spark.sql("select count(*) from " + table_name).collect().apply(0).apply(0) - } + assertResult(30000)(spark.table(table_name).count()) } } test("test native write with empty dataset") { - withSQLConf(("spark.gluten.sql.native.writer.enabled", "true")) { - for (format <- formats) { + nativeWrite2( + format => { val table_name = "t_" + format - spark.sql(s"drop table IF EXISTS $table_name") - spark.sql(s"create table $table_name (id int, str string) stored as $format") - spark.sql( - s"insert into $table_name select id, cast(id as string) from range(10)" + - " where id > 100") + ( + table_name, + s"create table $table_name (id int, str string) stored as $format", + s"insert into $table_name select id, cast(id as string) from range(10) where id > 100" + ) + }, + (table_name, _, _) => { + assertResult(0)(spark.table(table_name).count()) } - } + ) } test("test native write with union") { - withSQLConf(("spark.gluten.sql.native.writer.enabled", "true")) { - for (format <- formats) { + nativeWrite { + format => val table_name = "t_" + format - spark.sql(s"drop table IF EXISTS $table_name") - spark.sql(s"create table $table_name (id int, str string) stored as $format") - spark.sql( - s"insert overwrite table $table_name " + - "select id, cast(id as string) from range(10) union all " + - "select 10, '10' from range(10)") - spark.sql( - s"insert overwrite table $table_name " + - "select id, cast(id as string) from range(10) union all " + - "select 10, cast(id as string) from range(10)") - - } + withDestinationTable( + table_name, + s"create table $table_name (id int, str string) stored as $format") { + checkNativeWrite( + s"insert overwrite table $table_name " + + "select id, cast(id as string) from range(10) union all " + + "select 10, '10' from range(10)", + checkNative = true) + checkNativeWrite( + s"insert overwrite table $table_name " + + "select id, cast(id as string) from range(10) union all " + + "select 10, cast(id as string) from range(10)", + checkNative = true + ) + } } } test("test native write and non-native read consistency") { - withSQLConf(("spark.gluten.sql.native.writer.enabled", "true")) { - for (format <- formats) { - val table_name = "t_" + format - spark.sql(s"drop table IF EXISTS $table_name") - spark.sql(s"create table $table_name (id int, name string, info char(4)) stored as $format") - spark.sql( - s"insert overwrite table $table_name " + - "select id, cast(id as string), concat('aaa', cast(id as string)) from range(10)") + nativeWrite2( + { + format => + val table_name = "t_" + format + ( + table_name, + s"create table $table_name (id int, name string, info char(4)) stored as $format", + s"insert overwrite table $table_name " + + "select id, cast(id as string), concat('aaa', cast(id as string)) from range(10)" + ) + }, + (table_name, _, _) => compareResultsAgainstVanillaSpark( s"select * from $table_name", compareResult = true, _ => {}) - } - } + ) } test("GLUTEN-4316: fix crash on dynamic partition inserting") { - withSQLConf( - ("spark.gluten.sql.native.writer.enabled", "true"), - (GlutenConfig.GLUTEN_ENABLED.key, "true")) { - formats.foreach( - format => { - val tbl = "t_" + format - spark.sql(s"drop table IF EXISTS $tbl") - val sql1 = - s"create table $tbl(a int, b map, c struct) " + - s"partitioned by (day string) stored as $format" - val sql2 = s"insert overwrite $tbl partition (day) " + - s"select id as a, str_to_map(concat('t1:','a','&t2:','b'),'&',':'), " + - s"struct('1', null) as c, '2024-01-08' as day from range(10)" - spark.sql(sql1) - spark.sql(sql2) - }) + nativeWrite2 { + format => + val tbl = "t_" + format + val sql1 = + s"create table $tbl(a int, b map, c struct) " + + s"partitioned by (day string) stored as $format" + val sql2 = s"insert overwrite $tbl partition (day) " + + s"select id as a, str_to_map(concat('t1:','a','&t2:','b'),'&',':'), " + + s"struct('1', null) as c, '2024-01-08' as day from range(10)" + (tbl, sql1, sql2) } } - } diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseTPCDSParquetGraceHashJoinSuite.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseTPCDSParquetGraceHashJoinSuite.scala index 0b7ad9a6d8ac1..04ccda29b1aed 100644 --- a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseTPCDSParquetGraceHashJoinSuite.scala +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseTPCDSParquetGraceHashJoinSuite.scala @@ -16,13 +16,9 @@ */ package org.apache.gluten.execution -import org.apache.gluten.test.FallbackUtil - import org.apache.spark.SparkConf -import org.apache.spark.sql.catalyst.expressions.{DynamicPruningExpression, Not} -import org.apache.spark.sql.execution._ +import org.apache.spark.sql.catalyst.expressions.DynamicPruningExpression import org.apache.spark.sql.execution.exchange.ReusedExchangeExec -import org.apache.spark.sql.execution.joins.BroadcastHashJoinExec class GlutenClickHouseTPCDSParquetGraceHashJoinSuite extends GlutenClickHouseTPCDSAbstractSuite { @@ -39,105 +35,11 @@ class GlutenClickHouseTPCDSParquetGraceHashJoinSuite extends GlutenClickHouseTPC .set("spark.sql.autoBroadcastJoinThreshold", "10MB") .set("spark.memory.offHeap.size", "8g") .set("spark.gluten.sql.columnar.backend.ch.runtime_settings.join_algorithm", "grace_hash") - .set("spark.gluten.sql.columnar.backend.ch.runtime_settings.max_bytes_in_join", "3145728") + .set("spark.gluten.sql.columnar.backend.ch.runtime_settings.max_bytes_in_join", "314572800") } executeTPCDSTest(false); - test( - "test fallback operations not supported by ch backend " + - "in CHHashJoinExecTransformer && CHBroadcastHashJoinExecTransformer") { - val testSql = - """ - | SELECT i_brand_id AS brand_id, i_brand AS brand, i_manufact_id, i_manufact, - | sum(ss_ext_sales_price) AS ext_price - | FROM date_dim - | LEFT JOIN store_sales ON d_date_sk = ss_sold_date_sk - | LEFT JOIN item ON ss_item_sk = i_item_sk AND i_manager_id = 7 - | LEFT JOIN customer ON ss_customer_sk = c_customer_sk - | LEFT JOIN customer_address ON c_current_addr_sk = ca_address_sk - | LEFT JOIN store ON ss_store_sk = s_store_sk AND substr(ca_zip,1,5) <> substr(s_zip,1,5) - | WHERE d_moy = 11 - | AND d_year = 1999 - | GROUP BY i_brand_id, i_brand, i_manufact_id, i_manufact - | ORDER BY ext_price DESC, i_brand, i_brand_id, i_manufact_id, i_manufact - | LIMIT 100; - |""".stripMargin - - val df = spark.sql(testSql) - val operateWithCondition = df.queryExecution.executedPlan.collect { - case f: BroadcastHashJoinExec if f.condition.get.isInstanceOf[Not] => f - } - assert( - operateWithCondition(0).left - .asInstanceOf[InputAdapter] - .child - .isInstanceOf[CHColumnarToRowExec]) - } - - test("test fallbackutils") { - val testSql = - """ - | SELECT i_brand_id AS brand_id, i_brand AS brand, i_manufact_id, i_manufact, - | sum(ss_ext_sales_price) AS ext_price - | FROM date_dim - | LEFT JOIN store_sales ON d_date_sk = ss_sold_date_sk - | LEFT JOIN item ON ss_item_sk = i_item_sk AND i_manager_id = 7 - | LEFT JOIN customer ON ss_customer_sk = c_customer_sk - | LEFT JOIN customer_address ON c_current_addr_sk = ca_address_sk - | LEFT JOIN store ON ss_store_sk = s_store_sk AND substr(ca_zip,1,5) <> substr(s_zip,1,5) - | WHERE d_moy = 11 - | AND d_year = 1999 - | GROUP BY i_brand_id, i_brand, i_manufact_id, i_manufact - | ORDER BY ext_price DESC, i_brand, i_brand_id, i_manufact_id, i_manufact - | LIMIT 100; - |""".stripMargin - - val df = spark.sql(testSql) - assert(FallbackUtil.hasFallback(df.queryExecution.executedPlan)) - } - - test("Gluten-4458: test clickhouse not support join with IN condition") { - val testSql = - """ - | SELECT * - | FROM date_dim t1 - | LEFT JOIN date_dim t2 ON t1.d_date_sk = t2.d_date_sk - | AND datediff(t1.d_day_name, t2.d_day_name) IN (1, 3) - | LIMIT 100; - |""".stripMargin - - val df = spark.sql(testSql) - assert(FallbackUtil.hasFallback(df.queryExecution.executedPlan)) - } - - test("Gluten-4458: test join with Equal computing two table in one side") { - val testSql = - """ - | SELECT * - | FROM date_dim t1 - | LEFT JOIN date_dim t2 ON t1.d_date_sk = t2.d_date_sk AND t1.d_year - t2.d_year = 1 - | LIMIT 100; - |""".stripMargin - - val df = spark.sql(testSql) - assert(FallbackUtil.hasFallback(df.queryExecution.executedPlan)) - } - - test("Gluten-4458: test inner join can support join with IN condition") { - val testSql = - """ - | SELECT * - | FROM date_dim t1 - | INNER JOIN date_dim t2 ON t1.d_date_sk = t2.d_date_sk - | AND datediff(t1.d_day_name, t2.d_day_name) IN (1, 3) - | LIMIT 100; - |""".stripMargin - - val df = spark.sql(testSql) - assert(!FallbackUtil.hasFallback(df.queryExecution.executedPlan)) - } - test("Gluten-1235: Fix missing reading from the broadcasted value when executing DPP") { val testSql = """ @@ -198,55 +100,4 @@ class GlutenClickHouseTPCDSParquetGraceHashJoinSuite extends GlutenClickHouseTPC } } } - - test("TPCDS Q21 with non-separated scan rdd") { - withSQLConf(("spark.gluten.sql.columnar.separate.scan.rdd.for.ch", "false")) { - runTPCDSQuery("q21") { - df => - val foundDynamicPruningExpr = df.queryExecution.executedPlan.find { - case f: FileSourceScanExecTransformer => - f.partitionFilters.exists { - case _: DynamicPruningExpression => true - case _ => false - } - case _ => false - } - assert(foundDynamicPruningExpr.nonEmpty == true) - - val reuseExchange = df.queryExecution.executedPlan.find { - case r: ReusedExchangeExec => true - case _ => false - } - assert(reuseExchange.nonEmpty == true) - } - } - } - - test("Gluten-4452: Fix get wrong hash table when multi joins in a task") { - val testSql = - """ - | SELECT ws_item_sk, ws_sold_date_sk, ws_ship_date_sk, - | t3.d_date_id as sold_date_id, t2.d_date_id as ship_date_id - | FROM ( - | SELECT ws_item_sk, ws_sold_date_sk, ws_ship_date_sk, t1.d_date_id - | FROM web_sales - | LEFT JOIN - | (SELECT d_date_id, d_date_sk from date_dim GROUP BY d_date_id, d_date_sk) t1 - | ON ws_sold_date_sk == t1.d_date_sk) t3 - | INNER JOIN - | (SELECT d_date_id, d_date_sk from date_dim GROUP BY d_date_id, d_date_sk) t2 - | ON ws_ship_date_sk == t2.d_date_sk - | LIMIT 100; - |""".stripMargin - compareResultsAgainstVanillaSpark( - testSql, - true, - df => { - val foundBroadcastHashJoinExpr = df.queryExecution.executedPlan.collect { - case f: CHBroadcastHashJoinExecTransformer => f - } - assert(foundBroadcastHashJoinExpr.size == 2) - } - ) - } } diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseTPCDSParquetSuite.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseTPCDSParquetSuite.scala index a63e47888cb90..e9c27437b44a0 100644 --- a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseTPCDSParquetSuite.scala +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseTPCDSParquetSuite.scala @@ -16,13 +16,10 @@ */ package org.apache.gluten.execution -import org.apache.gluten.test.FallbackUtil - import org.apache.spark.SparkConf -import org.apache.spark.sql.catalyst.expressions.{DynamicPruningExpression, Not} +import org.apache.spark.sql.catalyst.expressions.DynamicPruningExpression import org.apache.spark.sql.execution._ import org.apache.spark.sql.execution.exchange.ReusedExchangeExec -import org.apache.spark.sql.execution.joins.{BroadcastHashJoinExec, SortMergeJoinExec} // Some sqls' line length exceeds 100 // scalastyle:off line.size.limit @@ -121,38 +118,7 @@ class GlutenClickHouseTPCDSParquetSuite extends GlutenClickHouseTPCDSAbstractSui assert(result(0).getLong(0) == 73049) } - test( - "test fallback operations not supported by ch backend " + - "in CHHashJoinExecTransformer && CHBroadcastHashJoinExecTransformer") { - val testSql = - """ - |SELECT i_brand_id AS brand_id, i_brand AS brand, i_manufact_id, i_manufact, - | sum(ss_ext_sales_price) AS ext_price - | FROM date_dim - | LEFT JOIN store_sales ON d_date_sk = ss_sold_date_sk - | LEFT JOIN item ON ss_item_sk = i_item_sk AND i_manager_id = 7 - | LEFT JOIN customer ON ss_customer_sk = c_customer_sk - | LEFT JOIN customer_address ON c_current_addr_sk = ca_address_sk - | LEFT JOIN store ON ss_store_sk = s_store_sk AND substr(ca_zip,1,5) <> substr(s_zip,1,5) - | WHERE d_moy = 11 - | AND d_year = 1999 - | GROUP BY i_brand_id, i_brand, i_manufact_id, i_manufact - | ORDER BY ext_price DESC, i_brand, i_brand_id, i_manufact_id, i_manufact - | LIMIT 100; - |""".stripMargin - - val df = spark.sql(testSql) - val operateWithCondition = df.queryExecution.executedPlan.collect { - case f: BroadcastHashJoinExec if f.condition.get.isInstanceOf[Not] => f - } - assert( - operateWithCondition(0).left - .asInstanceOf[InputAdapter] - .child - .isInstanceOf[CHColumnarToRowExec]) - } - - test("test fallbackutils") { + test("Test join with mixed condition 1") { val testSql = """ |SELECT i_brand_id AS brand_id, i_brand AS brand, i_manufact_id, i_manufact, @@ -169,36 +135,7 @@ class GlutenClickHouseTPCDSParquetSuite extends GlutenClickHouseTPCDSAbstractSui | ORDER BY ext_price DESC, i_brand, i_brand_id, i_manufact_id, i_manufact | LIMIT 100; |""".stripMargin - - val df = spark.sql(testSql) - assert(FallbackUtil.hasFallback(df.queryExecution.executedPlan)) - } - - test( - "Test avoid forceShuffledHashJoin when the join condition" + - " does not supported by the backend") { - val testSql = - """ - |SELECT /*+ merge(date_dim)*/ i_brand_id AS brand_id, i_brand AS brand, i_manufact_id, i_manufact, - | sum(ss_ext_sales_price) AS ext_price - | FROM date_dim - | LEFT JOIN store_sales ON d_date_sk == ss_sold_date_sk AND (d_date_sk = 213232 OR ss_sold_date_sk = 3232) - | LEFT JOIN item ON ss_item_sk = i_item_sk AND i_manager_id = 7 - | LEFT JOIN customer ON ss_customer_sk = c_customer_sk - | LEFT JOIN customer_address ON c_current_addr_sk = ca_address_sk - | LEFT JOIN store ON ss_store_sk = s_store_sk AND substr(ca_zip,1,5) <> substr(s_zip,1,5) - | WHERE d_moy = 11 - | AND d_year = 1999 - | GROUP BY i_brand_id, i_brand, i_manufact_id, i_manufact - | ORDER BY ext_price DESC, i_brand, i_brand_id, i_manufact_id, i_manufact - | LIMIT 100; - |""".stripMargin - - val df = spark.sql(testSql) - val sortMergeJoinExec = df.queryExecution.executedPlan.collect { - case s: SortMergeJoinExec => s - } - assert(sortMergeJoinExec.nonEmpty) + compareResultsAgainstVanillaSpark(testSql, true, _ => {}) } test("Gluten-1235: Fix missing reading from the broadcasted value when executing DPP") { diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseTPCHNullableColumnarShuffleSuite.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseTPCHNullableColumnarShuffleSuite.scala index 245b52d371091..c5f67f45d5771 100644 --- a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseTPCHNullableColumnarShuffleSuite.scala +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseTPCHNullableColumnarShuffleSuite.scala @@ -17,7 +17,7 @@ package org.apache.gluten.execution import org.apache.spark.SparkConf -import org.apache.spark.sql.catalyst.optimizer.BuildLeft +import org.apache.spark.sql.catalyst.optimizer.{BuildLeft, BuildRight} class GlutenClickHouseTPCHNullableColumnarShuffleSuite extends GlutenClickHouseTPCHAbstractSuite { @@ -63,7 +63,11 @@ class GlutenClickHouseTPCHNullableColumnarShuffleSuite extends GlutenClickHouseT val shjBuildLeft = df.queryExecution.executedPlan.collect { case shj: ShuffledHashJoinExecTransformerBase if shj.joinBuildSide == BuildLeft => shj } - assert(shjBuildLeft.size == 2) + assert(shjBuildLeft.size == 1) + val shjBuildRight = df.queryExecution.executedPlan.collect { + case shj: ShuffledHashJoinExecTransformerBase if shj.joinBuildSide == BuildRight => shj + } + assert(shjBuildRight.size == 1) } } } diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseTPCHNullableSuite.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseTPCHNullableSuite.scala index 0eb4de74209b6..7f62c69931577 100644 --- a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseTPCHNullableSuite.scala +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseTPCHNullableSuite.scala @@ -20,7 +20,7 @@ import org.apache.gluten.GlutenConfig import org.apache.spark.SparkConf import org.apache.spark.sql.catalyst.expressions.Alias -import org.apache.spark.sql.catalyst.optimizer.BuildLeft +import org.apache.spark.sql.catalyst.optimizer.{BuildLeft, BuildRight} class GlutenClickHouseTPCHNullableSuite extends GlutenClickHouseTPCHAbstractSuite { @@ -66,7 +66,11 @@ class GlutenClickHouseTPCHNullableSuite extends GlutenClickHouseTPCHAbstractSuit val shjBuildLeft = df.queryExecution.executedPlan.collect { case shj: ShuffledHashJoinExecTransformerBase if shj.joinBuildSide == BuildLeft => shj } - assert(shjBuildLeft.size == 2) + assert(shjBuildLeft.size == 1) + val shjBuildRight = df.queryExecution.executedPlan.collect { + case shj: ShuffledHashJoinExecTransformerBase if shj.joinBuildSide == BuildRight => shj + } + assert(shjBuildRight.size == 1) } } } diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseTPCHParquetAQEConcurrentSuite.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseTPCHParquetAQEConcurrentSuite.scala index 34e9658fb4197..9f4befbb01a9b 100644 --- a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseTPCHParquetAQEConcurrentSuite.scala +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseTPCHParquetAQEConcurrentSuite.scala @@ -24,6 +24,7 @@ import org.apache.spark.sql.types.DoubleType import java.util.concurrent.ForkJoinPool import scala.collection.parallel.ForkJoinTaskSupport +import scala.collection.parallel.immutable.ParVector class GlutenClickHouseTPCHParquetAQEConcurrentSuite extends GlutenClickHouseTPCHAbstractSuite @@ -74,7 +75,7 @@ class GlutenClickHouseTPCHParquetAQEConcurrentSuite test("fix race condition at the global variable of ColumnarOverrideRules::isAdaptiveContext") { - val queries = ((1 to 22) ++ (1 to 22) ++ (1 to 22) ++ (1 to 22)).par + val queries = ParVector((1 to 22) ++ (1 to 22) ++ (1 to 22) ++ (1 to 22): _*) queries.tasksupport = new ForkJoinTaskSupport(new ForkJoinPool(22)) queries.map(queryId => runTPCHQuery(queryId) { df => }) diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseTPCHParquetBucketSuite.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseTPCHParquetBucketSuite.scala index 14d3e0130b4c4..c164fae708f8f 100644 --- a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseTPCHParquetBucketSuite.scala +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseTPCHParquetBucketSuite.scala @@ -627,7 +627,7 @@ class GlutenClickHouseTPCHParquetBucketSuite } val touchedBuckets = scanExec.head.getPartitions .flatMap(partition => partition.asInstanceOf[FilePartition].files) - .flatMap(f => BucketingUtils.getBucketId(new Path(f.filePath).getName)) + .flatMap(f => BucketingUtils.getBucketId(new Path(f.filePath.toString()).getName)) .distinct // two files from part0-0,part0-1,part1-0,part1-1 assert(touchedBuckets.size == 1) diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseTPCHSaltNullParquetSuite.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseTPCHSaltNullParquetSuite.scala index eec0ad874c5d1..c0f37b08616ea 100644 --- a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseTPCHSaltNullParquetSuite.scala +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseTPCHSaltNullParquetSuite.scala @@ -719,10 +719,17 @@ class GlutenClickHouseTPCHSaltNullParquetSuite extends GlutenClickHouseTPCHAbstr "select unix_timestamp(concat(cast(l_shipdate as String), ' 00:00:00')) " + "from lineitem order by l_shipdate limit 10;")( checkGlutenOperatorMatch[ProjectExecTransformer]) + withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> "UTC") { + runQueryAndCompare( + "select to_unix_timestamp(concat(cast(l_shipdate as String), ' 00:00:00')) " + + "from lineitem order by l_shipdate limit 10")( + checkGlutenOperatorMatch[ProjectExecTransformer]) + } } test("test literals") { - val query = """ + val query = + """ SELECT CAST(NULL AS BOOLEAN) AS boolean_literal, CAST(1 AS TINYINT) AS tinyint_literal, @@ -937,7 +944,15 @@ class GlutenClickHouseTPCHSaltNullParquetSuite extends GlutenClickHouseTPCHAbstr |from nation |order by n_regionkey, n_nationkey, n_lag |""".stripMargin + val sql1 = + """ + | select n_regionkey, n_nationkey, + | lag(n_nationkey, 1, n_nationkey) OVER (PARTITION BY n_regionkey ORDER BY n_nationkey) as n_lag + |from nation + |order by n_regionkey, n_nationkey, n_lag + |""".stripMargin compareResultsAgainstVanillaSpark(sql, true, { _ => }) + compareResultsAgainstVanillaSpark(sql1, true, { _ => }, false) } test("window lag with null value") { @@ -1311,6 +1326,13 @@ class GlutenClickHouseTPCHSaltNullParquetSuite extends GlutenClickHouseTPCHAbstr } } + test("bit_get/bit_count") { + runQueryAndCompare( + "select bit_count(id), bit_get(id, 0), bit_get(id, 1), bit_get(id, 2), bit_get(id, 3) from range(100)") { + checkGlutenOperatorMatch[ProjectExecTransformer] + } + } + test("test 'EqualNullSafe'") { runQueryAndCompare("select l_linenumber <=> l_orderkey, l_linenumber <=> null from lineitem") { checkGlutenOperatorMatch[ProjectExecTransformer] @@ -1321,9 +1343,10 @@ class GlutenClickHouseTPCHSaltNullParquetSuite extends GlutenClickHouseTPCHAbstr spark.sql("create table test_1767 (id bigint, data map) using parquet") spark.sql("INSERT INTO test_1767 values(1, map('k', 'v'))") - val sql = """ - | select id from test_1767 lateral view - | posexplode(split(data['k'], ',')) tx as a, b""".stripMargin + val sql = + """ + | select id from test_1767 lateral view + | posexplode(split(data['k'], ',')) tx as a, b""".stripMargin runQueryAndCompare(sql)(checkGlutenOperatorMatch[CHGenerateExecTransformer]) spark.sql("drop table test_1767") @@ -2032,10 +2055,15 @@ class GlutenClickHouseTPCHSaltNullParquetSuite extends GlutenClickHouseTPCHAbstr """ |select to_json(struct(cast(id as string), id, 1.1, 1.1f, 1.1d)) from range(3) |""".stripMargin + val sql1 = + """ + | select to_json(named_struct('name', concat('/val/', id))) from range(3) + |""".stripMargin // cast('nan' as double) output 'NaN' in Spark, 'nan' in CH // cast('inf' as double) output 'Infinity' in Spark, 'inf' in CH // ignore them temporarily runQueryAndCompare(sql)(checkGlutenOperatorMatch[ProjectExecTransformer]) + runQueryAndCompare(sql1)(checkGlutenOperatorMatch[ProjectExecTransformer]) } test("GLUTEN-3501: test json output format with struct contains null value") { @@ -2082,21 +2110,23 @@ class GlutenClickHouseTPCHSaltNullParquetSuite extends GlutenClickHouseTPCHAbstr } test("GLUTEN-3149 convert Nan to int") { - val sql = """ - | select cast(a as Int) as n from( - | select cast(s as Float) as a from( - | select if(n_name='ALGERIA', 'nan', '1.0') as s from nation - | ))""".stripMargin + val sql = + """ + | select cast(a as Int) as n from( + | select cast(s as Float) as a from( + | select if(n_name='ALGERIA', 'nan', '1.0') as s from nation + | ))""".stripMargin compareResultsAgainstVanillaSpark(sql, true, { _ => }) } test("GLUTEN-3149 convert Inf to int") { - val sql = """ - | select n_regionkey, n is null, isnan(n), cast(n as int) from ( - | select n_regionkey, x, n_regionkey/(x) as n from ( - | select n_regionkey, cast(n_nationkey as float) as x from nation - | )t1 - | )t2""".stripMargin + val sql = + """ + | select n_regionkey, n is null, isnan(n), cast(n as int) from ( + | select n_regionkey, x, n_regionkey/(x) as n from ( + | select n_regionkey, cast(n_nationkey as float) as x from nation + | )t1 + | )t2""".stripMargin compareResultsAgainstVanillaSpark(sql, true, { _ => }) } @@ -2551,5 +2581,124 @@ class GlutenClickHouseTPCHSaltNullParquetSuite extends GlutenClickHouseTPCHAbstr compareResultsAgainstVanillaSpark(select_sql, true, { _ => }) spark.sql("drop table test_tbl_5096") } + + test("GLUTEN-5896: Bug fix greatest/least diff") { + val tbl_create_sql = + "create table test_tbl_5896(id bigint, x1 int, x2 int, x3 int) using parquet" + val tbl_insert_sql = + "insert into test_tbl_5896 values(1, 12, NULL, 13), (2, NULL, NULL, NULL), (3, 11, NULL, NULL), (4, 10, 9, 8)" + val select_sql = "select id, greatest(x1, x2, x3), least(x1, x2, x3) from test_tbl_5896" + spark.sql(tbl_create_sql) + spark.sql(tbl_insert_sql) + compareResultsAgainstVanillaSpark(select_sql, true, { _ => }) + spark.sql("drop table test_tbl_5896") + } + + test("test left with len -1") { + val tbl_create_sql = + "create table test_left(col string) using parquet" + val tbl_insert_sql = + "insert into test_left values('test1'), ('test2')" + spark.sql(tbl_create_sql) + spark.sql(tbl_insert_sql) + compareResultsAgainstVanillaSpark("select left(col, -1) from test_left", true, { _ => }) + compareResultsAgainstVanillaSpark("select left(col, -2) from test_left", true, { _ => }) + compareResultsAgainstVanillaSpark("select substring(col, 0, -1) from test_left", true, { _ => }) + spark.sql("drop table test_left") + } + + test("Inequal join support") { + withSQLConf(("spark.sql.autoBroadcastJoinThreshold", "-1")) { + spark.sql("create table ineq_join_t1 (key bigint, value bigint) using parquet"); + spark.sql("create table ineq_join_t2 (key bigint, value bigint) using parquet"); + spark.sql("insert into ineq_join_t1 values(1, 1), (2, 2), (3, 3), (4, 4), (5, 5)"); + spark.sql("insert into ineq_join_t2 values(2, 2), (2, 1), (3, 3), (4, 6), (5, 3)"); + val sql1 = + """ + | select t1.key, t1.value, t2.key, t2.value from ineq_join_t1 as t1 + | left join ineq_join_t2 as t2 + | on t1.key = t2.key and t1.value > t2.value + |""".stripMargin + compareResultsAgainstVanillaSpark(sql1, true, { _ => }) + + val sql2 = + """ + | select t1.key, t1.value from ineq_join_t1 as t1 + | left join ineq_join_t2 as t2 + | on t1.key = t2.key and t1.value > t2.value and t1.value > t2.key + |""".stripMargin + compareResultsAgainstVanillaSpark(sql2, true, { _ => }) + spark.sql("drop table ineq_join_t1") + spark.sql("drop table ineq_join_t2") + } + } + + test("GLUTEN-5910: Fix ASTLiteral type is lost in CH") { + spark.sql("create table test_tbl_5910_0(c_time bigint, type int) using parquet") + spark.sql("create table test_tbl_5910_1(type int) using parquet") + spark.sql("insert into test_tbl_5910_0 values(1717209159, 12)") + spark.sql("insert into test_tbl_5910_1 values(12)") + val select_sql = + """ + | select t1.cday, t2.type from ( + | select type, to_date(from_unixtime(c_time)) as cday from test_tbl_5910_0 ) t1 + | left join ( + | select type, "2024-06-01" as cday from test_tbl_5910_1 ) t2 + | on t1.cday = t2.cday and t1.type = t2.type + |""".stripMargin + compareResultsAgainstVanillaSpark(select_sql, true, { _ => }) + spark.sql("drop table test_tbl_5910_0") + spark.sql("drop table test_tbl_5910_1") + } + + test("GLUTEN-4451: Fix schema may be changed by filter") { + val create_sql = + """ + |create table if not exists test_tbl_4451( + | month_day string, + | month_dif int, + | is_month_new string, + | country string, + | os string, + | mr bigint + |) using parquet + |PARTITIONED BY ( + | day string, + | app_name string) + |""".stripMargin + val insert_sql1 = + "INSERT into test_tbl_4451 partition (day='2024-06-01', app_name='abc') " + + "values('2024-06-01', 0, '1', 'CN', 'iOS', 100)" + val insert_sql2 = + "INSERT into test_tbl_4451 partition (day='2024-06-01', app_name='abc') " + + "values('2024-06-01', 0, '1', 'CN', 'iOS', 50)" + val insert_sql3 = + "INSERT into test_tbl_4451 partition (day='2024-06-01', app_name='abc') " + + "values('2024-06-01', 1, '1', 'CN', 'iOS', 80)" + spark.sql(create_sql) + spark.sql(insert_sql1) + spark.sql(insert_sql2) + spark.sql(insert_sql3) + val select_sql = + """ + |SELECT * FROM ( + | SELECT + | month_day, + | country, + | if(os = 'ALite','Android',os) AS os, + | is_month_new, + | nvl(sum(if(month_dif = 0, mr, 0)),0) AS `month0_n`, + | nvl(sum(if(month_dif = 1, mr, 0)) / sum(if(month_dif = 0, mr, 0)),0) AS `month1_rate`, + | '2024-06-18' as day, + | app_name + | FROM test_tbl_4451 + | GROUP BY month_day,country,if(os = 'ALite','Android',os),is_month_new,app_name + |) tt + |WHERE month0_n > 0 AND month1_rate <= 1 AND os IN ('all','Android','iOS') + | AND app_name IS NOT NULL + |""".stripMargin + compareResultsAgainstVanillaSpark(select_sql, true, { _ => }) + spark.sql("drop table test_tbl_4451") + } } // scalastyle:on line.size.limit diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseTPCHSuite.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseTPCHSuite.scala index 6dcb83de4740b..d26891ddb1eaa 100644 --- a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseTPCHSuite.scala +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseTPCHSuite.scala @@ -18,7 +18,7 @@ package org.apache.gluten.execution import org.apache.spark.SparkConf import org.apache.spark.sql.{Row, TestUtils} -import org.apache.spark.sql.catalyst.optimizer.BuildLeft +import org.apache.spark.sql.catalyst.optimizer.{BuildLeft, BuildRight} import org.apache.spark.sql.types.{DecimalType, StructType} // Some sqls' line length exceeds 100 @@ -73,7 +73,11 @@ class GlutenClickHouseTPCHSuite extends GlutenClickHouseTPCHAbstractSuite { val shjBuildLeft = df.queryExecution.executedPlan.collect { case shj: ShuffledHashJoinExecTransformerBase if shj.joinBuildSide == BuildLeft => shj } - assert(shjBuildLeft.size == 2) + assert(shjBuildLeft.size == 1) + val shjBuildRight = df.queryExecution.executedPlan.collect { + case shj: ShuffledHashJoinExecTransformerBase if shj.joinBuildSide == BuildRight => shj + } + assert(shjBuildRight.size == 1) } } } @@ -481,5 +485,20 @@ class GlutenClickHouseTPCHSuite extends GlutenClickHouseTPCHAbstractSuite { spark.sql(table_drop_sql) } + + test("GLUTEN-5904 NaN values from stddev") { + val sql1 = + """ + |select a, stddev(b/c) from (select * from values (1,2, 1), (1,3,0) as data(a,b,c)) + |group by a + |""".stripMargin + compareResultsAgainstVanillaSpark(sql1, true, { _ => }) + val sql2 = + """ + |select a, stddev(b) from (select * from values (1,2, 1) as data(a,b,c)) group by a + |""".stripMargin + compareResultsAgainstVanillaSpark(sql2, true, { _ => }) + + } } // scalastyle:off line.size.limit diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseTableAfterRestart.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseTableAfterRestart.scala index 9e55df0fa836a..f9e831cb4aa7b 100644 --- a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseTableAfterRestart.scala +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseTableAfterRestart.scala @@ -16,8 +16,6 @@ */ package org.apache.gluten.execution -import org.apache.gluten.GlutenConfig - import org.apache.spark.SparkConf import org.apache.spark.sql.SparkSession import org.apache.spark.sql.SparkSession.{getActiveSession, getDefaultSession} @@ -63,6 +61,9 @@ class GlutenClickHouseTableAfterRestart .set( "spark.gluten.sql.columnar.backend.ch.runtime_settings.mergetree.merge_after_insert", "false") + .set( + "spark.gluten.sql.columnar.backend.ch.runtime_settings.input_format_parquet_max_block_size", + "8192") } override protected def createTPCHNotNullTables(): Unit = { @@ -182,9 +183,9 @@ class GlutenClickHouseTableAfterRestart // for this run, missing count should not increase runTPCHQueryBySQL(1, sqlStr)(_ => {}) val stats1 = ClickhouseSnapshot.deltaScanCache.stats() - assert(stats1.missCount() - oldMissingCount1 == 0) + assertResult(oldMissingCount1)(stats1.missCount()) val stats2 = ClickhouseSnapshot.addFileToAddMTPCache.stats() - assert(stats2.missCount() - oldMissingCount2 == 0) + assertResult(oldMissingCount2)(stats2.missCount()) } val oldMissingCount1 = ClickhouseSnapshot.deltaScanCache.stats().missCount() @@ -196,10 +197,9 @@ class GlutenClickHouseTableAfterRestart // after restart, additionally check stats of delta scan cache val stats1 = ClickhouseSnapshot.deltaScanCache.stats() - assert(stats1.missCount() - oldMissingCount1 == 1) + assertResult(oldMissingCount1 + 1)(stats1.missCount()) val stats2 = ClickhouseSnapshot.addFileToAddMTPCache.stats() - assert(stats2.missCount() - oldMissingCount2 == 6) - + assertResult(oldMissingCount2 + 6)(stats2.missCount()) } test("test optimize after restart") { @@ -224,7 +224,8 @@ class GlutenClickHouseTableAfterRestart restartSpark() spark.sql("optimize table_restart_optimize") - assert(spark.sql("select count(*) from table_restart_optimize").collect().apply(0).get(0) == 4) + assertResult(4)( + spark.sql("select count(*) from table_restart_optimize").collect().apply(0).get(0)) } test("test vacuum after restart") { @@ -250,11 +251,10 @@ class GlutenClickHouseTableAfterRestart restartSpark() - spark.sql(s"set ${GlutenConfig.GLUTEN_ENABLED.key}=false") spark.sql("vacuum table_restart_vacuum") - spark.sql(s"set ${GlutenConfig.GLUTEN_ENABLED.key}=true") - assert(spark.sql("select count(*) from table_restart_vacuum").collect().apply(0).get(0) == 4) + assertResult(4)( + spark.sql("select count(*) from table_restart_vacuum").collect().apply(0).get(0)) } test("test update after restart") { @@ -280,7 +280,8 @@ class GlutenClickHouseTableAfterRestart spark.sql("update table_restart_update set name = 'tom' where id = 1") - assert(spark.sql("select count(*) from table_restart_update").collect().apply(0).get(0) == 4) + assertResult(4)( + spark.sql("select count(*) from table_restart_update").collect().apply(0).get(0)) } test("test delete after restart") { @@ -306,7 +307,8 @@ class GlutenClickHouseTableAfterRestart spark.sql("delete from table_restart_delete where where id = 1") - assert(spark.sql("select count(*) from table_restart_delete").collect().apply(0).get(0) == 2) + assertResult(2)( + spark.sql("select count(*) from table_restart_delete").collect().apply(0).get(0)) } test("test drop after restart") { diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseWholeStageTransformerSuite.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseWholeStageTransformerSuite.scala index a891d6d1027b9..9412326ae342c 100644 --- a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseWholeStageTransformerSuite.scala +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseWholeStageTransformerSuite.scala @@ -55,6 +55,8 @@ class GlutenClickHouseWholeStageTransformerSuite extends WholeStageTransformerSu val S3_ACCESS_KEY = "BypTYzcXOlfr03FFIvt4" val S3_SECRET_KEY = "K9MDaGItPSaphorZM8t4hXf30gHF9dBWi6L2dK5E" + val CH_DEFAULT_STORAGE_DIR = "/data" + def AlmostEqualsIsRel(expected: Double, actual: Double, EPSILON: Double = DBL_EPSILON): Unit = { val diff = Math.abs(expected - actual) val epsilon = EPSILON * Math.max(Math.abs(expected), Math.abs(actual)) @@ -162,7 +164,7 @@ class GlutenClickHouseWholeStageTransformerSuite extends WholeStageTransformerSu override def beforeAll(): Unit = { // is not exist may cause some ut error - assert(new File("/data").exists()) + assert(new File(CH_DEFAULT_STORAGE_DIR).exists()) // prepare working paths val basePathDir = new File(basePath) diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickhouseCountDistinctSuite.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickhouseCountDistinctSuite.scala index 1b954df22eac1..5887050d0aaa7 100644 --- a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickhouseCountDistinctSuite.scala +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickhouseCountDistinctSuite.scala @@ -213,4 +213,11 @@ class GlutenClickhouseCountDistinctSuite extends GlutenClickHouseWholeStageTrans } ) } + + test("GLUTEN-5852: Fix mismatch result columns size exception related to 5618") { + val sql = + "select distinct * from (select 2 as r3, count(distinct a, b, c), 2 as r1, 2 as r2 from " + + "values (0, null, 1), (1, 1, 1), (2, 2, 1), (1, 2, 1) ,(2, 2, 2) as data(a,b,c) group by c)" + compareResultsAgainstVanillaSpark(sql, true, { _ => }) + } } diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickhouseFunctionSuite.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickhouseFunctionSuite.scala index 63e105e12a720..8853dfc77853d 100644 --- a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickhouseFunctionSuite.scala +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickhouseFunctionSuite.scala @@ -179,4 +179,51 @@ class GlutenClickhouseFunctionSuite extends GlutenClickHouseTPCHAbstractSuite { |""".stripMargin )(df => checkFallbackOperators(df, 0)) } + + test("GLUTEN-5981 null value from get_json_object") { + spark.sql("create table json_t1 (a string) using parquet") + spark.sql("insert into json_t1 values ('{\"a\":null}')") + runQueryAndCompare( + """ + |SELECT get_json_object(a, '$.a') is null from json_t1 + |""".stripMargin + )(df => checkFallbackOperators(df, 0)) + spark.sql("drop table json_t1") + } + + test("Fix arrayDistinct(Array(Nullable(Decimal))) core dump") { + val create_sql = + """ + |create table if not exists test( + | dec array + |) using parquet + |""".stripMargin + val fill_sql = + """ + |insert into test values(array(1, 2, null)), (array(null, 2,3, 5)) + |""".stripMargin + val query_sql = + """ + |select array_distinct(dec) from test; + |""".stripMargin + spark.sql(create_sql) + spark.sql(fill_sql) + compareResultsAgainstVanillaSpark(query_sql, true, { _ => }) + spark.sql("drop table test") + } + + test("intersect all") { + spark.sql("create table t1 (a int, b string) using parquet") + spark.sql("insert into t1 values (1, '1'),(2, '2'),(3, '3'),(4, '4'),(5, '5'),(6, '6')") + spark.sql("create table t2 (a int, b string) using parquet") + spark.sql("insert into t2 values (4, '4'),(5, '5'),(6, '6'),(7, '7'),(8, '8'),(9, '9')") + runQueryAndCompare( + """ + |SELECT a,b FROM t1 INTERSECT ALL SELECT a,b FROM t2 + |""".stripMargin + )(df => checkFallbackOperators(df, 0)) + spark.sql("drop table t1") + spark.sql("drop table t2") + } + } diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickhouseStringFunctionsSuite.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickhouseStringFunctionsSuite.scala new file mode 100644 index 0000000000000..163a8fedab7ea --- /dev/null +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickhouseStringFunctionsSuite.scala @@ -0,0 +1,140 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.execution + +import org.apache.spark.SparkConf + +class GlutenClickhouseStringFunctionsSuite extends GlutenClickHouseWholeStageTransformerSuite { + + /** Run Gluten + ClickHouse Backend with SortShuffleManager */ + override protected def sparkConf: SparkConf = { + super.sparkConf + .set("spark.shuffle.manager", "sort") + .set("spark.io.compression.codec", "SNAPPY") + .set("spark.sql.shuffle.partitions", "5") + .set("spark.sql.autoBroadcastJoinThreshold", "10MB") + } + + test("GLUTEN-5821: trim_character support value from column.") { + withTable("trim") { + sql("create table trim(a String, b String) using parquet") + sql(""" + |insert into trim values + | ('aba', 'a'),('bba', 'b'),('abcdef', 'abcd'), (null, '123'),('123', null) + |""".stripMargin) + + val sql_str = + s"""select + | trim(both b from a) + | from trim + """.stripMargin + + runQueryAndCompare(sql_str) { _ => } + } + } + + test("GLUTEN-5897: fix regexp_extract with bracket") { + withTable("regexp_extract_bracket") { + sql("create table regexp_extract_bracket(a String) using parquet") + sql(""" + |insert into regexp_extract_bracket + | values ('123.123abc-abc'),('123-LOW'),('123]abc-abc') + |""".stripMargin) + + val sql_str = + s"""select + | regexp_extract(a, '([0-9][[\\.][0-9]]*)', 1) + | , regexp_extract(a, '([0-9][[\\.][0-9]]*)', 1) + | , regexp_extract(a, '([0-9][[]]]*)', 1) + | from regexp_extract_bracket + """.stripMargin + + runQueryAndCompare(sql_str) { _ => } + } + } + + test("replace") { + val tableName = "replace_table" + withTable(tableName) { + sql(s"create table $tableName(src String, idx String, dest String) using parquet") + sql(s""" + |insert into $tableName values + | (null, null, null), + | ('1', '1', null), + | ('1', '1', '2'), + | ('1', null, '2'), + | ('1', '1', '3'), + | (null, '1', '2'), + | ('1', '', '3') + """.stripMargin) + + val sql_str = + s""" + |select + | REPLACE(src, idx, dest), + | REPLACE(src, null, dest), + | REPLACE(null, null, dest), + | REPLACE(null, null, null), + | REPLACE(src, '1', null) + | from $tableName + """.stripMargin + + runQueryAndCompare(sql_str) { _ => } + } + } + + test("base64") { + val tableName = "base64_table" + withTable(tableName) { + sql(s"create table $tableName(data String) using parquet") + sql(s""" + |insert into $tableName values + | ("hello") + """.stripMargin) + + val sql_str = + s""" + |select + | base64(data) + | from $tableName + """.stripMargin + + runQueryAndCompare(sql_str) { _ => } + } + } + + test("unbase64") { + val tableName = "unbase64_table" + withTable(tableName) { + sql(s"create table $tableName(data String) using parquet") + sql(s""" + |insert into $tableName values + | ("U3BhcmsgU1FM") + """.stripMargin) + + val sql_str = + s""" + |select + | unbase64(data) + | from $tableName + """.stripMargin + + runQueryAndCompare(sql_str) { _ => } + } + } + +} diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala index 7b52a970ef08d..d3e3e94460369 100644 --- a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala @@ -281,6 +281,12 @@ class GlutenFunctionValidateSuite extends GlutenClickHouseWholeStageTransformerS } } + test("Test get_json_object 11") { + runQueryAndCompare( + "SELECT string_field1 from json_test where" + + " get_json_object(string_field1, '$.a') is not null") { _ => } + } + test("Test covar_samp") { runQueryAndCompare("SELECT covar_samp(double_field1, int_field1) from json_test") { _ => } } @@ -708,4 +714,20 @@ class GlutenFunctionValidateSuite extends GlutenClickHouseWholeStageTransformerS } + test("array functions with lambda") { + withTable("tb_array") { + sql("create table tb_array(ids array) using parquet") + sql(""" + |insert into tb_array values (array(1,5,2,null, 3)), (array(1,1,3,2)), (null), (array()) + |""".stripMargin) + val transform_sql = "select transform(ids, x -> x + 1) from tb_array" + runQueryAndCompare(transform_sql)(checkGlutenOperatorMatch[ProjectExecTransformer]) + + val filter_sql = "select filter(ids, x -> x % 2 == 1) from tb_array"; + runQueryAndCompare(filter_sql)(checkGlutenOperatorMatch[ProjectExecTransformer]) + + val aggregate_sql = "select ids, aggregate(ids, 3, (acc, x) -> acc + x) from tb_array"; + runQueryAndCompare(aggregate_sql)(checkGlutenOperatorMatch[ProjectExecTransformer]) + } + } } diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/compatibility/GlutenFunctionSuite.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/compatibility/GlutenFunctionSuite.scala new file mode 100644 index 0000000000000..8a20558c4a8e9 --- /dev/null +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/compatibility/GlutenFunctionSuite.scala @@ -0,0 +1,107 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.execution.compatibility + +import org.apache.gluten.execution.GlutenClickHouseWholeStageTransformerSuite +import org.apache.gluten.test.GlutenSQLTestUtils +import org.apache.gluten.utils.UTSystemParameters + +import org.apache.spark.internal.Logging + +class GlutenFunctionSuite + extends GlutenClickHouseWholeStageTransformerSuite + with GlutenSQLTestUtils + with Logging { + + override protected val fileFormat: String = "parquet" + private val testPath: String = s"${UTSystemParameters.testDataPath}/$fileFormat/function" + + case class TestCase( + name: String, + sql: String, + ignore: Boolean = false + ) + + private val testCase = Seq( + TestCase( + "left", + s"""|select + | left(`99`, 2) + | , left(`100`, 3) + | , left(`101`, 4) + | , left(`101`, 0) + | , left(`101`, -1) -- error + | from parquet.`$testPath/left` + |""".stripMargin + ), + TestCase( + "trim", + s"""|select + | trim(both ' ' from `99`) + | , trim(LEADING `100` from `99`) -- error + | , trim(TRAILING `100` from `99`) -- error + | from parquet.`$testPath/left` + |""".stripMargin + ), + TestCase( + "date_format 1", + s"""|select + | `0` + | , date_format(`0`, 'y') + | , date_format(`0`, 'M') + | , date_format(`0`, 'D') -- error timezone related issue + | , date_format(`0`, 'd') + | , date_format(`0`, 'H') + | , date_format(`0`, 'h') + | , date_format(`0`, 'm') + | , date_format(`0`, 's') + | from parquet.`$testPath/date_format/date` + |""".stripMargin + ), + TestCase( + "date_format 2", + s"""|select + | `4` + | , date_format(`4`, 'y') + | , date_format(`4`, 'M') + | , date_format(`4`, 'D') -- error timezone related issue + | , date_format(`4`, 'd') + | , date_format(`4`, 'H') + | , date_format(`4`, 'h') + | , date_format(`4`, 'm') + | , date_format(`4`, 's') + | from parquet.`$testPath/date_format/timestamp` + |""".stripMargin + ) + ) + + testCase.foreach { + data => + if (data.ignore) { + ignore(s"${data.name}") {} + } else { + test(s"${data.name}") { + compareResultsAgainstVanillaSpark( + data.sql, + compareResult = true, + { _ => } + ) + } + } + } + +} diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/extension/GlutenCustomAggExpressionSuite.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/extension/GlutenCustomAggExpressionSuite.scala index 3a2808e70f225..ba7d2c8f1935d 100644 --- a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/extension/GlutenCustomAggExpressionSuite.scala +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/extension/GlutenCustomAggExpressionSuite.scala @@ -91,7 +91,7 @@ class GlutenCustomAggExpressionSuite extends GlutenClickHouseTPCHAbstractSuite { assert(planExecs(3).isInstanceOf[HashAggregateExec]) val substraitContext = new SubstraitContext - planExecs(2).asInstanceOf[CHHashAggregateExecTransformer].doTransform(substraitContext) + planExecs(2).asInstanceOf[CHHashAggregateExecTransformer].transform(substraitContext) // Check the functions assert(substraitContext.registeredFunction.containsKey("custom_sum_double:req_fp64")) diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/metrics/GlutenClickHouseMetricsUTUtils.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/metrics/GlutenClickHouseMetricsUTUtils.scala index bc395ca88e9a3..ee0ad8039afc1 100644 --- a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/metrics/GlutenClickHouseMetricsUTUtils.scala +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/metrics/GlutenClickHouseMetricsUTUtils.scala @@ -20,7 +20,7 @@ import org.apache.gluten.execution.WholeStageTransformer import org.apache.gluten.memory.alloc.CHNativeMemoryAllocators import org.apache.gluten.metrics.{MetricsUtil, NativeMetrics} import org.apache.gluten.utils.SubstraitPlanPrinterUtil -import org.apache.gluten.vectorized.{CHNativeExpressionEvaluator, GeneralInIterator, GeneralOutIterator} +import org.apache.gluten.vectorized.{CHNativeExpressionEvaluator, GeneralInIterator} import org.apache.spark.sql.{DataFrame, SparkSession} import org.apache.spark.sql.catalyst.expressions.Attribute @@ -47,7 +47,7 @@ object GlutenClickHouseMetricsUTUtils { val transKernel = new CHNativeExpressionEvaluator() val mockMemoryAllocator = CHNativeMemoryAllocators.contextInstanceForUT() - val resIter: GeneralOutIterator = transKernel.createKernelWithBatchIterator( + val resIter = transKernel.createKernelWithBatchIterator( mockMemoryAllocator.getNativeInstanceId, substraitPlan.toByteArray, new Array[Array[Byte]](0), diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/metrics/GlutenClickHouseTPCHMetricsSuite.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/metrics/GlutenClickHouseTPCHMetricsSuite.scala index 1484d46532606..1b3df81667a0b 100644 --- a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/metrics/GlutenClickHouseTPCHMetricsSuite.scala +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/metrics/GlutenClickHouseTPCHMetricsSuite.scala @@ -27,7 +27,7 @@ import org.apache.spark.sql.execution.InputIteratorTransformer import scala.collection.JavaConverters._ class GlutenClickHouseTPCHMetricsSuite extends GlutenClickHouseTPCHAbstractSuite { - + private val parquetMaxBlockSize = 4096; override protected val needCopyParquetToTablePath = true override protected val tablesPath: String = basePath + "/tpch-data" @@ -38,6 +38,7 @@ class GlutenClickHouseTPCHMetricsSuite extends GlutenClickHouseTPCHAbstractSuite protected val metricsJsonFilePath: String = rootPath + "metrics-json" protected val substraitPlansDatPath: String = rootPath + "substrait-plans" + // scalastyle:off line.size.limit /** Run Gluten + ClickHouse Backend with SortShuffleManager */ override protected def sparkConf: SparkConf = { super.sparkConf @@ -45,10 +46,15 @@ class GlutenClickHouseTPCHMetricsSuite extends GlutenClickHouseTPCHAbstractSuite .set("spark.io.compression.codec", "LZ4") .set("spark.sql.shuffle.partitions", "1") .set("spark.sql.autoBroadcastJoinThreshold", "10MB") + // .set("spark.gluten.sql.columnar.backend.ch.runtime_config.logger.level", "DEBUG") + .set( + "spark.gluten.sql.columnar.backend.ch.runtime_settings.input_format_parquet_max_block_size", + s"$parquetMaxBlockSize") .set( "spark.gluten.sql.columnar.backend.ch.runtime_config.enable_streaming_aggregating", "true") } + // scalastyle:on line.size.limit override protected def createTPCHNotNullTables(): Unit = { createNotNullTPCHTablesInParquet(tablesPath) @@ -76,6 +82,33 @@ class GlutenClickHouseTPCHMetricsSuite extends GlutenClickHouseTPCHAbstractSuite } } + test("test simple limit query scan metrics") { + val sql = "select * from nation limit 5" + runSql(sql) { + df => + val plans = df.queryExecution.executedPlan.collect { + case scanExec: BasicScanExecTransformer => scanExec + } + assert(plans.size == 1) + assert(plans.head.metrics("numOutputRows").value === 25) + assert(plans.head.metrics("outputVectors").value === 1) + assert(plans.head.metrics("outputBytes").value > 0) + } + + val sql2 = "select * from lineitem limit 3" + runSql(sql2) { + df => + val plans = df.queryExecution.executedPlan.collect { + case scanExec: BasicScanExecTransformer => scanExec + } + assert(plans.size == 1) + // 1 block keep in SubstraitFileStep, and 4 blocks keep in other steps + assert(plans.head.metrics("numOutputRows").value === 5 * parquetMaxBlockSize) + assert(plans.head.metrics("outputVectors").value === 1) + assert(plans.head.metrics("outputBytes").value > 0) + } + } + test("test Generate metrics") { val sql = """ diff --git a/backends-clickhouse/src/test/scala/org/apache/spark/sql/gluten/parquet/GlutenParquetColumnIndexSuite.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/parquet/GlutenParquetColumnIndexSuite.scala similarity index 70% rename from backends-clickhouse/src/test/scala/org/apache/spark/sql/gluten/parquet/GlutenParquetColumnIndexSuite.scala rename to backends-clickhouse/src/test/scala/org/apache/gluten/execution/parquet/GlutenParquetColumnIndexSuite.scala index bc23728524d94..0311594a18ade 100644 --- a/backends-clickhouse/src/test/scala/org/apache/spark/sql/gluten/parquet/GlutenParquetColumnIndexSuite.scala +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/parquet/GlutenParquetColumnIndexSuite.scala @@ -14,18 +14,23 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.spark.sql.gluten.parquet +package org.apache.gluten.execution.parquet import org.apache.gluten.execution.{FileSourceScanExecTransformer, GlutenClickHouseWholeStageTransformerSuite} +import org.apache.gluten.test.GlutenSQLTestUtils import org.apache.gluten.utils.UTSystemParameters import org.apache.spark.SparkConf import org.apache.spark.internal.Logging import org.apache.spark.sql.DataFrame -import org.apache.spark.sql.gluten.test.GlutenSQLTestUtils import org.apache.spark.sql.internal.SQLConf -case class ParquetData(parquetDir: String, filter: String, scanOutput: Long) +case class ParquetData( + column: String, + parquetDir: String, + filter: String, + scanOutput: Long, + title: Option[String] = None) class GlutenParquetColumnIndexSuite extends GlutenClickHouseWholeStageTransformerSuite @@ -39,20 +44,41 @@ class GlutenParquetColumnIndexSuite // both gluten and vanilla spark dataframe private val parquetData = Seq( ParquetData( + "count(*)", "index/tpch/20003", "`27` <> '1-URGENT' and `9` >= '1995-01-01' and `9` < '1996-01-01' ", 140000), ParquetData( + "count(*)", "index/tpch/upper_case", "c_comment = '! requests wake. (...)ructions. furiousl'", - 12853) + 12853), + ParquetData( + "*", + "index/pageindex/query102", + "`198` = 'Crafts' or `198` = 'Computers' or `198`= 'a' or `198`= ''", + 45), + ParquetData( + "count(*)", + "index/pageindex/query102", + "`100001` < 30000 and `100001` > 1000.004", + 45, + Some("push down Decimal filter")), + ParquetData( + "count(*)", + "index/pageindex/query102", + "`100001` in (30000, 1000.004, 45000, 2323445, 4235423.6, 4546677.245, 56677.5)", + 45, + Some("push down Decimal filter In") + ), + ParquetData("count(*)", "index/pageindex/query05", "`142` = true", 9896) ) parquetData.foreach { data => - test(s"${data.parquetDir}") { + test(data.title.getOrElse(data.parquetDir)) { val parquetDir = s"$testPath/${data.parquetDir}" - val sql1 = s"""|select count(*) from $fileFormat.`$parquetDir` + val sql1 = s"""|select ${data.column} from $fileFormat.`$parquetDir` |where ${data.filter} |""".stripMargin compareResultsAgainstVanillaSpark( @@ -72,6 +98,6 @@ class GlutenParquetColumnIndexSuite } override protected def sparkConf: SparkConf = super.sparkConf - .set(SQLConf.ADAPTIVE_EXECUTION_ENABLED, false) + .set(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key, "false") .set("spark.gluten.sql.columnar.backend.ch.runtime_config.use_local_format", "true") } diff --git a/backends-clickhouse/src/test/scala/org/apache/spark/sql/gluten/parquet/GlutenParquetFilterSuite.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/parquet/GlutenParquetFilterSuite.scala similarity index 98% rename from backends-clickhouse/src/test/scala/org/apache/spark/sql/gluten/parquet/GlutenParquetFilterSuite.scala rename to backends-clickhouse/src/test/scala/org/apache/gluten/execution/parquet/GlutenParquetFilterSuite.scala index 1022b17e4311e..a1b5801daddf2 100644 --- a/backends-clickhouse/src/test/scala/org/apache/spark/sql/gluten/parquet/GlutenParquetFilterSuite.scala +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/parquet/GlutenParquetFilterSuite.scala @@ -14,15 +14,16 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.spark.sql.gluten.parquet +package org.apache.gluten.execution.parquet import org.apache.gluten.execution.{FileSourceScanExecTransformer, GlutenClickHouseWholeStageTransformerSuite} +import org.apache.gluten.test.GlutenSQLTestUtils import org.apache.spark.SparkConf import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.gluten.test.{GlutenSQLTestUtils, GlutenTPCHBase} +import org.apache.spark.sql.gluten.test.GlutenTPCHBase import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.Decimal @@ -45,8 +46,8 @@ class GlutenParquetFilterSuite override protected def sparkConf: SparkConf = super.sparkConf - .set(SQLConf.ADAPTIVE_EXECUTION_ENABLED, false) - .set(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD, -1L) // disable broadcast + .set(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key, "false") + .set(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key, "-1") // disable broadcast private val result: Array[Map[String, Seq[Predicate]]] = Array( Map( // q1 diff --git a/backends-clickhouse/src/test/scala/org/apache/spark/sql/gluten/test/GlutenSQLTestUtils.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/test/GlutenSQLTestUtils.scala similarity index 96% rename from backends-clickhouse/src/test/scala/org/apache/spark/sql/gluten/test/GlutenSQLTestUtils.scala rename to backends-clickhouse/src/test/scala/org/apache/gluten/test/GlutenSQLTestUtils.scala index 09ffb91ffb4ef..9888baf9aac1b 100644 --- a/backends-clickhouse/src/test/scala/org/apache/spark/sql/gluten/test/GlutenSQLTestUtils.scala +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/test/GlutenSQLTestUtils.scala @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.spark.sql.gluten.test +package org.apache.gluten.test import org.apache.gluten.GlutenConfig diff --git a/backends-clickhouse/src/test/scala/org/apache/spark/sql/gluten/test/GlutenTPCBase.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/test/GlutenTPCBase.scala similarity index 80% rename from backends-clickhouse/src/test/scala/org/apache/spark/sql/gluten/test/GlutenTPCBase.scala rename to backends-clickhouse/src/test/scala/org/apache/gluten/test/GlutenTPCBase.scala index 21c6abb56fbe8..224ad64437366 100644 --- a/backends-clickhouse/src/test/scala/org/apache/spark/sql/gluten/test/GlutenTPCBase.scala +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/test/GlutenTPCBase.scala @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.spark.sql.gluten.test +package org.apache.gluten.test import org.apache.spark.SparkConf import org.apache.spark.sql.internal.SQLConf @@ -27,12 +27,12 @@ trait GlutenTPCBase extends SharedSparkSession { override protected def sparkConf: SparkConf = { if (injectStats) { super.sparkConf - .set(SQLConf.MAX_TO_STRING_FIELDS, Int.MaxValue) - .set(SQLConf.CBO_ENABLED, true) - .set(SQLConf.PLAN_STATS_ENABLED, true) - .set(SQLConf.JOIN_REORDER_ENABLED, true) + .set(SQLConf.MAX_TO_STRING_FIELDS.key, s"${Int.MaxValue}") + .set(SQLConf.CBO_ENABLED.key, "true") + .set(SQLConf.PLAN_STATS_ENABLED.key, "true") + .set(SQLConf.JOIN_REORDER_ENABLED.key, "true") } else { - super.sparkConf.set(SQLConf.MAX_TO_STRING_FIELDS, Int.MaxValue) + super.sparkConf.set(SQLConf.MAX_TO_STRING_FIELDS.key, s"${Int.MaxValue}") } } diff --git a/backends-clickhouse/src/test/scala/org/apache/spark/sql/gluten/test/GlutenTPCHBase.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/test/GlutenTPCHBase.scala similarity index 98% rename from backends-clickhouse/src/test/scala/org/apache/spark/sql/gluten/test/GlutenTPCHBase.scala rename to backends-clickhouse/src/test/scala/org/apache/gluten/test/GlutenTPCHBase.scala index 4ed7e45131ce4..685f185ac81ff 100644 --- a/backends-clickhouse/src/test/scala/org/apache/spark/sql/gluten/test/GlutenTPCHBase.scala +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/test/GlutenTPCHBase.scala @@ -16,6 +16,8 @@ */ package org.apache.spark.sql.gluten.test +import org.apache.gluten.test.GlutenTPCBase + import org.apache.spark.sql.catalyst.TableIdentifier trait GlutenTPCHBase extends GlutenTPCBase { diff --git a/backends-clickhouse/src/test/scala/org/apache/spark/gluten/NativeWriteChecker.scala b/backends-clickhouse/src/test/scala/org/apache/spark/gluten/NativeWriteChecker.scala new file mode 100644 index 0000000000000..79616d52d0bc4 --- /dev/null +++ b/backends-clickhouse/src/test/scala/org/apache/spark/gluten/NativeWriteChecker.scala @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.gluten + +import org.apache.gluten.execution.GlutenClickHouseWholeStageTransformerSuite + +import org.apache.spark.sql.execution.QueryExecution +import org.apache.spark.sql.execution.datasources.FakeRowAdaptor +import org.apache.spark.sql.util.QueryExecutionListener + +trait NativeWriteChecker extends GlutenClickHouseWholeStageTransformerSuite { + + def checkNativeWrite(sqlStr: String, checkNative: Boolean): Unit = { + var nativeUsed = false + + val queryListener = new QueryExecutionListener { + override def onFailure(f: String, qe: QueryExecution, e: Exception): Unit = {} + override def onSuccess(funcName: String, qe: QueryExecution, duration: Long): Unit = { + if (!nativeUsed) { + nativeUsed = if (isSparkVersionGE("3.4")) { + false + } else { + qe.executedPlan.find(_.isInstanceOf[FakeRowAdaptor]).isDefined + } + } + } + } + + try { + spark.listenerManager.register(queryListener) + spark.sql(sqlStr) + spark.sparkContext.listenerBus.waitUntilEmpty() + assertResult(checkNative)(nativeUsed) + } finally { + spark.listenerManager.unregister(queryListener) + } + } +} diff --git a/backends-clickhouse/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CustomSum.scala b/backends-clickhouse/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CustomSum.scala index c2d7cf622b3d5..be361277fb1ea 100644 --- a/backends-clickhouse/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CustomSum.scala +++ b/backends-clickhouse/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CustomSum.scala @@ -21,7 +21,6 @@ import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.trees.TreePattern.{SUM, TreePattern} import org.apache.spark.sql.catalyst.trees.UnaryLike -import org.apache.spark.sql.catalyst.util.TypeUtils import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ @@ -43,8 +42,7 @@ case class CustomSum(child: Expression, failOnError: Boolean = SQLConf.get.ansiE override def inputTypes: Seq[AbstractDataType] = Seq(TypeCollection(NumericType, YearMonthIntervalType, DayTimeIntervalType)) - override def checkInputDataTypes(): TypeCheckResult = - TypeUtils.checkForAnsiIntervalOrNumericType(child.dataType, "sum") + override def checkInputDataTypes(): TypeCheckResult = TypeCheckResult.TypeCheckSuccess final override val nodePatterns: Seq[TreePattern] = Seq(SUM) @@ -141,11 +139,7 @@ case class CustomSum(child: Expression, failOnError: Boolean = SQLConf.get.ansiE * overflow has happened. So now, if ansi is enabled, then throw exception, if not then return * null. If sum is not null, then return the sum. */ - override lazy val evaluateExpression: Expression = resultType match { - case d: DecimalType => - If(isEmpty, Literal.create(null, resultType), CheckOverflowInSum(sum, d, !failOnError)) - case _ => sum - } + override lazy val evaluateExpression: Expression = sum override protected def withNewChildInternal(newChild: Expression): CustomSum = copy(child = newChild) diff --git a/backends-clickhouse/src/test/scala/org/apache/spark/sql/execution/benchmarks/CHHashBuildBenchmark.scala b/backends-clickhouse/src/test/scala/org/apache/spark/sql/execution/benchmarks/CHHashBuildBenchmark.scala index 487433c469c16..8d4bee5546253 100644 --- a/backends-clickhouse/src/test/scala/org/apache/spark/sql/execution/benchmarks/CHHashBuildBenchmark.scala +++ b/backends-clickhouse/src/test/scala/org/apache/spark/sql/execution/benchmarks/CHHashBuildBenchmark.scala @@ -104,7 +104,7 @@ object CHHashBuildBenchmark extends SqlBasedBenchmark with CHSqlBasedBenchmark w ( countsAndBytes.flatMap(_._2), countsAndBytes.map(_._1).sum, - BroadCastHashJoinContext(Seq(child.output.head), Inner, child.output, "") + BroadCastHashJoinContext(Seq(child.output.head), Inner, false, child.output, "") ) } } diff --git a/backends-clickhouse/src/test/scala/org/apache/spark/sql/execution/benchmarks/CHParquetReadBenchmark.scala b/backends-clickhouse/src/test/scala/org/apache/spark/sql/execution/benchmarks/CHParquetReadBenchmark.scala deleted file mode 100644 index 5e802eeed1e4f..0000000000000 --- a/backends-clickhouse/src/test/scala/org/apache/spark/sql/execution/benchmarks/CHParquetReadBenchmark.scala +++ /dev/null @@ -1,255 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.spark.sql.execution.benchmarks - -import org.apache.gluten.GlutenConfig -import org.apache.gluten.backendsapi.BackendsApiManager -import org.apache.gluten.execution.{FileSourceScanExecTransformer, WholeStageTransformContext} -import org.apache.gluten.expression.ConverterUtils -import org.apache.gluten.sql.shims.SparkShimLoader -import org.apache.gluten.substrait.SubstraitContext -import org.apache.gluten.substrait.plan.PlanBuilder -import org.apache.gluten.vectorized.{CHBlockConverterJniWrapper, CHNativeBlock} - -import org.apache.spark.benchmark.Benchmark -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions.{UnsafeProjection, UnsafeRow} -import org.apache.spark.sql.execution.FileSourceScanExec -import org.apache.spark.sql.execution.benchmark.SqlBasedBenchmark -import org.apache.spark.sql.execution.datasources.{FilePartition, PartitionedFile} -import org.apache.spark.sql.vectorized.ColumnarBatch - -import com.google.common.collect.Lists - -import scala.collection.JavaConverters._ - -/** - * Benchmark to measure Clickhouse parquet read performance. To run this benchmark: - * {{{ - * 1. Run in IDEA: run this class directly; - * 2. Run without IDEA: bin/spark-submit --class - * --jars ,, - * --conf xxxx=xxx - * backends-clickhouse-XXX-tests.jar - * parameters - * - * Parameters: - * 1. parquet files dir; - * 2. the count of the parquet file to read; - * 3. the fields to read; - * 4. the execution count; - * 5. whether to run vanilla spark benchmarks; - * }}} - */ -object CHParquetReadBenchmark extends SqlBasedBenchmark with CHSqlBasedBenchmark { - - protected lazy val appName = "CHParquetReadBenchmark" - protected lazy val thrdNum = "1" - protected lazy val memorySize = "4G" - protected lazy val offheapSize = "4G" - - def beforeAll(): Unit = {} - - override def getSparkSession: SparkSession = { - beforeAll() - val conf = getSparkConf - .setIfMissing("spark.sql.columnVector.offheap.enabled", "true") - .set("spark.gluten.sql.columnar.separate.scan.rdd.for.ch", "true") - - SparkSession.builder.config(conf).getOrCreate() - } - - override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { - val (parquetDir, readFileCnt, scanSchema, executedCnt, executedVanilla) = - if (mainArgs.isEmpty) { - ("/data/tpch-data/parquet/lineitem", 3, "l_orderkey,l_receiptdate", 5, true) - } else { - (mainArgs(0), mainArgs(1).toInt, mainArgs(2), mainArgs(3).toInt, mainArgs(4).toBoolean) - } - - val chParquet = spark.sql(s""" - |select $scanSchema from parquet.`$parquetDir` - | - |""".stripMargin) - - // Get the `FileSourceScanExecTransformer` - val chScanPlan = chParquet.queryExecution.executedPlan.collect { - case scan: FileSourceScanExecTransformer => scan - } - - val chFileScan = chScanPlan.head - val outputAttrs = chFileScan.outputAttributes() - val filePartitions = chFileScan.getPartitions - .take(readFileCnt) - .map(_.asInstanceOf[FilePartition]) - - val numOutputRows = chFileScan.longMetric("numOutputRows") - val numOutputVectors = chFileScan.longMetric("outputVectors") - val scanTime = chFileScan.longMetric("scanTime") - // Generate Substrait plan - val substraitContext = new SubstraitContext - val transformContext = chFileScan.doTransform(substraitContext) - val outNames = new java.util.ArrayList[String]() - for (attr <- outputAttrs) { - outNames.add(ConverterUtils.genColumnNameWithExprId(attr)) - } - val planNode = - PlanBuilder.makePlan(substraitContext, Lists.newArrayList(transformContext.root), outNames) - - val nativeFileScanRDD = BackendsApiManager.getIteratorApiInstance.genNativeFileScanRDD( - spark.sparkContext, - WholeStageTransformContext(planNode, substraitContext), - chFileScan.getSplitInfos, - chFileScan, - numOutputRows, - numOutputVectors, - scanTime - ) - - // Get the total row count - val chRowCnt = nativeFileScanRDD - .mapPartitionsInternal(batches => batches.map(batch => batch.numRows().toLong)) - .collect() - .sum - - val parquetReadBenchmark = - new Benchmark( - s"Parquet Read $readFileCnt files, fields: $scanSchema, total $chRowCnt records", - chRowCnt, - output = output) - - parquetReadBenchmark.addCase(s"ClickHouse Parquet Read", executedCnt) { - _ => - val resultRDD: RDD[Long] = nativeFileScanRDD.mapPartitionsInternal { - batches => - batches.map { - batch => - val block = CHNativeBlock.fromColumnarBatch(batch) - block.totalBytes() - block.close() - batch.numRows().toLong - } - } - resultRDD.collect() - } - - parquetReadBenchmark.addCase(s"ClickHouse Parquet Read to Rows", executedCnt) { - _ => - val resultRDD: RDD[Long] = nativeFileScanRDD.mapPartitionsInternal { - batches => - batches.map { - batch => - val block = CHNativeBlock.fromColumnarBatch(batch) - val info = - CHBlockConverterJniWrapper.convertColumnarToRow(block.blockAddress(), null) - new Iterator[InternalRow] { - var rowId = 0 - val row = new UnsafeRow(batch.numCols()) - var closed = false - - override def hasNext: Boolean = { - val result = rowId < batch.numRows() - if (!result && !closed) { - CHBlockConverterJniWrapper.freeMemory(info.memoryAddress, info.totalSize) - closed = true - } - result - } - - override def next: UnsafeRow = { - if (rowId >= batch.numRows()) throw new NoSuchElementException - - val (offset, length) = (info.offsets(rowId), info.lengths(rowId)) - row.pointTo(null, info.memoryAddress + offset, length.toInt) - rowId += 1 - row - } - }.foreach(_.numFields) - block.close() - - batch.numRows().toLong - } - } - resultRDD.collect() - } - - if (executedVanilla) { - spark.conf.set(GlutenConfig.GLUTEN_ENABLED.key, "false") - - val vanillaParquet = spark.sql(s""" - |select $scanSchema from parquet.`$parquetDir` - | - |""".stripMargin) - - val vanillaScanPlan = vanillaParquet.queryExecution.executedPlan.collect { - case scan: FileSourceScanExec => scan - } - - val fileScan = vanillaScanPlan.head - val fileScanOutput = fileScan.output - val relation = fileScan.relation - val readFile: PartitionedFile => Iterator[InternalRow] = - relation.fileFormat.buildReaderWithPartitionValues( - sparkSession = relation.sparkSession, - dataSchema = relation.dataSchema, - partitionSchema = relation.partitionSchema, - requiredSchema = fileScan.requiredSchema, - filters = Seq.empty, - options = relation.options, - hadoopConf = relation.sparkSession.sessionState.newHadoopConfWithOptions(relation.options) - ) - - val newFileScanRDD = - SparkShimLoader.getSparkShims - .generateFileScanRDD(spark, readFile, filePartitions, fileScan) - .asInstanceOf[RDD[ColumnarBatch]] - - val rowCnt = newFileScanRDD - .mapPartitionsInternal(batches => batches.map(batch => batch.numRows().toLong)) - .collect() - .sum - assert(chRowCnt == rowCnt, "The row count of the benchmark is not equal.") - - parquetReadBenchmark.addCase(s"Vanilla Spark Parquet Read", executedCnt) { - _ => - val resultRDD: RDD[Long] = newFileScanRDD.mapPartitionsInternal { - batches => batches.map(_.numRows().toLong) - } - resultRDD.collect() - } - - parquetReadBenchmark.addCase(s"Vanilla Spark Parquet Read to Rows", executedCnt) { - _ => - val resultRDD: RDD[Long] = newFileScanRDD.mapPartitionsInternal { - batches => - val toUnsafe = UnsafeProjection.create(fileScanOutput, fileScanOutput) - batches.map { - batch => - // Convert to row and decode parquet value - batch.rowIterator().asScala.map(toUnsafe).foreach(_.numFields) - batch.numRows().toLong - } - } - resultRDD.collect() - } - } - - parquetReadBenchmark.run() - } -} diff --git a/backends-velox/pom.xml b/backends-velox/pom.xml index 70b8b901baac0..de529a34df2fc 100755 --- a/backends-velox/pom.xml +++ b/backends-velox/pom.xml @@ -87,7 +87,7 @@ org.scalacheck scalacheck_${scala.binary.version} - 1.13.5 + 1.17.0 test @@ -113,13 +113,13 @@ org.scalatestplus - scalatestplus-mockito_2.12 + scalatestplus-mockito_${scala.binary.version} 1.0.0-M2 test org.scalatestplus - scalatestplus-scalacheck_2.12 + scalatestplus-scalacheck_${scala.binary.version} 3.1.0.0-RC2 test diff --git a/backends-velox/src/main/java/org/apache/gluten/udf/UdfJniWrapper.java b/backends-velox/src/main/java/org/apache/gluten/udf/UdfJniWrapper.java index 4b609769b2abc..8bfe8bad5c018 100644 --- a/backends-velox/src/main/java/org/apache/gluten/udf/UdfJniWrapper.java +++ b/backends-velox/src/main/java/org/apache/gluten/udf/UdfJniWrapper.java @@ -18,7 +18,5 @@ public class UdfJniWrapper { - public UdfJniWrapper() {} - - public native void getFunctionSignatures(); + public static native void getFunctionSignatures(); } diff --git a/backends-velox/src/main/java/org/apache/gluten/utils/VeloxBatchAppender.java b/backends-velox/src/main/java/org/apache/gluten/utils/VeloxBatchAppender.java new file mode 100644 index 0000000000000..32b2289471f93 --- /dev/null +++ b/backends-velox/src/main/java/org/apache/gluten/utils/VeloxBatchAppender.java @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.utils; + +import org.apache.gluten.exec.Runtime; +import org.apache.gluten.exec.Runtimes; +import org.apache.gluten.vectorized.ColumnarBatchInIterator; +import org.apache.gluten.vectorized.ColumnarBatchOutIterator; + +import org.apache.spark.sql.vectorized.ColumnarBatch; + +import java.util.Iterator; + +public final class VeloxBatchAppender { + public static ColumnarBatchOutIterator create( + int minOutputBatchSize, Iterator in) { + final Runtime runtime = Runtimes.contextInstance("VeloxBatchAppender"); + long outHandle = + VeloxBatchAppenderJniWrapper.create(runtime) + .create(minOutputBatchSize, new ColumnarBatchInIterator(in)); + return new ColumnarBatchOutIterator(runtime, outHandle); + } +} diff --git a/backends-velox/src/main/java/org/apache/gluten/utils/VeloxBatchAppenderJniWrapper.java b/backends-velox/src/main/java/org/apache/gluten/utils/VeloxBatchAppenderJniWrapper.java new file mode 100644 index 0000000000000..231e655539813 --- /dev/null +++ b/backends-velox/src/main/java/org/apache/gluten/utils/VeloxBatchAppenderJniWrapper.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.utils; + +import org.apache.gluten.exec.Runtime; +import org.apache.gluten.exec.RuntimeAware; +import org.apache.gluten.vectorized.ColumnarBatchInIterator; + +public class VeloxBatchAppenderJniWrapper implements RuntimeAware { + private final Runtime runtime; + + private VeloxBatchAppenderJniWrapper(Runtime runtime) { + this.runtime = runtime; + } + + public static VeloxBatchAppenderJniWrapper create(Runtime runtime) { + return new VeloxBatchAppenderJniWrapper(runtime); + } + + @Override + public long handle() { + return runtime.getHandle(); + } + + public native long create(int minOutputBatchSize, ColumnarBatchInIterator itr); +} diff --git a/backends-velox/src/main/java/org/apache/gluten/utils/VeloxBloomFilter.java b/backends-velox/src/main/java/org/apache/gluten/utils/VeloxBloomFilter.java index 13ba8e0113b1b..f23426d7da9dd 100644 --- a/backends-velox/src/main/java/org/apache/gluten/utils/VeloxBloomFilter.java +++ b/backends-velox/src/main/java/org/apache/gluten/utils/VeloxBloomFilter.java @@ -16,6 +16,8 @@ */ package org.apache.gluten.utils; +import org.apache.gluten.exec.Runtimes; + import org.apache.commons.io.IOUtils; import org.apache.spark.util.sketch.BloomFilter; import org.apache.spark.util.sketch.IncompatibleMergeException; @@ -27,17 +29,15 @@ import java.io.OutputStream; public class VeloxBloomFilter extends BloomFilter { - - private final VeloxBloomFilterJniWrapper jni; + private final VeloxBloomFilterJniWrapper jni = + VeloxBloomFilterJniWrapper.create(Runtimes.contextInstance("VeloxBloomFilter")); private final long handle; private VeloxBloomFilter(byte[] data) { - jni = VeloxBloomFilterJniWrapper.create(); handle = jni.init(data); } private VeloxBloomFilter(int capacity) { - jni = VeloxBloomFilterJniWrapper.create(); handle = jni.empty(capacity); } diff --git a/backends-velox/src/main/java/org/apache/gluten/utils/VeloxBloomFilterJniWrapper.java b/backends-velox/src/main/java/org/apache/gluten/utils/VeloxBloomFilterJniWrapper.java index 94f8e17bc6e5d..3ddfd2c02ed8c 100644 --- a/backends-velox/src/main/java/org/apache/gluten/utils/VeloxBloomFilterJniWrapper.java +++ b/backends-velox/src/main/java/org/apache/gluten/utils/VeloxBloomFilterJniWrapper.java @@ -18,7 +18,6 @@ import org.apache.gluten.exec.Runtime; import org.apache.gluten.exec.RuntimeAware; -import org.apache.gluten.exec.Runtimes; public class VeloxBloomFilterJniWrapper implements RuntimeAware { private final Runtime runtime; @@ -27,8 +26,8 @@ private VeloxBloomFilterJniWrapper(Runtime runtime) { this.runtime = runtime; } - public static VeloxBloomFilterJniWrapper create() { - return new VeloxBloomFilterJniWrapper(Runtimes.contextInstance()); + public static VeloxBloomFilterJniWrapper create(Runtime runtime) { + return new VeloxBloomFilterJniWrapper(runtime); } @Override diff --git a/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxBackend.scala b/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxBackend.scala index a2da0b8b2a864..0238508d96995 100644 --- a/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxBackend.scala +++ b/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxBackend.scala @@ -29,13 +29,14 @@ import org.apache.gluten.substrait.rel.LocalFilesNode.ReadFileFormat.{DwrfReadFo import org.apache.spark.sql.catalyst.catalog.BucketSpec import org.apache.spark.sql.catalyst.expressions.{Alias, CumeDist, DenseRank, Descending, Expression, Lag, Lead, Literal, MakeYMInterval, NamedExpression, NthValue, NTile, PercentRank, Rand, RangeFrame, Rank, RowNumber, SortOrder, SparkPartitionID, SpecialFrameBoundary, SpecifiedWindowFrame, Uuid} import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, ApproximatePercentile, Count, Sum} -import org.apache.spark.sql.catalyst.plans.JoinType +import org.apache.spark.sql.catalyst.plans.{JoinType, LeftOuter, RightOuter} import org.apache.spark.sql.catalyst.util.CharVarcharUtils import org.apache.spark.sql.execution.{ProjectExec, SparkPlan} import org.apache.spark.sql.execution.aggregate.HashAggregateExec import org.apache.spark.sql.execution.command.CreateDataSourceTableAsSelectCommand import org.apache.spark.sql.execution.datasources.{FileFormat, InsertIntoHadoopFsRelationCommand} import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat +import org.apache.spark.sql.hive.execution.HiveFileFormat import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ @@ -51,7 +52,6 @@ class VeloxBackend extends Backend { override def validatorApi(): ValidatorApi = new VeloxValidatorApi override def metricsApi(): MetricsApi = new VeloxMetricsApi override def listenerApi(): ListenerApi = new VeloxListenerApi - override def broadcastApi(): BroadcastApi = new VeloxBroadcastApi override def settings(): BackendSettingsApi = VeloxBackendSettings } @@ -65,6 +65,7 @@ object VeloxBackendSettings extends BackendSettingsApi { val GLUTEN_VELOX_UDF_LIB_PATHS = getBackendConfigPrefix() + ".udfLibraryPaths" val GLUTEN_VELOX_DRIVER_UDF_LIB_PATHS = getBackendConfigPrefix() + ".driver.udfLibraryPaths" + val GLUTEN_VELOX_INTERNAL_UDF_LIB_PATHS = getBackendConfigPrefix() + ".internal.udfLibraryPaths" val MAXIMUM_BATCH_SIZE: Int = 32768 @@ -183,6 +184,30 @@ object VeloxBackendSettings extends BackendSettingsApi { bucketSpec: Option[BucketSpec], options: Map[String, String]): ValidationResult = { + // Validate if HiveFileFormat write is supported based on output file type + def validateHiveFileFormat(hiveFileFormat: HiveFileFormat): Option[String] = { + // Reflect to get access to fileSinkConf which contains the output file format + val fileSinkConfField = format.getClass.getDeclaredField("fileSinkConf") + fileSinkConfField.setAccessible(true) + val fileSinkConf = fileSinkConfField.get(hiveFileFormat) + val tableInfoField = fileSinkConf.getClass.getDeclaredField("tableInfo") + tableInfoField.setAccessible(true) + val tableInfo = tableInfoField.get(fileSinkConf) + val getOutputFileFormatClassNameMethod = tableInfo.getClass + .getDeclaredMethod("getOutputFileFormatClassName") + val outputFileFormatClassName = getOutputFileFormatClassNameMethod.invoke(tableInfo) + + // Match based on the output file format class name + outputFileFormatClassName match { + case "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat" => + None + case _ => + Some( + "HiveFileFormat is supported only with Parquet as the output file type" + ) // Unsupported format + } + } + def validateCompressionCodec(): Option[String] = { // Velox doesn't support brotli and lzo. val unSupportedCompressions = Set("brotli", "lzo", "lz4raw", "lz4_raw") @@ -195,7 +220,7 @@ object VeloxBackendSettings extends BackendSettingsApi { } // Validate if all types are supported. - def validateDateTypes(): Option[String] = { + def validateDataTypes(): Option[String] = { val unsupportedTypes = fields.flatMap { field => field.dataType match { @@ -223,8 +248,13 @@ object VeloxBackendSettings extends BackendSettingsApi { def validateFileFormat(): Option[String] = { format match { - case _: ParquetFileFormat => None - case _: FileFormat => Some("Only parquet fileformat is supported in Velox backend.") + case _: ParquetFileFormat => None // Parquet is directly supported + case h: HiveFileFormat if GlutenConfig.getConf.enableHiveFileFormatWriter => + validateHiveFileFormat(h) // Parquet via Hive SerDe + case _ => + Some( + "Only ParquetFileFormat and HiveFileFormat are supported." + ) // Unsupported format } } @@ -251,7 +281,7 @@ object VeloxBackendSettings extends BackendSettingsApi { validateCompressionCodec() .orElse(validateFileFormat()) .orElse(validateFieldMetadata()) - .orElse(validateDateTypes()) + .orElse(validateDataTypes()) .orElse(validateWriteFilesOptions()) .orElse(validateBucketSpec()) match { case Some(reason) => ValidationResult.notOk(reason) @@ -272,6 +302,10 @@ object VeloxBackendSettings extends BackendSettingsApi { override def supportNativeMetadataColumns(): Boolean = true + override def supportNativeRowIndexColumn(): Boolean = true + + override def supportNativeInputFileRelatedExpr(): Boolean = true + override def supportExpandExec(): Boolean = true override def supportSortExec(): Boolean = true @@ -297,15 +331,9 @@ object VeloxBackendSettings extends BackendSettingsApi { case _ => throw new GlutenNotSupportException(s"$func is not supported.") } - // Block the offloading by checking Velox's current limitations - // when literal bound type is used for RangeFrame. def checkLimitations(swf: SpecifiedWindowFrame, orderSpec: Seq[SortOrder]): Unit = { - def doCheck(bound: Expression, isUpperBound: Boolean): Unit = { + def doCheck(bound: Expression): Unit = { bound match { - case e if e.foldable => - throw new GlutenNotSupportException( - "Window frame of type RANGE does" + - " not support constant arguments in velox backend") case _: SpecialFrameBoundary => case e if e.foldable => orderSpec.foreach( @@ -326,17 +354,11 @@ object VeloxBackendSettings extends BackendSettingsApi { "Only integral type & date type are" + " supported for sort key when literal bound type is used!") }) - val rawValue = e.eval().toString.toLong - if (isUpperBound && rawValue < 0) { - throw new GlutenNotSupportException("Negative upper bound is not supported!") - } else if (!isUpperBound && rawValue > 0) { - throw new GlutenNotSupportException("Positive lower bound is not supported!") - } case _ => } } - doCheck(swf.upper, true) - doCheck(swf.lower, false) + doCheck(swf.upper) + doCheck(swf.lower) } windowExpression.windowSpec.frameSpecification match { @@ -362,9 +384,9 @@ object VeloxBackendSettings extends BackendSettingsApi { } override def supportColumnarShuffleExec(): Boolean = { - GlutenConfig.getConf.isUseColumnarShuffleManager || - GlutenConfig.getConf.isUseCelebornShuffleManager || - GlutenConfig.getConf.isUseUniffleShuffleManager + GlutenConfig.getConf.enableColumnarShuffle && (GlutenConfig.getConf.isUseColumnarShuffleManager + || GlutenConfig.getConf.isUseCelebornShuffleManager + || GlutenConfig.getConf.isUseUniffleShuffleManager) } override def enableJoinKeysRewrite(): Boolean = false @@ -376,13 +398,10 @@ object VeloxBackendSettings extends BackendSettingsApi { } else { t match { // OPPRO-266: For Velox backend, build right and left are both supported for - // LeftOuter and LeftSemi. - // FIXME Hongze 22/12/06 - // HashJoin.scala in shim was not always loaded by class loader. - // The file should be removed and we temporarily disable the improvement - // introduced by OPPRO-266 by commenting out the following prerequisite - // condition. -// case LeftOuter | LeftSemi => true + // LeftOuter. + // TODO: Support LeftSemi after resolve issue + // https://github.com/facebookincubator/velox/issues/9980 + case LeftOuter => true case _ => false } } @@ -394,12 +413,7 @@ object VeloxBackendSettings extends BackendSettingsApi { } else { t match { // OPPRO-266: For Velox backend, build right and left are both supported for RightOuter. - // FIXME Hongze 22/12/06 - // HashJoin.scala in shim was not always loaded by class loader. - // The file should be removed and we temporarily disable the improvement - // introduced by OPPRO-266 by commenting out the following prerequisite - // condition. -// case RightOuter => true + case RightOuter => true case _ => false } } @@ -499,7 +513,11 @@ object VeloxBackendSettings extends BackendSettingsApi { override def supportBroadcastNestedLoopJoinExec(): Boolean = true + override def supportSampleExec(): Boolean = true + override def supportColumnarArrowUdf(): Boolean = true override def generateHdfsConfForLibhdfs(): Boolean = true + + override def needPreComputeRangeFrameBoundary(): Boolean = true } diff --git a/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxIteratorApi.scala b/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxIteratorApi.scala index f1fbf3648bb26..613e539456ecf 100644 --- a/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxIteratorApi.scala +++ b/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxIteratorApi.scala @@ -25,18 +25,18 @@ import org.apache.gluten.substrait.plan.PlanNode import org.apache.gluten.substrait.rel.{LocalFilesBuilder, LocalFilesNode, SplitInfo} import org.apache.gluten.substrait.rel.LocalFilesNode.ReadFileFormat import org.apache.gluten.utils._ +import org.apache.gluten.utils.iterator.Iterators import org.apache.gluten.vectorized._ -import org.apache.spark.{SparkConf, SparkContext, TaskContext} +import org.apache.spark.{SparkConf, TaskContext} import org.apache.spark.internal.Logging -import org.apache.spark.rdd.RDD import org.apache.spark.softaffinity.SoftAffinity import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils import org.apache.spark.sql.catalyst.util.{DateFormatter, TimestampFormatter} import org.apache.spark.sql.connector.read.InputPartition import org.apache.spark.sql.execution.datasources.{FilePartition, PartitionedFile} import org.apache.spark.sql.execution.metric.SQLMetric -import org.apache.spark.sql.types.{BinaryType, DateType, Decimal, DecimalType, StructType, TimestampType} +import org.apache.spark.sql.types._ import org.apache.spark.sql.utils.OASPackageBridge.InputMetricsWrapper import org.apache.spark.sql.vectorized.ColumnarBatch import org.apache.spark.util.ExecutorManager @@ -57,7 +57,14 @@ class VeloxIteratorApi extends IteratorApi with Logging { metadataColumnNames: Seq[String]): SplitInfo = { partition match { case f: FilePartition => - val (paths, starts, lengths, partitionColumns, metadataColumns) = + val ( + paths, + starts, + lengths, + fileSizes, + modificationTimes, + partitionColumns, + metadataColumns) = constructSplitInfo(partitionSchema, f.files, metadataColumnNames) val preferredLocations = SoftAffinity.getFilePartitionLocations(f) @@ -66,6 +73,8 @@ class VeloxIteratorApi extends IteratorApi with Logging { paths, starts, lengths, + fileSizes, + modificationTimes, partitionColumns, metadataColumns, fileFormat, @@ -101,8 +110,10 @@ class VeloxIteratorApi extends IteratorApi with Logging { val paths = new JArrayList[String]() val starts = new JArrayList[JLong] val lengths = new JArrayList[JLong]() + val fileSizes = new JArrayList[JLong]() + val modificationTimes = new JArrayList[JLong]() val partitionColumns = new JArrayList[JMap[String, String]] - var metadataColumns = new JArrayList[JMap[String, String]] + val metadataColumns = new JArrayList[JMap[String, String]] files.foreach { file => // The "file.filePath" in PartitionedFile is not the original encoded path, so the decoded @@ -112,6 +123,14 @@ class VeloxIteratorApi extends IteratorApi with Logging { .decode(file.filePath.toString, StandardCharsets.UTF_8.name())) starts.add(JLong.valueOf(file.start)) lengths.add(JLong.valueOf(file.length)) + val (fileSize, modificationTime) = + SparkShimLoader.getSparkShims.getFileSizeAndModificationTime(file) + (fileSize, modificationTime) match { + case (Some(size), Some(time)) => + fileSizes.add(JLong.valueOf(size)) + modificationTimes.add(JLong.valueOf(time)) + case _ => // Do nothing + } val metadataColumn = SparkShimLoader.getSparkShims.generateMetadataColumns(file, metadataColumnNames) metadataColumns.add(metadataColumn) @@ -139,7 +158,7 @@ class VeloxIteratorApi extends IteratorApi with Logging { } partitionColumns.add(partitionColumn) } - (paths, starts, lengths, partitionColumns, metadataColumns) + (paths, starts, lengths, fileSizes, modificationTimes, partitionColumns, metadataColumns) } override def injectWriteFilesTempPath(path: String): Unit = { @@ -185,7 +204,7 @@ class VeloxIteratorApi extends IteratorApi with Logging { resIter.close() } .recyclePayload(batch => batch.close()) - .addToPipelineTime(pipelineTime) + .collectLifeMillis(millis => pipelineTime += millis) .asInterruptible(context) .create() } @@ -228,20 +247,8 @@ class VeloxIteratorApi extends IteratorApi with Logging { nativeResultIterator.close() } .recyclePayload(batch => batch.close()) - .addToPipelineTime(pipelineTime) + .collectLifeMillis(millis => pipelineTime += millis) .create() } // scalastyle:on argcount - - /** Generate Native FileScanRDD, currently only for ClickHouse Backend. */ - override def genNativeFileScanRDD( - sparkContext: SparkContext, - wsCxt: WholeStageTransformContext, - splitInfos: Seq[SplitInfo], - scan: BasicScanExecTransformer, - numOutputRows: SQLMetric, - numOutputBatches: SQLMetric, - scanTime: SQLMetric): RDD[ColumnarBatch] = { - throw new UnsupportedOperationException("Cannot support to generate Native FileScanRDD.") - } } diff --git a/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxListenerApi.scala b/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxListenerApi.scala index bbeb3a2715fed..e1abbdd7c6b7e 100644 --- a/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxListenerApi.scala +++ b/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxListenerApi.scala @@ -25,7 +25,8 @@ import org.apache.gluten.init.NativeBackendInitializer import org.apache.gluten.utils._ import org.apache.gluten.vectorized.{JniLibLoader, JniWorkspace} -import org.apache.spark.SparkConf +import org.apache.spark.{SparkConf, SparkContext} +import org.apache.spark.api.plugin.PluginContext import org.apache.spark.sql.execution.datasources.velox.{VeloxOrcWriterInjects, VeloxParquetWriterInjects, VeloxRowSplitter} import org.apache.spark.sql.expression.UDFResolver import org.apache.spark.sql.internal.{GlutenConfigUtil, StaticSQLConf} @@ -38,22 +39,21 @@ import scala.sys.process._ class VeloxListenerApi extends ListenerApi { private val ARROW_VERSION = "1500" - override def onDriverStart(conf: SparkConf): Unit = { + override def onDriverStart(sc: SparkContext, pc: PluginContext): Unit = { + val conf = pc.conf() // sql table cache serializer if (conf.getBoolean(GlutenConfig.COLUMNAR_TABLE_CACHE_ENABLED.key, defaultValue = false)) { conf.set( StaticSQLConf.SPARK_CACHE_SERIALIZER.key, "org.apache.spark.sql.execution.ColumnarCachedBatchSerializer") } - UDFResolver.resolveUdfConf(conf, isDriver = true) - initialize(conf) + initialize(conf, isDriver = true) } override def onDriverShutdown(): Unit = shutdown() - override def onExecutorStart(conf: SparkConf): Unit = { - UDFResolver.resolveUdfConf(conf, isDriver = false) - initialize(conf) + override def onExecutorStart(pc: PluginContext): Unit = { + initialize(pc.conf(), isDriver = false) } override def onExecutorShutdown(): Unit = shutdown() @@ -80,6 +80,8 @@ class VeloxListenerApi extends ListenerApi { new SharedLibraryLoaderCentos7 } else if (system.contains("tencentos") && system.contains("3.2")) { new SharedLibraryLoaderCentos8 + } else if (systemName.contains("Red Hat") && systemVersion.startsWith("9")) { + new SharedLibraryLoaderCentos8 } else if (systemName.contains("Red Hat") && systemVersion.startsWith("8")) { new SharedLibraryLoaderCentos8 } else if (systemName.contains("Red Hat") && systemVersion.startsWith("7")) { @@ -92,7 +94,7 @@ class VeloxListenerApi extends ListenerApi { throw new GlutenException( s"Found unsupported OS($systemName, $systemVersion)! Currently, Gluten's Velox backend" + " only supports Ubuntu 20.04/22.04, CentOS 7/8, " + - "Alibaba Cloud Linux 2/3 & Anolis 7/8, tencentos 3.2, RedHat 7/8, " + + "Alibaba Cloud Linux 2/3 & Anolis 7/8, tencentos 3.2, RedHat 7/8/9, " + "Debian 11/12.") } } @@ -132,36 +134,18 @@ class VeloxListenerApi extends ListenerApi { ) { loadLibFromJar(loader, conf) } - loader - .newTransaction() - .loadAndCreateLink(s"libarrow.so.$ARROW_VERSION.0.0", s"libarrow.so.$ARROW_VERSION", false) - .loadAndCreateLink( - s"libparquet.so.$ARROW_VERSION.0.0", - s"libparquet.so.$ARROW_VERSION", - false) - .commit() } private def loadLibWithMacOS(loader: JniLibLoader): Unit = { - loader - .newTransaction() - .loadAndCreateLink( - s"libarrow.$ARROW_VERSION.0.0.dylib", - s"libarrow.$ARROW_VERSION.dylib", - false) - .loadAndCreateLink( - s"libparquet.$ARROW_VERSION.0.0.dylib", - s"libparquet.$ARROW_VERSION.dylib", - false) - .commit() + // Placeholder for loading shared libs on MacOS if user needs. } - private def initialize(conf: SparkConf): Unit = { + private def initialize(conf: SparkConf, isDriver: Boolean): Unit = { SparkDirectoryUtil.init(conf) - val debugJni = conf.getBoolean(GlutenConfig.GLUTEN_DEBUG_MODE, defaultValue = false) && - conf.getBoolean(GlutenConfig.GLUTEN_DEBUG_KEEP_JNI_WORKSPACE, defaultValue = false) - if (debugJni) { - JniWorkspace.enableDebug() + UDFResolver.resolveUdfConf(conf, isDriver = isDriver) + if (conf.getBoolean(GlutenConfig.GLUTEN_DEBUG_KEEP_JNI_WORKSPACE, defaultValue = false)) { + val debugDir = conf.get(GlutenConfig.GLUTEN_DEBUG_KEEP_JNI_WORKSPACE_DIR) + JniWorkspace.enableDebug(debugDir) } val loader = JniWorkspace.getDefault.libLoader diff --git a/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxMetricsApi.scala b/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxMetricsApi.scala index 7be639d4caf5c..c2696de506419 100644 --- a/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxMetricsApi.scala +++ b/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxMetricsApi.scala @@ -41,7 +41,7 @@ class VeloxMetricsApi extends MetricsApi with Logging { sparkContext: SparkContext): Map[String, SQLMetric] = { Map( "cpuCount" -> SQLMetrics.createMetric(sparkContext, "cpu wall time count"), - "wallNanos" -> SQLMetrics.createNanoTimingMetric(sparkContext, "totaltime of input iterator"), + "wallNanos" -> SQLMetrics.createNanoTimingMetric(sparkContext, "time of input iterator"), "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"), "outputVectors" -> SQLMetrics.createMetric(sparkContext, "number of output vectors") ) @@ -62,7 +62,7 @@ class VeloxMetricsApi extends MetricsApi with Logging { "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"), "outputVectors" -> SQLMetrics.createMetric(sparkContext, "number of output vectors"), "outputBytes" -> SQLMetrics.createSizeMetric(sparkContext, "number of output bytes"), - "wallNanos" -> SQLMetrics.createNanoTimingMetric(sparkContext, "totaltime of batch scan"), + "wallNanos" -> SQLMetrics.createNanoTimingMetric(sparkContext, "time of batch scan"), "cpuCount" -> SQLMetrics.createMetric(sparkContext, "cpu wall time count"), "scanTime" -> SQLMetrics.createNanoTimingMetric(sparkContext, "scan time"), "peakMemoryBytes" -> SQLMetrics.createSizeMetric(sparkContext, "peak memory bytes"), @@ -93,10 +93,8 @@ class VeloxMetricsApi extends MetricsApi with Logging { "rawInputBytes" -> SQLMetrics.createSizeMetric(sparkContext, "number of raw input bytes"), "outputVectors" -> SQLMetrics.createMetric(sparkContext, "number of output vectors"), "outputBytes" -> SQLMetrics.createSizeMetric(sparkContext, "number of output bytes"), - "scanTime" -> SQLMetrics.createNanoTimingMetric(sparkContext, "totaltime of scan"), - "wallNanos" -> SQLMetrics.createNanoTimingMetric( - sparkContext, - "totaltime of scan and filter"), + "scanTime" -> SQLMetrics.createNanoTimingMetric(sparkContext, "time of scan"), + "wallNanos" -> SQLMetrics.createNanoTimingMetric(sparkContext, "time of scan and filter"), "cpuCount" -> SQLMetrics.createMetric(sparkContext, "cpu wall time count"), "peakMemoryBytes" -> SQLMetrics.createSizeMetric(sparkContext, "peak memory bytes"), "numFiles" -> SQLMetrics.createMetric(sparkContext, "number of files read"), @@ -133,10 +131,8 @@ class VeloxMetricsApi extends MetricsApi with Logging { "rawInputBytes" -> SQLMetrics.createSizeMetric(sparkContext, "number of raw input bytes"), "outputVectors" -> SQLMetrics.createMetric(sparkContext, "number of output vectors"), "outputBytes" -> SQLMetrics.createSizeMetric(sparkContext, "number of output bytes"), - "scanTime" -> SQLMetrics.createNanoTimingMetric(sparkContext, "totaltime of scan"), - "wallNanos" -> SQLMetrics.createNanoTimingMetric( - sparkContext, - "totaltime of scan and filter"), + "scanTime" -> SQLMetrics.createNanoTimingMetric(sparkContext, "time of scan"), + "wallNanos" -> SQLMetrics.createNanoTimingMetric(sparkContext, "time of scan and filter"), "cpuCount" -> SQLMetrics.createMetric(sparkContext, "cpu wall time count"), "peakMemoryBytes" -> SQLMetrics.createSizeMetric(sparkContext, "peak memory bytes"), "numFiles" -> SQLMetrics.createMetric(sparkContext, "number of files read"), @@ -171,7 +167,7 @@ class VeloxMetricsApi extends MetricsApi with Logging { "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"), "outputVectors" -> SQLMetrics.createMetric(sparkContext, "number of output vectors"), "outputBytes" -> SQLMetrics.createSizeMetric(sparkContext, "number of output bytes"), - "wallNanos" -> SQLMetrics.createNanoTimingMetric(sparkContext, "totaltime of filter"), + "wallNanos" -> SQLMetrics.createNanoTimingMetric(sparkContext, "time of filter"), "cpuCount" -> SQLMetrics.createMetric(sparkContext, "cpu wall time count"), "peakMemoryBytes" -> SQLMetrics.createSizeMetric(sparkContext, "peak memory bytes"), "numMemoryAllocations" -> SQLMetrics.createMetric( @@ -187,7 +183,7 @@ class VeloxMetricsApi extends MetricsApi with Logging { "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"), "outputVectors" -> SQLMetrics.createMetric(sparkContext, "number of output vectors"), "outputBytes" -> SQLMetrics.createSizeMetric(sparkContext, "number of output bytes"), - "wallNanos" -> SQLMetrics.createNanoTimingMetric(sparkContext, "totaltime of project"), + "wallNanos" -> SQLMetrics.createNanoTimingMetric(sparkContext, "time of project"), "cpuCount" -> SQLMetrics.createMetric(sparkContext, "cpu wall time count"), "peakMemoryBytes" -> SQLMetrics.createSizeMetric(sparkContext, "peak memory bytes"), "numMemoryAllocations" -> SQLMetrics.createMetric( @@ -205,7 +201,7 @@ class VeloxMetricsApi extends MetricsApi with Logging { "aggOutputVectors" -> SQLMetrics.createMetric(sparkContext, "number of output vectors"), "aggOutputBytes" -> SQLMetrics.createSizeMetric(sparkContext, "number of output bytes"), "aggCpuCount" -> SQLMetrics.createMetric(sparkContext, "cpu wall time count"), - "aggWallNanos" -> SQLMetrics.createNanoTimingMetric(sparkContext, "totaltime of aggregation"), + "aggWallNanos" -> SQLMetrics.createNanoTimingMetric(sparkContext, "time of aggregation"), "aggPeakMemoryBytes" -> SQLMetrics.createSizeMetric(sparkContext, "peak memory bytes"), "aggNumMemoryAllocations" -> SQLMetrics.createMetric( sparkContext, @@ -222,13 +218,13 @@ class VeloxMetricsApi extends MetricsApi with Logging { "rowConstruction cpu wall time count"), "rowConstructionWallNanos" -> SQLMetrics.createNanoTimingMetric( sparkContext, - "totaltime of rowConstruction"), + "time of rowConstruction"), "extractionCpuCount" -> SQLMetrics.createMetric( sparkContext, "extraction cpu wall time count"), "extractionWallNanos" -> SQLMetrics.createNanoTimingMetric( sparkContext, - "totaltime of extraction"), + "time of extraction"), "finalOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of final output rows"), "finalOutputVectors" -> SQLMetrics.createMetric( sparkContext, @@ -244,7 +240,7 @@ class VeloxMetricsApi extends MetricsApi with Logging { "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"), "outputVectors" -> SQLMetrics.createMetric(sparkContext, "number of output vectors"), "outputBytes" -> SQLMetrics.createSizeMetric(sparkContext, "number of output bytes"), - "wallNanos" -> SQLMetrics.createNanoTimingMetric(sparkContext, "totaltime of expand"), + "wallNanos" -> SQLMetrics.createNanoTimingMetric(sparkContext, "time of expand"), "cpuCount" -> SQLMetrics.createMetric(sparkContext, "cpu wall time count"), "peakMemoryBytes" -> SQLMetrics.createSizeMetric(sparkContext, "peak memory bytes"), "numMemoryAllocations" -> SQLMetrics.createMetric( @@ -265,10 +261,10 @@ class VeloxMetricsApi extends MetricsApi with Logging { "dataSize" -> SQLMetrics.createSizeMetric(sparkContext, "data size"), "numPartitions" -> SQLMetrics.createMetric(sparkContext, "number of partitions"), "bytesSpilled" -> SQLMetrics.createSizeMetric(sparkContext, "shuffle bytes spilled"), - "splitBufferSize" -> SQLMetrics.createSizeMetric(sparkContext, "split buffer size total"), - "splitTime" -> SQLMetrics.createNanoTimingMetric(sparkContext, "totaltime to split"), - "spillTime" -> SQLMetrics.createNanoTimingMetric(sparkContext, "totaltime to spill"), - "deserializeTime" -> SQLMetrics.createNanoTimingMetric(sparkContext, "totaltime deserialize"), + "splitBufferSize" -> SQLMetrics.createSizeMetric(sparkContext, "split buffer size"), + "splitTime" -> SQLMetrics.createNanoTimingMetric(sparkContext, "time to split"), + "spillTime" -> SQLMetrics.createNanoTimingMetric(sparkContext, "time to spill"), + "deserializeTime" -> SQLMetrics.createNanoTimingMetric(sparkContext, "time to deserialize"), "avgReadBatchNumRows" -> SQLMetrics .createAverageMetric(sparkContext, "avg read batch num rows"), "numInputRows" -> SQLMetrics.createMetric(sparkContext, "number of input rows"), @@ -281,8 +277,8 @@ class VeloxMetricsApi extends MetricsApi with Logging { baseMetrics } else { baseMetrics ++ Map( - "compressTime" -> SQLMetrics.createNanoTimingMetric(sparkContext, "totaltime to compress"), - "decompressTime" -> SQLMetrics.createNanoTimingMetric(sparkContext, "totaltime decompress") + "compressTime" -> SQLMetrics.createNanoTimingMetric(sparkContext, "time to compress"), + "decompressTime" -> SQLMetrics.createNanoTimingMetric(sparkContext, "time to decompress") ) } } @@ -292,7 +288,7 @@ class VeloxMetricsApi extends MetricsApi with Logging { "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"), "outputVectors" -> SQLMetrics.createMetric(sparkContext, "number of output vectors"), "outputBytes" -> SQLMetrics.createSizeMetric(sparkContext, "number of output bytes"), - "wallNanos" -> SQLMetrics.createNanoTimingMetric(sparkContext, "totaltime of window"), + "wallNanos" -> SQLMetrics.createNanoTimingMetric(sparkContext, "time of window"), "cpuCount" -> SQLMetrics.createMetric(sparkContext, "cpu wall time count"), "peakMemoryBytes" -> SQLMetrics.createSizeMetric(sparkContext, "peak memory bytes"), "numMemoryAllocations" -> SQLMetrics.createMetric( @@ -314,7 +310,7 @@ class VeloxMetricsApi extends MetricsApi with Logging { Map( "numInputRows" -> SQLMetrics.createMetric(sparkContext, "number of input rows"), "numOutputBatches" -> SQLMetrics.createMetric(sparkContext, "number of output batches"), - "convertTime" -> SQLMetrics.createTimingMetric(sparkContext, "totaltime to convert") + "convertTime" -> SQLMetrics.createTimingMetric(sparkContext, "time to convert") ) override def genLimitTransformerMetrics(sparkContext: SparkContext): Map[String, SQLMetric] = @@ -322,7 +318,7 @@ class VeloxMetricsApi extends MetricsApi with Logging { "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"), "outputVectors" -> SQLMetrics.createMetric(sparkContext, "number of output vectors"), "outputBytes" -> SQLMetrics.createSizeMetric(sparkContext, "number of output bytes"), - "wallNanos" -> SQLMetrics.createNanoTimingMetric(sparkContext, "totaltime of limit"), + "wallNanos" -> SQLMetrics.createNanoTimingMetric(sparkContext, "time of limit"), "cpuCount" -> SQLMetrics.createMetric(sparkContext, "cpu wall time count"), "peakMemoryBytes" -> SQLMetrics.createSizeMetric(sparkContext, "peak memory bytes"), "numMemoryAllocations" -> SQLMetrics.createMetric( @@ -347,7 +343,7 @@ class VeloxMetricsApi extends MetricsApi with Logging { "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"), "outputVectors" -> SQLMetrics.createMetric(sparkContext, "number of output vectors"), "outputBytes" -> SQLMetrics.createSizeMetric(sparkContext, "number of output bytes"), - "wallNanos" -> SQLMetrics.createNanoTimingMetric(sparkContext, "totaltime of sort"), + "wallNanos" -> SQLMetrics.createNanoTimingMetric(sparkContext, "time of sort"), "cpuCount" -> SQLMetrics.createMetric(sparkContext, "cpu wall time count"), "peakMemoryBytes" -> SQLMetrics.createSizeMetric(sparkContext, "peak memory bytes"), "numMemoryAllocations" -> SQLMetrics.createMetric( @@ -370,7 +366,7 @@ class VeloxMetricsApi extends MetricsApi with Logging { "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"), "numOutputVectors" -> SQLMetrics.createMetric(sparkContext, "number of output vectors"), "numOutputBytes" -> SQLMetrics.createSizeMetric(sparkContext, "number of output bytes"), - "wallNanos" -> SQLMetrics.createNanoTimingMetric(sparkContext, "totaltime of merge join"), + "wallNanos" -> SQLMetrics.createNanoTimingMetric(sparkContext, "time of merge join"), "cpuCount" -> SQLMetrics.createMetric(sparkContext, "cpu wall time count"), "peakMemoryBytes" -> SQLMetrics.createSizeMetric(sparkContext, "peak memory bytes"), "numMemoryAllocations" -> SQLMetrics.createMetric( @@ -381,19 +377,19 @@ class VeloxMetricsApi extends MetricsApi with Logging { "stream preProject cpu wall time count"), "streamPreProjectionWallNanos" -> SQLMetrics.createNanoTimingMetric( sparkContext, - "totaltime of stream preProjection"), + "time of stream preProjection"), "bufferPreProjectionCpuCount" -> SQLMetrics.createMetric( sparkContext, "buffer preProject cpu wall time count"), "bufferPreProjectionWallNanos" -> SQLMetrics.createNanoTimingMetric( sparkContext, - "totaltime of buffer preProjection"), + "time of buffer preProjection"), "postProjectionCpuCount" -> SQLMetrics.createMetric( sparkContext, "postProject cpu wall time count"), "postProjectionWallNanos" -> SQLMetrics.createNanoTimingMetric( sparkContext, - "totaltime of postProjection") + "time of postProjection") ) override def genSortMergeJoinTransformerMetricsUpdater( @@ -433,9 +429,7 @@ class VeloxMetricsApi extends MetricsApi with Logging { "hashBuildCpuCount" -> SQLMetrics.createMetric( sparkContext, "hash build cpu wall time count"), - "hashBuildWallNanos" -> SQLMetrics.createNanoTimingMetric( - sparkContext, - "totaltime of hash build"), + "hashBuildWallNanos" -> SQLMetrics.createNanoTimingMetric(sparkContext, "time of hash build"), "hashBuildPeakMemoryBytes" -> SQLMetrics.createSizeMetric( sparkContext, "hash build peak memory bytes"), @@ -469,9 +463,7 @@ class VeloxMetricsApi extends MetricsApi with Logging { "hashProbeCpuCount" -> SQLMetrics.createMetric( sparkContext, "hash probe cpu wall time count"), - "hashProbeWallNanos" -> SQLMetrics.createNanoTimingMetric( - sparkContext, - "totaltime of hash probe"), + "hashProbeWallNanos" -> SQLMetrics.createNanoTimingMetric(sparkContext, "time of hash probe"), "hashProbePeakMemoryBytes" -> SQLMetrics.createSizeMetric( sparkContext, "hash probe peak memory bytes"), @@ -501,19 +493,19 @@ class VeloxMetricsApi extends MetricsApi with Logging { "stream preProject cpu wall time count"), "streamPreProjectionWallNanos" -> SQLMetrics.createNanoTimingMetric( sparkContext, - "totaltime of stream preProjection"), + "time of stream preProjection"), "buildPreProjectionCpuCount" -> SQLMetrics.createMetric( sparkContext, "preProject cpu wall time count"), "buildPreProjectionWallNanos" -> SQLMetrics.createNanoTimingMetric( sparkContext, - "totaltime to build preProjection"), + "time to build preProjection"), "postProjectionCpuCount" -> SQLMetrics.createMetric( sparkContext, "postProject cpu wall time count"), "postProjectionWallNanos" -> SQLMetrics.createNanoTimingMetric( sparkContext, - "totaltime of postProjection"), + "time of postProjection"), "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"), "numOutputVectors" -> SQLMetrics.createMetric(sparkContext, "number of output vectors"), "numOutputBytes" -> SQLMetrics.createSizeMetric(sparkContext, "number of output bytes") @@ -528,9 +520,7 @@ class VeloxMetricsApi extends MetricsApi with Logging { "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"), "outputVectors" -> SQLMetrics.createMetric(sparkContext, "number of output vectors"), "outputBytes" -> SQLMetrics.createSizeMetric(sparkContext, "number of output bytes"), - "wallNanos" -> SQLMetrics.createNanoTimingMetric( - sparkContext, - "total time of NestedLoopJoin"), + "wallNanos" -> SQLMetrics.createNanoTimingMetric(sparkContext, "time of NestedLoopJoin"), "cpuCount" -> SQLMetrics.createMetric(sparkContext, "cpu wall time count"), "peakMemoryBytes" -> SQLMetrics.createSizeMetric(sparkContext, "peak memory bytes"), "numMemoryAllocations" -> SQLMetrics.createMetric( @@ -540,4 +530,20 @@ class VeloxMetricsApi extends MetricsApi with Logging { override def genNestedLoopJoinTransformerMetricsUpdater( metrics: Map[String, SQLMetric]): MetricsUpdater = new NestedLoopJoinMetricsUpdater(metrics) + + override def genSampleTransformerMetrics(sparkContext: SparkContext): Map[String, SQLMetric] = + Map( + "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"), + "outputVectors" -> SQLMetrics.createMetric(sparkContext, "number of output vectors"), + "outputBytes" -> SQLMetrics.createSizeMetric(sparkContext, "number of output bytes"), + "wallNanos" -> SQLMetrics.createNanoTimingMetric(sparkContext, "time of sample"), + "cpuCount" -> SQLMetrics.createMetric(sparkContext, "cpu wall time count"), + "peakMemoryBytes" -> SQLMetrics.createSizeMetric(sparkContext, "peak memory bytes"), + "numMemoryAllocations" -> SQLMetrics.createMetric( + sparkContext, + "number of memory allocations") + ) + + override def genSampleTransformerMetricsUpdater(metrics: Map[String, SQLMetric]): MetricsUpdater = + new SampleMetricsUpdater(metrics) } diff --git a/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxSparkPlanExecApi.scala b/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxSparkPlanExecApi.scala index cfa135046012f..e13ebd971ef55 100644 --- a/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxSparkPlanExecApi.scala +++ b/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxSparkPlanExecApi.scala @@ -22,9 +22,10 @@ import org.apache.gluten.datasource.ArrowConvertorRule import org.apache.gluten.exception.GlutenNotSupportException import org.apache.gluten.execution._ import org.apache.gluten.expression._ +import org.apache.gluten.expression.ExpressionNames.{TRANSFORM_KEYS, TRANSFORM_VALUES} import org.apache.gluten.expression.aggregate.{HLLAdapter, VeloxBloomFilterAggregate, VeloxCollectList, VeloxCollectSet} import org.apache.gluten.extension._ -import org.apache.gluten.extension.columnar.TransformHints +import org.apache.gluten.extension.columnar.FallbackTags import org.apache.gluten.extension.columnar.transition.Convention import org.apache.gluten.extension.columnar.transition.ConventionFunc.BatchOverride import org.apache.gluten.sql.shims.SparkShimLoader @@ -109,8 +110,7 @@ class VeloxSparkPlanExecApi extends SparkPlanExecApi { GenericExpressionTransformer(condFuncName, Seq(left), condExpr), right, left, - newExpr - ) + newExpr) } /** Transform Uuid to Substrait. */ @@ -123,42 +123,50 @@ class VeloxSparkPlanExecApi extends SparkPlanExecApi { original) } - override def genTryAddTransformer( + override def genTryArithmeticTransformer( substraitExprName: String, left: ExpressionTransformer, right: ExpressionTransformer, - original: TryEval): ExpressionTransformer = { + original: TryEval, + checkArithmeticExprName: String): ExpressionTransformer = { if (SparkShimLoader.getSparkShims.withAnsiEvalMode(original.child)) { - throw new GlutenNotSupportException(s"add with ansi mode is not supported") + throw new GlutenNotSupportException( + s"${original.child.prettyName} with ansi mode is not supported") } original.child.dataType match { case LongType | IntegerType | ShortType | ByteType => - case _ => throw new GlutenNotSupportException(s"try_add is not supported") + case _ => throw new GlutenNotSupportException(s"$substraitExprName is not supported") } // Offload to velox for only IntegralTypes. GenericExpressionTransformer( substraitExprName, - Seq(GenericExpressionTransformer(ExpressionNames.TRY_ADD, Seq(left, right), original)), + Seq(GenericExpressionTransformer(checkArithmeticExprName, Seq(left, right), original)), original) } - override def genAddTransformer( + /** + * Map arithmetic expr to different functions: substraitExprName or try(checkArithmeticExprName) + * based on EvalMode. + */ + override def genArithmeticTransformer( substraitExprName: String, left: ExpressionTransformer, right: ExpressionTransformer, - original: Add): ExpressionTransformer = { + original: Expression, + checkArithmeticExprName: String): ExpressionTransformer = { if (SparkShimLoader.getSparkShims.withTryEvalMode(original)) { original.dataType match { case LongType | IntegerType | ShortType | ByteType => - case _ => throw new GlutenNotSupportException(s"try_add is not supported") + case _ => + throw new GlutenNotSupportException(s"$substraitExprName with try mode is not supported") } // Offload to velox for only IntegralTypes. GenericExpressionTransformer( ExpressionMappings.expressionsMap(classOf[TryEval]), - Seq(GenericExpressionTransformer(ExpressionNames.TRY_ADD, Seq(left, right), original)), + Seq(GenericExpressionTransformer(checkArithmeticExprName, Seq(left, right), original)), original) } else if (SparkShimLoader.getSparkShims.withAnsiEvalMode(original)) { - throw new GlutenNotSupportException(s"add with ansi mode is not supported") + throw new GlutenNotSupportException(s"$substraitExprName with ansi mode is not supported") } else { GenericExpressionTransformer(substraitExprName, Seq(left, right), original) } @@ -272,6 +280,26 @@ class VeloxSparkPlanExecApi extends SparkPlanExecApi { GenericExpressionTransformer(substraitExprName, Seq(endDate, startDate), original) } + override def genPreciseTimestampConversionTransformer( + substraitExprName: String, + children: Seq[ExpressionTransformer], + expr: PreciseTimestampConversion): ExpressionTransformer = { + // Expression used internally to convert the TimestampType to Long and back without losing + // precision, i.e. in microseconds. + val (newSubstraitName, newExpr) = expr match { + case _ @PreciseTimestampConversion(_, TimestampType, LongType) => + (ExpressionMappings.expressionsMap(classOf[UnixMicros]), UnixMicros(expr.child)) + case _ @PreciseTimestampConversion(_, LongType, TimestampType) => + ( + ExpressionMappings.expressionsMap(classOf[MicrosToTimestamp]), + MicrosToTimestamp(expr.child)) + case _ => + // TimestampNTZType is not supported here. + throw new GlutenNotSupportException("PreciseTimestampConversion is not supported") + } + GenericExpressionTransformer(newSubstraitName, children, newExpr) + } + /** * Generate FilterExecTransformer. * @@ -312,42 +340,83 @@ class VeloxSparkPlanExecApi extends SparkPlanExecApi { aggregateAttributes: Seq[Attribute]): HashAggregateExecPullOutBaseHelper = HashAggregateExecPullOutHelper(aggregateExpressions, aggregateAttributes) - override def genColumnarShuffleExchange( - shuffle: ShuffleExchangeExec, - newChild: SparkPlan): SparkPlan = { + override def genColumnarShuffleExchange(shuffle: ShuffleExchangeExec): SparkPlan = { + def allowHashOnMap[T](f: => T): T = { + val originalAllowHash = SQLConf.get.getConf(SQLConf.LEGACY_ALLOW_HASH_ON_MAPTYPE) + try { + SQLConf.get.setConf(SQLConf.LEGACY_ALLOW_HASH_ON_MAPTYPE, true) + f + } finally { + SQLConf.get.setConf(SQLConf.LEGACY_ALLOW_HASH_ON_MAPTYPE, originalAllowHash) + } + } + + def maybeAddAppendBatchesExec(plan: SparkPlan): SparkPlan = { + if (GlutenConfig.getConf.veloxCoalesceBatchesBeforeShuffle) { + VeloxAppendBatchesExec(plan, GlutenConfig.getConf.veloxMinBatchSizeForShuffle) + } else { + plan + } + } + + val child = shuffle.child + shuffle.outputPartitioning match { case HashPartitioning(exprs, _) => val hashExpr = new Murmur3Hash(exprs) - val projectList = Seq(Alias(hashExpr, "hash_partition_key")()) ++ newChild.output - val projectTransformer = ProjectExecTransformer(projectList, newChild) + val projectList = Seq(Alias(hashExpr, "hash_partition_key")()) ++ child.output + val projectTransformer = ProjectExecTransformer(projectList, child) val validationResult = projectTransformer.doValidate() if (validationResult.isValid) { - ColumnarShuffleExchangeExec( - shuffle, - projectTransformer, - projectTransformer.output.drop(1)) + val newChild = maybeAddAppendBatchesExec(projectTransformer) + ColumnarShuffleExchangeExec(shuffle, newChild, newChild.output.drop(1)) } else { - TransformHints.tagNotTransformable(shuffle, validationResult) - shuffle.withNewChildren(newChild :: Nil) + FallbackTags.add(shuffle, validationResult) + shuffle.withNewChildren(child :: Nil) } case RoundRobinPartitioning(num) if SQLConf.get.sortBeforeRepartition && num > 1 => - val hashExpr = new Murmur3Hash(newChild.output) - val projectList = Seq(Alias(hashExpr, "hash_partition_key")()) ++ newChild.output - val projectTransformer = ProjectExecTransformer(projectList, newChild) - val sortOrder = SortOrder(projectTransformer.output.head, Ascending) - val sortByHashCode = SortExecTransformer(Seq(sortOrder), global = false, projectTransformer) - val dropSortColumnTransformer = ProjectExecTransformer(projectList.drop(1), sortByHashCode) - val validationResult = dropSortColumnTransformer.doValidate() - if (validationResult.isValid) { - ColumnarShuffleExchangeExec( - shuffle, - dropSortColumnTransformer, - dropSortColumnTransformer.output) - } else { - TransformHints.tagNotTransformable(shuffle, validationResult) - shuffle.withNewChildren(newChild :: Nil) + // scalastyle:off line.size.limit + // Temporarily allow hash on map if it's disabled, otherwise HashExpression will fail to get + // resolved if its child contains map type. + // See https://github.com/apache/spark/blob/609bd4839e5d504917de74ed1cb9c23645fba51f/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala#L279-L283 + // scalastyle:on line.size.limit + allowHashOnMap { + // Velox hash expression does not support null type and we also do not need to sort + // null type since the value always be null. + val columnsForHash = child.output.filterNot(_.dataType == NullType) + if (columnsForHash.isEmpty) { + val newChild = maybeAddAppendBatchesExec(child) + ColumnarShuffleExchangeExec(shuffle, newChild, newChild.output) + } else { + val hashExpr = new Murmur3Hash(columnsForHash) + val projectList = Seq(Alias(hashExpr, "hash_partition_key")()) ++ child.output + val projectTransformer = ProjectExecTransformer(projectList, child) + val projectBeforeSortValidationResult = projectTransformer.doValidate() + // Make sure we support offload hash expression + val projectBeforeSort = if (projectBeforeSortValidationResult.isValid) { + projectTransformer + } else { + val project = ProjectExec(projectList, child) + FallbackTags.add(project, projectBeforeSortValidationResult) + project + } + val sortOrder = SortOrder(projectBeforeSort.output.head, Ascending) + val sortByHashCode = + SortExecTransformer(Seq(sortOrder), global = false, projectBeforeSort) + val dropSortColumnTransformer = + ProjectExecTransformer(projectList.drop(1), sortByHashCode) + val validationResult = dropSortColumnTransformer.doValidate() + if (validationResult.isValid) { + val newChild = maybeAddAppendBatchesExec(dropSortColumnTransformer) + ColumnarShuffleExchangeExec(shuffle, newChild, newChild.output) + } else { + FallbackTags.add(shuffle, validationResult) + shuffle.withNewChildren(child :: Nil) + } + } } case _ => + val newChild = maybeAddAppendBatchesExec(child) ColumnarShuffleExchangeExec(shuffle, newChild, null) } } @@ -392,6 +461,15 @@ class VeloxSparkPlanExecApi extends SparkPlanExecApi { right, isNullAwareAntiJoin) + override def genSampleExecTransformer( + lowerBound: Double, + upperBound: Double, + withReplacement: Boolean, + seed: Long, + child: SparkPlan): SampleExecTransformer = { + SampleExecTransformer(lowerBound, upperBound, withReplacement, seed, child) + } + override def genSortMergeJoinExecTransformer( leftKeys: Seq[Expression], rightKeys: Seq[Expression], @@ -409,8 +487,7 @@ class VeloxSparkPlanExecApi extends SparkPlanExecApi { left, right, isSkewJoin, - projectList - ) + projectList) } override def genCartesianProductExecTransformer( left: SparkPlan, @@ -419,8 +496,7 @@ class VeloxSparkPlanExecApi extends SparkPlanExecApi { CartesianProductExecTransformer( ColumnarCartesianProductBridge(left), ColumnarCartesianProductBridge(right), - condition - ) + condition) } override def genBroadcastNestedLoopJoinExecTransformer( @@ -429,13 +505,7 @@ class VeloxSparkPlanExecApi extends SparkPlanExecApi { buildSide: BuildSide, joinType: JoinType, condition: Option[Expression]): BroadcastNestedLoopJoinExecTransformer = - GlutenBroadcastNestedLoopJoinExecTransformer( - left, - right, - buildSide, - joinType, - condition - ) + VeloxBroadcastNestedLoopJoinExecTransformer(left, right, buildSide, joinType, condition) override def genHashExpressionTransformer( substraitExprName: String, @@ -716,20 +786,17 @@ class VeloxSparkPlanExecApi extends SparkPlanExecApi { * * @return */ - override def genExtendedOptimizers(): List[SparkSession => Rule[LogicalPlan]] = List( - CollectRewriteRule.apply, - HLLRewriteRule.apply - ) + override def genExtendedOptimizers(): List[SparkSession => Rule[LogicalPlan]] = + List(CollectRewriteRule.apply, HLLRewriteRule.apply) /** * Generate extended columnar pre-rules, in the validation phase. * * @return */ - override def genExtendedColumnarValidationRules(): List[SparkSession => Rule[SparkPlan]] = List( - BloomFilterMightContainJointRewriteRule.apply, - ArrowScanReplaceRule.apply - ) + override def genExtendedColumnarValidationRules(): List[SparkSession => Rule[SparkPlan]] = { + List(BloomFilterMightContainJointRewriteRule.apply, ArrowScanReplaceRule.apply) + } /** * Generate extended columnar pre-rules. @@ -744,15 +811,6 @@ class VeloxSparkPlanExecApi extends SparkPlanExecApi { buf.result } - /** - * Generate extended columnar post-rules. - * - * @return - */ - override def genExtendedColumnarPostRules(): List[SparkSession => Rule[SparkPlan]] = { - SparkShimLoader.getSparkShims.getExtendedColumnarPostRules() ::: List() - } - override def genInjectPostHocResolutionRules(): List[SparkSession => Rule[LogicalPlan]] = { List(ArrowConvertorRule) } @@ -776,7 +834,11 @@ class VeloxSparkPlanExecApi extends SparkPlanExecApi { Sig[VeloxCollectList](ExpressionNames.COLLECT_LIST), Sig[VeloxCollectSet](ExpressionNames.COLLECT_SET), Sig[VeloxBloomFilterMightContain](ExpressionNames.MIGHT_CONTAIN), - Sig[VeloxBloomFilterAggregate](ExpressionNames.BLOOM_FILTER_AGG) + Sig[VeloxBloomFilterAggregate](ExpressionNames.BLOOM_FILTER_AGG), + Sig[TransformKeys](TRANSFORM_KEYS), + Sig[TransformValues](TRANSFORM_VALUES), + // For test purpose. + Sig[VeloxDummyExpression](VeloxDummyExpression.VELOX_DUMMY_EXPRESSION) ) } @@ -806,8 +868,7 @@ class VeloxSparkPlanExecApi extends SparkPlanExecApi { requiredChildOutput: Seq[Attribute], outer: Boolean, generatorOutput: Seq[Attribute], - child: SparkPlan - ): GenerateExecTransformerBase = { + child: SparkPlan): GenerateExecTransformerBase = { GenerateExecTransformer(generator, requiredChildOutput, outer, generatorOutput, child) } diff --git a/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxTransformerApi.scala b/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxTransformerApi.scala index aadfcd9b7d1e5..4cbde635e9bda 100644 --- a/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxTransformerApi.scala +++ b/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxTransformerApi.scala @@ -17,6 +17,7 @@ package org.apache.gluten.backendsapi.velox import org.apache.gluten.backendsapi.TransformerApi +import org.apache.gluten.exec.Runtimes import org.apache.gluten.expression.ConverterUtils import org.apache.gluten.substrait.expression.{ExpressionBuilder, ExpressionNode} import org.apache.gluten.utils.InputPartitionsUtil @@ -39,6 +40,7 @@ class VeloxTransformerApi extends TransformerApi with Logging { /** Generate Seq[InputPartition] for FileSourceScanExecTransformer. */ def genInputPartitionSeq( relation: HadoopFsRelation, + requiredSchema: StructType, selectedPartitions: Array[PartitionDirectory], output: Seq[Attribute], bucketedScan: Boolean, @@ -48,6 +50,7 @@ class VeloxTransformerApi extends TransformerApi with Logging { filterExprs: Seq[Expression] = Seq.empty): Seq[InputPartition] = { InputPartitionsUtil( relation, + requiredSchema, selectedPartitions, output, bucketedScan, @@ -81,7 +84,8 @@ class VeloxTransformerApi extends TransformerApi with Logging { override def getNativePlanString(substraitPlan: Array[Byte], details: Boolean): String = { TaskResources.runUnsafe { - val jniWrapper = PlanEvaluatorJniWrapper.create() + val jniWrapper = PlanEvaluatorJniWrapper.create( + Runtimes.contextInstance("VeloxTransformerApi#getNativePlanString")) jniWrapper.nativePlanString(substraitPlan, details) } } diff --git a/backends-velox/src/main/scala/org/apache/gluten/datasource/ArrowCSVFileFormat.scala b/backends-velox/src/main/scala/org/apache/gluten/datasource/ArrowCSVFileFormat.scala index 0f6813d8fc6a1..a8e65b0539c7c 100644 --- a/backends-velox/src/main/scala/org/apache/gluten/datasource/ArrowCSVFileFormat.scala +++ b/backends-velox/src/main/scala/org/apache/gluten/datasource/ArrowCSVFileFormat.scala @@ -21,7 +21,8 @@ import org.apache.gluten.exception.SchemaMismatchException import org.apache.gluten.execution.RowToVeloxColumnarExec import org.apache.gluten.memory.arrow.alloc.ArrowBufferAllocators import org.apache.gluten.memory.arrow.pool.ArrowNativeMemoryPool -import org.apache.gluten.utils.{ArrowUtil, Iterators} +import org.apache.gluten.utils.ArrowUtil +import org.apache.gluten.utils.iterator.Iterators import org.apache.gluten.vectorized.ArrowWritableColumnVector import org.apache.spark.TaskContext @@ -40,8 +41,10 @@ import org.apache.spark.sql.types.{StructField, StructType} import org.apache.spark.sql.vectorized.ColumnarBatch import org.apache.spark.util.SerializableConfiguration +import org.apache.arrow.c.ArrowSchema import org.apache.arrow.dataset.file.FileSystemDatasetFactory import org.apache.arrow.dataset.scanner.ScanOptions +import org.apache.arrow.dataset.scanner.csv.CsvFragmentScanOptions import org.apache.arrow.memory.BufferAllocator import org.apache.arrow.vector.VectorUnloader import org.apache.arrow.vector.types.pojo.Schema @@ -51,11 +54,17 @@ import org.apache.hadoop.fs.{FileStatus, Path} import java.net.URLDecoder import java.util.Optional -import scala.collection.JavaConverters.asScalaBufferConverter +import scala.collection.JavaConverters.{asJavaIterableConverter, asScalaBufferConverter} -class ArrowCSVFileFormat extends FileFormat with DataSourceRegister with Logging with Serializable { +class ArrowCSVFileFormat(parsedOptions: CSVOptions) + extends FileFormat + with DataSourceRegister + with Logging + with Serializable { private val fileFormat = org.apache.arrow.dataset.file.FileFormat.CSV + private lazy val pool = ArrowNativeMemoryPool.arrowPool("FileSystem Read") + var fallback = false override def isSplitable( sparkSession: SparkSession, @@ -68,9 +77,11 @@ class ArrowCSVFileFormat extends FileFormat with DataSourceRegister with Logging sparkSession: SparkSession, options: Map[String, String], files: Seq[FileStatus]): Option[StructType] = { + val arrowConfig = ArrowCSVOptionConverter.convert(parsedOptions) ArrowUtil.readSchema( files, fileFormat, + arrowConfig, ArrowBufferAllocators.contextInstance(), ArrowNativeMemoryPool.arrowPool("infer schema")) } @@ -89,51 +100,74 @@ class ArrowCSVFileFormat extends FileFormat with DataSourceRegister with Logging val broadcastedHadoopConf = sparkSession.sparkContext.broadcast(new SerializableConfiguration(hadoopConf)) val batchSize = sqlConf.columnBatchSize - val caseSensitive = sqlConf.caseSensitiveAnalysis val columnPruning = sqlConf.csvColumnPruning && !requiredSchema.exists(_.name == sparkSession.sessionState.conf.columnNameOfCorruptRecord) - val parsedOptions = new CSVOptions( - options, - columnPruning, - sparkSession.sessionState.conf.sessionLocalTimeZone, - sparkSession.sessionState.conf.columnNameOfCorruptRecord) val actualFilters = filters.filterNot(_.references.contains(parsedOptions.columnNameOfCorruptRecord)) (file: PartitionedFile) => { + val actualDataSchema = StructType( + dataSchema.filterNot(_.name == parsedOptions.columnNameOfCorruptRecord)) + val actualRequiredSchema = StructType( + requiredSchema.filterNot(_.name == parsedOptions.columnNameOfCorruptRecord)) ArrowCSVFileFormat.checkHeader( file, - dataSchema, - requiredSchema, + actualDataSchema, + actualRequiredSchema, parsedOptions, actualFilters, broadcastedHadoopConf.value.value) - val factory = - ArrowUtil.makeArrowDiscovery( + + val arrowConfig = ArrowCSVOptionConverter.convert(parsedOptions) + val allocator = ArrowBufferAllocators.contextInstance() + // todo predicate validation / pushdown + val fileNames = ArrowUtil + .readArrowFileColumnNames( URLDecoder.decode(file.filePath.toString, "UTF-8"), fileFormat, + arrowConfig, ArrowBufferAllocators.contextInstance(), - ArrowNativeMemoryPool.arrowPool("FileSystemDatasetFactory") - ) - // todo predicate validation / pushdown - val fileFields = factory.inspect().getFields.asScala + pool) + val tokenIndexArr = + actualRequiredSchema + .map(f => java.lang.Integer.valueOf(actualDataSchema.indexOf(f))) + .toArray + val fileIndex = tokenIndexArr.filter(_ < fileNames.length) + val requestSchema = new StructType( + fileIndex + .map(index => StructField(fileNames(index), actualDataSchema(index).dataType))) + val missingIndex = tokenIndexArr.filter(_ >= fileNames.length) + val missingSchema = new StructType(missingIndex.map(actualDataSchema(_))) // TODO: support array/map/struct types in out-of-order schema reading. + val cSchema: ArrowSchema = ArrowSchema.allocateNew(allocator) + val cSchema2: ArrowSchema = ArrowSchema.allocateNew(allocator) try { - val actualReadFields = - ArrowUtil.getRequestedField(requiredSchema, fileFields, caseSensitive) + ArrowCSVOptionConverter.schema(requestSchema, cSchema, allocator, arrowConfig) + val factory = + ArrowUtil.makeArrowDiscovery( + URLDecoder.decode(file.filePath.toString, "UTF-8"), + fileFormat, + Optional.of(arrowConfig), + ArrowBufferAllocators.contextInstance(), + pool) + val fields = factory.inspect().getFields + val actualReadFields = new Schema( + fileIndex.map(index => fields.get(index)).toIterable.asJava) + ArrowCSVOptionConverter.schema(requestSchema, cSchema2, allocator, arrowConfig) ArrowCSVFileFormat .readArrow( ArrowBufferAllocators.contextInstance(), file, actualReadFields, - caseSensitive, - requiredSchema, + missingSchema, partitionSchema, factory, - batchSize) + batchSize, + arrowConfig) .asInstanceOf[Iterator[InternalRow]] } catch { case e: SchemaMismatchException => logWarning(e.getMessage) + fallback = true val iter = ArrowCSVFileFormat.fallbackReadVanilla( dataSchema, requiredSchema, @@ -148,8 +182,10 @@ class ArrowCSVFileFormat extends FileFormat with DataSourceRegister with Logging .rowToColumn(schema, batchSize, rows) .asInstanceOf[Iterator[InternalRow]] case d: Exception => throw d + } finally { + cSchema.close() + cSchema2.close() } - } } @@ -184,28 +220,23 @@ object ArrowCSVFileFormat { allocator: BufferAllocator, file: PartitionedFile, actualReadFields: Schema, - caseSensitive: Boolean, - requiredSchema: StructType, + missingSchema: StructType, partitionSchema: StructType, factory: FileSystemDatasetFactory, - batchSize: Int): Iterator[ColumnarBatch] = { - val compare = ArrowUtil.compareStringFunc(caseSensitive) + batchSize: Int, + arrowConfig: CsvFragmentScanOptions): Iterator[ColumnarBatch] = { val actualReadFieldNames = actualReadFields.getFields.asScala.map(_.getName).toArray - val actualReadSchema = new StructType( - actualReadFieldNames.map(f => requiredSchema.find(field => compare(f, field.name)).get)) val dataset = factory.finish(actualReadFields) - - val hasMissingColumns = actualReadFields.getFields.size() != requiredSchema.size - - val scanOptions = new ScanOptions(batchSize, Optional.of(actualReadFieldNames)) + val scanOptions = new ScanOptions.Builder(batchSize) + .columns(Optional.of(actualReadFieldNames)) + .fragmentScanOptions(arrowConfig) + .build() val scanner = dataset.newScan(scanOptions) val partitionVectors = ArrowUtil.loadPartitionColumns(batchSize, partitionSchema, file.partitionValues) - val nullVectors = if (hasMissingColumns) { - val missingSchema = - new StructType(requiredSchema.filterNot(actualReadSchema.contains).toArray) + val nullVectors = if (missingSchema.nonEmpty) { ArrowUtil.loadMissingColumns(batchSize, missingSchema) } else { Array.empty[ArrowWritableColumnVector] @@ -225,8 +256,7 @@ object ArrowCSVFileFormat { val batch = ArrowUtil.loadBatch( allocator, unloader.getRecordBatch, - actualReadSchema, - requiredSchema, + actualReadFields, partitionVectors, nullVectors) batch @@ -246,8 +276,8 @@ object ArrowCSVFileFormat { def checkHeader( file: PartitionedFile, - dataSchema: StructType, - requiredSchema: StructType, + actualDataSchema: StructType, + actualRequiredSchema: StructType, parsedOptions: CSVOptions, actualFilters: Seq[Filter], conf: Configuration): Unit = { @@ -255,10 +285,6 @@ object ArrowCSVFileFormat { if (!isStartOfFile) { return } - val actualDataSchema = StructType( - dataSchema.filterNot(_.name == parsedOptions.columnNameOfCorruptRecord)) - val actualRequiredSchema = StructType( - requiredSchema.filterNot(_.name == parsedOptions.columnNameOfCorruptRecord)) val parser = new UnivocityParser(actualDataSchema, actualRequiredSchema, parsedOptions, actualFilters) val schema = if (parsedOptions.columnPruning) actualRequiredSchema else actualDataSchema diff --git a/backends-velox/src/main/scala/org/apache/gluten/datasource/ArrowCSVOptionConverter.scala b/backends-velox/src/main/scala/org/apache/gluten/datasource/ArrowCSVOptionConverter.scala new file mode 100644 index 0000000000000..7d6a54c2ac7a6 --- /dev/null +++ b/backends-velox/src/main/scala/org/apache/gluten/datasource/ArrowCSVOptionConverter.scala @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.datasource + +import org.apache.gluten.utils.ArrowAbiUtil + +import org.apache.spark.sql.catalyst.csv.CSVOptions +import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.utils.SparkSchemaUtil + +import com.google.common.collect.ImmutableMap +import org.apache.arrow.c.ArrowSchema +import org.apache.arrow.dataset.scanner.csv.{CsvConvertOptions, CsvFragmentScanOptions} +import org.apache.arrow.memory.BufferAllocator + +import java.util + +object ArrowCSVOptionConverter { + def convert(option: CSVOptions): CsvFragmentScanOptions = { + val parseMap = new util.HashMap[String, String]() + val default = new CSVOptions( + CaseInsensitiveMap(Map()), + option.columnPruning, + SparkSchemaUtil.getLocalTimezoneID) + parseMap.put("strings_can_be_null", "true") + if (option.delimiter != default.delimiter) { + parseMap.put("delimiter", option.delimiter) + } + if (option.escapeQuotes != default.escapeQuotes) { + parseMap.put("quoting", (!option.escapeQuotes).toString) + } + + val convertOptions = new CsvConvertOptions(ImmutableMap.of()) + new CsvFragmentScanOptions(convertOptions, ImmutableMap.of(), parseMap) + } + + def schema( + requiredSchema: StructType, + cSchema: ArrowSchema, + allocator: BufferAllocator, + option: CsvFragmentScanOptions): Unit = { + val schema = SparkSchemaUtil.toArrowSchema(requiredSchema) + ArrowAbiUtil.exportSchema(allocator, schema, cSchema) + option.getConvertOptions.setArrowSchema(cSchema) + } + +} diff --git a/backends-velox/src/main/scala/org/apache/gluten/datasource/ArrowConvertorRule.scala b/backends-velox/src/main/scala/org/apache/gluten/datasource/ArrowConvertorRule.scala index dab1ffd3b9e3f..2778710155bf9 100644 --- a/backends-velox/src/main/scala/org/apache/gluten/datasource/ArrowConvertorRule.scala +++ b/backends-velox/src/main/scala/org/apache/gluten/datasource/ArrowConvertorRule.scala @@ -19,6 +19,7 @@ package org.apache.gluten.datasource import org.apache.gluten.backendsapi.BackendsApiManager import org.apache.gluten.datasource.v2.ArrowCSVTable import org.apache.gluten.sql.shims.SparkShimLoader +import org.apache.gluten.utils.LogicalPlanSelector import org.apache.spark.annotation.Experimental import org.apache.spark.sql.SparkSession @@ -39,7 +40,7 @@ import scala.collection.convert.ImplicitConversions.`map AsScala` @Experimental case class ArrowConvertorRule(session: SparkSession) extends Rule[LogicalPlan] { - override def apply(plan: LogicalPlan): LogicalPlan = { + override def apply(plan: LogicalPlan): LogicalPlan = LogicalPlanSelector.maybe(session, plan) { if (!BackendsApiManager.getSettings.enableNativeArrowReadFiles()) { return plan } @@ -49,7 +50,11 @@ case class ArrowConvertorRule(session: SparkSession) extends Rule[LogicalPlan] { _, _, _) if validate(session, dataSchema, options) => - l.copy(relation = r.copy(fileFormat = new ArrowCSVFileFormat())(session)) + val csvOptions = new CSVOptions( + options, + columnPruning = session.sessionState.conf.csvColumnPruning, + session.sessionState.conf.sessionLocalTimeZone) + l.copy(relation = r.copy(fileFormat = new ArrowCSVFileFormat(csvOptions))(session)) case d @ DataSourceV2Relation( t @ CSVTable( name, @@ -88,7 +93,8 @@ case class ArrowConvertorRule(session: SparkSession) extends Rule[LogicalPlan] { } private def checkCsvOptions(csvOptions: CSVOptions, timeZone: String): Boolean = { - csvOptions.headerFlag && !csvOptions.multiLine && csvOptions.delimiter == "," && + csvOptions.headerFlag && !csvOptions.multiLine && + csvOptions.delimiter.length == 1 && csvOptions.quote == '\"' && csvOptions.escape == '\\' && csvOptions.lineSeparator.isEmpty && diff --git a/backends-velox/src/main/scala/org/apache/gluten/datasource/v2/ArrowCSVPartitionReaderFactory.scala b/backends-velox/src/main/scala/org/apache/gluten/datasource/v2/ArrowCSVPartitionReaderFactory.scala index ddc7f797fb938..4af5022a6252a 100644 --- a/backends-velox/src/main/scala/org/apache/gluten/datasource/v2/ArrowCSVPartitionReaderFactory.scala +++ b/backends-velox/src/main/scala/org/apache/gluten/datasource/v2/ArrowCSVPartitionReaderFactory.scala @@ -16,7 +16,7 @@ */ package org.apache.gluten.datasource.v2 -import org.apache.gluten.datasource.ArrowCSVFileFormat +import org.apache.gluten.datasource.{ArrowCSVFileFormat, ArrowCSVOptionConverter} import org.apache.gluten.exception.SchemaMismatchException import org.apache.gluten.memory.arrow.alloc.ArrowBufferAllocators import org.apache.gluten.memory.arrow.pool.ArrowNativeMemoryPool @@ -31,15 +31,17 @@ import org.apache.spark.sql.execution.datasources.PartitionedFile import org.apache.spark.sql.execution.datasources.v2.FilePartitionReaderFactory import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.sources.Filter -import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.types.{StructField, StructType} import org.apache.spark.sql.vectorized.ColumnarBatch import org.apache.spark.util.{SerializableConfiguration, TaskResources} -import org.apache.arrow.dataset.file.FileFormat +import org.apache.arrow.c.ArrowSchema +import org.apache.arrow.vector.types.pojo.Schema import java.net.URLDecoder +import java.util.Optional -import scala.collection.JavaConverters.asScalaBufferConverter +import scala.collection.JavaConverters.asJavaIterableConverter case class ArrowCSVPartitionReaderFactory( sqlConf: SQLConf, @@ -53,8 +55,9 @@ case class ArrowCSVPartitionReaderFactory( with Logging { private val batchSize = sqlConf.parquetVectorizedReaderBatchSize - private val caseSensitive: Boolean = sqlConf.caseSensitiveAnalysis private val csvColumnPruning: Boolean = sqlConf.csvColumnPruning + private val fileFormat = org.apache.arrow.dataset.file.FileFormat.CSV + var fallback = false override def supportColumnarReads(partition: InputPartition): Boolean = true @@ -67,12 +70,12 @@ case class ArrowCSVPartitionReaderFactory( partitionedFile: PartitionedFile): PartitionReader[ColumnarBatch] = { val actualDataSchema = StructType( dataSchema.filterNot(_.name == options.columnNameOfCorruptRecord)) - val actualReadDataSchema = StructType( + val actualRequiredSchema = StructType( readDataSchema.filterNot(_.name == options.columnNameOfCorruptRecord)) ArrowCSVFileFormat.checkHeader( partitionedFile, actualDataSchema, - actualReadDataSchema, + actualRequiredSchema, options, filters, broadcastedConf.value.value) @@ -87,29 +90,54 @@ case class ArrowCSVPartitionReaderFactory( ArrowBufferAllocators.contextInstance(), ArrowNativeMemoryPool.arrowPool("FileSystemFactory")) } - val factory = ArrowUtil.makeArrowDiscovery( - URLDecoder.decode(partitionedFile.filePath.toString(), "UTF-8"), - FileFormat.CSV, - allocator, - pool) - val parquetFileFields = factory.inspect().getFields.asScala + val arrowConfig = ArrowCSVOptionConverter.convert(options) + val fileNames = ArrowUtil + .readArrowFileColumnNames( + URLDecoder.decode(partitionedFile.filePath.toString, "UTF-8"), + fileFormat, + arrowConfig, + ArrowBufferAllocators.contextInstance(), + pool) + val tokenIndexArr = + actualRequiredSchema.map(f => java.lang.Integer.valueOf(actualDataSchema.indexOf(f))).toArray + val fileIndex = tokenIndexArr.filter(_ < fileNames.length) + val requestSchema = new StructType( + fileIndex + .map(index => StructField(fileNames(index), actualDataSchema(index).dataType))) + val missingIndex = tokenIndexArr.filter(_ >= fileNames.length) + val missingSchema = new StructType(missingIndex.map(actualDataSchema(_))) + // TODO: support array/map/struct types in out-of-order schema reading. + val cSchema: ArrowSchema = ArrowSchema.allocateNew(allocator) + val cSchema2: ArrowSchema = ArrowSchema.allocateNew(allocator) // TODO: support array/map/struct types in out-of-order schema reading. val iter = try { - val actualReadFields = - ArrowUtil.getRequestedField(readDataSchema, parquetFileFields, caseSensitive) - ArrowCSVFileFormat.readArrow( - allocator, - partitionedFile, - actualReadFields, - caseSensitive, - readDataSchema, - readPartitionSchema, - factory, - batchSize) + ArrowCSVOptionConverter.schema(requestSchema, cSchema, allocator, arrowConfig) + val factory = + ArrowUtil.makeArrowDiscovery( + URLDecoder.decode(partitionedFile.filePath.toString, "UTF-8"), + fileFormat, + Optional.of(arrowConfig), + ArrowBufferAllocators.contextInstance(), + pool) + val fields = factory.inspect().getFields + val actualReadFields = new Schema( + fileIndex.map(index => fields.get(index)).toIterable.asJava) + ArrowCSVOptionConverter.schema(requestSchema, cSchema2, allocator, arrowConfig) + ArrowCSVFileFormat + .readArrow( + ArrowBufferAllocators.contextInstance(), + partitionedFile, + actualReadFields, + missingSchema, + readPartitionSchema, + factory, + batchSize, + arrowConfig) } catch { case e: SchemaMismatchException => logWarning(e.getMessage) + fallback = true val iter = ArrowCSVFileFormat.fallbackReadVanilla( dataSchema, readDataSchema, @@ -125,6 +153,9 @@ case class ArrowCSVPartitionReaderFactory( partitionedFile) ArrowCSVFileFormat.rowToColumn(schema, batchSize, rows) case d: Exception => throw d + } finally { + cSchema.close() + cSchema2.close() } new PartitionReader[ColumnarBatch] { diff --git a/backends-velox/src/main/scala/org/apache/gluten/datasource/v2/ArrowCSVTable.scala b/backends-velox/src/main/scala/org/apache/gluten/datasource/v2/ArrowCSVTable.scala index aa7f737f9cfcd..02485975e7055 100644 --- a/backends-velox/src/main/scala/org/apache/gluten/datasource/v2/ArrowCSVTable.scala +++ b/backends-velox/src/main/scala/org/apache/gluten/datasource/v2/ArrowCSVTable.scala @@ -16,11 +16,13 @@ */ package org.apache.gluten.datasource.v2 +import org.apache.gluten.datasource.ArrowCSVOptionConverter import org.apache.gluten.memory.arrow.alloc.ArrowBufferAllocators import org.apache.gluten.memory.arrow.pool.ArrowNativeMemoryPool import org.apache.gluten.utils.ArrowUtil import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.csv.CSVOptions import org.apache.spark.sql.connector.read.ScanBuilder import org.apache.spark.sql.connector.write.{LogicalWriteInfo, WriteBuilder} import org.apache.spark.sql.execution.datasources.FileFormat @@ -31,6 +33,8 @@ import org.apache.spark.util.TaskResources import org.apache.hadoop.fs.FileStatus +import scala.collection.JavaConverters.mapAsScalaMapConverter + case class ArrowCSVTable( name: String, sparkSession: SparkSession, @@ -48,9 +52,17 @@ case class ArrowCSVTable( } else { (ArrowBufferAllocators.contextInstance(), ArrowNativeMemoryPool.arrowPool("inferSchema")) } + val parsedOptions: CSVOptions = new CSVOptions( + options.asScala.toMap, + columnPruning = sparkSession.sessionState.conf.csvColumnPruning, + sparkSession.sessionState.conf.sessionLocalTimeZone, + sparkSession.sessionState.conf.columnNameOfCorruptRecord + ) + val arrowConfig = ArrowCSVOptionConverter.convert(parsedOptions) ArrowUtil.readSchema( files.head, org.apache.arrow.dataset.file.FileFormat.CSV, + arrowConfig, allocator, pool ) diff --git a/backends-velox/src/main/scala/org/apache/gluten/execution/GenerateExecTransformer.scala b/backends-velox/src/main/scala/org/apache/gluten/execution/GenerateExecTransformer.scala index c9b0abd6fabf9..8ceea8c14f6ad 100644 --- a/backends-velox/src/main/scala/org/apache/gluten/execution/GenerateExecTransformer.scala +++ b/backends-velox/src/main/scala/org/apache/gluten/execution/GenerateExecTransformer.scala @@ -55,7 +55,7 @@ case class GenerateExecTransformer( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"), "numOutputVectors" -> SQLMetrics.createMetric(sparkContext, "number of output vectors"), "numOutputBytes" -> SQLMetrics.createSizeMetric(sparkContext, "number of output bytes"), - "wallNanos" -> SQLMetrics.createNanoTimingMetric(sparkContext, "totaltime of generate"), + "wallNanos" -> SQLMetrics.createNanoTimingMetric(sparkContext, "time of generate"), "cpuCount" -> SQLMetrics.createMetric(sparkContext, "cpu wall time count"), "peakMemoryBytes" -> SQLMetrics.createSizeMetric(sparkContext, "peak memory bytes"), "numMemoryAllocations" -> SQLMetrics.createMetric( diff --git a/backends-velox/src/main/scala/org/apache/gluten/execution/HashAggregateExecTransformer.scala b/backends-velox/src/main/scala/org/apache/gluten/execution/HashAggregateExecTransformer.scala index 2f447572406bf..4f33ae7c718ca 100644 --- a/backends-velox/src/main/scala/org/apache/gluten/execution/HashAggregateExecTransformer.scala +++ b/backends-velox/src/main/scala/org/apache/gluten/execution/HashAggregateExecTransformer.scala @@ -20,7 +20,6 @@ import org.apache.gluten.backendsapi.BackendsApiManager import org.apache.gluten.exception.GlutenNotSupportException import org.apache.gluten.expression._ import org.apache.gluten.expression.ConverterUtils.FunctionConfig -import org.apache.gluten.expression.aggregate.HLLAdapter import org.apache.gluten.substrait.`type`.{TypeBuilder, TypeNode} import org.apache.gluten.substrait.{AggregationParams, SubstraitContext} import org.apache.gluten.substrait.expression.{AggregateFunctionNode, ExpressionBuilder, ExpressionNode, ScalarFunctionNode} @@ -65,8 +64,8 @@ abstract class HashAggregateExecTransformer( super.output } - override def doTransform(context: SubstraitContext): TransformContext = { - val childCtx = child.asInstanceOf[TransformSupport].doTransform(context) + override protected def doTransform(context: SubstraitContext): TransformContext = { + val childCtx = child.asInstanceOf[TransformSupport].transform(context) val aggParams = new AggregationParams val operatorId = context.nextOperatorId(this.nodeName) @@ -74,20 +73,6 @@ abstract class HashAggregateExecTransformer( TransformContext(childCtx.outputAttributes, output, relNode) } - override protected def checkAggFuncModeSupport( - aggFunc: AggregateFunction, - mode: AggregateMode): Boolean = { - aggFunc match { - case _: HLLAdapter => - mode match { - case Partial | Final => true - case _ => false - } - case _ => - super.checkAggFuncModeSupport(aggFunc, mode) - } - } - // Return whether the outputs partial aggregation should be combined for Velox computing. // When the partial outputs are multiple-column, row construct is needed. private def rowConstructNeeded(aggregateExpressions: Seq[AggregateExpression]): Boolean = { @@ -241,21 +226,21 @@ abstract class HashAggregateExecTransformer( } aggregateFunction match { - case hllAdapter: HLLAdapter => + case _ if aggregateFunction.aggBufferAttributes.size > 1 => + generateMergeCompanionNode() + case _ => aggregateMode match { - case Partial => - // For Partial mode output type is binary. + case Partial | PartialMerge => val partialNode = ExpressionBuilder.makeAggregateFunction( VeloxAggregateFunctionsBuilder.create(args, aggregateFunction, aggregateMode), childrenNodeList, modeKeyWord, ConverterUtils.getTypeNode( - hllAdapter.inputAggBufferAttributes.head.dataType, - hllAdapter.inputAggBufferAttributes.head.nullable) + aggregateFunction.inputAggBufferAttributes.head.dataType, + aggregateFunction.inputAggBufferAttributes.head.nullable) ) aggregateNodeList.add(partialNode) case Final => - // For Final mode output type is long. val aggFunctionNode = ExpressionBuilder.makeAggregateFunction( VeloxAggregateFunctionsBuilder.create(args, aggregateFunction, aggregateMode), childrenNodeList, @@ -266,16 +251,6 @@ abstract class HashAggregateExecTransformer( case other => throw new GlutenNotSupportException(s"$other is not supported.") } - case _ if aggregateFunction.aggBufferAttributes.size > 1 => - generateMergeCompanionNode() - case _ => - val aggFunctionNode = ExpressionBuilder.makeAggregateFunction( - VeloxAggregateFunctionsBuilder.create(args, aggregateFunction, aggregateMode), - childrenNodeList, - modeKeyWord, - ConverterUtils.getTypeNode(aggregateFunction.dataType, aggregateFunction.nullable) - ) - aggregateNodeList.add(aggFunctionNode) } } diff --git a/backends-velox/src/main/scala/org/apache/gluten/execution/RowToVeloxColumnarExec.scala b/backends-velox/src/main/scala/org/apache/gluten/execution/RowToVeloxColumnarExec.scala index be1bc64e21b83..7bcf56f7edb05 100644 --- a/backends-velox/src/main/scala/org/apache/gluten/execution/RowToVeloxColumnarExec.scala +++ b/backends-velox/src/main/scala/org/apache/gluten/execution/RowToVeloxColumnarExec.scala @@ -21,8 +21,8 @@ import org.apache.gluten.columnarbatch.ColumnarBatches import org.apache.gluten.exception.GlutenException import org.apache.gluten.exec.Runtimes import org.apache.gluten.memory.arrow.alloc.ArrowBufferAllocators -import org.apache.gluten.memory.nmm.NativeMemoryManagers -import org.apache.gluten.utils.{ArrowAbiUtil, Iterators} +import org.apache.gluten.utils.ArrowAbiUtil +import org.apache.gluten.utils.iterator.Iterators import org.apache.gluten.vectorized._ import org.apache.spark.broadcast.Broadcast @@ -70,8 +70,7 @@ case class RowToVeloxColumnarExec(child: SparkPlan) extends RowToColumnarExecBas numInputRows, numOutputBatches, convertTime, - numRows - ) + numRows) } } @@ -96,9 +95,7 @@ case class RowToVeloxColumnarExec(child: SparkPlan) extends RowToColumnarExecBas numInputRows, numOutputBatches, convertTime, - numRows - ) - ) + numRows)) } // For spark 3.2. @@ -120,16 +117,16 @@ object RowToVeloxColumnarExec { val arrowSchema = SparkArrowUtil.toArrowSchema(schema, SQLConf.get.sessionLocalTimeZone) - val jniWrapper = NativeRowToColumnarJniWrapper.create() + val runtime = Runtimes.contextInstance("RowToColumnar") + val jniWrapper = NativeRowToColumnarJniWrapper.create(runtime) val arrowAllocator = ArrowBufferAllocators.contextInstance() - val memoryManager = NativeMemoryManagers.contextInstance("RowToColumnar") val cSchema = ArrowSchema.allocateNew(arrowAllocator) val factory = UnsafeProjection val converter = factory.create(schema) val r2cHandle = try { ArrowAbiUtil.exportSchema(arrowAllocator, arrowSchema, cSchema) - jniWrapper.init(cSchema.memoryAddress(), memoryManager.getNativeInstanceHandle) + jniWrapper.init(cSchema.memoryAddress()) } finally { cSchema.close() } @@ -145,10 +142,20 @@ object RowToVeloxColumnarExec { } } - def nativeConvert(row: UnsafeRow): ColumnarBatch = { + def convertToUnsafeRow(row: InternalRow): UnsafeRow = { + row match { + case unsafeRow: UnsafeRow => unsafeRow + case _ => + converter.apply(row) + } + } + + override def next(): ColumnarBatch = { + val firstRow = it.next() + val start = System.currentTimeMillis() + val row = convertToUnsafeRow(firstRow) var arrowBuf: ArrowBuf = null TaskResources.addRecycler("RowToColumnar_arrowBuf", 100) { - // Remind, remove isOpen here if (arrowBuf != null && arrowBuf.refCnt() != 0) { arrowBuf.close() } @@ -175,12 +182,14 @@ object RowToVeloxColumnarExec { rowLength += sizeInBytes.toLong rowCount += 1 + convertTime += System.currentTimeMillis() - start while (rowCount < columnBatchSize && !finished) { val iterHasNext = it.hasNext if (!iterHasNext) { finished = true } else { val row = it.next() + val start2 = System.currentTimeMillis() val unsafeRow = convertToUnsafeRow(row) val sizeInBytes = unsafeRow.getSizeInBytes if ((offset + sizeInBytes) > arrowBuf.capacity()) { @@ -198,36 +207,23 @@ object RowToVeloxColumnarExec { offset += sizeInBytes rowLength += sizeInBytes.toLong rowCount += 1 + convertTime += System.currentTimeMillis() - start2 } } numInputRows += rowCount + numOutputBatches += 1 + val startNative = System.currentTimeMillis() try { val handle = jniWrapper .nativeConvertRowToColumnar(r2cHandle, rowLength.toArray, arrowBuf.memoryAddress()) - ColumnarBatches.create(Runtimes.contextInstance(), handle) + val cb = ColumnarBatches.create(handle) + convertTime += System.currentTimeMillis() - startNative + cb } finally { arrowBuf.close() arrowBuf = null } } - - def convertToUnsafeRow(row: InternalRow): UnsafeRow = { - row match { - case unsafeRow: UnsafeRow => unsafeRow - case _ => - converter.apply(row) - } - } - - override def next(): ColumnarBatch = { - val firstRow = it.next() - val start = System.currentTimeMillis() - val unsafeRow = convertToUnsafeRow(firstRow) - val cb = nativeConvert(unsafeRow) - numOutputBatches += 1 - convertTime += System.currentTimeMillis() - start - cb - } } Iterators .wrap(res) diff --git a/backends-velox/src/main/scala/org/apache/gluten/execution/ShuffledHashJoinExecTransformer.scala b/backends-velox/src/main/scala/org/apache/gluten/execution/ShuffledHashJoinExecTransformer.scala index c9c60772fe738..d78e6c5b3e538 100644 --- a/backends-velox/src/main/scala/org/apache/gluten/execution/ShuffledHashJoinExecTransformer.scala +++ b/backends-velox/src/main/scala/org/apache/gluten/execution/ShuffledHashJoinExecTransformer.scala @@ -16,12 +16,13 @@ */ package org.apache.gluten.execution +import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.catalyst.optimizer.{BuildLeft, BuildRight, BuildSide} +import org.apache.spark.sql.catalyst.optimizer.BuildSide import org.apache.spark.sql.catalyst.plans._ -import org.apache.spark.sql.execution.{FilterExec, SparkPlan} -import org.apache.spark.sql.execution.aggregate.BaseAggregateExec +import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.joins.BuildSideRelation +import org.apache.spark.sql.vectorized.ColumnarBatch import io.substrait.proto.JoinRel @@ -44,80 +45,6 @@ case class ShuffledHashJoinExecTransformer( right, isSkewJoin) { - // Used to specify the preferred build side in backend's real execution. - object PreferredBuildSide extends Serializable { - val LEFT = "left table" - val RIGHT = "right table" - val NON = "none" - } - - /** - * Returns whether the plan matches the condition to be preferred as build side. Currently, filter - * and aggregation are preferred. - * @param plan - * the left or right plan of join - * @return - * whether the plan matches the condition - */ - private def matchCondition(plan: SparkPlan): Boolean = - plan.isInstanceOf[FilterExecTransformerBase] || plan.isInstanceOf[FilterExec] || - plan.isInstanceOf[BaseAggregateExec] - - /** - * Returns whether a plan is preferred as the build side. If this plan or its children match the - * condition, it will be preferred. - * @param plan - * the left or right plan of join - * @return - * whether the plan is preferred as the build side - */ - private def isPreferred(plan: SparkPlan): Boolean = - matchCondition(plan) || plan.children.exists(child => matchCondition(child)) - - // Returns the preferred build side with the consideration of preferring condition. - private lazy val preferredBuildSide: String = - if ((isPreferred(left) && isPreferred(right)) || (!isPreferred(left) && !isPreferred(right))) { - PreferredBuildSide.NON - } else if (isPreferred(left)) { - PreferredBuildSide.LEFT - } else { - PreferredBuildSide.RIGHT - } - - /** - * Returns whether the build and stream table should be exchanged with consideration of build - * type, planned build side and the preferred build side. - */ - override lazy val needSwitchChildren: Boolean = hashJoinType match { - case LeftOuter | LeftSemi | ExistenceJoin(_) => - joinBuildSide match { - case BuildLeft => - // Exchange build and stream side when left side or none is preferred as the build side, - // and RightOuter or RightSemi wil be used. - !(preferredBuildSide == PreferredBuildSide.RIGHT) - case _ => - // Do not exchange build and stream side when right side or none is preferred - // as the build side, and LeftOuter or LeftSemi wil be used. - preferredBuildSide == PreferredBuildSide.LEFT - } - case RightOuter => - joinBuildSide match { - case BuildRight => - // Do not exchange build and stream side when right side or none is preferred - // as the build side, and RightOuter will be used. - preferredBuildSide == PreferredBuildSide.LEFT - case _ => - // Exchange build and stream side when left side or none is preferred as the build side, - // and LeftOuter will be used. - !(preferredBuildSide == PreferredBuildSide.RIGHT) - } - case _ => - joinBuildSide match { - case BuildLeft => true - case BuildRight => false - } - } - override protected lazy val substraitJoinType: JoinRel.JoinType = joinType match { case _: InnerLike => JoinRel.JoinType.JOIN_TYPE_INNER @@ -196,8 +123,11 @@ case class BroadcastHashJoinExecTransformer( newRight: SparkPlan): BroadcastHashJoinExecTransformer = copy(left = newLeft, right = newRight) - override protected def createBroadcastBuildSideRDD(): BroadcastBuildSideRDD = { + override def columnarInputRDDs: Seq[RDD[ColumnarBatch]] = { + val streamedRDD = getColumnarInputRDDs(streamedPlan) val broadcast = buildPlan.executeBroadcast[BuildSideRelation]() - VeloxBroadcastBuildSideRDD(sparkContext, broadcast) + val broadcastRDD = VeloxBroadcastBuildSideRDD(sparkContext, broadcast) + // FIXME: Do we have to make build side a RDD? + streamedRDD :+ broadcastRDD } } diff --git a/backends-velox/src/main/scala/org/apache/gluten/execution/TopNTransformer.scala b/backends-velox/src/main/scala/org/apache/gluten/execution/TopNTransformer.scala index 9df821fe10c54..01c89bee217b8 100644 --- a/backends-velox/src/main/scala/org/apache/gluten/execution/TopNTransformer.scala +++ b/backends-velox/src/main/scala/org/apache/gluten/execution/TopNTransformer.scala @@ -19,7 +19,7 @@ package org.apache.gluten.execution import org.apache.gluten.backendsapi.BackendsApiManager import org.apache.gluten.expression.{ConverterUtils, ExpressionConverter} import org.apache.gluten.extension.ValidationResult -import org.apache.gluten.metrics.{MetricsUpdater, NoopMetricsUpdater} +import org.apache.gluten.metrics.MetricsUpdater import org.apache.gluten.substrait.`type`.TypeBuilder import org.apache.gluten.substrait.SubstraitContext import org.apache.gluten.substrait.extensions.ExtensionBuilder @@ -67,8 +67,8 @@ case class TopNTransformer( doNativeValidation(context, relNode) } - override def doTransform(context: SubstraitContext): TransformContext = { - val childCtx = child.asInstanceOf[TransformSupport].doTransform(context) + override protected def doTransform(context: SubstraitContext): TransformContext = { + val childCtx = child.asInstanceOf[TransformSupport].transform(context) val operatorId = context.nextOperatorId(this.nodeName) val relNode = getRelNode( @@ -114,5 +114,5 @@ case class TopNTransformer( } } - override def metricsUpdater(): MetricsUpdater = NoopMetricsUpdater // TODO + override def metricsUpdater(): MetricsUpdater = MetricsUpdater.Todo // TODO } diff --git a/backends-velox/src/main/scala/org/apache/gluten/execution/VeloxAppendBatchesExec.scala b/backends-velox/src/main/scala/org/apache/gluten/execution/VeloxAppendBatchesExec.scala new file mode 100644 index 0000000000000..4b4db703de7a4 --- /dev/null +++ b/backends-velox/src/main/scala/org/apache/gluten/execution/VeloxAppendBatchesExec.scala @@ -0,0 +1,106 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.execution + +import org.apache.gluten.extension.GlutenPlan +import org.apache.gluten.utils.VeloxBatchAppender +import org.apache.gluten.utils.iterator.Iterators + +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.{Attribute, SortOrder} +import org.apache.spark.sql.catalyst.plans.physical.Partitioning +import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} +import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} +import org.apache.spark.sql.vectorized.ColumnarBatch + +import java.util.concurrent.atomic.AtomicLong + +import scala.collection.JavaConverters._ + +/** + * An operator to coalesce input batches by appending the later batches to the one that comes + * earlier. + */ +case class VeloxAppendBatchesExec(override val child: SparkPlan, minOutputBatchSize: Int) + extends GlutenPlan + with UnaryExecNode { + + override lazy val metrics: Map[String, SQLMetric] = Map( + "numInputRows" -> SQLMetrics.createMetric(sparkContext, "number of input rows"), + "numInputBatches" -> SQLMetrics.createMetric(sparkContext, "number of input batches"), + "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"), + "numOutputBatches" -> SQLMetrics.createMetric(sparkContext, "number of output batches"), + "appendTime" -> SQLMetrics.createTimingMetric(sparkContext, "time to append batches") + ) + + override def supportsColumnar: Boolean = true + override protected def doExecute(): RDD[InternalRow] = throw new UnsupportedOperationException() + + override protected def doExecuteColumnar(): RDD[ColumnarBatch] = { + val numInputRows = longMetric("numInputRows") + val numInputBatches = longMetric("numInputBatches") + val numOutputRows = longMetric("numOutputRows") + val numOutputBatches = longMetric("numOutputBatches") + val appendTime = longMetric("appendTime") + + child.executeColumnar().mapPartitions { + in => + // Append millis = Out millis - In millis. + val appendMillis = new AtomicLong(0L) + + val appender = VeloxBatchAppender.create( + minOutputBatchSize, + Iterators + .wrap(in) + .collectReadMillis(inMillis => appendMillis.getAndAdd(-inMillis)) + .create() + .map { + inBatch => + numInputRows += inBatch.numRows() + numInputBatches += 1 + inBatch + } + .asJava + ) + + val out = Iterators + .wrap(appender.asScala) + .collectReadMillis(outMillis => appendMillis.getAndAdd(outMillis)) + .recyclePayload(_.close()) + .recycleIterator { + appender.close() + appendTime += appendMillis.get() + } + .create() + .map { + outBatch => + numOutputRows += outBatch.numRows() + numOutputBatches += 1 + outBatch + } + + out + } + } + + override def output: Seq[Attribute] = child.output + override def outputPartitioning: Partitioning = child.outputPartitioning + override def outputOrdering: Seq[SortOrder] = child.outputOrdering + override protected def withNewChildInternal(newChild: SparkPlan): SparkPlan = + copy(child = newChild) +} diff --git a/backends-velox/src/main/scala/org/apache/gluten/execution/VeloxBroadcastBuildSideRDD.scala b/backends-velox/src/main/scala/org/apache/gluten/execution/VeloxBroadcastBuildSideRDD.scala index 17d0522d0732a..fe3c0b7e3938f 100644 --- a/backends-velox/src/main/scala/org/apache/gluten/execution/VeloxBroadcastBuildSideRDD.scala +++ b/backends-velox/src/main/scala/org/apache/gluten/execution/VeloxBroadcastBuildSideRDD.scala @@ -16,7 +16,7 @@ */ package org.apache.gluten.execution -import org.apache.gluten.utils.Iterators +import org.apache.gluten.utils.iterator.Iterators import org.apache.spark.{broadcast, SparkContext} import org.apache.spark.sql.execution.joins.BuildSideRelation diff --git a/backends-velox/src/main/scala/org/apache/gluten/execution/GlutenBroadcastNestedLoopJoinExecTransformer.scala b/backends-velox/src/main/scala/org/apache/gluten/execution/VeloxBroadcastNestedLoopJoinExecTransformer.scala similarity index 76% rename from backends-velox/src/main/scala/org/apache/gluten/execution/GlutenBroadcastNestedLoopJoinExecTransformer.scala rename to backends-velox/src/main/scala/org/apache/gluten/execution/VeloxBroadcastNestedLoopJoinExecTransformer.scala index 3cde6b27b1fa1..8517422698d93 100644 --- a/backends-velox/src/main/scala/org/apache/gluten/execution/GlutenBroadcastNestedLoopJoinExecTransformer.scala +++ b/backends-velox/src/main/scala/org/apache/gluten/execution/VeloxBroadcastNestedLoopJoinExecTransformer.scala @@ -16,13 +16,15 @@ */ package org.apache.gluten.execution +import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.optimizer.BuildSide import org.apache.spark.sql.catalyst.plans.JoinType import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.joins.BuildSideRelation +import org.apache.spark.sql.vectorized.ColumnarBatch -case class GlutenBroadcastNestedLoopJoinExecTransformer( +case class VeloxBroadcastNestedLoopJoinExecTransformer( left: SparkPlan, right: SparkPlan, buildSide: BuildSide, @@ -36,14 +38,17 @@ case class GlutenBroadcastNestedLoopJoinExecTransformer( condition ) { - override protected def createBroadcastBuildSideRDD(): BroadcastBuildSideRDD = { + override def columnarInputRDDs: Seq[RDD[ColumnarBatch]] = { + val streamedRDD = getColumnarInputRDDs(streamedPlan) val broadcast = buildPlan.executeBroadcast[BuildSideRelation]() - VeloxBroadcastBuildSideRDD(sparkContext, broadcast) + val broadcastRDD = VeloxBroadcastBuildSideRDD(sparkContext, broadcast) + // FIXME: Do we have to make build side a RDD? + streamedRDD :+ broadcastRDD } override protected def withNewChildrenInternal( newLeft: SparkPlan, - newRight: SparkPlan): GlutenBroadcastNestedLoopJoinExecTransformer = + newRight: SparkPlan): VeloxBroadcastNestedLoopJoinExecTransformer = copy(left = newLeft, right = newRight) } diff --git a/backends-velox/src/main/scala/org/apache/gluten/execution/VeloxColumnarToRowExec.scala b/backends-velox/src/main/scala/org/apache/gluten/execution/VeloxColumnarToRowExec.scala index 77bf497272831..1a54255208ea4 100644 --- a/backends-velox/src/main/scala/org/apache/gluten/execution/VeloxColumnarToRowExec.scala +++ b/backends-velox/src/main/scala/org/apache/gluten/execution/VeloxColumnarToRowExec.scala @@ -18,9 +18,9 @@ package org.apache.gluten.execution import org.apache.gluten.columnarbatch.ColumnarBatches import org.apache.gluten.exception.GlutenNotSupportException +import org.apache.gluten.exec.Runtimes import org.apache.gluten.extension.ValidationResult -import org.apache.gluten.memory.nmm.NativeMemoryManagers -import org.apache.gluten.utils.Iterators +import org.apache.gluten.utils.iterator.Iterators import org.apache.gluten.vectorized.NativeColumnarToRowJniWrapper import org.apache.spark.broadcast.Broadcast @@ -75,13 +75,8 @@ case class VeloxColumnarToRowExec(child: SparkPlan) extends ColumnarToRowExecBas val convertTime = longMetric("convertTime") child.executeColumnar().mapPartitions { it => - VeloxColumnarToRowExec.toRowIterator( - it, - output, - numOutputRows, - numInputBatches, - convertTime - ) + VeloxColumnarToRowExec + .toRowIterator(it, output, numOutputRows, numInputBatches, convertTime) } } @@ -96,13 +91,7 @@ case class VeloxColumnarToRowExec(child: SparkPlan) extends ColumnarToRowExecBas sparkContext, mode, relation, - VeloxColumnarToRowExec.toRowIterator( - _, - output, - numOutputRows, - numInputBatches, - convertTime - )) + VeloxColumnarToRowExec.toRowIterator(_, output, numOutputRows, numInputBatches, convertTime)) } protected def withNewChildInternal(newChild: SparkPlan): VeloxColumnarToRowExec = @@ -120,10 +109,10 @@ object VeloxColumnarToRowExec { return Iterator.empty } + val runtime = Runtimes.contextInstance("ColumnarToRow") // TODO:: pass the jni jniWrapper and arrowSchema and serializeSchema method by broadcast - val jniWrapper = NativeColumnarToRowJniWrapper.create() - val c2rId = jniWrapper.nativeColumnarToRowInit( - NativeMemoryManagers.contextInstance("ColumnarToRow").getNativeInstanceHandle) + val jniWrapper = NativeColumnarToRowJniWrapper.create(runtime) + val c2rId = jniWrapper.nativeColumnarToRowInit() val res: Iterator[Iterator[InternalRow]] = new Iterator[Iterator[InternalRow]] { @@ -159,7 +148,7 @@ object VeloxColumnarToRowExec { val beforeConvert = System.currentTimeMillis() val batchHandle = ColumnarBatches.getNativeHandle(batch) val info = - jniWrapper.nativeColumnarToRowConvert(batchHandle, c2rId) + jniWrapper.nativeColumnarToRowConvert(c2rId, batchHandle) convertTime += (System.currentTimeMillis() - beforeConvert) diff --git a/backends-velox/src/main/scala/org/apache/gluten/expression/DummyExpression.scala b/backends-velox/src/main/scala/org/apache/gluten/expression/DummyExpression.scala new file mode 100644 index 0000000000000..e2af66b599d3d --- /dev/null +++ b/backends-velox/src/main/scala/org/apache/gluten/expression/DummyExpression.scala @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.expression + +import org.apache.spark.sql.catalyst.{FunctionIdentifier, InternalRow} +import org.apache.spark.sql.catalyst.analysis.FunctionRegistry +import org.apache.spark.sql.catalyst.expressions.{Expression, ExpressionInfo, UnaryExpression} +import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} +import org.apache.spark.sql.types.DataType + +abstract class DummyExpression(child: Expression) extends UnaryExpression with Serializable { + private val accessor: (InternalRow, Int) => Any = InternalRow.getAccessor(dataType, nullable) + + override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = + defineCodeGen(ctx, ev, c => c) + + override def dataType: DataType = child.dataType + + override def eval(input: InternalRow): Any = { + assert(input.numFields == 1, "The input row of DummyExpression should have only 1 field.") + accessor(input, 0) + } +} + +// Can be used as a wrapper to force fall back the original expression to mock the fallback behavior +// of an supported expression in Gluten which fails native validation. +case class VeloxDummyExpression(child: Expression) + extends DummyExpression(child) + with Transformable { + override def getTransformer( + childrenTransformers: Seq[ExpressionTransformer]): ExpressionTransformer = { + if (childrenTransformers.size != children.size) { + throw new IllegalStateException( + this.getClass.getSimpleName + + ": getTransformer called before children transformer initialized.") + } + + GenericExpressionTransformer( + VeloxDummyExpression.VELOX_DUMMY_EXPRESSION, + childrenTransformers, + this) + } + + override protected def withNewChildInternal(newChild: Expression): Expression = copy(newChild) +} + +object VeloxDummyExpression { + val VELOX_DUMMY_EXPRESSION = "velox_dummy_expression" + + private val identifier = new FunctionIdentifier(VELOX_DUMMY_EXPRESSION) + + def registerFunctions(registry: FunctionRegistry): Unit = { + registry.registerFunction( + identifier, + new ExpressionInfo(classOf[VeloxDummyExpression].getName, VELOX_DUMMY_EXPRESSION), + (e: Seq[Expression]) => VeloxDummyExpression(e.head) + ) + } + + def unregisterFunctions(registry: FunctionRegistry): Unit = { + registry.dropFunction(identifier) + } +} diff --git a/backends-velox/src/main/scala/org/apache/gluten/expression/ExpressionTransformer.scala b/backends-velox/src/main/scala/org/apache/gluten/expression/ExpressionTransformer.scala index 0f0eb2969f7e1..51b19ab140d9a 100644 --- a/backends-velox/src/main/scala/org/apache/gluten/expression/ExpressionTransformer.scala +++ b/backends-velox/src/main/scala/org/apache/gluten/expression/ExpressionTransformer.scala @@ -19,6 +19,7 @@ package org.apache.gluten.expression import org.apache.gluten.exception.GlutenNotSupportException import org.apache.gluten.expression.ConverterUtils.FunctionConfig import org.apache.gluten.expression.ExpressionConverter.replaceWithExpressionTransformer +import org.apache.gluten.substrait.`type`.StructNode import org.apache.gluten.substrait.expression._ import org.apache.spark.sql.catalyst.expressions._ @@ -63,6 +64,10 @@ case class VeloxGetStructFieldTransformer( case node: SelectionNode => // Append the nested index to selection node. node.addNestedChildIdx(JInteger.valueOf(original.ordinal)) + case node: NullLiteralNode => + val nodeType = + node.getTypeNode.asInstanceOf[StructNode].getFieldTypes.get(original.ordinal) + ExpressionBuilder.makeNullLiteral(nodeType) case other => throw new GlutenNotSupportException(s"$other is not supported.") } diff --git a/backends-velox/src/main/scala/org/apache/gluten/extension/ArrowScanReplaceRule.scala b/backends-velox/src/main/scala/org/apache/gluten/extension/ArrowScanReplaceRule.scala index adfc6ca742c93..dba8df5cf1a1f 100644 --- a/backends-velox/src/main/scala/org/apache/gluten/extension/ArrowScanReplaceRule.scala +++ b/backends-velox/src/main/scala/org/apache/gluten/extension/ArrowScanReplaceRule.scala @@ -19,6 +19,7 @@ package org.apache.gluten.extension import org.apache.gluten.datasource.ArrowCSVFileFormat import org.apache.gluten.datasource.v2.ArrowCSVScan import org.apache.gluten.execution.datasource.v2.ArrowBatchScanExec +import org.apache.gluten.utils.PhysicalPlanSelector import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.rules.Rule @@ -26,7 +27,7 @@ import org.apache.spark.sql.execution.{ArrowFileSourceScanExec, FileSourceScanEx import org.apache.spark.sql.execution.datasources.v2.BatchScanExec case class ArrowScanReplaceRule(spark: SparkSession) extends Rule[SparkPlan] { - override def apply(plan: SparkPlan): SparkPlan = { + override def apply(plan: SparkPlan): SparkPlan = PhysicalPlanSelector.maybe(spark, plan) { plan.transformUp { case plan: FileSourceScanExec if plan.relation.fileFormat.isInstanceOf[ArrowCSVFileFormat] => ArrowFileSourceScanExec(plan) diff --git a/backends-velox/src/main/scala/org/apache/gluten/extension/FlushableHashAggregateRule.scala b/backends-velox/src/main/scala/org/apache/gluten/extension/FlushableHashAggregateRule.scala index f850b6f457ea0..777bf553856ed 100644 --- a/backends-velox/src/main/scala/org/apache/gluten/extension/FlushableHashAggregateRule.scala +++ b/backends-velox/src/main/scala/org/apache/gluten/extension/FlushableHashAggregateRule.scala @@ -16,7 +16,7 @@ */ package org.apache.gluten.extension -import org.apache.gluten.execution.{FlushableHashAggregateExecTransformer, HashAggregateExecTransformer, ProjectExecTransformer, RegularHashAggregateExecTransformer} +import org.apache.gluten.execution._ import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.expressions.aggregate.{Partial, PartialMerge} @@ -30,74 +30,55 @@ import org.apache.spark.sql.execution.exchange.ShuffleExchangeLike * optimizations such as flushing and abandoning. */ case class FlushableHashAggregateRule(session: SparkSession) extends Rule[SparkPlan] { + import FlushableHashAggregateRule._ override def apply(plan: SparkPlan): SparkPlan = plan.transformUp { - case shuffle: ShuffleExchangeLike => + case s: ShuffleExchangeLike => // If an exchange follows a hash aggregate in which all functions are in partial mode, // then it's safe to convert the hash aggregate to flushable hash aggregate. - shuffle.child match { - case HashAggPropagatedToShuffle(proj, agg) => - shuffle.withNewChildren( - Seq(proj.withNewChildren(Seq(FlushableHashAggregateExecTransformer( - agg.requiredChildDistributionExpressions, - agg.groupingExpressions, - agg.aggregateExpressions, - agg.aggregateAttributes, - agg.initialInputBufferOffset, - agg.resultExpressions, - agg.child - ))))) - case HashAggWithShuffle(agg) => - shuffle.withNewChildren( - Seq(FlushableHashAggregateExecTransformer( - agg.requiredChildDistributionExpressions, - agg.groupingExpressions, - agg.aggregateExpressions, - agg.aggregateAttributes, - agg.initialInputBufferOffset, - agg.resultExpressions, - agg.child - ))) - case _ => - shuffle - } + val out = s.withNewChildren( + List( + replaceEligibleAggregates(s.child) { + agg => + FlushableHashAggregateExecTransformer( + agg.requiredChildDistributionExpressions, + agg.groupingExpressions, + agg.aggregateExpressions, + agg.aggregateAttributes, + agg.initialInputBufferOffset, + agg.resultExpressions, + agg.child + ) + } + ) + ) + out } -} -object HashAggPropagatedToShuffle { - def unapply( - plan: SparkPlan): Option[(ProjectExecTransformer, RegularHashAggregateExecTransformer)] = { - if (!plan.isInstanceOf[ProjectExecTransformer]) { - return None - } - val proj = plan.asInstanceOf[ProjectExecTransformer] - val child = proj.child - if (!child.isInstanceOf[RegularHashAggregateExecTransformer]) { - return None - } - val agg = child.asInstanceOf[RegularHashAggregateExecTransformer] - if (!agg.aggregateExpressions.forall(p => p.mode == Partial || p.mode == PartialMerge)) { - return None + private def replaceEligibleAggregates(plan: SparkPlan)( + func: RegularHashAggregateExecTransformer => SparkPlan): SparkPlan = { + def transformDown: SparkPlan => SparkPlan = { + case agg: RegularHashAggregateExecTransformer + if !agg.aggregateExpressions.forall(p => p.mode == Partial || p.mode == PartialMerge) => + // Not a intermediate agg. Skip. + agg + case agg: RegularHashAggregateExecTransformer + if isAggInputAlreadyDistributedWithAggKeys(agg) => + // Data already grouped by aggregate keys, Skip. + agg + case agg: RegularHashAggregateExecTransformer => + func(agg) + case p if !canPropagate(p) => p + case other => other.withNewChildren(other.children.map(transformDown)) } - if (FlushableHashAggregateRule.isAggInputAlreadyDistributedWithAggKeys(agg)) { - return None - } - Some((proj, agg)) + + val out = transformDown(plan) + out } -} -object HashAggWithShuffle { - def unapply(plan: SparkPlan): Option[RegularHashAggregateExecTransformer] = { - if (!plan.isInstanceOf[RegularHashAggregateExecTransformer]) { - return None - } - val agg = plan.asInstanceOf[RegularHashAggregateExecTransformer] - if (!agg.aggregateExpressions.forall(p => p.mode == Partial || p.mode == PartialMerge)) { - return None - } - if (FlushableHashAggregateRule.isAggInputAlreadyDistributedWithAggKeys(agg)) { - return None - } - Some(agg) + private def canPropagate(plan: SparkPlan): Boolean = plan match { + case _: ProjectExecTransformer => true + case _: VeloxAppendBatchesExec => true + case _ => false } } @@ -112,7 +93,8 @@ object FlushableHashAggregateRule { * only on a single partition among the whole cluster. Spark's planner may use this information to * perform optimizations like doing "partial_count(a, b, c)" directly on the output data. */ - def isAggInputAlreadyDistributedWithAggKeys(agg: HashAggregateExecTransformer): Boolean = { + private def isAggInputAlreadyDistributedWithAggKeys( + agg: HashAggregateExecTransformer): Boolean = { if (agg.groupingExpressions.isEmpty) { // Empty grouping set () should not be satisfied by any partitioning patterns. // E.g., diff --git a/backends-velox/src/main/scala/org/apache/gluten/extension/HLLRewriteRule.scala b/backends-velox/src/main/scala/org/apache/gluten/extension/HLLRewriteRule.scala index cb1e626a1ea62..7bae64ff8d592 100644 --- a/backends-velox/src/main/scala/org/apache/gluten/extension/HLLRewriteRule.scala +++ b/backends-velox/src/main/scala/org/apache/gluten/extension/HLLRewriteRule.scala @@ -35,7 +35,7 @@ case class HLLRewriteRule(spark: SparkSession) extends Rule[LogicalPlan] { case hllExpr @ AggregateExpression(hll: HyperLogLogPlusPlus, _, _, _, _) if GlutenConfig.getConf.enableNativeHyperLogLogAggregateFunction && GlutenConfig.getConf.enableColumnarHashAgg && - !hasDistinctAggregateFunc(a) && isDataTypeSupported(hll.child.dataType) => + isDataTypeSupported(hll.child.dataType) => AggregateExpression( HLLAdapter( hll.child, @@ -51,28 +51,15 @@ case class HLLRewriteRule(spark: SparkSession) extends Rule[LogicalPlan] { } } - private def hasDistinctAggregateFunc(agg: Aggregate): Boolean = { - agg.aggregateExpressions - .flatMap(_.collect { case ae: AggregateExpression => ae }) - .exists(_.isDistinct) - } - private def isDataTypeSupported(dataType: DataType): Boolean = { // HLL in velox only supports below data types. we should not offload HLL to velox, if // child's data type is not supported. This prevents the case only partail agg is fallbacked. // As spark and velox have different HLL binary formats, HLL binary generated by spark can't // be parsed by velox, it would cause the error: 'Unexpected type of HLL'. dataType match { - case BooleanType => true - case ByteType => true - case _: CharType => true - case DateType => true - case DoubleType => true - case FloatType => true - case IntegerType => true - case LongType => true - case ShortType => true - case StringType => true + case BooleanType | ByteType | ShortType | IntegerType | LongType | FloatType | DoubleType | + StringType | _: CharType | _: DecimalType | DateType => + true case _ => false } } diff --git a/backends-velox/src/main/scala/org/apache/gluten/utils/DatasourceUtil.scala b/backends-velox/src/main/scala/org/apache/gluten/utils/DatasourceUtil.scala index 6150507b4baa7..3d7725655bb12 100644 --- a/backends-velox/src/main/scala/org/apache/gluten/utils/DatasourceUtil.scala +++ b/backends-velox/src/main/scala/org/apache/gluten/utils/DatasourceUtil.scala @@ -17,8 +17,8 @@ package org.apache.gluten.utils import org.apache.gluten.datasource.DatasourceJniWrapper +import org.apache.gluten.exec.Runtimes import org.apache.gluten.memory.arrow.alloc.ArrowBufferAllocators -import org.apache.gluten.memory.nmm.NativeMemoryManagers import org.apache.spark.sql.types.StructType import org.apache.spark.sql.utils.SparkSchemaUtil @@ -38,13 +38,12 @@ object DatasourceUtil { def readSchema(file: FileStatus): Option[StructType] = { val allocator = ArrowBufferAllocators.contextInstance() - val datasourceJniWrapper = DatasourceJniWrapper.create() + val runtime = Runtimes.contextInstance("VeloxWriter") + val datasourceJniWrapper = DatasourceJniWrapper.create(runtime) val dsHandle = datasourceJniWrapper.nativeInitDatasource( file.getPath.toString, -1, - NativeMemoryManagers.contextInstance("VeloxWriter").getNativeInstanceHandle, - new util.HashMap[String, String]() - ) + new util.HashMap[String, String]()) val cSchema = ArrowSchema.allocateNew(allocator) datasourceJniWrapper.inspectSchema(dsHandle, cSchema.memoryAddress()) try { diff --git a/backends-velox/src/main/scala/org/apache/spark/api/python/ColumnarArrowEvalPythonExec.scala b/backends-velox/src/main/scala/org/apache/spark/api/python/ColumnarArrowEvalPythonExec.scala index d5639057dac8d..88280ff2eddee 100644 --- a/backends-velox/src/main/scala/org/apache/spark/api/python/ColumnarArrowEvalPythonExec.scala +++ b/backends-velox/src/main/scala/org/apache/spark/api/python/ColumnarArrowEvalPythonExec.scala @@ -20,7 +20,8 @@ import org.apache.gluten.columnarbatch.ColumnarBatches import org.apache.gluten.exception.GlutenException import org.apache.gluten.extension.GlutenPlan import org.apache.gluten.memory.arrow.alloc.ArrowBufferAllocators -import org.apache.gluten.utils.{Iterators, PullOutProjectHelper} +import org.apache.gluten.utils.PullOutProjectHelper +import org.apache.gluten.utils.iterator.Iterators import org.apache.gluten.vectorized.ArrowWritableColumnVector import org.apache.spark.{ContextAwareIterator, SparkEnv, TaskContext} diff --git a/backends-velox/src/main/scala/org/apache/spark/sql/execution/BroadcastUtils.scala b/backends-velox/src/main/scala/org/apache/spark/sql/execution/BroadcastUtils.scala index 6346251b6bcda..3d532133a0537 100644 --- a/backends-velox/src/main/scala/org/apache/spark/sql/execution/BroadcastUtils.scala +++ b/backends-velox/src/main/scala/org/apache/spark/sql/execution/BroadcastUtils.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.execution import org.apache.gluten.columnarbatch.ColumnarBatches -import org.apache.gluten.memory.nmm.NativeMemoryManagers +import org.apache.gluten.exec.Runtimes import org.apache.gluten.sql.shims.SparkShimLoader import org.apache.gluten.vectorized.{ColumnarBatchSerializeResult, ColumnarBatchSerializerJniWrapper} @@ -31,7 +31,6 @@ import org.apache.spark.sql.types.StructType import org.apache.spark.sql.vectorized.ColumnarBatch import org.apache.spark.util.TaskResources -import scala.collection.JavaConverters._ import scala.collection.mutable.ArrayBuffer; // Utility methods to convert Vanilla broadcast relations from/to Velox broadcast relations. @@ -153,17 +152,12 @@ object BroadcastUtils { if (filtered.isEmpty) { return ColumnarBatchSerializeResult.EMPTY } - val batchRuntime = ColumnarBatches.getRuntime(filtered.toList.asJava) val handleArray = filtered.map(ColumnarBatches.getNativeHandle) val serializeResult = try { ColumnarBatchSerializerJniWrapper - .forRuntime(batchRuntime) - .serialize( - handleArray, - NativeMemoryManagers - .contextInstance("BroadcastRelation") - .getNativeInstanceHandle) + .create(Runtimes.contextInstance("BroadcastUtils#serializeStream")) + .serialize(handleArray) } finally { filtered.foreach(ColumnarBatches.release) } diff --git a/backends-velox/src/main/scala/org/apache/spark/sql/execution/ColumnarCachedBatchSerializer.scala b/backends-velox/src/main/scala/org/apache/spark/sql/execution/ColumnarCachedBatchSerializer.scala index 7385c53d61b3c..15fd51abef489 100644 --- a/backends-velox/src/main/scala/org/apache/spark/sql/execution/ColumnarCachedBatchSerializer.scala +++ b/backends-velox/src/main/scala/org/apache/spark/sql/execution/ColumnarCachedBatchSerializer.scala @@ -22,8 +22,8 @@ import org.apache.gluten.columnarbatch.ColumnarBatches import org.apache.gluten.exec.Runtimes import org.apache.gluten.execution.{RowToVeloxColumnarExec, VeloxColumnarToRowExec} import org.apache.gluten.memory.arrow.alloc.ArrowBufferAllocators -import org.apache.gluten.memory.nmm.NativeMemoryManagers -import org.apache.gluten.utils.{ArrowAbiUtil, Iterators} +import org.apache.gluten.utils.ArrowAbiUtil +import org.apache.gluten.utils.iterator.Iterators import org.apache.gluten.vectorized.ColumnarBatchSerializerJniWrapper import org.apache.spark.internal.Logging @@ -149,8 +149,7 @@ class ColumnarCachedBatchSerializer extends CachedBatchSerializer with SQLConfHe numInputRows, numOutputBatches, convertTime, - numRows - ) + numRows) } convertColumnarBatchToCachedBatch(rddColumnarBatch, schema, storageLevel, conf) } @@ -185,8 +184,7 @@ class ColumnarCachedBatchSerializer extends CachedBatchSerializer with SQLConfHe selectedAttributes, numOutputRows, numInputBatches, - convertTime - ) + convertTime) } } @@ -197,10 +195,6 @@ class ColumnarCachedBatchSerializer extends CachedBatchSerializer with SQLConfHe conf: SQLConf): RDD[CachedBatch] = { input.mapPartitions { it => - val nativeMemoryManagerHandle = NativeMemoryManagers - .contextInstance("ColumnarCachedBatchSerializer serialize") - .getNativeInstanceHandle - new Iterator[CachedBatch] { override def hasNext: Boolean = it.hasNext @@ -208,11 +202,8 @@ class ColumnarCachedBatchSerializer extends CachedBatchSerializer with SQLConfHe val batch = it.next() val results = ColumnarBatchSerializerJniWrapper - .create() - .serialize( - Array(ColumnarBatches.getNativeHandle(batch)), - nativeMemoryManagerHandle - ) + .create(Runtimes.contextInstance("ColumnarCachedBatchSerializer#serialize")) + .serialize(Array(ColumnarBatches.getNativeHandle(batch))) CachedColumnarBatch( results.getNumRows.toInt, results.getSerialized.length, @@ -236,19 +227,15 @@ class ColumnarCachedBatchSerializer extends CachedBatchSerializer with SQLConfHe val timezoneId = SQLConf.get.sessionLocalTimeZone input.mapPartitions { it => + val runtime = Runtimes.contextInstance("ColumnarCachedBatchSerializer#read") val jniWrapper = ColumnarBatchSerializerJniWrapper - .create() - val nmm = NativeMemoryManagers - .contextInstance("ColumnarCachedBatchSerializer read") + .create(runtime) val schema = SparkArrowUtil.toArrowSchema(localSchema, timezoneId) val arrowAlloc = ArrowBufferAllocators.contextInstance() val cSchema = ArrowSchema.allocateNew(arrowAlloc) ArrowAbiUtil.exportSchema(arrowAlloc, schema, cSchema) val deserializerHandle = jniWrapper - .init( - cSchema.memoryAddress(), - nmm.getNativeInstanceHandle - ) + .init(cSchema.memoryAddress()) cSchema.close() Iterators @@ -260,10 +247,10 @@ class ColumnarCachedBatchSerializer extends CachedBatchSerializer with SQLConfHe val batchHandle = jniWrapper .deserialize(deserializerHandle, cachedBatch.bytes) - val batch = ColumnarBatches.create(Runtimes.contextInstance(), batchHandle) + val batch = ColumnarBatches.create(batchHandle) if (shouldSelectAttributes) { try { - ColumnarBatches.select(nmm, batch, requestedColumnIndices.toArray) + ColumnarBatches.select(batch, requestedColumnIndices.toArray) } finally { batch.close() } diff --git a/backends-velox/src/main/scala/org/apache/spark/sql/execution/VeloxColumnarWriteFilesExec.scala b/backends-velox/src/main/scala/org/apache/spark/sql/execution/VeloxColumnarWriteFilesExec.scala index 1d3d55afb5262..c87b8d4f688d7 100644 --- a/backends-velox/src/main/scala/org/apache/spark/sql/execution/VeloxColumnarWriteFilesExec.scala +++ b/backends-velox/src/main/scala/org/apache/spark/sql/execution/VeloxColumnarWriteFilesExec.scala @@ -24,8 +24,8 @@ import org.apache.gluten.memory.arrow.alloc.ArrowBufferAllocators import org.apache.gluten.sql.shims.SparkShimLoader import org.apache.spark.{Partition, SparkException, TaskContext, TaskOutputFileAlreadyExistException} +import org.apache.spark.internal.io.{FileCommitProtocol, SparkHadoopWriterUtils} import org.apache.spark.internal.io.FileCommitProtocol.TaskCommitMessage -import org.apache.spark.internal.io.SparkHadoopWriterUtils import org.apache.spark.rdd.RDD import org.apache.spark.shuffle.FetchFailedException import org.apache.spark.sql.catalyst.InternalRow @@ -90,7 +90,8 @@ case class VeloxWriteFilesMetrics( */ class VeloxColumnarWriteFilesRDD( var prev: RDD[ColumnarBatch], - writeFilesSpec: WriteFilesSpec, + description: WriteJobDescription, + committer: FileCommitProtocol, jobTrackerID: String) extends RDD[WriterCommitMessage](prev) { @@ -118,7 +119,7 @@ class VeloxColumnarWriteFilesRDD( val fileWriteInfo = fileWriteInfos.head numBytes += fileWriteInfo.fileSize val targetFileName = fileWriteInfo.targetFileName - val outputPath = writeFilesSpec.description.path + val outputPath = description.path // part1=1/part2=1 val partitionFragment = metrics.name @@ -126,7 +127,7 @@ class VeloxColumnarWriteFilesRDD( if (partitionFragment != "") { updatedPartitions += partitionFragment val tmpOutputPath = outputPath + "/" + partitionFragment + "/" + targetFileName - val customOutputPath = writeFilesSpec.description.customPartitionLocations.get( + val customOutputPath = description.customPartitionLocations.get( PartitioningUtils.parsePathFragment(partitionFragment)) if (customOutputPath.isDefined) { addedAbsPathFiles(tmpOutputPath) = customOutputPath.get + "/" + targetFileName @@ -174,8 +175,6 @@ class VeloxColumnarWriteFilesRDD( private def writeFilesForEmptyIterator( commitProtocol: SparkWriteFilesCommitProtocol): WriteTaskResult = { - val description = writeFilesSpec.description - val committer = writeFilesSpec.committer val taskAttemptContext = commitProtocol.taskAttemptContext val dataWriter = @@ -194,10 +193,7 @@ class VeloxColumnarWriteFilesRDD( } override def compute(split: Partition, context: TaskContext): Iterator[WriterCommitMessage] = { - val commitProtocol = new SparkWriteFilesCommitProtocol( - jobTrackerID, - writeFilesSpec.description, - writeFilesSpec.committer) + val commitProtocol = new SparkWriteFilesCommitProtocol(jobTrackerID, description, committer) commitProtocol.setupTask() val writePath = commitProtocol.newTaskAttemptTempPath() @@ -238,7 +234,7 @@ class VeloxColumnarWriteFilesRDD( case t: Throwable => throw new SparkException( s"Task failed while writing rows to staging path: $writePath, " + - s"output path: ${writeFilesSpec.description.path}", + s"output path: ${description.path}", t) } @@ -285,10 +281,9 @@ case class VeloxColumnarWriteFilesExec private ( /** Fallback to use vanilla Spark write files to generate an empty file for metadata only. */ private def writeFilesForEmptyRDD( - writeFilesSpec: WriteFilesSpec, + description: WriteJobDescription, + committer: FileCommitProtocol, jobTrackerID: String): RDD[WriterCommitMessage] = { - val description = writeFilesSpec.description - val committer = writeFilesSpec.committer val rddWithNonEmptyPartitions = session.sparkContext.parallelize(Seq.empty[InternalRow], 1) rddWithNonEmptyPartitions.mapPartitionsInternal { iterator => @@ -314,12 +309,14 @@ case class VeloxColumnarWriteFilesExec private ( val rdd = child.executeColumnar() val jobTrackerID = SparkHadoopWriterUtils.createJobTrackerID(new Date()) + val description = writeFilesSpec.description + val committer = writeFilesSpec.committer if (rdd.partitions.length == 0) { // SPARK-23271 If we are attempting to write a zero partition rdd, create a dummy single // partition rdd to make sure we at least set up one write task to write the metadata. - writeFilesForEmptyRDD(writeFilesSpec, jobTrackerID) + writeFilesForEmptyRDD(description, committer, jobTrackerID) } else { - new VeloxColumnarWriteFilesRDD(rdd, writeFilesSpec, jobTrackerID) + new VeloxColumnarWriteFilesRDD(rdd, description, committer, jobTrackerID) } } override protected def withNewChildrenInternal( diff --git a/backends-velox/src/main/scala/org/apache/spark/sql/execution/datasources/VeloxWriteQueue.scala b/backends-velox/src/main/scala/org/apache/spark/sql/execution/datasources/VeloxWriteQueue.scala index 089db1da1dee8..b2905e157554f 100644 --- a/backends-velox/src/main/scala/org/apache/spark/sql/execution/datasources/VeloxWriteQueue.scala +++ b/backends-velox/src/main/scala/org/apache/spark/sql/execution/datasources/VeloxWriteQueue.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.execution.datasources import org.apache.gluten.datasource.DatasourceJniWrapper -import org.apache.gluten.utils.Iterators +import org.apache.gluten.utils.iterator.Iterators import org.apache.gluten.vectorized.ColumnarBatchInIterator import org.apache.spark.TaskContext diff --git a/backends-velox/src/main/scala/org/apache/spark/sql/execution/datasources/velox/VeloxBlockStripes.java b/backends-velox/src/main/scala/org/apache/spark/sql/execution/datasources/velox/VeloxBlockStripes.java index 4d887e60794a5..f9848d4ab6344 100644 --- a/backends-velox/src/main/scala/org/apache/spark/sql/execution/datasources/velox/VeloxBlockStripes.java +++ b/backends-velox/src/main/scala/org/apache/spark/sql/execution/datasources/velox/VeloxBlockStripes.java @@ -16,13 +16,11 @@ */ package org.apache.spark.sql.execution.datasources.velox; -import org.apache.gluten.exec.Runtimes; +import org.apache.gluten.columnarbatch.ColumnarBatches; import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.catalyst.expressions.UnsafeRow; import org.apache.spark.sql.execution.datasources.BlockStripe; import org.apache.spark.sql.execution.datasources.BlockStripes; -import org.apache.gluten.columnarbatch.ColumnarBatches; - import org.apache.spark.sql.vectorized.ColumnarBatch; import org.jetbrains.annotations.NotNull; @@ -53,7 +51,7 @@ public BlockStripe next() { return new BlockStripe() { @Override public ColumnarBatch getColumnarBatch() { - return ColumnarBatches.create(Runtimes.contextInstance(), blockAddresses[0]); + return ColumnarBatches.create(blockAddresses[0]); } @Override diff --git a/backends-velox/src/main/scala/org/apache/spark/sql/execution/datasources/velox/VeloxFormatWriterInjects.scala b/backends-velox/src/main/scala/org/apache/spark/sql/execution/datasources/velox/VeloxFormatWriterInjects.scala index c358d6372c368..ebbf959d0b1f9 100644 --- a/backends-velox/src/main/scala/org/apache/spark/sql/execution/datasources/velox/VeloxFormatWriterInjects.scala +++ b/backends-velox/src/main/scala/org/apache/spark/sql/execution/datasources/velox/VeloxFormatWriterInjects.scala @@ -19,9 +19,9 @@ package org.apache.spark.sql.execution.datasources.velox import org.apache.gluten.columnarbatch.ColumnarBatches import org.apache.gluten.datasource.DatasourceJniWrapper import org.apache.gluten.exception.GlutenException +import org.apache.gluten.exec.Runtimes import org.apache.gluten.execution.datasource.GlutenRowSplitter import org.apache.gluten.memory.arrow.alloc.ArrowBufferAllocators -import org.apache.gluten.memory.nmm.NativeMemoryManagers import org.apache.gluten.utils.{ArrowAbiUtil, DatasourceUtil} import org.apache.spark.sql.SparkSession @@ -34,7 +34,7 @@ import org.apache.spark.util.TaskResources import com.google.common.base.Preconditions import org.apache.arrow.c.ArrowSchema -import org.apache.hadoop.fs.{FileStatus, FileSystem, Path} +import org.apache.hadoop.fs.{FileStatus, Path} import org.apache.hadoop.mapreduce.TaskAttemptContext import java.io.IOException @@ -48,8 +48,8 @@ trait VeloxFormatWriterInjects extends GlutenFormatWriterInjectsBase { // Create the hdfs path if not existed. val hdfsSchema = "hdfs://" if (filePath.startsWith(hdfsSchema)) { - val fs = FileSystem.get(context.getConfiguration) val hdfsPath = new Path(filePath) + val fs = hdfsPath.getFileSystem(context.getConfiguration) if (!fs.exists(hdfsPath.getParent)) { fs.mkdirs(hdfsPath.getParent) } @@ -59,15 +59,13 @@ trait VeloxFormatWriterInjects extends GlutenFormatWriterInjectsBase { SparkArrowUtil.toArrowSchema(dataSchema, SQLConf.get.sessionLocalTimeZone) val cSchema = ArrowSchema.allocateNew(ArrowBufferAllocators.contextInstance()) var dsHandle = -1L - val datasourceJniWrapper = DatasourceJniWrapper.create() + val runtime = Runtimes.contextInstance("VeloxWriter") + val datasourceJniWrapper = DatasourceJniWrapper.create(runtime) val allocator = ArrowBufferAllocators.contextInstance() try { ArrowAbiUtil.exportSchema(allocator, arrowSchema, cSchema) - dsHandle = datasourceJniWrapper.nativeInitDatasource( - filePath, - cSchema.memoryAddress(), - NativeMemoryManagers.contextInstance("VeloxWriter").getNativeInstanceHandle, - nativeConf) + dsHandle = + datasourceJniWrapper.nativeInitDatasource(filePath, cSchema.memoryAddress(), nativeConf) } catch { case e: IOException => throw new GlutenException(e) @@ -119,16 +117,12 @@ class VeloxRowSplitter extends GlutenRowSplitter { hasBucket: Boolean, reserve_partition_columns: Boolean = false): BlockStripes = { val handler = ColumnarBatches.getNativeHandle(row.batch) - val datasourceJniWrapper = DatasourceJniWrapper.create() + val runtime = Runtimes.contextInstance("VeloxPartitionWriter") + val datasourceJniWrapper = DatasourceJniWrapper.create(runtime) val originalColumns: Array[Int] = Array.range(0, row.batch.numCols()) val dataColIndice = originalColumns.filterNot(partitionColIndice.contains(_)) new VeloxBlockStripes( datasourceJniWrapper - .splitBlockByPartitionAndBucket( - handler, - dataColIndice, - hasBucket, - NativeMemoryManagers.contextInstance("VeloxPartitionWriter").getNativeInstanceHandle) - ) + .splitBlockByPartitionAndBucket(handler, dataColIndice, hasBucket)) } } diff --git a/backends-velox/src/main/scala/org/apache/spark/sql/expression/UDFResolver.scala b/backends-velox/src/main/scala/org/apache/spark/sql/expression/UDFResolver.scala index ec98e98f1c6e7..99f9faf9914a0 100644 --- a/backends-velox/src/main/scala/org/apache/spark/sql/expression/UDFResolver.scala +++ b/backends-velox/src/main/scala/org/apache/spark/sql/expression/UDFResolver.scala @@ -27,7 +27,7 @@ import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.{FunctionIdentifier, InternalRow} import org.apache.spark.sql.catalyst.analysis.FunctionRegistry.FunctionBuilder -import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Expression, ExpressionInfo} +import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Expression, ExpressionInfo, Unevaluable} import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateFunction import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} import org.apache.spark.sql.catalyst.types.DataTypeUtils @@ -94,7 +94,8 @@ case class UDFExpression( dataType: DataType, nullable: Boolean, children: Seq[Expression]) - extends Transformable { + extends Unevaluable + with Transformable { override protected def withNewChildrenInternal( newChildren: IndexedSeq[Expression]): Expression = { this.copy(children = newChildren) @@ -175,12 +176,16 @@ object UDFResolver extends Logging { intermediateTypes: ExpressionType, variableArity: Boolean): Unit = { assert(argTypes.dataType.isInstanceOf[StructType]) - assert(intermediateTypes.dataType.isInstanceOf[StructType]) - val aggBufferAttributes = - intermediateTypes.dataType.asInstanceOf[StructType].fields.zipWithIndex.map { - case (f, index) => - AttributeReference(s"inter_$index", f.dataType, f.nullable)() + val aggBufferAttributes: Seq[AttributeReference] = + intermediateTypes.dataType match { + case StructType(fields) => + fields.zipWithIndex.map { + case (f, index) => + AttributeReference(s"agg_inter_$index", f.dataType, f.nullable)() + } + case t => + Seq(AttributeReference(s"agg_inter", t)()) } val v = @@ -226,8 +231,9 @@ object UDFResolver extends Logging { udfLibPaths match { case Some(paths) => + // Set resolved paths to the internal config to parse on native side. sparkConf.set( - VeloxBackendSettings.GLUTEN_VELOX_UDF_LIB_PATHS, + VeloxBackendSettings.GLUTEN_VELOX_INTERNAL_UDF_LIB_PATHS, getAllLibraries(sparkConf, isDriver, paths)) case None => } @@ -322,7 +328,7 @@ object UDFResolver extends Logging { case None => Seq.empty case Some(_) => - new UdfJniWrapper().getFunctionSignatures() + UdfJniWrapper.getFunctionSignatures() UDFNames.map { name => diff --git a/backends-velox/src/test/java/org/apache/gluten/columnarbatch/ColumnarBatchTest.java b/backends-velox/src/test/java/org/apache/gluten/columnarbatch/ColumnarBatchTest.java new file mode 100644 index 0000000000000..cd2ac50d350c3 --- /dev/null +++ b/backends-velox/src/test/java/org/apache/gluten/columnarbatch/ColumnarBatchTest.java @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.columnarbatch; + +import org.apache.gluten.memory.arrow.alloc.ArrowBufferAllocators; +import org.apache.gluten.test.VeloxBackendTestBase; +import org.apache.gluten.vectorized.ArrowWritableColumnVector; + +import org.apache.spark.sql.types.StructType; +import org.apache.spark.sql.vectorized.ColumnarBatch; +import org.apache.spark.util.TaskResources$; +import org.junit.Assert; +import org.junit.Test; + +import java.util.Spliterator; +import java.util.Spliterators; +import java.util.stream.StreamSupport; + +public class ColumnarBatchTest extends VeloxBackendTestBase { + + @Test + public void testOffloadAndLoad() { + TaskResources$.MODULE$.runUnsafe( + () -> { + final int numRows = 100; + final ColumnarBatch batch = newArrowBatch("a boolean, b int", numRows); + Assert.assertTrue(ColumnarBatches.isHeavyBatch(batch)); + final ColumnarBatch offloaded = + ColumnarBatches.ensureOffloaded(ArrowBufferAllocators.contextInstance(), batch); + Assert.assertTrue(ColumnarBatches.isLightBatch(offloaded)); + final ColumnarBatch loaded = + ColumnarBatches.ensureLoaded(ArrowBufferAllocators.contextInstance(), offloaded); + Assert.assertTrue(ColumnarBatches.isHeavyBatch(loaded)); + long cnt = + StreamSupport.stream( + Spliterators.spliteratorUnknownSize( + loaded.rowIterator(), Spliterator.ORDERED), + false) + .count(); + Assert.assertEquals(numRows, cnt); + loaded.close(); + return null; + }); + } + + @Test + public void testCreateByHandle() { + TaskResources$.MODULE$.runUnsafe( + () -> { + final int numRows = 100; + final ColumnarBatch batch = newArrowBatch("a boolean, b int", numRows); + Assert.assertEquals(1, ColumnarBatches.getRefCnt(batch)); + final ColumnarBatch offloaded = + ColumnarBatches.ensureOffloaded(ArrowBufferAllocators.contextInstance(), batch); + Assert.assertEquals(1, ColumnarBatches.getRefCnt(offloaded)); + final long handle = ColumnarBatches.getNativeHandle(offloaded); + final ColumnarBatch created = ColumnarBatches.create(handle); + Assert.assertEquals(handle, ColumnarBatches.getNativeHandle(created)); + Assert.assertEquals(1, ColumnarBatches.getRefCnt(offloaded)); + Assert.assertEquals(1, ColumnarBatches.getRefCnt(created)); + ColumnarBatches.retain(created); + Assert.assertEquals(2, ColumnarBatches.getRefCnt(offloaded)); + Assert.assertEquals(2, ColumnarBatches.getRefCnt(created)); + ColumnarBatches.retain(offloaded); + Assert.assertEquals(3, ColumnarBatches.getRefCnt(offloaded)); + Assert.assertEquals(3, ColumnarBatches.getRefCnt(created)); + created.close(); + Assert.assertEquals(2, ColumnarBatches.getRefCnt(offloaded)); + Assert.assertEquals(2, ColumnarBatches.getRefCnt(created)); + offloaded.close(); + Assert.assertEquals(1, ColumnarBatches.getRefCnt(offloaded)); + Assert.assertEquals(1, ColumnarBatches.getRefCnt(created)); + created.close(); + Assert.assertEquals(0, ColumnarBatches.getRefCnt(offloaded)); + Assert.assertEquals(0, ColumnarBatches.getRefCnt(created)); + return null; + }); + } + + private static ColumnarBatch newArrowBatch(String schema, int numRows) { + final ArrowWritableColumnVector[] columns = + ArrowWritableColumnVector.allocateColumns(numRows, StructType.fromDDL(schema)); + for (ArrowWritableColumnVector col : columns) { + col.setValueCount(numRows); + } + final ColumnarBatch batch = new ColumnarBatch(columns); + batch.setNumRows(numRows); + return batch; + } +} diff --git a/backends-velox/src/test/java/org/apache/gluten/test/VeloxBackendTestBase.java b/backends-velox/src/test/java/org/apache/gluten/test/VeloxBackendTestBase.java new file mode 100644 index 0000000000000..1d7df23566dfb --- /dev/null +++ b/backends-velox/src/test/java/org/apache/gluten/test/VeloxBackendTestBase.java @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.test; + +import org.apache.gluten.GlutenConfig; +import org.apache.gluten.backendsapi.ListenerApi; +import org.apache.gluten.backendsapi.velox.VeloxListenerApi; + +import com.codahale.metrics.MetricRegistry; +import org.apache.spark.SparkConf; +import org.apache.spark.SparkContext; +import org.apache.spark.api.plugin.PluginContext; +import org.apache.spark.resource.ResourceInformation; +import org.junit.BeforeClass; + +import java.io.IOException; +import java.util.Map; + +/** For testing Velox backend without starting a Spark context. */ +public abstract class VeloxBackendTestBase { + @BeforeClass + public static void setup() { + final ListenerApi api = new VeloxListenerApi(); + api.onDriverStart(mockSparkContext(), mockPluginContext()); + } + + private static SparkContext mockSparkContext() { + // Not yet implemented. + return null; + } + + private static PluginContext mockPluginContext() { + return new PluginContext() { + @Override + public MetricRegistry metricRegistry() { + throw new UnsupportedOperationException(); + } + + @Override + public SparkConf conf() { + final SparkConf conf = new SparkConf(); + conf.set(GlutenConfig.GLUTEN_NUM_TASK_SLOTS_PER_EXECUTOR_KEY(), "0"); + return conf; + } + + @Override + public String executorID() { + throw new UnsupportedOperationException(); + } + + @Override + public String hostname() { + throw new UnsupportedOperationException(); + } + + @Override + public Map resources() { + throw new UnsupportedOperationException(); + } + + @Override + public void send(Object message) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public Object ask(Object message) throws Exception { + throw new UnsupportedOperationException(); + } + }; + } +} diff --git a/backends-velox/src/test/java/org/apache/gluten/utils/VeloxBloomFilterTest.java b/backends-velox/src/test/java/org/apache/gluten/utils/VeloxBloomFilterTest.java index db54bd7832dae..cf568b1665821 100644 --- a/backends-velox/src/test/java/org/apache/gluten/utils/VeloxBloomFilterTest.java +++ b/backends-velox/src/test/java/org/apache/gluten/utils/VeloxBloomFilterTest.java @@ -16,28 +16,18 @@ */ package org.apache.gluten.utils; -import org.apache.gluten.backendsapi.ListenerApi; -import org.apache.gluten.backendsapi.velox.VeloxListenerApi; +import org.apache.gluten.test.VeloxBackendTestBase; -import org.apache.spark.SparkConf; import org.apache.spark.util.TaskResources$; import org.apache.spark.util.sketch.BloomFilter; import org.apache.spark.util.sketch.IncompatibleMergeException; import org.junit.Assert; -import org.junit.BeforeClass; import org.junit.Test; import org.junit.function.ThrowingRunnable; import java.nio.ByteBuffer; -public class VeloxBloomFilterTest { - - @BeforeClass - public static void setup() { - final ListenerApi api = new VeloxListenerApi(); - api.onDriverStart(new SparkConf()); - } - +public class VeloxBloomFilterTest extends VeloxBackendTestBase { @Test public void testEmpty() { TaskResources$.MODULE$.runUnsafe( diff --git a/backends-velox/src/test/resources/datasource/csv/student_option.csv b/backends-velox/src/test/resources/datasource/csv/student_option.csv new file mode 100644 index 0000000000000..919b7387b53c0 --- /dev/null +++ b/backends-velox/src/test/resources/datasource/csv/student_option.csv @@ -0,0 +1,4 @@ +Name;Language +Juno;Java +Peter;Python +Celin;C++ diff --git a/backends-velox/src/test/resources/datasource/csv/student_option_schema.csv b/backends-velox/src/test/resources/datasource/csv/student_option_schema.csv new file mode 100644 index 0000000000000..be8459a217393 --- /dev/null +++ b/backends-velox/src/test/resources/datasource/csv/student_option_schema.csv @@ -0,0 +1,4 @@ +id,name,language +1,Juno,Java +2,Peter,Python +3,Celin,C++ diff --git a/backends-velox/src/test/resources/datasource/csv/student_option_str.csv b/backends-velox/src/test/resources/datasource/csv/student_option_str.csv new file mode 100644 index 0000000000000..b4214b390caea --- /dev/null +++ b/backends-velox/src/test/resources/datasource/csv/student_option_str.csv @@ -0,0 +1,4 @@ +Name,Language +Juno,Java +Peter,Python +,C++ diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/1.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/1.txt index 8d67aad16c3dc..53edb933c1fb4 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/1.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/1.txt @@ -1,31 +1,31 @@ == Physical Plan == -AdaptiveSparkPlan (28) +AdaptiveSparkPlan (30) +- == Final Plan == - VeloxColumnarToRowExec (19) - +- ^ SortExecTransformer (17) - +- ^ InputIteratorTransformer (16) - +- ^ InputAdapter (15) - +- ^ ShuffleQueryStage (14) - +- ColumnarExchange (13) - +- ^ RegularHashAggregateExecTransformer (11) - +- ^ InputIteratorTransformer (10) - +- ^ InputAdapter (9) - +- ^ ShuffleQueryStage (8) - +- ColumnarExchange (7) + VeloxColumnarToRowExec (21) + +- ^ SortExecTransformer (19) + +- ^ InputIteratorTransformer (18) + +- ShuffleQueryStage (16) + +- ColumnarExchange (15) + +- VeloxAppendBatches (14) + +- ^ RegularHashAggregateExecTransformer (12) + +- ^ InputIteratorTransformer (11) + +- ShuffleQueryStage (9) + +- ColumnarExchange (8) + +- VeloxAppendBatches (7) +- ^ ProjectExecTransformer (5) +- ^ FlushableHashAggregateExecTransformer (4) +- ^ ProjectExecTransformer (3) +- ^ NoopFilter (2) +- ^ Scan parquet (1) +- == Initial Plan == - Sort (27) - +- Exchange (26) - +- HashAggregate (25) - +- Exchange (24) - +- HashAggregate (23) - +- Project (22) - +- Filter (21) - +- Scan parquet (20) + Sort (29) + +- Exchange (28) + +- HashAggregate (27) + +- Exchange (26) + +- HashAggregate (25) + +- Project (24) + +- Filter (23) + +- Scan parquet (22) (1) Scan parquet @@ -58,97 +58,105 @@ Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, Input [18]: [hash_partition_key#X, l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Arguments: false -(7) ColumnarExchange +(7) VeloxAppendBatches +Input [18]: [hash_partition_key#X, l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] +Arguments: X + +(8) ColumnarExchange Input [18]: [hash_partition_key#X, l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Arguments: hashpartitioning(l_returnflag#X, l_linestatus#X, 1), ENSURE_REQUIREMENTS, [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X], [plan_id=X], [id=#X] -(8) ShuffleQueryStage +(9) ShuffleQueryStage Output [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Arguments: X -(9) InputAdapter +(10) InputAdapter Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] -(10) InputIteratorTransformer +(11) InputIteratorTransformer Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] -(11) RegularHashAggregateExecTransformer +(12) RegularHashAggregateExecTransformer Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Keys [2]: [l_returnflag#X, l_linestatus#X] Functions [8]: [sum(l_quantity#X), sum(l_extendedprice#X), sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true)), sum(CheckOverflow((promote_precision(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true)) * promote_precision(cast(CheckOverflow((1.00 + promote_precision(cast(l_tax#X as decimal(13,2)))), DecimalType(13,2), true) as decimal(26,4)))), DecimalType(38,6), true)), avg(l_quantity#X), avg(l_extendedprice#X), avg(l_discount#X), count(1)] Aggregate Attributes [8]: [sum(l_quantity#X)#X, sum(l_extendedprice#X)#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X, sum(CheckOverflow((promote_precision(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true)) * promote_precision(cast(CheckOverflow((1.00 + promote_precision(cast(l_tax#X as decimal(13,2)))), DecimalType(13,2), true) as decimal(26,4)))), DecimalType(38,6), true))#X, avg(l_quantity#X)#X, avg(l_extendedprice#X)#X, avg(l_discount#X)#X, count(1)#X] Results [10]: [l_returnflag#X, l_linestatus#X, sum(l_quantity#X)#X AS sum_qty#X, sum(l_extendedprice#X)#X AS sum_base_price#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X AS sum_disc_price#X, sum(CheckOverflow((promote_precision(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true)) * promote_precision(cast(CheckOverflow((1.00 + promote_precision(cast(l_tax#X as decimal(13,2)))), DecimalType(13,2), true) as decimal(26,4)))), DecimalType(38,6), true))#X AS sum_charge#X, avg(l_quantity#X)#X AS avg_qty#X, avg(l_extendedprice#X)#X AS avg_price#X, avg(l_discount#X)#X AS avg_disc#X, count(1)#X AS count_order#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] +Arguments: X + +(15) ColumnarExchange Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: rangepartitioning(l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] -(17) SortExecTransformer +(19) SortExecTransformer Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: [l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST], true, 0 -(18) WholeStageCodegenTransformer (X) +(20) WholeStageCodegenTransformer (X) Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: false -(19) VeloxColumnarToRowExec +(21) VeloxColumnarToRowExec Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] -(20) Scan parquet +(22) Scan parquet Output [7]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_tax#X, l_returnflag#X, l_linestatus#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), LessThanOrEqual(l_shipdate,1998-09-02)] ReadSchema: struct -(21) Filter +(23) Filter Input [7]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_tax#X, l_returnflag#X, l_linestatus#X, l_shipdate#X] Condition : (isnotnull(l_shipdate#X) AND (l_shipdate#X <= 1998-09-02)) -(22) Project +(24) Project Output [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_tax#X, l_returnflag#X, l_linestatus#X] Input [7]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_tax#X, l_returnflag#X, l_linestatus#X, l_shipdate#X] -(23) HashAggregate +(25) HashAggregate Input [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_tax#X, l_returnflag#X, l_linestatus#X] Keys [2]: [l_returnflag#X, l_linestatus#X] Functions [8]: [partial_sum(l_quantity#X), partial_sum(l_extendedprice#X), partial_sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true)), partial_sum(CheckOverflow((promote_precision(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true)) * promote_precision(cast(CheckOverflow((1.00 + promote_precision(cast(l_tax#X as decimal(13,2)))), DecimalType(13,2), true) as decimal(26,4)))), DecimalType(38,6), true)), partial_avg(l_quantity#X), partial_avg(l_extendedprice#X), partial_avg(l_discount#X), partial_count(1)] Aggregate Attributes [15]: [sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Results [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] -(24) Exchange +(26) Exchange Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Arguments: hashpartitioning(l_returnflag#X, l_linestatus#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(25) HashAggregate +(27) HashAggregate Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Keys [2]: [l_returnflag#X, l_linestatus#X] Functions [8]: [sum(l_quantity#X), sum(l_extendedprice#X), sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true)), sum(CheckOverflow((promote_precision(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true)) * promote_precision(cast(CheckOverflow((1.00 + promote_precision(cast(l_tax#X as decimal(13,2)))), DecimalType(13,2), true) as decimal(26,4)))), DecimalType(38,6), true)), avg(l_quantity#X), avg(l_extendedprice#X), avg(l_discount#X), count(1)] Aggregate Attributes [8]: [sum(l_quantity#X)#X, sum(l_extendedprice#X)#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X, sum(CheckOverflow((promote_precision(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true)) * promote_precision(cast(CheckOverflow((1.00 + promote_precision(cast(l_tax#X as decimal(13,2)))), DecimalType(13,2), true) as decimal(26,4)))), DecimalType(38,6), true))#X, avg(l_quantity#X)#X, avg(l_extendedprice#X)#X, avg(l_discount#X)#X, count(1)#X] Results [10]: [l_returnflag#X, l_linestatus#X, sum(l_quantity#X)#X AS sum_qty#X, sum(l_extendedprice#X)#X AS sum_base_price#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X AS sum_disc_price#X, sum(CheckOverflow((promote_precision(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true)) * promote_precision(cast(CheckOverflow((1.00 + promote_precision(cast(l_tax#X as decimal(13,2)))), DecimalType(13,2), true) as decimal(26,4)))), DecimalType(38,6), true))#X AS sum_charge#X, avg(l_quantity#X)#X AS avg_qty#X, avg(l_extendedprice#X)#X AS avg_price#X, avg(l_discount#X)#X AS avg_disc#X, count(1)#X AS count_order#X] -(26) Exchange +(28) Exchange Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: rangepartitioning(l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(27) Sort +(29) Sort Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: [l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST], true, 0 -(28) AdaptiveSparkPlan +(30) AdaptiveSparkPlan Output [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/10.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/10.txt index c0153da0cda99..29f28ac26a6be 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/10.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/10.txt @@ -1,68 +1,65 @@ == Physical Plan == -AdaptiveSparkPlan (67) +AdaptiveSparkPlan (68) +- == Final Plan == - VeloxColumnarToRowExec (43) - +- TakeOrderedAndProjectExecTransformer (42) - +- ^ ProjectExecTransformer (40) - +- ^ RegularHashAggregateExecTransformer (39) - +- ^ InputIteratorTransformer (38) - +- ^ InputAdapter (37) - +- ^ ShuffleQueryStage (36) - +- ColumnarExchange (35) + VeloxColumnarToRowExec (44) + +- TakeOrderedAndProjectExecTransformer (43) + +- ^ ProjectExecTransformer (41) + +- ^ RegularHashAggregateExecTransformer (40) + +- ^ InputIteratorTransformer (39) + +- ShuffleQueryStage (37) + +- ColumnarExchange (36) + +- VeloxAppendBatches (35) +- ^ ProjectExecTransformer (33) +- ^ FlushableHashAggregateExecTransformer (32) +- ^ ProjectExecTransformer (31) - +- ^ BroadcastHashJoinExecTransformer Inner (30) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (30) :- ^ ProjectExecTransformer (22) - : +- ^ BroadcastHashJoinExecTransformer Inner (21) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (21) : :- ^ ProjectExecTransformer (12) - : : +- ^ BroadcastHashJoinExecTransformer Inner (11) + : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (11) : : :- ^ NoopFilter (2) : : : +- ^ Scan parquet (1) : : +- ^ InputIteratorTransformer (10) - : : +- ^ InputAdapter (9) - : : +- ^ BroadcastQueryStage (8) - : : +- ColumnarBroadcastExchange (7) - : : +- ^ ProjectExecTransformer (5) - : : +- ^ NoopFilter (4) - : : +- ^ Scan parquet (3) + : : +- BroadcastQueryStage (8) + : : +- ColumnarBroadcastExchange (7) + : : +- ^ ProjectExecTransformer (5) + : : +- ^ NoopFilter (4) + : : +- ^ Scan parquet (3) : +- ^ InputIteratorTransformer (20) - : +- ^ InputAdapter (19) - : +- ^ BroadcastQueryStage (18) - : +- ColumnarBroadcastExchange (17) - : +- ^ ProjectExecTransformer (15) - : +- ^ NoopFilter (14) - : +- ^ Scan parquet (13) + : +- BroadcastQueryStage (18) + : +- ColumnarBroadcastExchange (17) + : +- ^ ProjectExecTransformer (15) + : +- ^ NoopFilter (14) + : +- ^ Scan parquet (13) +- ^ InputIteratorTransformer (29) - +- ^ InputAdapter (28) - +- ^ BroadcastQueryStage (27) - +- ColumnarBroadcastExchange (26) - +- ^ NoopFilter (24) - +- ^ Scan parquet (23) + +- BroadcastQueryStage (27) + +- ColumnarBroadcastExchange (26) + +- ^ NoopFilter (24) + +- ^ Scan parquet (23) +- == Initial Plan == - TakeOrderedAndProject (66) - +- HashAggregate (65) - +- Exchange (64) - +- HashAggregate (63) - +- Project (62) - +- BroadcastHashJoin Inner BuildRight (61) - :- Project (57) - : +- BroadcastHashJoin Inner BuildRight (56) - : :- Project (51) - : : +- BroadcastHashJoin Inner BuildRight (50) - : : :- Filter (45) - : : : +- Scan parquet (44) - : : +- BroadcastExchange (49) - : : +- Project (48) - : : +- Filter (47) - : : +- Scan parquet (46) - : +- BroadcastExchange (55) - : +- Project (54) - : +- Filter (53) - : +- Scan parquet (52) - +- BroadcastExchange (60) - +- Filter (59) - +- Scan parquet (58) + TakeOrderedAndProject (67) + +- HashAggregate (66) + +- Exchange (65) + +- HashAggregate (64) + +- Project (63) + +- BroadcastHashJoin Inner BuildRight (62) + :- Project (58) + : +- BroadcastHashJoin Inner BuildRight (57) + : :- Project (52) + : : +- BroadcastHashJoin Inner BuildRight (51) + : : :- Filter (46) + : : : +- Scan parquet (45) + : : +- BroadcastExchange (50) + : : +- Project (49) + : : +- Filter (48) + : : +- Scan parquet (47) + : +- BroadcastExchange (56) + : +- Project (55) + : +- Filter (54) + : +- Scan parquet (53) + +- BroadcastExchange (61) + +- Filter (60) + +- Scan parquet (59) (1) Scan parquet @@ -213,155 +210,159 @@ Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Arguments: false -(35) ColumnarExchange +(35) VeloxAppendBatches +Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] +Arguments: X + +(36) ColumnarExchange Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Arguments: hashpartitioning(c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(36) ShuffleQueryStage +(37) ShuffleQueryStage Output [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Arguments: X -(37) InputAdapter +(38) InputAdapter Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] -(38) InputIteratorTransformer +(39) InputIteratorTransformer Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] -(39) RegularHashAggregateExecTransformer +(40) RegularHashAggregateExecTransformer Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Keys [7]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] Results [8]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] -(40) ProjectExecTransformer +(41) ProjectExecTransformer Output [8]: [c_custkey#X, c_name#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X AS revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] Input [8]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] -(41) WholeStageCodegenTransformer (X) +(42) WholeStageCodegenTransformer (X) Input [8]: [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] Arguments: false -(42) TakeOrderedAndProjectExecTransformer +(43) TakeOrderedAndProjectExecTransformer Input [8]: [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] Arguments: X, [revenue#X DESC NULLS LAST], [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X], 0 -(43) VeloxColumnarToRowExec +(44) VeloxColumnarToRowExec Input [8]: [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] -(44) Scan parquet +(45) Scan parquet Output [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey), IsNotNull(c_nationkey)] ReadSchema: struct -(45) Filter +(46) Filter Input [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] Condition : (isnotnull(c_custkey#X) AND isnotnull(c_nationkey#X)) -(46) Scan parquet +(47) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1993-10-01), LessThan(o_orderdate,1994-01-01), IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(47) Filter +(48) Filter Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Condition : ((((isnotnull(o_orderdate#X) AND (o_orderdate#X >= 1993-10-01)) AND (o_orderdate#X < 1994-01-01)) AND isnotnull(o_custkey#X)) AND isnotnull(o_orderkey#X)) -(48) Project +(49) Project Output [2]: [o_orderkey#X, o_custkey#X] Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] -(49) BroadcastExchange +(50) BroadcastExchange Input [2]: [o_orderkey#X, o_custkey#X] Arguments: HashedRelationBroadcastMode(List(input[1, bigint, true]),false), [plan_id=X] -(50) BroadcastHashJoin +(51) BroadcastHashJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(51) Project +(52) Project Output [8]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] Input [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X, o_custkey#X] -(52) Scan parquet +(53) Scan parquet Output [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_returnflag#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_returnflag), EqualTo(l_returnflag,R), IsNotNull(l_orderkey)] ReadSchema: struct -(53) Filter +(54) Filter Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_returnflag#X] Condition : ((isnotnull(l_returnflag#X) AND (l_returnflag#X = R)) AND isnotnull(l_orderkey#X)) -(54) Project +(55) Project Output [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_returnflag#X] -(55) BroadcastExchange +(56) BroadcastExchange Input [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(56) BroadcastHashJoin +(57) BroadcastHashJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(57) Project +(58) Project Output [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] Input [11]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X] -(58) Scan parquet +(59) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey)] ReadSchema: struct -(59) Filter +(60) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : isnotnull(n_nationkey#X) -(60) BroadcastExchange +(61) BroadcastExchange Input [2]: [n_nationkey#X, n_name#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(61) BroadcastHashJoin +(62) BroadcastHashJoin Left keys [1]: [c_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(62) Project +(63) Project Output [9]: [c_custkey#X, c_name#X, c_address#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X, n_name#X] Input [11]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X, n_nationkey#X, n_name#X] -(63) HashAggregate +(64) HashAggregate Input [9]: [c_custkey#X, c_name#X, c_address#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X, n_name#X] Keys [7]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X] Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] -(64) Exchange +(65) Exchange Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Arguments: hashpartitioning(c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(65) HashAggregate +(66) HashAggregate Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Keys [7]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] Results [8]: [c_custkey#X, c_name#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X AS revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] -(66) TakeOrderedAndProject +(67) TakeOrderedAndProject Input [8]: [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] Arguments: X, [revenue#X DESC NULLS LAST], [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] -(67) AdaptiveSparkPlan +(68) AdaptiveSparkPlan Output [8]: [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/11.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/11.txt index 219d4b7c14de8..b8f466eb9456c 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/11.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/11.txt @@ -1,59 +1,57 @@ == Physical Plan == -AdaptiveSparkPlan (58) +AdaptiveSparkPlan (60) +- == Final Plan == - VeloxColumnarToRowExec (38) - +- ^ SortExecTransformer (36) - +- ^ InputIteratorTransformer (35) - +- ^ InputAdapter (34) - +- ^ ShuffleQueryStage (33) - +- ColumnarExchange (32) - +- ^ FilterExecTransformer (30) - +- ^ RegularHashAggregateExecTransformer (29) - +- ^ InputIteratorTransformer (28) - +- ^ InputAdapter (27) - +- ^ ShuffleQueryStage (26) - +- ColumnarExchange (25) + VeloxColumnarToRowExec (40) + +- ^ SortExecTransformer (38) + +- ^ InputIteratorTransformer (37) + +- ShuffleQueryStage (35) + +- ColumnarExchange (34) + +- VeloxAppendBatches (33) + +- ^ FilterExecTransformer (31) + +- ^ RegularHashAggregateExecTransformer (30) + +- ^ InputIteratorTransformer (29) + +- ShuffleQueryStage (27) + +- ColumnarExchange (26) + +- VeloxAppendBatches (25) +- ^ ProjectExecTransformer (23) +- ^ FlushableHashAggregateExecTransformer (22) +- ^ ProjectExecTransformer (21) - +- ^ BroadcastHashJoinExecTransformer Inner (20) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (20) :- ^ ProjectExecTransformer (11) - : +- ^ BroadcastHashJoinExecTransformer Inner (10) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (10) : :- ^ NoopFilter (2) : : +- ^ Scan parquet (1) : +- ^ InputIteratorTransformer (9) - : +- ^ InputAdapter (8) - : +- ^ BroadcastQueryStage (7) - : +- ColumnarBroadcastExchange (6) - : +- ^ NoopFilter (4) - : +- ^ Scan parquet (3) + : +- BroadcastQueryStage (7) + : +- ColumnarBroadcastExchange (6) + : +- ^ NoopFilter (4) + : +- ^ Scan parquet (3) +- ^ InputIteratorTransformer (19) - +- ^ InputAdapter (18) - +- ^ BroadcastQueryStage (17) - +- ColumnarBroadcastExchange (16) - +- ^ ProjectExecTransformer (14) - +- ^ NoopFilter (13) - +- ^ Scan parquet (12) + +- BroadcastQueryStage (17) + +- ColumnarBroadcastExchange (16) + +- ^ ProjectExecTransformer (14) + +- ^ NoopFilter (13) + +- ^ Scan parquet (12) +- == Initial Plan == - Sort (57) - +- Exchange (56) - +- Filter (55) - +- HashAggregate (54) - +- Exchange (53) - +- HashAggregate (52) - +- Project (51) - +- BroadcastHashJoin Inner BuildRight (50) - :- Project (45) - : +- BroadcastHashJoin Inner BuildRight (44) - : :- Filter (40) - : : +- Scan parquet (39) - : +- BroadcastExchange (43) - : +- Filter (42) - : +- Scan parquet (41) - +- BroadcastExchange (49) - +- Project (48) - +- Filter (47) - +- Scan parquet (46) + Sort (59) + +- Exchange (58) + +- Filter (57) + +- HashAggregate (56) + +- Exchange (55) + +- HashAggregate (54) + +- Project (53) + +- BroadcastHashJoin Inner BuildRight (52) + :- Project (47) + : +- BroadcastHashJoin Inner BuildRight (46) + : :- Filter (42) + : : +- Scan parquet (41) + : +- BroadcastExchange (45) + : +- Filter (44) + : +- Scan parquet (43) + +- BroadcastExchange (51) + +- Project (50) + +- Filter (49) + +- Scan parquet (48) (1) Scan parquet @@ -162,153 +160,161 @@ Input [3]: [ps_partkey#X, sum#X, isEmpty#X] Input [4]: [hash_partition_key#X, ps_partkey#X, sum#X, isEmpty#X] Arguments: false -(25) ColumnarExchange +(25) VeloxAppendBatches +Input [4]: [hash_partition_key#X, ps_partkey#X, sum#X, isEmpty#X] +Arguments: X + +(26) ColumnarExchange Input [4]: [hash_partition_key#X, ps_partkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(26) ShuffleQueryStage +(27) ShuffleQueryStage Output [3]: [ps_partkey#X, sum#X, isEmpty#X] Arguments: X -(27) InputAdapter +(28) InputAdapter Input [3]: [ps_partkey#X, sum#X, isEmpty#X] -(28) InputIteratorTransformer +(29) InputIteratorTransformer Input [3]: [ps_partkey#X, sum#X, isEmpty#X] -(29) RegularHashAggregateExecTransformer +(30) RegularHashAggregateExecTransformer Input [3]: [ps_partkey#X, sum#X, isEmpty#X] Keys [1]: [ps_partkey#X] Functions [1]: [sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(cast(ps_availqty#X as decimal(10,0)) as decimal(12,2)))), DecimalType(23,2), true))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(cast(ps_availqty#X as decimal(10,0)) as decimal(12,2)))), DecimalType(23,2), true))#X] Results [2]: [ps_partkey#X, sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(cast(ps_availqty#X as decimal(10,0)) as decimal(12,2)))), DecimalType(23,2), true))#X AS value#X] -(30) FilterExecTransformer +(31) FilterExecTransformer Input [2]: [ps_partkey#X, value#X] Arguments: (isnotnull(value#X) AND (cast(value#X as decimal(38,6)) > Subquery subquery#X, [id=#X])) -(31) WholeStageCodegenTransformer (X) +(32) WholeStageCodegenTransformer (X) Input [2]: [ps_partkey#X, value#X] Arguments: false -(32) ColumnarExchange +(33) VeloxAppendBatches +Input [2]: [ps_partkey#X, value#X] +Arguments: X + +(34) ColumnarExchange Input [2]: [ps_partkey#X, value#X] Arguments: rangepartitioning(value#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(33) ShuffleQueryStage +(35) ShuffleQueryStage Output [2]: [ps_partkey#X, value#X] Arguments: X -(34) InputAdapter +(36) InputAdapter Input [2]: [ps_partkey#X, value#X] -(35) InputIteratorTransformer +(37) InputIteratorTransformer Input [2]: [ps_partkey#X, value#X] -(36) SortExecTransformer +(38) SortExecTransformer Input [2]: [ps_partkey#X, value#X] Arguments: [value#X DESC NULLS LAST], true, 0 -(37) WholeStageCodegenTransformer (X) +(39) WholeStageCodegenTransformer (X) Input [2]: [ps_partkey#X, value#X] Arguments: false -(38) VeloxColumnarToRowExec +(40) VeloxColumnarToRowExec Input [2]: [ps_partkey#X, value#X] -(39) Scan parquet +(41) Scan parquet Output [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_suppkey)] ReadSchema: struct -(40) Filter +(42) Filter Input [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Condition : isnotnull(ps_suppkey#X) -(41) Scan parquet +(43) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(42) Filter +(44) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(43) BroadcastExchange +(45) BroadcastExchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(44) BroadcastHashJoin +(46) BroadcastHashJoin Left keys [1]: [ps_suppkey#X] Right keys [1]: [s_suppkey#X] Join condition: None -(45) Project +(47) Project Output [4]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] Input [6]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X, s_suppkey#X, s_nationkey#X] -(46) Scan parquet +(48) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,GERMANY), IsNotNull(n_nationkey)] ReadSchema: struct -(47) Filter +(49) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : ((isnotnull(n_name#X) AND (n_name#X = GERMANY)) AND isnotnull(n_nationkey#X)) -(48) Project +(50) Project Output [1]: [n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(49) BroadcastExchange +(51) BroadcastExchange Input [1]: [n_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(50) BroadcastHashJoin +(52) BroadcastHashJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(51) Project +(53) Project Output [3]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X] Input [5]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X, n_nationkey#X] -(52) HashAggregate +(54) HashAggregate Input [3]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X] Keys [1]: [ps_partkey#X] Functions [1]: [partial_sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(cast(ps_availqty#X as decimal(10,0)) as decimal(12,2)))), DecimalType(23,2), true))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [ps_partkey#X, sum#X, isEmpty#X] -(53) Exchange +(55) Exchange Input [3]: [ps_partkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(54) HashAggregate +(56) HashAggregate Input [3]: [ps_partkey#X, sum#X, isEmpty#X] Keys [1]: [ps_partkey#X] Functions [1]: [sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(cast(ps_availqty#X as decimal(10,0)) as decimal(12,2)))), DecimalType(23,2), true))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(cast(ps_availqty#X as decimal(10,0)) as decimal(12,2)))), DecimalType(23,2), true))#X] Results [2]: [ps_partkey#X, sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(cast(ps_availqty#X as decimal(10,0)) as decimal(12,2)))), DecimalType(23,2), true))#X AS value#X] -(55) Filter +(57) Filter Input [2]: [ps_partkey#X, value#X] Condition : (isnotnull(value#X) AND (cast(value#X as decimal(38,6)) > Subquery subquery#X, [id=#X])) -(56) Exchange +(58) Exchange Input [2]: [ps_partkey#X, value#X] Arguments: rangepartitioning(value#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(57) Sort +(59) Sort Input [2]: [ps_partkey#X, value#X] Arguments: [value#X DESC NULLS LAST], true, 0 -(58) AdaptiveSparkPlan +(60) AdaptiveSparkPlan Output [2]: [ps_partkey#X, value#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/12.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/12.txt index 985cbf4c3c59f..1e60a93910a7b 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/12.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/12.txt @@ -1,44 +1,43 @@ == Physical Plan == -AdaptiveSparkPlan (42) +AdaptiveSparkPlan (44) +- == Final Plan == - VeloxColumnarToRowExec (28) - +- ^ SortExecTransformer (26) - +- ^ InputIteratorTransformer (25) - +- ^ InputAdapter (24) - +- ^ ShuffleQueryStage (23) - +- ColumnarExchange (22) - +- ^ RegularHashAggregateExecTransformer (20) - +- ^ InputIteratorTransformer (19) - +- ^ InputAdapter (18) - +- ^ ShuffleQueryStage (17) - +- ColumnarExchange (16) + VeloxColumnarToRowExec (30) + +- ^ SortExecTransformer (28) + +- ^ InputIteratorTransformer (27) + +- ShuffleQueryStage (25) + +- ColumnarExchange (24) + +- VeloxAppendBatches (23) + +- ^ RegularHashAggregateExecTransformer (21) + +- ^ InputIteratorTransformer (20) + +- ShuffleQueryStage (18) + +- ColumnarExchange (17) + +- VeloxAppendBatches (16) +- ^ ProjectExecTransformer (14) +- ^ FlushableHashAggregateExecTransformer (13) +- ^ ProjectExecTransformer (12) - +- ^ BroadcastHashJoinExecTransformer Inner (11) + +- ^ BroadcastHashJoinExecTransformer Inner BuildLeft (11) :- ^ InputIteratorTransformer (7) - : +- ^ InputAdapter (6) - : +- ^ BroadcastQueryStage (5) - : +- ColumnarBroadcastExchange (4) - : +- ^ NoopFilter (2) - : +- ^ Scan parquet (1) + : +- BroadcastQueryStage (5) + : +- ColumnarBroadcastExchange (4) + : +- ^ NoopFilter (2) + : +- ^ Scan parquet (1) +- ^ ProjectExecTransformer (10) +- ^ NoopFilter (9) +- ^ Scan parquet (8) +- == Initial Plan == - Sort (41) - +- Exchange (40) - +- HashAggregate (39) - +- Exchange (38) - +- HashAggregate (37) - +- Project (36) - +- BroadcastHashJoin Inner BuildLeft (35) - :- BroadcastExchange (31) - : +- Filter (30) - : +- Scan parquet (29) - +- Project (34) - +- Filter (33) - +- Scan parquet (32) + Sort (43) + +- Exchange (42) + +- HashAggregate (41) + +- Exchange (40) + +- HashAggregate (39) + +- Project (38) + +- BroadcastHashJoin Inner BuildLeft (37) + :- BroadcastExchange (33) + : +- Filter (32) + : +- Scan parquet (31) + +- Project (36) + +- Filter (35) + +- Scan parquet (34) (1) Scan parquet @@ -109,121 +108,129 @@ Input [3]: [l_shipmode#X, sum#X, sum#X] Input [4]: [hash_partition_key#X, l_shipmode#X, sum#X, sum#X] Arguments: false -(16) ColumnarExchange +(16) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_shipmode#X, sum#X, sum#X] +Arguments: X + +(17) ColumnarExchange Input [4]: [hash_partition_key#X, l_shipmode#X, sum#X, sum#X] Arguments: hashpartitioning(l_shipmode#X, 1), ENSURE_REQUIREMENTS, [l_shipmode#X, sum#X, sum#X], [plan_id=X], [id=#X] -(17) ShuffleQueryStage +(18) ShuffleQueryStage Output [3]: [l_shipmode#X, sum#X, sum#X] Arguments: X -(18) InputAdapter +(19) InputAdapter Input [3]: [l_shipmode#X, sum#X, sum#X] -(19) InputIteratorTransformer +(20) InputIteratorTransformer Input [3]: [l_shipmode#X, sum#X, sum#X] -(20) RegularHashAggregateExecTransformer +(21) RegularHashAggregateExecTransformer Input [3]: [l_shipmode#X, sum#X, sum#X] Keys [1]: [l_shipmode#X] Functions [2]: [sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END), sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)] Aggregate Attributes [2]: [sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X, sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X] Results [3]: [l_shipmode#X, sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X AS high_line_count#X, sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X AS low_line_count#X] -(21) WholeStageCodegenTransformer (X) +(22) WholeStageCodegenTransformer (X) Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: false -(22) ColumnarExchange +(23) VeloxAppendBatches +Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] +Arguments: X + +(24) ColumnarExchange Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: rangepartitioning(l_shipmode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(23) ShuffleQueryStage +(25) ShuffleQueryStage Output [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: X -(24) InputAdapter +(26) InputAdapter Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] -(25) InputIteratorTransformer +(27) InputIteratorTransformer Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] -(26) SortExecTransformer +(28) SortExecTransformer Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: [l_shipmode#X ASC NULLS FIRST], true, 0 -(27) WholeStageCodegenTransformer (X) +(29) WholeStageCodegenTransformer (X) Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: false -(28) VeloxColumnarToRowExec +(30) VeloxColumnarToRowExec Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] -(29) Scan parquet +(31) Scan parquet Output [2]: [o_orderkey#X, o_orderpriority#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderkey)] ReadSchema: struct -(30) Filter +(32) Filter Input [2]: [o_orderkey#X, o_orderpriority#X] Condition : isnotnull(o_orderkey#X) -(31) BroadcastExchange +(33) BroadcastExchange Input [2]: [o_orderkey#X, o_orderpriority#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(32) Scan parquet +(34) Scan parquet Output [5]: [l_orderkey#X, l_shipdate#X, l_commitdate#X, l_receiptdate#X, l_shipmode#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_commitdate), IsNotNull(l_receiptdate), IsNotNull(l_shipdate), In(l_shipmode, [MAIL,SHIP]), GreaterThanOrEqual(l_receiptdate,1994-01-01), LessThan(l_receiptdate,1995-01-01), IsNotNull(l_orderkey)] ReadSchema: struct -(33) Filter +(35) Filter Input [5]: [l_orderkey#X, l_shipdate#X, l_commitdate#X, l_receiptdate#X, l_shipmode#X] Condition : ((((((((isnotnull(l_commitdate#X) AND isnotnull(l_receiptdate#X)) AND isnotnull(l_shipdate#X)) AND l_shipmode#X IN (MAIL,SHIP)) AND (l_commitdate#X < l_receiptdate#X)) AND (l_shipdate#X < l_commitdate#X)) AND (l_receiptdate#X >= 1994-01-01)) AND (l_receiptdate#X < 1995-01-01)) AND isnotnull(l_orderkey#X)) -(34) Project +(36) Project Output [2]: [l_orderkey#X, l_shipmode#X] Input [5]: [l_orderkey#X, l_shipdate#X, l_commitdate#X, l_receiptdate#X, l_shipmode#X] -(35) BroadcastHashJoin +(37) BroadcastHashJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(36) Project +(38) Project Output [2]: [o_orderpriority#X, l_shipmode#X] Input [4]: [o_orderkey#X, o_orderpriority#X, l_orderkey#X, l_shipmode#X] -(37) HashAggregate +(39) HashAggregate Input [2]: [o_orderpriority#X, l_shipmode#X] Keys [1]: [l_shipmode#X] Functions [2]: [partial_sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END), partial_sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)] Aggregate Attributes [2]: [sum#X, sum#X] Results [3]: [l_shipmode#X, sum#X, sum#X] -(38) Exchange +(40) Exchange Input [3]: [l_shipmode#X, sum#X, sum#X] Arguments: hashpartitioning(l_shipmode#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(39) HashAggregate +(41) HashAggregate Input [3]: [l_shipmode#X, sum#X, sum#X] Keys [1]: [l_shipmode#X] Functions [2]: [sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END), sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)] Aggregate Attributes [2]: [sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X, sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X] Results [3]: [l_shipmode#X, sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X AS high_line_count#X, sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X AS low_line_count#X] -(40) Exchange +(42) Exchange Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: rangepartitioning(l_shipmode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(41) Sort +(43) Sort Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: [l_shipmode#X ASC NULLS FIRST], true, 0 -(42) AdaptiveSparkPlan +(44) AdaptiveSparkPlan Output [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/13.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/13.txt index 4a3a239381ca1..c525944e12623 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/13.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/13.txt @@ -1,53 +1,52 @@ == Physical Plan == -AdaptiveSparkPlan (52) +AdaptiveSparkPlan (55) +- == Final Plan == - VeloxColumnarToRowExec (36) - +- ^ SortExecTransformer (34) - +- ^ InputIteratorTransformer (33) - +- ^ InputAdapter (32) - +- ^ ShuffleQueryStage (31) - +- ColumnarExchange (30) - +- ^ RegularHashAggregateExecTransformer (28) - +- ^ InputIteratorTransformer (27) - +- ^ InputAdapter (26) - +- ^ ShuffleQueryStage (25) - +- ColumnarExchange (24) - +- ^ ProjectExecTransformer (22) - +- ^ FlushableHashAggregateExecTransformer (21) - +- ^ ProjectExecTransformer (20) - +- ^ RegularHashAggregateExecTransformer (19) - +- ^ InputIteratorTransformer (18) - +- ^ InputAdapter (17) - +- ^ ShuffleQueryStage (16) - +- ColumnarExchange (15) + VeloxColumnarToRowExec (39) + +- ^ SortExecTransformer (37) + +- ^ InputIteratorTransformer (36) + +- ShuffleQueryStage (34) + +- ColumnarExchange (33) + +- VeloxAppendBatches (32) + +- ^ RegularHashAggregateExecTransformer (30) + +- ^ InputIteratorTransformer (29) + +- ShuffleQueryStage (27) + +- ColumnarExchange (26) + +- VeloxAppendBatches (25) + +- ^ ProjectExecTransformer (23) + +- ^ FlushableHashAggregateExecTransformer (22) + +- ^ ProjectExecTransformer (21) + +- ^ RegularHashAggregateExecTransformer (20) + +- ^ InputIteratorTransformer (19) + +- ShuffleQueryStage (17) + +- ColumnarExchange (16) + +- VeloxAppendBatches (15) +- ^ ProjectExecTransformer (13) +- ^ FlushableHashAggregateExecTransformer (12) +- ^ ProjectExecTransformer (11) - +- ^ BroadcastHashJoinExecTransformer LeftOuter (10) + +- ^ BroadcastHashJoinExecTransformer LeftOuter BuildRight (10) :- ^ Scan parquet (1) +- ^ InputIteratorTransformer (9) - +- ^ InputAdapter (8) - +- ^ BroadcastQueryStage (7) - +- ColumnarBroadcastExchange (6) - +- ^ ProjectExecTransformer (4) - +- ^ NoopFilter (3) - +- ^ Scan parquet (2) + +- BroadcastQueryStage (7) + +- ColumnarBroadcastExchange (6) + +- ^ ProjectExecTransformer (4) + +- ^ NoopFilter (3) + +- ^ Scan parquet (2) +- == Initial Plan == - Sort (51) - +- Exchange (50) - +- HashAggregate (49) - +- Exchange (48) - +- HashAggregate (47) - +- HashAggregate (46) - +- Exchange (45) - +- HashAggregate (44) - +- Project (43) - +- BroadcastHashJoin LeftOuter BuildRight (42) - :- Scan parquet (37) - +- BroadcastExchange (41) - +- Project (40) - +- Filter (39) - +- Scan parquet (38) + Sort (54) + +- Exchange (53) + +- HashAggregate (52) + +- Exchange (51) + +- HashAggregate (50) + +- HashAggregate (49) + +- Exchange (48) + +- HashAggregate (47) + +- Project (46) + +- BroadcastHashJoin LeftOuter BuildRight (45) + :- Scan parquet (40) + +- BroadcastExchange (44) + +- Project (43) + +- Filter (42) + +- Scan parquet (41) (1) Scan parquet @@ -113,174 +112,186 @@ Input [2]: [c_custkey#X, count#X] Input [3]: [hash_partition_key#X, c_custkey#X, count#X] Arguments: false -(15) ColumnarExchange +(15) VeloxAppendBatches +Input [3]: [hash_partition_key#X, c_custkey#X, count#X] +Arguments: X + +(16) ColumnarExchange Input [3]: [hash_partition_key#X, c_custkey#X, count#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, count#X], [plan_id=X], [id=#X] -(16) ShuffleQueryStage +(17) ShuffleQueryStage Output [2]: [c_custkey#X, count#X] Arguments: X -(17) InputAdapter +(18) InputAdapter Input [2]: [c_custkey#X, count#X] -(18) InputIteratorTransformer +(19) InputIteratorTransformer Input [2]: [c_custkey#X, count#X] -(19) RegularHashAggregateExecTransformer +(20) RegularHashAggregateExecTransformer Input [2]: [c_custkey#X, count#X] Keys [1]: [c_custkey#X] Functions [1]: [count(o_orderkey#X)] Aggregate Attributes [1]: [count(o_orderkey#X)#X] Results [2]: [c_custkey#X, count(o_orderkey#X)#X] -(20) ProjectExecTransformer +(21) ProjectExecTransformer Output [1]: [count(o_orderkey#X)#X AS c_count#X] Input [2]: [c_custkey#X, count(o_orderkey#X)#X] -(21) FlushableHashAggregateExecTransformer +(22) FlushableHashAggregateExecTransformer Input [1]: [c_count#X] Keys [1]: [c_count#X] Functions [1]: [partial_count(1)] Aggregate Attributes [1]: [count#X] Results [2]: [c_count#X, count#X] -(22) ProjectExecTransformer +(23) ProjectExecTransformer Output [3]: [hash(c_count#X, 42) AS hash_partition_key#X, c_count#X, count#X] Input [2]: [c_count#X, count#X] -(23) WholeStageCodegenTransformer (X) +(24) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, c_count#X, count#X] Arguments: false -(24) ColumnarExchange +(25) VeloxAppendBatches +Input [3]: [hash_partition_key#X, c_count#X, count#X] +Arguments: X + +(26) ColumnarExchange Input [3]: [hash_partition_key#X, c_count#X, count#X] Arguments: hashpartitioning(c_count#X, 1), ENSURE_REQUIREMENTS, [c_count#X, count#X], [plan_id=X], [id=#X] -(25) ShuffleQueryStage +(27) ShuffleQueryStage Output [2]: [c_count#X, count#X] Arguments: X -(26) InputAdapter +(28) InputAdapter Input [2]: [c_count#X, count#X] -(27) InputIteratorTransformer +(29) InputIteratorTransformer Input [2]: [c_count#X, count#X] -(28) RegularHashAggregateExecTransformer +(30) RegularHashAggregateExecTransformer Input [2]: [c_count#X, count#X] Keys [1]: [c_count#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [c_count#X, count(1)#X AS custdist#X] -(29) WholeStageCodegenTransformer (X) +(31) WholeStageCodegenTransformer (X) Input [2]: [c_count#X, custdist#X] Arguments: false -(30) ColumnarExchange +(32) VeloxAppendBatches +Input [2]: [c_count#X, custdist#X] +Arguments: X + +(33) ColumnarExchange Input [2]: [c_count#X, custdist#X] Arguments: rangepartitioning(custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(31) ShuffleQueryStage +(34) ShuffleQueryStage Output [2]: [c_count#X, custdist#X] Arguments: X -(32) InputAdapter +(35) InputAdapter Input [2]: [c_count#X, custdist#X] -(33) InputIteratorTransformer +(36) InputIteratorTransformer Input [2]: [c_count#X, custdist#X] -(34) SortExecTransformer +(37) SortExecTransformer Input [2]: [c_count#X, custdist#X] Arguments: [custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST], true, 0 -(35) WholeStageCodegenTransformer (X) +(38) WholeStageCodegenTransformer (X) Input [2]: [c_count#X, custdist#X] Arguments: false -(36) VeloxColumnarToRowExec +(39) VeloxColumnarToRowExec Input [2]: [c_count#X, custdist#X] -(37) Scan parquet +(40) Scan parquet Output [1]: [c_custkey#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(38) Scan parquet +(41) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_comment#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_comment), IsNotNull(o_custkey)] ReadSchema: struct -(39) Filter +(42) Filter Input [3]: [o_orderkey#X, o_custkey#X, o_comment#X] Condition : ((isnotnull(o_comment#X) AND NOT o_comment#X LIKE %special%requests%) AND isnotnull(o_custkey#X)) -(40) Project +(43) Project Output [2]: [o_orderkey#X, o_custkey#X] Input [3]: [o_orderkey#X, o_custkey#X, o_comment#X] -(41) BroadcastExchange +(44) BroadcastExchange Input [2]: [o_orderkey#X, o_custkey#X] Arguments: HashedRelationBroadcastMode(List(input[1, bigint, true]),false), [plan_id=X] -(42) BroadcastHashJoin +(45) BroadcastHashJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(43) Project +(46) Project Output [2]: [c_custkey#X, o_orderkey#X] Input [3]: [c_custkey#X, o_orderkey#X, o_custkey#X] -(44) HashAggregate +(47) HashAggregate Input [2]: [c_custkey#X, o_orderkey#X] Keys [1]: [c_custkey#X] Functions [1]: [partial_count(o_orderkey#X)] Aggregate Attributes [1]: [count#X] Results [2]: [c_custkey#X, count#X] -(45) Exchange +(48) Exchange Input [2]: [c_custkey#X, count#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(46) HashAggregate +(49) HashAggregate Input [2]: [c_custkey#X, count#X] Keys [1]: [c_custkey#X] Functions [1]: [count(o_orderkey#X)] Aggregate Attributes [1]: [count(o_orderkey#X)#X] Results [1]: [count(o_orderkey#X)#X AS c_count#X] -(47) HashAggregate +(50) HashAggregate Input [1]: [c_count#X] Keys [1]: [c_count#X] Functions [1]: [partial_count(1)] Aggregate Attributes [1]: [count#X] Results [2]: [c_count#X, count#X] -(48) Exchange +(51) Exchange Input [2]: [c_count#X, count#X] Arguments: hashpartitioning(c_count#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(49) HashAggregate +(52) HashAggregate Input [2]: [c_count#X, count#X] Keys [1]: [c_count#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [c_count#X, count(1)#X AS custdist#X] -(50) Exchange +(53) Exchange Input [2]: [c_count#X, custdist#X] Arguments: rangepartitioning(custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(51) Sort +(54) Sort Input [2]: [c_count#X, custdist#X] Arguments: [custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST], true, 0 -(52) AdaptiveSparkPlan +(55) AdaptiveSparkPlan Output [2]: [c_count#X, custdist#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/14.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/14.txt index ee1fe0a3316ca..fbfcf07030ebf 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/14.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/14.txt @@ -1,37 +1,36 @@ == Physical Plan == -AdaptiveSparkPlan (34) +AdaptiveSparkPlan (35) +- == Final Plan == - VeloxColumnarToRowExec (22) - +- ^ ProjectExecTransformer (20) - +- ^ RegularHashAggregateExecTransformer (19) - +- ^ InputIteratorTransformer (18) - +- ^ InputAdapter (17) - +- ^ ShuffleQueryStage (16) - +- ColumnarExchange (15) + VeloxColumnarToRowExec (23) + +- ^ ProjectExecTransformer (21) + +- ^ RegularHashAggregateExecTransformer (20) + +- ^ InputIteratorTransformer (19) + +- ShuffleQueryStage (17) + +- ColumnarExchange (16) + +- VeloxAppendBatches (15) +- ^ FlushableHashAggregateExecTransformer (13) +- ^ ProjectExecTransformer (12) - +- ^ BroadcastHashJoinExecTransformer Inner (11) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (11) :- ^ ProjectExecTransformer (3) : +- ^ NoopFilter (2) : +- ^ Scan parquet (1) +- ^ InputIteratorTransformer (10) - +- ^ InputAdapter (9) - +- ^ BroadcastQueryStage (8) - +- ColumnarBroadcastExchange (7) - +- ^ NoopFilter (5) - +- ^ Scan parquet (4) + +- BroadcastQueryStage (8) + +- ColumnarBroadcastExchange (7) + +- ^ NoopFilter (5) + +- ^ Scan parquet (4) +- == Initial Plan == - HashAggregate (33) - +- Exchange (32) - +- HashAggregate (31) - +- Project (30) - +- BroadcastHashJoin Inner BuildRight (29) - :- Project (25) - : +- Filter (24) - : +- Scan parquet (23) - +- BroadcastExchange (28) - +- Filter (27) - +- Scan parquet (26) + HashAggregate (34) + +- Exchange (33) + +- HashAggregate (32) + +- Project (31) + +- BroadcastHashJoin Inner BuildRight (30) + :- Project (26) + : +- Filter (25) + : +- Scan parquet (24) + +- BroadcastExchange (29) + +- Filter (28) + +- Scan parquet (27) (1) Scan parquet @@ -98,95 +97,99 @@ Results [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: false -(15) ColumnarExchange +(15) VeloxAppendBatches +Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] +Arguments: X + +(16) ColumnarExchange Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(16) ShuffleQueryStage +(17) ShuffleQueryStage Output [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: X -(17) InputAdapter +(18) InputAdapter Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] -(18) InputIteratorTransformer +(19) InputIteratorTransformer Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] -(19) RegularHashAggregateExecTransformer +(20) RegularHashAggregateExecTransformer Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Keys: [] Functions [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) ELSE 0.0000 END), sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) ELSE 0.0000 END)#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] Results [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) ELSE 0.0000 END)#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] -(20) ProjectExecTransformer +(21) ProjectExecTransformer Output [1]: [CheckOverflow((promote_precision(CheckOverflow((100.0000 * promote_precision(sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) ELSE 0.0000 END)#X)), DecimalType(38,6), true)) / promote_precision(cast(sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X as decimal(38,6)))), DecimalType(38,6), true) AS promo_revenue#X] Input [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) ELSE 0.0000 END)#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] -(21) WholeStageCodegenTransformer (X) +(22) WholeStageCodegenTransformer (X) Input [1]: [promo_revenue#X] Arguments: false -(22) VeloxColumnarToRowExec +(23) VeloxColumnarToRowExec Input [1]: [promo_revenue#X] -(23) Scan parquet +(24) Scan parquet Output [4]: [l_partkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1995-09-01), LessThan(l_shipdate,1995-10-01), IsNotNull(l_partkey)] ReadSchema: struct -(24) Filter +(25) Filter Input [4]: [l_partkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : (((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1995-09-01)) AND (l_shipdate#X < 1995-10-01)) AND isnotnull(l_partkey#X)) -(25) Project +(26) Project Output [3]: [l_partkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_partkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(26) Scan parquet +(27) Scan parquet Output [2]: [p_partkey#X, p_type#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_partkey)] ReadSchema: struct -(27) Filter +(28) Filter Input [2]: [p_partkey#X, p_type#X] Condition : isnotnull(p_partkey#X) -(28) BroadcastExchange +(29) BroadcastExchange Input [2]: [p_partkey#X, p_type#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(29) BroadcastHashJoin +(30) BroadcastHashJoin Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(30) Project +(31) Project Output [3]: [l_extendedprice#X, l_discount#X, p_type#X] Input [5]: [l_partkey#X, l_extendedprice#X, l_discount#X, p_partkey#X, p_type#X] -(31) HashAggregate +(32) HashAggregate Input [3]: [l_extendedprice#X, l_discount#X, p_type#X] Keys: [] Functions [2]: [partial_sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) ELSE 0.0000 END), partial_sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Results [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] -(32) Exchange +(33) Exchange Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X] -(33) HashAggregate +(34) HashAggregate Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Keys: [] Functions [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) ELSE 0.0000 END), sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) ELSE 0.0000 END)#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] Results [1]: [CheckOverflow((promote_precision(CheckOverflow((100.0000 * promote_precision(sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) ELSE 0.0000 END)#X)), DecimalType(38,6), true)) / promote_precision(cast(sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X as decimal(38,6)))), DecimalType(38,6), true) AS promo_revenue#X] -(34) AdaptiveSparkPlan +(35) AdaptiveSparkPlan Output [1]: [promo_revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/15.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/15.txt index 076e82bd53e3e..b5073901c6206 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/15.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/15.txt @@ -1,46 +1,45 @@ == Physical Plan == -AdaptiveSparkPlan (44) +AdaptiveSparkPlan (46) +- == Final Plan == - VeloxColumnarToRowExec (29) - +- ^ SortExecTransformer (27) - +- ^ InputIteratorTransformer (26) - +- ^ InputAdapter (25) - +- ^ ShuffleQueryStage (24) - +- ColumnarExchange (23) - +- ^ ProjectExecTransformer (21) - +- ^ BroadcastHashJoinExecTransformer Inner (20) + VeloxColumnarToRowExec (31) + +- ^ SortExecTransformer (29) + +- ^ InputIteratorTransformer (28) + +- ShuffleQueryStage (26) + +- ColumnarExchange (25) + +- VeloxAppendBatches (24) + +- ^ ProjectExecTransformer (22) + +- ^ BroadcastHashJoinExecTransformer Inner BuildLeft (21) :- ^ InputIteratorTransformer (7) - : +- ^ InputAdapter (6) - : +- ^ BroadcastQueryStage (5) - : +- ColumnarBroadcastExchange (4) - : +- ^ NoopFilter (2) - : +- ^ Scan parquet (1) - +- ^ FilterExecTransformer (19) - +- ^ RegularHashAggregateExecTransformer (18) - +- ^ InputIteratorTransformer (17) - +- ^ InputAdapter (16) - +- ^ ShuffleQueryStage (15) - +- ColumnarExchange (14) + : +- BroadcastQueryStage (5) + : +- ColumnarBroadcastExchange (4) + : +- ^ NoopFilter (2) + : +- ^ Scan parquet (1) + +- ^ FilterExecTransformer (20) + +- ^ RegularHashAggregateExecTransformer (19) + +- ^ InputIteratorTransformer (18) + +- ShuffleQueryStage (16) + +- ColumnarExchange (15) + +- VeloxAppendBatches (14) +- ^ ProjectExecTransformer (12) +- ^ FlushableHashAggregateExecTransformer (11) +- ^ ProjectExecTransformer (10) +- ^ NoopFilter (9) +- ^ Scan parquet (8) +- == Initial Plan == - Sort (43) - +- Exchange (42) - +- Project (41) - +- BroadcastHashJoin Inner BuildLeft (40) - :- BroadcastExchange (32) - : +- Filter (31) - : +- Scan parquet (30) - +- Filter (39) - +- HashAggregate (38) - +- Exchange (37) - +- HashAggregate (36) - +- Project (35) - +- Filter (34) - +- Scan parquet (33) + Sort (45) + +- Exchange (44) + +- Project (43) + +- BroadcastHashJoin Inner BuildLeft (42) + :- BroadcastExchange (34) + : +- Filter (33) + : +- Scan parquet (32) + +- Filter (41) + +- HashAggregate (40) + +- Exchange (39) + +- HashAggregate (38) + +- Project (37) + +- Filter (36) + +- Scan parquet (35) (1) Scan parquet @@ -102,138 +101,146 @@ Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: false -(14) ColumnarExchange +(14) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] +Arguments: X + +(15) ColumnarExchange Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(15) ShuffleQueryStage +(16) ShuffleQueryStage Output [3]: [l_suppkey#X, sum#X, isEmpty#X] Arguments: X -(16) InputAdapter +(17) InputAdapter Input [3]: [l_suppkey#X, sum#X, isEmpty#X] -(17) InputIteratorTransformer +(18) InputIteratorTransformer Input [3]: [l_suppkey#X, sum#X, isEmpty#X] -(18) RegularHashAggregateExecTransformer +(19) RegularHashAggregateExecTransformer Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Keys [1]: [l_suppkey#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] Results [2]: [l_suppkey#X AS supplier_no#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X AS total_revenue#X] -(19) FilterExecTransformer +(20) FilterExecTransformer Input [2]: [supplier_no#X, total_revenue#X] Arguments: (isnotnull(total_revenue#X) AND (total_revenue#X = Subquery subquery#X, [id=#X])) -(20) BroadcastHashJoinExecTransformer +(21) BroadcastHashJoinExecTransformer Left keys [1]: [s_suppkey#X] Right keys [1]: [supplier_no#X] Join condition: None -(21) ProjectExecTransformer +(22) ProjectExecTransformer Output [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Input [6]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, supplier_no#X, total_revenue#X] -(22) WholeStageCodegenTransformer (X) +(23) WholeStageCodegenTransformer (X) Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: false -(23) ColumnarExchange +(24) VeloxAppendBatches +Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] +Arguments: X + +(25) ColumnarExchange Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: rangepartitioning(s_suppkey#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(24) ShuffleQueryStage +(26) ShuffleQueryStage Output [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: X -(25) InputAdapter +(27) InputAdapter Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] -(26) InputIteratorTransformer +(28) InputIteratorTransformer Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] -(27) SortExecTransformer +(29) SortExecTransformer Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: [s_suppkey#X ASC NULLS FIRST], true, 0 -(28) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: false -(29) VeloxColumnarToRowExec +(31) VeloxColumnarToRowExec Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] -(30) Scan parquet +(32) Scan parquet Output [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey)] ReadSchema: struct -(31) Filter +(33) Filter Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] Condition : isnotnull(s_suppkey#X) -(32) BroadcastExchange +(34) BroadcastExchange Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(33) Scan parquet +(35) Scan parquet Output [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1996-01-01), LessThan(l_shipdate,1996-04-01), IsNotNull(l_suppkey)] ReadSchema: struct -(34) Filter +(36) Filter Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : (((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1996-01-01)) AND (l_shipdate#X < 1996-04-01)) AND isnotnull(l_suppkey#X)) -(35) Project +(37) Project Output [3]: [l_suppkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(36) HashAggregate +(38) HashAggregate Input [3]: [l_suppkey#X, l_extendedprice#X, l_discount#X] Keys [1]: [l_suppkey#X] Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_suppkey#X, sum#X, isEmpty#X] -(37) Exchange +(39) Exchange Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(38) HashAggregate +(40) HashAggregate Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Keys [1]: [l_suppkey#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] Results [2]: [l_suppkey#X AS supplier_no#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X AS total_revenue#X] -(39) Filter +(41) Filter Input [2]: [supplier_no#X, total_revenue#X] Condition : (isnotnull(total_revenue#X) AND (total_revenue#X = Subquery subquery#X, [id=#X])) -(40) BroadcastHashJoin +(42) BroadcastHashJoin Left keys [1]: [s_suppkey#X] Right keys [1]: [supplier_no#X] Join condition: None -(41) Project +(43) Project Output [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Input [6]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, supplier_no#X, total_revenue#X] -(42) Exchange +(44) Exchange Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: rangepartitioning(s_suppkey#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(43) Sort +(45) Sort Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: [s_suppkey#X ASC NULLS FIRST], true, 0 -(44) AdaptiveSparkPlan +(46) AdaptiveSparkPlan Output [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/16.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/16.txt index 603ea771b1670..f3cef927551c5 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/16.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/16.txt @@ -1,57 +1,56 @@ == Physical Plan == -AdaptiveSparkPlan (56) +AdaptiveSparkPlan (59) +- == Final Plan == - VeloxColumnarToRowExec (35) - +- ^ SortExecTransformer (33) - +- ^ InputIteratorTransformer (32) - +- ^ InputAdapter (31) - +- ^ ShuffleQueryStage (30) - +- ColumnarExchange (29) - +- ^ RegularHashAggregateExecTransformer (27) - +- ^ InputIteratorTransformer (26) - +- ^ InputAdapter (25) - +- ^ ShuffleQueryStage (24) - +- ColumnarExchange (23) - +- ^ ProjectExecTransformer (21) - +- ^ FlushableHashAggregateExecTransformer (20) - +- ^ RegularHashAggregateExecTransformer (19) - +- ^ InputIteratorTransformer (18) - +- ^ InputAdapter (17) - +- ^ ShuffleQueryStage (16) - +- ColumnarExchange (15) + VeloxColumnarToRowExec (38) + +- ^ SortExecTransformer (36) + +- ^ InputIteratorTransformer (35) + +- ShuffleQueryStage (33) + +- ColumnarExchange (32) + +- VeloxAppendBatches (31) + +- ^ RegularHashAggregateExecTransformer (29) + +- ^ InputIteratorTransformer (28) + +- ShuffleQueryStage (26) + +- ColumnarExchange (25) + +- VeloxAppendBatches (24) + +- ^ ProjectExecTransformer (22) + +- ^ FlushableHashAggregateExecTransformer (21) + +- ^ RegularHashAggregateExecTransformer (20) + +- ^ InputIteratorTransformer (19) + +- ShuffleQueryStage (17) + +- ColumnarExchange (16) + +- VeloxAppendBatches (15) +- ^ ProjectExecTransformer (13) +- ^ FlushableHashAggregateExecTransformer (12) +- ^ ProjectExecTransformer (11) - +- ^ BroadcastHashJoinExecTransformer Inner (10) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (10) :- ^ NoopFilter (2) : +- ^ Scan parquet (1) +- ^ InputIteratorTransformer (9) - +- ^ InputAdapter (8) - +- ^ BroadcastQueryStage (7) - +- ColumnarBroadcastExchange (6) - +- ^ NoopFilter (4) - +- ^ Scan parquet (3) + +- BroadcastQueryStage (7) + +- ColumnarBroadcastExchange (6) + +- ^ NoopFilter (4) + +- ^ Scan parquet (3) +- == Initial Plan == - Sort (55) - +- Exchange (54) - +- HashAggregate (53) - +- Exchange (52) - +- HashAggregate (51) - +- HashAggregate (50) - +- Exchange (49) - +- HashAggregate (48) - +- Project (47) - +- BroadcastHashJoin Inner BuildRight (46) - :- BroadcastHashJoin LeftAnti BuildRight (42) - : :- Filter (37) - : : +- Scan parquet (36) - : +- BroadcastExchange (41) - : +- Project (40) - : +- Filter (39) - : +- Scan parquet (38) - +- BroadcastExchange (45) - +- Filter (44) - +- Scan parquet (43) + Sort (58) + +- Exchange (57) + +- HashAggregate (56) + +- Exchange (55) + +- HashAggregate (54) + +- HashAggregate (53) + +- Exchange (52) + +- HashAggregate (51) + +- Project (50) + +- BroadcastHashJoin Inner BuildRight (49) + :- BroadcastHashJoin LeftAnti BuildRight (45) + : :- Filter (40) + : : +- Scan parquet (39) + : +- BroadcastExchange (44) + : +- Project (43) + : +- Filter (42) + : +- Scan parquet (41) + +- BroadcastExchange (48) + +- Filter (47) + +- Scan parquet (46) (1) Scan parquet @@ -118,195 +117,207 @@ Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Arguments: false -(15) ColumnarExchange +(15) VeloxAppendBatches +Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, ps_suppkey#X] +Arguments: X + +(16) ColumnarExchange Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, ps_suppkey#X], [plan_id=X], [id=#X] -(16) ShuffleQueryStage +(17) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Arguments: X -(17) InputAdapter +(18) InputAdapter Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(18) InputIteratorTransformer +(19) InputIteratorTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(19) RegularHashAggregateExecTransformer +(20) RegularHashAggregateExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Keys [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Functions: [] Aggregate Attributes: [] Results [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(20) FlushableHashAggregateExecTransformer +(21) FlushableHashAggregateExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] Functions [1]: [partial_count(distinct ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count#X] -(21) ProjectExecTransformer +(22) ProjectExecTransformer Output [5]: [hash(p_brand#X, p_type#X, p_size#X, 42) AS hash_partition_key#X, p_brand#X, p_type#X, p_size#X, count#X] Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] -(22) WholeStageCodegenTransformer (X) +(23) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, count#X] Arguments: false -(23) ColumnarExchange +(24) VeloxAppendBatches +Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, count#X] +Arguments: X + +(25) ColumnarExchange Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, count#X] Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, count#X], [plan_id=X], [id=#X] -(24) ShuffleQueryStage +(26) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, count#X] Arguments: X -(25) InputAdapter +(27) InputAdapter Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] -(26) InputIteratorTransformer +(28) InputIteratorTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] -(27) RegularHashAggregateExecTransformer +(29) RegularHashAggregateExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] Functions [1]: [count(distinct ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count(ps_suppkey#X)#X AS supplier_cnt#X] -(28) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: false -(29) ColumnarExchange +(31) VeloxAppendBatches +Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] +Arguments: X + +(32) ColumnarExchange Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: rangepartitioning(supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(30) ShuffleQueryStage +(33) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: X -(31) InputAdapter +(34) InputAdapter Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] -(32) InputIteratorTransformer +(35) InputIteratorTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] -(33) SortExecTransformer +(36) SortExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: [supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST], true, 0 -(34) WholeStageCodegenTransformer (X) +(37) WholeStageCodegenTransformer (X) Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: false -(35) VeloxColumnarToRowExec +(38) VeloxColumnarToRowExec Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] -(36) Scan parquet +(39) Scan parquet Output [2]: [ps_partkey#X, ps_suppkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_partkey)] ReadSchema: struct -(37) Filter +(40) Filter Input [2]: [ps_partkey#X, ps_suppkey#X] Condition : isnotnull(ps_partkey#X) -(38) Scan parquet +(41) Scan parquet Output [2]: [s_suppkey#X, s_comment#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_comment)] ReadSchema: struct -(39) Filter +(42) Filter Input [2]: [s_suppkey#X, s_comment#X] Condition : (isnotnull(s_comment#X) AND s_comment#X LIKE %Customer%Complaints%) -(40) Project +(43) Project Output [1]: [s_suppkey#X] Input [2]: [s_suppkey#X, s_comment#X] -(41) BroadcastExchange +(44) BroadcastExchange Input [1]: [s_suppkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),true), [plan_id=X] -(42) BroadcastHashJoin +(45) BroadcastHashJoin Left keys [1]: [ps_suppkey#X] Right keys [1]: [s_suppkey#X] Join condition: None -(43) Scan parquet +(46) Scan parquet Output [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_brand), IsNotNull(p_type), Not(EqualTo(p_brand,Brand#X)), Not(StringStartsWith(p_type,MEDIUM POLISHED)), In(p_size, [14,19,23,3,36,45,49,9]), IsNotNull(p_partkey)] ReadSchema: struct -(44) Filter +(47) Filter Input [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] Condition : (((((isnotnull(p_brand#X) AND isnotnull(p_type#X)) AND NOT (p_brand#X = Brand#X)) AND NOT StartsWith(p_type#X, MEDIUM POLISHED)) AND p_size#X IN (49,14,23,45,19,3,36,9)) AND isnotnull(p_partkey#X)) -(45) BroadcastExchange +(48) BroadcastExchange Input [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(46) BroadcastHashJoin +(49) BroadcastHashJoin Left keys [1]: [ps_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(47) Project +(50) Project Output [4]: [ps_suppkey#X, p_brand#X, p_type#X, p_size#X] Input [6]: [ps_partkey#X, ps_suppkey#X, p_partkey#X, p_brand#X, p_type#X, p_size#X] -(48) HashAggregate +(51) HashAggregate Input [4]: [ps_suppkey#X, p_brand#X, p_type#X, p_size#X] Keys [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Functions: [] Aggregate Attributes: [] Results [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(49) Exchange +(52) Exchange Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(50) HashAggregate +(53) HashAggregate Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Keys [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Functions: [] Aggregate Attributes: [] Results [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(51) HashAggregate +(54) HashAggregate Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] Functions [1]: [partial_count(distinct ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count#X] -(52) Exchange +(55) Exchange Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(53) HashAggregate +(56) HashAggregate Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] Functions [1]: [count(distinct ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count(ps_suppkey#X)#X AS supplier_cnt#X] -(54) Exchange +(57) Exchange Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: rangepartitioning(supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(55) Sort +(58) Sort Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: [supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST], true, 0 -(56) AdaptiveSparkPlan +(59) AdaptiveSparkPlan Output [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/17.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/17.txt index d806c8c2577b5..7c5359849d4e8 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/17.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/17.txt @@ -1,39 +1,38 @@ == Physical Plan == -AdaptiveSparkPlan (35) +AdaptiveSparkPlan (36) +- == Final Plan == - VeloxColumnarToRowExec (14) - +- ^ ProjectExecTransformer (12) - +- ^ RegularHashAggregateExecTransformer (11) - +- ^ InputIteratorTransformer (10) - +- ^ InputAdapter (9) - +- ^ ShuffleQueryStage (8) - +- ColumnarExchange (7) + VeloxColumnarToRowExec (15) + +- ^ ProjectExecTransformer (13) + +- ^ RegularHashAggregateExecTransformer (12) + +- ^ InputIteratorTransformer (11) + +- ShuffleQueryStage (9) + +- ColumnarExchange (8) + +- VeloxAppendBatches (7) +- ^ FlushableHashAggregateExecTransformer (5) +- ^ InputIteratorTransformer (4) - +- ^ InputAdapter (3) - +- ^ RowToVeloxColumnar (2) - +- ^ LocalTableScan (1) + +- RowToVeloxColumnar (2) + +- LocalTableScan (1) +- == Initial Plan == - HashAggregate (34) - +- Exchange (33) - +- HashAggregate (32) - +- Project (31) - +- BroadcastHashJoin Inner BuildRight (30) - :- Project (22) - : +- BroadcastHashJoin Inner BuildRight (21) - : :- Filter (16) - : : +- Scan parquet (15) - : +- BroadcastExchange (20) - : +- Project (19) - : +- Filter (18) - : +- Scan parquet (17) - +- BroadcastExchange (29) - +- Filter (28) - +- HashAggregate (27) - +- Exchange (26) - +- HashAggregate (25) - +- Filter (24) - +- Scan parquet (23) + HashAggregate (35) + +- Exchange (34) + +- HashAggregate (33) + +- Project (32) + +- BroadcastHashJoin Inner BuildRight (31) + :- Project (23) + : +- BroadcastHashJoin Inner BuildRight (22) + : :- Filter (17) + : : +- Scan parquet (16) + : +- BroadcastExchange (21) + : +- Project (20) + : +- Filter (19) + : +- Scan parquet (18) + +- BroadcastExchange (30) + +- Filter (29) + +- HashAggregate (28) + +- Exchange (27) + +- HashAggregate (26) + +- Filter (25) + +- Scan parquet (24) (1) LocalTableScan @@ -60,141 +59,145 @@ Results [2]: [sum#X, isEmpty#X] Input [2]: [sum#X, isEmpty#X] Arguments: false -(7) ColumnarExchange +(7) VeloxAppendBatches +Input [2]: [sum#X, isEmpty#X] +Arguments: X + +(8) ColumnarExchange Input [2]: [sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(8) ShuffleQueryStage +(9) ShuffleQueryStage Output [2]: [sum#X, isEmpty#X] Arguments: X -(9) InputAdapter +(10) InputAdapter Input [2]: [sum#X, isEmpty#X] -(10) InputIteratorTransformer +(11) InputIteratorTransformer Input [2]: [sum#X, isEmpty#X] -(11) RegularHashAggregateExecTransformer +(12) RegularHashAggregateExecTransformer Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(l_extendedprice#X)] Aggregate Attributes [1]: [sum(l_extendedprice#X)#X] Results [1]: [sum(l_extendedprice#X)#X] -(12) ProjectExecTransformer +(13) ProjectExecTransformer Output [1]: [CheckOverflow((promote_precision(sum(l_extendedprice#X)#X) / 7.00), DecimalType(27,6), true) AS avg_yearly#X] Input [1]: [sum(l_extendedprice#X)#X] -(13) WholeStageCodegenTransformer (X) +(14) WholeStageCodegenTransformer (X) Input [1]: [avg_yearly#X] Arguments: false -(14) VeloxColumnarToRowExec +(15) VeloxColumnarToRowExec Input [1]: [avg_yearly#X] -(15) Scan parquet +(16) Scan parquet Output [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey), IsNotNull(l_quantity)] ReadSchema: struct -(16) Filter +(17) Filter Input [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] Condition : (isnotnull(l_partkey#X) AND isnotnull(l_quantity#X)) -(17) Scan parquet +(18) Scan parquet Output [3]: [p_partkey#X, p_brand#X, p_container#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_brand), IsNotNull(p_container), EqualTo(p_brand,Brand#X), EqualTo(p_container,MED BOX), IsNotNull(p_partkey)] ReadSchema: struct -(18) Filter +(19) Filter Input [3]: [p_partkey#X, p_brand#X, p_container#X] Condition : ((((isnotnull(p_brand#X) AND isnotnull(p_container#X)) AND (p_brand#X = Brand#X)) AND (p_container#X = MED BOX)) AND isnotnull(p_partkey#X)) -(19) Project +(20) Project Output [1]: [p_partkey#X] Input [3]: [p_partkey#X, p_brand#X, p_container#X] -(20) BroadcastExchange +(21) BroadcastExchange Input [1]: [p_partkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(21) BroadcastHashJoin +(22) BroadcastHashJoin Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(22) Project +(23) Project Output [3]: [l_quantity#X, l_extendedprice#X, p_partkey#X] Input [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, p_partkey#X] -(23) Scan parquet +(24) Scan parquet Output [2]: [l_partkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey)] ReadSchema: struct -(24) Filter +(25) Filter Input [2]: [l_partkey#X, l_quantity#X] Condition : isnotnull(l_partkey#X) -(25) HashAggregate +(26) HashAggregate Input [2]: [l_partkey#X, l_quantity#X] Keys [1]: [l_partkey#X] Functions [1]: [partial_avg(l_quantity#X)] Aggregate Attributes [2]: [sum#X, count#X] Results [3]: [l_partkey#X, sum#X, count#X] -(26) Exchange +(27) Exchange Input [3]: [l_partkey#X, sum#X, count#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(27) HashAggregate +(28) HashAggregate Input [3]: [l_partkey#X, sum#X, count#X] Keys [1]: [l_partkey#X] Functions [1]: [avg(l_quantity#X)] Aggregate Attributes [1]: [avg(l_quantity#X)#X] Results [2]: [CheckOverflow((0.200000 * promote_precision(avg(l_quantity#X)#X)), DecimalType(18,7), true) AS (0.2 * avg(l_quantity))#X, l_partkey#X] -(28) Filter +(29) Filter Input [2]: [(0.2 * avg(l_quantity))#X, l_partkey#X] Condition : isnotnull((0.2 * avg(l_quantity))#X) -(29) BroadcastExchange +(30) BroadcastExchange Input [2]: [(0.2 * avg(l_quantity))#X, l_partkey#X] Arguments: HashedRelationBroadcastMode(List(input[1, bigint, true]),false), [plan_id=X] -(30) BroadcastHashJoin +(31) BroadcastHashJoin Left keys [1]: [p_partkey#X] Right keys [1]: [l_partkey#X] Join condition: (cast(l_quantity#X as decimal(18,7)) < (0.2 * avg(l_quantity))#X) -(31) Project +(32) Project Output [1]: [l_extendedprice#X] Input [5]: [l_quantity#X, l_extendedprice#X, p_partkey#X, (0.2 * avg(l_quantity))#X, l_partkey#X] -(32) HashAggregate +(33) HashAggregate Input [1]: [l_extendedprice#X] Keys: [] Functions [1]: [partial_sum(l_extendedprice#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(33) Exchange +(34) Exchange Input [2]: [sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X] -(34) HashAggregate +(35) HashAggregate Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(l_extendedprice#X)] Aggregate Attributes [1]: [sum(l_extendedprice#X)#X] Results [1]: [CheckOverflow((promote_precision(sum(l_extendedprice#X)#X) / 7.00), DecimalType(27,6), true) AS avg_yearly#X] -(35) AdaptiveSparkPlan +(36) AdaptiveSparkPlan Output [1]: [avg_yearly#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/18.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/18.txt index c294e8e6dac15..a0a6e26a6ac3c 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/18.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/18.txt @@ -1,86 +1,82 @@ == Physical Plan == -AdaptiveSparkPlan (86) +AdaptiveSparkPlan (88) +- == Final Plan == - VeloxColumnarToRowExec (53) - +- TakeOrderedAndProjectExecTransformer (52) - +- ^ RegularHashAggregateExecTransformer (50) - +- ^ InputIteratorTransformer (49) - +- ^ InputAdapter (48) - +- ^ ShuffleQueryStage (47) - +- ColumnarExchange (46) - +- ^ ProjectExecTransformer (44) - +- ^ FlushableHashAggregateExecTransformer (43) - +- ^ ProjectExecTransformer (42) - +- ^ BroadcastHashJoinExecTransformer Inner (41) - :- ^ ProjectExecTransformer (28) - : +- ^ BroadcastHashJoinExecTransformer Inner (27) + VeloxColumnarToRowExec (55) + +- TakeOrderedAndProjectExecTransformer (54) + +- ^ RegularHashAggregateExecTransformer (52) + +- ^ InputIteratorTransformer (51) + +- ShuffleQueryStage (49) + +- ColumnarExchange (48) + +- VeloxAppendBatches (47) + +- ^ ProjectExecTransformer (45) + +- ^ FlushableHashAggregateExecTransformer (44) + +- ^ ProjectExecTransformer (43) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (42) + :- ^ ProjectExecTransformer (29) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildLeft (28) : :- ^ InputIteratorTransformer (7) - : : +- ^ InputAdapter (6) - : : +- ^ BroadcastQueryStage (5) - : : +- ColumnarBroadcastExchange (4) - : : +- ^ NoopFilter (2) - : : +- ^ Scan parquet (1) - : +- ^ BroadcastHashJoinExecTransformer LeftSemi (26) + : : +- BroadcastQueryStage (5) + : : +- ColumnarBroadcastExchange (4) + : : +- ^ NoopFilter (2) + : : +- ^ Scan parquet (1) + : +- ^ BroadcastHashJoinExecTransformer LeftSemi BuildRight (27) : :- ^ NoopFilter (9) : : +- ^ Scan parquet (8) - : +- ^ InputIteratorTransformer (25) - : +- ^ InputAdapter (24) - : +- ^ BroadcastQueryStage (23) - : +- ColumnarBroadcastExchange (22) - : +- ^ ProjectExecTransformer (20) - : +- ^ FilterExecTransformer (19) - : +- ^ RegularHashAggregateExecTransformer (18) - : +- ^ InputIteratorTransformer (17) - : +- ^ InputAdapter (16) - : +- ^ ShuffleQueryStage (15) - : +- ColumnarExchange (14) - : +- ^ ProjectExecTransformer (12) - : +- ^ FlushableHashAggregateExecTransformer (11) - : +- ^ Scan parquet (10) - +- ^ InputIteratorTransformer (40) - +- ^ InputAdapter (39) - +- ^ BroadcastQueryStage (38) - +- ColumnarBroadcastExchange (37) - +- ^ BroadcastHashJoinExecTransformer LeftSemi (35) - :- ^ NoopFilter (30) - : +- ^ Scan parquet (29) - +- ^ InputIteratorTransformer (34) - +- ^ InputAdapter (33) - +- ^ BroadcastQueryStage (32) - +- ReusedExchange (31) + : +- ^ InputIteratorTransformer (26) + : +- BroadcastQueryStage (24) + : +- ColumnarBroadcastExchange (23) + : +- ^ ProjectExecTransformer (21) + : +- ^ FilterExecTransformer (20) + : +- ^ RegularHashAggregateExecTransformer (19) + : +- ^ InputIteratorTransformer (18) + : +- ShuffleQueryStage (16) + : +- ColumnarExchange (15) + : +- VeloxAppendBatches (14) + : +- ^ ProjectExecTransformer (12) + : +- ^ FlushableHashAggregateExecTransformer (11) + : +- ^ Scan parquet (10) + +- ^ InputIteratorTransformer (41) + +- BroadcastQueryStage (39) + +- ColumnarBroadcastExchange (38) + +- ^ BroadcastHashJoinExecTransformer LeftSemi BuildRight (36) + :- ^ NoopFilter (31) + : +- ^ Scan parquet (30) + +- ^ InputIteratorTransformer (35) + +- BroadcastQueryStage (33) + +- ReusedExchange (32) +- == Initial Plan == - TakeOrderedAndProject (85) - +- HashAggregate (84) - +- Exchange (83) - +- HashAggregate (82) - +- Project (81) - +- BroadcastHashJoin Inner BuildRight (80) - :- Project (68) - : +- BroadcastHashJoin Inner BuildLeft (67) - : :- BroadcastExchange (56) - : : +- Filter (55) - : : +- Scan parquet (54) - : +- BroadcastHashJoin LeftSemi BuildRight (66) - : :- Filter (58) - : : +- Scan parquet (57) - : +- BroadcastExchange (65) - : +- Project (64) - : +- Filter (63) - : +- HashAggregate (62) - : +- Exchange (61) - : +- HashAggregate (60) - : +- Scan parquet (59) - +- BroadcastExchange (79) - +- BroadcastHashJoin LeftSemi BuildRight (78) - :- Filter (70) - : +- Scan parquet (69) - +- BroadcastExchange (77) - +- Project (76) - +- Filter (75) - +- HashAggregate (74) - +- Exchange (73) - +- HashAggregate (72) - +- Scan parquet (71) + TakeOrderedAndProject (87) + +- HashAggregate (86) + +- Exchange (85) + +- HashAggregate (84) + +- Project (83) + +- BroadcastHashJoin Inner BuildRight (82) + :- Project (70) + : +- BroadcastHashJoin Inner BuildLeft (69) + : :- BroadcastExchange (58) + : : +- Filter (57) + : : +- Scan parquet (56) + : +- BroadcastHashJoin LeftSemi BuildRight (68) + : :- Filter (60) + : : +- Scan parquet (59) + : +- BroadcastExchange (67) + : +- Project (66) + : +- Filter (65) + : +- HashAggregate (64) + : +- Exchange (63) + : +- HashAggregate (62) + : +- Scan parquet (61) + +- BroadcastExchange (81) + +- BroadcastHashJoin LeftSemi BuildRight (80) + :- Filter (72) + : +- Scan parquet (71) + +- BroadcastExchange (79) + +- Project (78) + +- Filter (77) + +- HashAggregate (76) + +- Exchange (75) + +- HashAggregate (74) + +- Scan parquet (73) (1) Scan parquet @@ -144,333 +140,341 @@ Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Input [4]: [hash_partition_key#X, l_orderkey#X, sum#X, isEmpty#X] Arguments: false -(14) ColumnarExchange +(14) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_orderkey#X, sum#X, isEmpty#X] +Arguments: X + +(15) ColumnarExchange Input [4]: [hash_partition_key#X, l_orderkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(15) ShuffleQueryStage +(16) ShuffleQueryStage Output [3]: [l_orderkey#X, sum#X, isEmpty#X] Arguments: X -(16) InputAdapter +(17) InputAdapter Input [3]: [l_orderkey#X, sum#X, isEmpty#X] -(17) InputIteratorTransformer +(18) InputIteratorTransformer Input [3]: [l_orderkey#X, sum#X, isEmpty#X] -(18) RegularHashAggregateExecTransformer +(19) RegularHashAggregateExecTransformer Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Keys [1]: [l_orderkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [2]: [l_orderkey#X, sum(l_quantity#X)#X AS sum(l_quantity#X)#X] -(19) FilterExecTransformer +(20) FilterExecTransformer Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] Arguments: (isnotnull(sum(l_quantity#X)#X) AND (sum(l_quantity#X)#X > 300.00)) -(20) ProjectExecTransformer +(21) ProjectExecTransformer Output [1]: [l_orderkey#X] Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] -(21) WholeStageCodegenTransformer (X) +(22) WholeStageCodegenTransformer (X) Input [1]: [l_orderkey#X] Arguments: false -(22) ColumnarBroadcastExchange +(23) ColumnarBroadcastExchange Input [1]: [l_orderkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(23) BroadcastQueryStage +(24) BroadcastQueryStage Output [1]: [l_orderkey#X] Arguments: X -(24) InputAdapter +(25) InputAdapter Input [1]: [l_orderkey#X] -(25) InputIteratorTransformer +(26) InputIteratorTransformer Input [1]: [l_orderkey#X] -(26) BroadcastHashJoinExecTransformer +(27) BroadcastHashJoinExecTransformer Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(27) BroadcastHashJoinExecTransformer +(28) BroadcastHashJoinExecTransformer Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(28) ProjectExecTransformer +(29) ProjectExecTransformer Output [5]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] Input [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -(29) Scan parquet +(30) Scan parquet Output [2]: [l_orderkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_orderkey)] ReadSchema: struct -(30) NoopFilter +(31) NoopFilter Input [2]: [l_orderkey#X, l_quantity#X] Arguments: [l_orderkey#X, l_quantity#X] -(31) ReusedExchange [Reuses operator id: 22] +(32) ReusedExchange [Reuses operator id: 23] Output [1]: [l_orderkey#X] -(32) BroadcastQueryStage +(33) BroadcastQueryStage Output [1]: [l_orderkey#X] Arguments: X -(33) InputAdapter +(34) InputAdapter Input [1]: [l_orderkey#X] -(34) InputIteratorTransformer +(35) InputIteratorTransformer Input [1]: [l_orderkey#X] -(35) BroadcastHashJoinExecTransformer +(36) BroadcastHashJoinExecTransformer Left keys [1]: [l_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(36) WholeStageCodegenTransformer (X) +(37) WholeStageCodegenTransformer (X) Input [2]: [l_orderkey#X, l_quantity#X] Arguments: false -(37) ColumnarBroadcastExchange +(38) ColumnarBroadcastExchange Input [2]: [l_orderkey#X, l_quantity#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(38) BroadcastQueryStage +(39) BroadcastQueryStage Output [2]: [l_orderkey#X, l_quantity#X] Arguments: X -(39) InputAdapter +(40) InputAdapter Input [2]: [l_orderkey#X, l_quantity#X] -(40) InputIteratorTransformer +(41) InputIteratorTransformer Input [2]: [l_orderkey#X, l_quantity#X] -(41) BroadcastHashJoinExecTransformer +(42) BroadcastHashJoinExecTransformer Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(42) ProjectExecTransformer +(43) ProjectExecTransformer Output [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_quantity#X] Input [7]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_orderkey#X, l_quantity#X] -(43) FlushableHashAggregateExecTransformer +(44) FlushableHashAggregateExecTransformer Input [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_quantity#X] Keys [5]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] -(44) ProjectExecTransformer +(45) ProjectExecTransformer Output [8]: [hash(c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, 42) AS hash_partition_key#X, c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] Input [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] -(45) WholeStageCodegenTransformer (X) +(46) WholeStageCodegenTransformer (X) Input [8]: [hash_partition_key#X, c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] Arguments: false -(46) ColumnarExchange +(47) VeloxAppendBatches +Input [8]: [hash_partition_key#X, c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] +Arguments: X + +(48) ColumnarExchange Input [8]: [hash_partition_key#X, c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] Arguments: hashpartitioning(c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, 1), ENSURE_REQUIREMENTS, [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(47) ShuffleQueryStage +(49) ShuffleQueryStage Output [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] Arguments: X -(48) InputAdapter +(50) InputAdapter Input [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] -(49) InputIteratorTransformer +(51) InputIteratorTransformer Input [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] -(50) RegularHashAggregateExecTransformer +(52) RegularHashAggregateExecTransformer Input [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] Keys [5]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity#X)#X AS sum(l_quantity)#X] -(51) WholeStageCodegenTransformer (X) +(53) WholeStageCodegenTransformer (X) Input [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] Arguments: false -(52) TakeOrderedAndProjectExecTransformer +(54) TakeOrderedAndProjectExecTransformer Input [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] Arguments: X, [o_totalprice#X DESC NULLS LAST, o_orderdate#X ASC NULLS FIRST], [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X], 0 -(53) VeloxColumnarToRowExec +(55) VeloxColumnarToRowExec Input [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] -(54) Scan parquet +(56) Scan parquet Output [2]: [c_custkey#X, c_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey)] ReadSchema: struct -(55) Filter +(57) Filter Input [2]: [c_custkey#X, c_name#X] Condition : isnotnull(c_custkey#X) -(56) BroadcastExchange +(58) BroadcastExchange Input [2]: [c_custkey#X, c_name#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(57) Scan parquet +(59) Scan parquet Output [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(58) Filter +(60) Filter Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Condition : (isnotnull(o_custkey#X) AND isnotnull(o_orderkey#X)) -(59) Scan parquet +(61) Scan parquet Output [2]: [l_orderkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(60) HashAggregate +(62) HashAggregate Input [2]: [l_orderkey#X, l_quantity#X] Keys [1]: [l_orderkey#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_orderkey#X, sum#X, isEmpty#X] -(61) Exchange +(63) Exchange Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(62) HashAggregate +(64) HashAggregate Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Keys [1]: [l_orderkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [2]: [l_orderkey#X, sum(l_quantity#X)#X AS sum(l_quantity#X)#X] -(63) Filter +(65) Filter Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] Condition : (isnotnull(sum(l_quantity#X)#X) AND (sum(l_quantity#X)#X > 300.00)) -(64) Project +(66) Project Output [1]: [l_orderkey#X] Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] -(65) BroadcastExchange +(67) BroadcastExchange Input [1]: [l_orderkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(66) BroadcastHashJoin +(68) BroadcastHashJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(67) BroadcastHashJoin +(69) BroadcastHashJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(68) Project +(70) Project Output [5]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] Input [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -(69) Scan parquet +(71) Scan parquet Output [2]: [l_orderkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_orderkey)] ReadSchema: struct -(70) Filter +(72) Filter Input [2]: [l_orderkey#X, l_quantity#X] Condition : isnotnull(l_orderkey#X) -(71) Scan parquet +(73) Scan parquet Output [2]: [l_orderkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(72) HashAggregate +(74) HashAggregate Input [2]: [l_orderkey#X, l_quantity#X] Keys [1]: [l_orderkey#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_orderkey#X, sum#X, isEmpty#X] -(73) Exchange +(75) Exchange Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(74) HashAggregate +(76) HashAggregate Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Keys [1]: [l_orderkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [2]: [l_orderkey#X, sum(l_quantity#X)#X AS sum(l_quantity#X)#X] -(75) Filter +(77) Filter Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] Condition : (isnotnull(sum(l_quantity#X)#X) AND (sum(l_quantity#X)#X > 300.00)) -(76) Project +(78) Project Output [1]: [l_orderkey#X] Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] -(77) BroadcastExchange +(79) BroadcastExchange Input [1]: [l_orderkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(78) BroadcastHashJoin +(80) BroadcastHashJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(79) BroadcastExchange +(81) BroadcastExchange Input [2]: [l_orderkey#X, l_quantity#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(80) BroadcastHashJoin +(82) BroadcastHashJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(81) Project +(83) Project Output [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_quantity#X] Input [7]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_orderkey#X, l_quantity#X] -(82) HashAggregate +(84) HashAggregate Input [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_quantity#X] Keys [5]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] -(83) Exchange +(85) Exchange Input [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] Arguments: hashpartitioning(c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(84) HashAggregate +(86) HashAggregate Input [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] Keys [5]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity#X)#X AS sum(l_quantity)#X] -(85) TakeOrderedAndProject +(87) TakeOrderedAndProject Input [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] Arguments: X, [o_totalprice#X DESC NULLS LAST, o_orderdate#X ASC NULLS FIRST], [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] -(86) AdaptiveSparkPlan +(88) AdaptiveSparkPlan Output [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/19.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/19.txt index 267ae6add0ac4..d2ab9979f8a37 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/19.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/19.txt @@ -1,36 +1,35 @@ == Physical Plan == -AdaptiveSparkPlan (33) +AdaptiveSparkPlan (34) +- == Final Plan == - VeloxColumnarToRowExec (21) - +- ^ RegularHashAggregateExecTransformer (19) - +- ^ InputIteratorTransformer (18) - +- ^ InputAdapter (17) - +- ^ ShuffleQueryStage (16) - +- ColumnarExchange (15) + VeloxColumnarToRowExec (22) + +- ^ RegularHashAggregateExecTransformer (20) + +- ^ InputIteratorTransformer (19) + +- ShuffleQueryStage (17) + +- ColumnarExchange (16) + +- VeloxAppendBatches (15) +- ^ FlushableHashAggregateExecTransformer (13) +- ^ ProjectExecTransformer (12) - +- ^ BroadcastHashJoinExecTransformer Inner (11) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (11) :- ^ ProjectExecTransformer (3) : +- ^ NoopFilter (2) : +- ^ Scan parquet (1) +- ^ InputIteratorTransformer (10) - +- ^ InputAdapter (9) - +- ^ BroadcastQueryStage (8) - +- ColumnarBroadcastExchange (7) - +- ^ NoopFilter (5) - +- ^ Scan parquet (4) + +- BroadcastQueryStage (8) + +- ColumnarBroadcastExchange (7) + +- ^ NoopFilter (5) + +- ^ Scan parquet (4) +- == Initial Plan == - HashAggregate (32) - +- Exchange (31) - +- HashAggregate (30) - +- Project (29) - +- BroadcastHashJoin Inner BuildRight (28) - :- Project (24) - : +- Filter (23) - : +- Scan parquet (22) - +- BroadcastExchange (27) - +- Filter (26) - +- Scan parquet (25) + HashAggregate (33) + +- Exchange (32) + +- HashAggregate (31) + +- Project (30) + +- BroadcastHashJoin Inner BuildRight (29) + :- Project (25) + : +- Filter (24) + : +- Scan parquet (23) + +- BroadcastExchange (28) + +- Filter (27) + +- Scan parquet (26) (1) Scan parquet @@ -97,91 +96,95 @@ Results [2]: [sum#X, isEmpty#X] Input [2]: [sum#X, isEmpty#X] Arguments: false -(15) ColumnarExchange +(15) VeloxAppendBatches +Input [2]: [sum#X, isEmpty#X] +Arguments: X + +(16) ColumnarExchange Input [2]: [sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(16) ShuffleQueryStage +(17) ShuffleQueryStage Output [2]: [sum#X, isEmpty#X] Arguments: X -(17) InputAdapter +(18) InputAdapter Input [2]: [sum#X, isEmpty#X] -(18) InputIteratorTransformer +(19) InputIteratorTransformer Input [2]: [sum#X, isEmpty#X] -(19) RegularHashAggregateExecTransformer +(20) RegularHashAggregateExecTransformer Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] Results [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X AS revenue#X] -(20) WholeStageCodegenTransformer (X) +(21) WholeStageCodegenTransformer (X) Input [1]: [revenue#X] Arguments: false -(21) VeloxColumnarToRowExec +(22) VeloxColumnarToRowExec Input [1]: [revenue#X] -(22) Scan parquet +(23) Scan parquet Output [6]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, l_shipinstruct#X, l_shipmode#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipinstruct), In(l_shipmode, [AIR,AIR REG]), EqualTo(l_shipinstruct,DELIVER IN PERSON), IsNotNull(l_partkey), Or(Or(And(GreaterThanOrEqual(l_quantity,1.00),LessThanOrEqual(l_quantity,11.00)),And(GreaterThanOrEqual(l_quantity,10.00),LessThanOrEqual(l_quantity,20.00))),And(GreaterThanOrEqual(l_quantity,20.00),LessThanOrEqual(l_quantity,30.00)))] ReadSchema: struct -(23) Filter +(24) Filter Input [6]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, l_shipinstruct#X, l_shipmode#X] Condition : ((((isnotnull(l_shipinstruct#X) AND l_shipmode#X IN (AIR,AIR REG)) AND (l_shipinstruct#X = DELIVER IN PERSON)) AND isnotnull(l_partkey#X)) AND ((((l_quantity#X >= 1.00) AND (l_quantity#X <= 11.00)) OR ((l_quantity#X >= 10.00) AND (l_quantity#X <= 20.00))) OR ((l_quantity#X >= 20.00) AND (l_quantity#X <= 30.00)))) -(24) Project +(25) Project Output [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Input [6]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, l_shipinstruct#X, l_shipmode#X] -(25) Scan parquet +(26) Scan parquet Output [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_size), GreaterThanOrEqual(p_size,1), IsNotNull(p_partkey), Or(Or(And(And(EqualTo(p_brand,Brand#X),In(p_container, [SM BOX,SM CASE,SM PACK,SM PKG])),LessThanOrEqual(p_size,5)),And(And(EqualTo(p_brand,Brand#X),In(p_container, [MED BAG,MED BOX,MED PACK,MED PKG])),LessThanOrEqual(p_size,10))),And(And(EqualTo(p_brand,Brand#X),In(p_container, [LG BOX,LG CASE,LG PACK,LG PKG])),LessThanOrEqual(p_size,15)))] ReadSchema: struct -(26) Filter +(27) Filter Input [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] Condition : (((isnotnull(p_size#X) AND (p_size#X >= 1)) AND isnotnull(p_partkey#X)) AND (((((p_brand#X = Brand#X) AND p_container#X IN (SM CASE,SM BOX,SM PACK,SM PKG)) AND (p_size#X <= 5)) OR (((p_brand#X = Brand#X) AND p_container#X IN (MED BAG,MED BOX,MED PKG,MED PACK)) AND (p_size#X <= 10))) OR (((p_brand#X = Brand#X) AND p_container#X IN (LG CASE,LG BOX,LG PACK,LG PKG)) AND (p_size#X <= 15)))) -(27) BroadcastExchange +(28) BroadcastExchange Input [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(28) BroadcastHashJoin +(29) BroadcastHashJoin Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join condition: (((((((p_brand#X = Brand#X) AND p_container#X IN (SM CASE,SM BOX,SM PACK,SM PKG)) AND (l_quantity#X >= 1.00)) AND (l_quantity#X <= 11.00)) AND (p_size#X <= 5)) OR (((((p_brand#X = Brand#X) AND p_container#X IN (MED BAG,MED BOX,MED PKG,MED PACK)) AND (l_quantity#X >= 10.00)) AND (l_quantity#X <= 20.00)) AND (p_size#X <= 10))) OR (((((p_brand#X = Brand#X) AND p_container#X IN (LG CASE,LG BOX,LG PACK,LG PKG)) AND (l_quantity#X >= 20.00)) AND (l_quantity#X <= 30.00)) AND (p_size#X <= 15))) -(29) Project +(30) Project Output [2]: [l_extendedprice#X, l_discount#X] Input [8]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, p_partkey#X, p_brand#X, p_size#X, p_container#X] -(30) HashAggregate +(31) HashAggregate Input [2]: [l_extendedprice#X, l_discount#X] Keys: [] Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(31) Exchange +(32) Exchange Input [2]: [sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X] -(32) HashAggregate +(33) HashAggregate Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] Results [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X AS revenue#X] -(33) AdaptiveSparkPlan +(34) AdaptiveSparkPlan Output [1]: [revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/20.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/20.txt index 1b4522028bb4e..a1f1bb51cb989 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/20.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/20.txt @@ -1,105 +1,102 @@ == Physical Plan == -AdaptiveSparkPlan (107) +AdaptiveSparkPlan (112) +- == Final Plan == - VeloxColumnarToRowExec (70) - +- ^ SortExecTransformer (68) - +- ^ InputIteratorTransformer (67) - +- ^ InputAdapter (66) - +- ^ ShuffleQueryStage (65) - +- ColumnarExchange (64) - +- ^ ProjectExecTransformer (62) - +- ^ BroadcastHashJoinExecTransformer Inner (61) - :- ^ ProjectExecTransformer (52) - : +- ^ BroadcastHashJoinExecTransformer LeftSemi (51) - : :- ^ InputIteratorTransformer (9) - : : +- ^ InputAdapter (8) - : : +- ^ AQEShuffleRead (7) - : : +- ^ ShuffleQueryStage (6) - : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (73) + +- ^ SortExecTransformer (71) + +- ^ InputIteratorTransformer (70) + +- ShuffleQueryStage (68) + +- ColumnarExchange (67) + +- VeloxAppendBatches (66) + +- ^ ProjectExecTransformer (64) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (63) + :- ^ ProjectExecTransformer (54) + : +- ^ BroadcastHashJoinExecTransformer LeftSemi BuildRight (53) + : :- ^ InputIteratorTransformer (10) + : : +- AQEShuffleRead (8) + : : +- ShuffleQueryStage (7) + : : +- ColumnarExchange (6) + : : +- VeloxAppendBatches (5) : : +- ^ ProjectExecTransformer (3) : : +- ^ NoopFilter (2) : : +- ^ Scan parquet (1) - : +- ^ InputIteratorTransformer (50) - : +- ^ InputAdapter (49) - : +- ^ BroadcastQueryStage (48) - : +- ColumnarBroadcastExchange (47) - : +- ^ ProjectExecTransformer (45) - : +- ^ BroadcastHashJoinExecTransformer Inner (44) - : :- ^ InputIteratorTransformer (25) - : : +- ^ InputAdapter (24) - : : +- ^ BroadcastQueryStage (23) - : : +- ColumnarBroadcastExchange (22) - : : +- ^ BroadcastHashJoinExecTransformer LeftSemi (20) - : : :- ^ NoopFilter (11) - : : : +- ^ Scan parquet (10) - : : +- ^ InputIteratorTransformer (19) - : : +- ^ InputAdapter (18) - : : +- ^ BroadcastQueryStage (17) - : : +- ColumnarBroadcastExchange (16) - : : +- ^ ProjectExecTransformer (14) - : : +- ^ NoopFilter (13) - : : +- ^ Scan parquet (12) - : +- ^ FilterExecTransformer (43) - : +- ^ ProjectExecTransformer (42) - : +- ^ RegularHashAggregateExecTransformer (41) - : +- ^ InputIteratorTransformer (40) - : +- ^ InputAdapter (39) - : +- ^ ShuffleQueryStage (38) - : +- ColumnarExchange (37) - : +- ^ ProjectExecTransformer (35) - : +- ^ FlushableHashAggregateExecTransformer (34) - : +- ^ BroadcastHashJoinExecTransformer LeftSemi (33) - : :- ^ ProjectExecTransformer (28) - : : +- ^ NoopFilter (27) - : : +- ^ Scan parquet (26) - : +- ^ InputIteratorTransformer (32) - : +- ^ InputAdapter (31) - : +- ^ BroadcastQueryStage (30) - : +- ReusedExchange (29) - +- ^ InputIteratorTransformer (60) - +- ^ InputAdapter (59) - +- ^ BroadcastQueryStage (58) - +- ColumnarBroadcastExchange (57) - +- ^ ProjectExecTransformer (55) - +- ^ NoopFilter (54) - +- ^ Scan parquet (53) + : +- ^ InputIteratorTransformer (52) + : +- BroadcastQueryStage (50) + : +- ColumnarBroadcastExchange (49) + : +- ^ ProjectExecTransformer (47) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildLeft (46) + : :- ^ InputIteratorTransformer (26) + : : +- BroadcastQueryStage (24) + : : +- ColumnarBroadcastExchange (23) + : : +- ^ BroadcastHashJoinExecTransformer LeftSemi BuildRight (21) + : : :- ^ NoopFilter (12) + : : : +- ^ Scan parquet (11) + : : +- ^ InputIteratorTransformer (20) + : : +- BroadcastQueryStage (18) + : : +- ColumnarBroadcastExchange (17) + : : +- ^ ProjectExecTransformer (15) + : : +- ^ NoopFilter (14) + : : +- ^ Scan parquet (13) + : +- ^ FilterExecTransformer (45) + : +- ^ ProjectExecTransformer (44) + : +- ^ RegularHashAggregateExecTransformer (43) + : +- ^ InputIteratorTransformer (42) + : +- ShuffleQueryStage (40) + : +- ColumnarExchange (39) + : +- VeloxAppendBatches (38) + : +- ^ ProjectExecTransformer (36) + : +- ^ FlushableHashAggregateExecTransformer (35) + : +- ^ BroadcastHashJoinExecTransformer LeftSemi BuildRight (34) + : :- ^ ProjectExecTransformer (29) + : : +- ^ NoopFilter (28) + : : +- ^ Scan parquet (27) + : +- ^ InputIteratorTransformer (33) + : +- BroadcastQueryStage (31) + : +- ReusedExchange (30) + +- ^ InputIteratorTransformer (62) + +- BroadcastQueryStage (60) + +- ColumnarBroadcastExchange (59) + +- ^ ProjectExecTransformer (57) + +- ^ NoopFilter (56) + +- ^ Scan parquet (55) +- == Initial Plan == - Sort (106) - +- Exchange (105) - +- Project (104) - +- BroadcastHashJoin Inner BuildRight (103) - :- Project (98) - : +- ShuffledHashJoin LeftSemi BuildRight (97) - : :- Exchange (73) - : : +- Filter (72) - : : +- Scan parquet (71) - : +- Exchange (96) - : +- Project (95) - : +- BroadcastHashJoin Inner BuildLeft (94) - : :- BroadcastExchange (81) - : : +- BroadcastHashJoin LeftSemi BuildRight (80) - : : :- Filter (75) - : : : +- Scan parquet (74) - : : +- BroadcastExchange (79) - : : +- Project (78) - : : +- Filter (77) - : : +- Scan parquet (76) - : +- Filter (93) - : +- HashAggregate (92) - : +- Exchange (91) - : +- HashAggregate (90) - : +- BroadcastHashJoin LeftSemi BuildRight (89) - : :- Project (84) - : : +- Filter (83) - : : +- Scan parquet (82) - : +- BroadcastExchange (88) - : +- Project (87) - : +- Filter (86) - : +- Scan parquet (85) - +- BroadcastExchange (102) - +- Project (101) - +- Filter (100) - +- Scan parquet (99) + Sort (111) + +- Exchange (110) + +- Project (109) + +- BroadcastHashJoin Inner BuildRight (108) + :- Project (103) + : +- SortMergeJoin LeftSemi (102) + : :- Sort (77) + : : +- Exchange (76) + : : +- Filter (75) + : : +- Scan parquet (74) + : +- Sort (101) + : +- Exchange (100) + : +- Project (99) + : +- BroadcastHashJoin Inner BuildLeft (98) + : :- BroadcastExchange (85) + : : +- BroadcastHashJoin LeftSemi BuildRight (84) + : : :- Filter (79) + : : : +- Scan parquet (78) + : : +- BroadcastExchange (83) + : : +- Project (82) + : : +- Filter (81) + : : +- Scan parquet (80) + : +- Filter (97) + : +- HashAggregate (96) + : +- Exchange (95) + : +- HashAggregate (94) + : +- BroadcastHashJoin LeftSemi BuildRight (93) + : :- Project (88) + : : +- Filter (87) + : : +- Scan parquet (86) + : +- BroadcastExchange (92) + : +- Project (91) + : +- Filter (90) + : +- Scan parquet (89) + +- BroadcastExchange (107) + +- Project (106) + +- Filter (105) + +- Scan parquet (104) (1) Scan parquet @@ -121,448 +118,468 @@ Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] +Arguments: X + +(6) ColumnarExchange Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Arguments: X -(7) AQEShuffleRead +(8) AQEShuffleRead Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Arguments: local -(8) InputAdapter +(9) InputAdapter Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] -(9) InputIteratorTransformer +(10) InputIteratorTransformer Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] -(10) Scan parquet +(11) Scan parquet Output [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_availqty), IsNotNull(ps_partkey), IsNotNull(ps_suppkey)] ReadSchema: struct -(11) NoopFilter +(12) NoopFilter Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] -(12) Scan parquet +(13) Scan parquet Output [2]: [p_partkey#X, p_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_name), StringStartsWith(p_name,forest)] ReadSchema: struct -(13) NoopFilter +(14) NoopFilter Input [2]: [p_partkey#X, p_name#X] Arguments: [p_partkey#X, p_name#X] -(14) ProjectExecTransformer +(15) ProjectExecTransformer Output [1]: [p_partkey#X] Input [2]: [p_partkey#X, p_name#X] -(15) WholeStageCodegenTransformer (X) +(16) WholeStageCodegenTransformer (X) Input [1]: [p_partkey#X] Arguments: false -(16) ColumnarBroadcastExchange +(17) ColumnarBroadcastExchange Input [1]: [p_partkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(17) BroadcastQueryStage +(18) BroadcastQueryStage Output [1]: [p_partkey#X] Arguments: X -(18) InputAdapter +(19) InputAdapter Input [1]: [p_partkey#X] -(19) InputIteratorTransformer +(20) InputIteratorTransformer Input [1]: [p_partkey#X] -(20) BroadcastHashJoinExecTransformer +(21) BroadcastHashJoinExecTransformer Left keys [1]: [ps_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(21) WholeStageCodegenTransformer (X) +(22) WholeStageCodegenTransformer (X) Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: false -(22) ColumnarBroadcastExchange +(23) ColumnarBroadcastExchange Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false], input[1, bigint, false]),false), [plan_id=X] -(23) BroadcastQueryStage +(24) BroadcastQueryStage Output [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: X -(24) InputAdapter +(25) InputAdapter Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] -(25) InputIteratorTransformer +(26) InputIteratorTransformer Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] -(26) Scan parquet +(27) Scan parquet Output [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1994-01-01), LessThan(l_shipdate,1995-01-01), IsNotNull(l_partkey), IsNotNull(l_suppkey)] ReadSchema: struct -(27) NoopFilter +(28) NoopFilter Input [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] Arguments: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] -(28) ProjectExecTransformer +(29) ProjectExecTransformer Output [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] Input [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] -(29) ReusedExchange [Reuses operator id: 16] +(30) ReusedExchange [Reuses operator id: 17] Output [1]: [p_partkey#X] -(30) BroadcastQueryStage +(31) BroadcastQueryStage Output [1]: [p_partkey#X] Arguments: X -(31) InputAdapter +(32) InputAdapter Input [1]: [p_partkey#X] -(32) InputIteratorTransformer +(33) InputIteratorTransformer Input [1]: [p_partkey#X] -(33) BroadcastHashJoinExecTransformer +(34) BroadcastHashJoinExecTransformer Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(34) FlushableHashAggregateExecTransformer +(35) FlushableHashAggregateExecTransformer Input [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] Keys [2]: [l_partkey#X, l_suppkey#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] -(35) ProjectExecTransformer +(36) ProjectExecTransformer Output [5]: [hash(l_partkey#X, l_suppkey#X, 42) AS hash_partition_key#X, l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] Input [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] -(36) WholeStageCodegenTransformer (X) +(37) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: false -(37) ColumnarExchange +(38) VeloxAppendBatches +Input [5]: [hash_partition_key#X, l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] +Arguments: X + +(39) ColumnarExchange Input [5]: [hash_partition_key#X, l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_partkey#X, l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(38) ShuffleQueryStage +(40) ShuffleQueryStage Output [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: X -(39) InputAdapter +(41) InputAdapter Input [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] -(40) InputIteratorTransformer +(42) InputIteratorTransformer Input [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] -(41) RegularHashAggregateExecTransformer +(43) RegularHashAggregateExecTransformer Input [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] Keys [2]: [l_partkey#X, l_suppkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [3]: [l_partkey#X, l_suppkey#X, sum(l_quantity#X)#X] -(42) ProjectExecTransformer +(44) ProjectExecTransformer Output [3]: [CheckOverflow((0.50 * promote_precision(sum(l_quantity#X)#X)), DecimalType(24,3), true) AS (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Input [3]: [l_partkey#X, l_suppkey#X, sum(l_quantity#X)#X] -(43) FilterExecTransformer +(45) FilterExecTransformer Input [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Arguments: isnotnull((0.5 * sum(l_quantity))#X) -(44) BroadcastHashJoinExecTransformer +(46) BroadcastHashJoinExecTransformer Left keys [2]: [ps_partkey#X, ps_suppkey#X] Right keys [2]: [l_partkey#X, l_suppkey#X] Join condition: (cast(cast(ps_availqty#X as decimal(10,0)) as decimal(24,3)) > (0.5 * sum(l_quantity))#X) -(45) ProjectExecTransformer +(47) ProjectExecTransformer Output [1]: [ps_suppkey#X] Input [6]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] -(46) WholeStageCodegenTransformer (X) +(48) WholeStageCodegenTransformer (X) Input [1]: [ps_suppkey#X] Arguments: false -(47) ColumnarBroadcastExchange +(49) ColumnarBroadcastExchange Input [1]: [ps_suppkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(48) BroadcastQueryStage +(50) BroadcastQueryStage Output [1]: [ps_suppkey#X] Arguments: X -(49) InputAdapter +(51) InputAdapter Input [1]: [ps_suppkey#X] -(50) InputIteratorTransformer +(52) InputIteratorTransformer Input [1]: [ps_suppkey#X] -(51) BroadcastHashJoinExecTransformer +(53) BroadcastHashJoinExecTransformer Left keys [1]: [s_suppkey#X] Right keys [1]: [ps_suppkey#X] Join condition: None -(52) ProjectExecTransformer +(54) ProjectExecTransformer Output [3]: [s_name#X, s_address#X, s_nationkey#X] Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] -(53) Scan parquet +(55) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,CANADA), IsNotNull(n_nationkey)] ReadSchema: struct -(54) NoopFilter +(56) NoopFilter Input [2]: [n_nationkey#X, n_name#X] Arguments: [n_nationkey#X, n_name#X] -(55) ProjectExecTransformer +(57) ProjectExecTransformer Output [1]: [n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(56) WholeStageCodegenTransformer (X) +(58) WholeStageCodegenTransformer (X) Input [1]: [n_nationkey#X] Arguments: false -(57) ColumnarBroadcastExchange +(59) ColumnarBroadcastExchange Input [1]: [n_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(58) BroadcastQueryStage +(60) BroadcastQueryStage Output [1]: [n_nationkey#X] Arguments: X -(59) InputAdapter +(61) InputAdapter Input [1]: [n_nationkey#X] -(60) InputIteratorTransformer +(62) InputIteratorTransformer Input [1]: [n_nationkey#X] -(61) BroadcastHashJoinExecTransformer +(63) BroadcastHashJoinExecTransformer Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(62) ProjectExecTransformer +(64) ProjectExecTransformer Output [2]: [s_name#X, s_address#X] Input [4]: [s_name#X, s_address#X, s_nationkey#X, n_nationkey#X] -(63) WholeStageCodegenTransformer (X) +(65) WholeStageCodegenTransformer (X) Input [2]: [s_name#X, s_address#X] Arguments: false -(64) ColumnarExchange +(66) VeloxAppendBatches +Input [2]: [s_name#X, s_address#X] +Arguments: X + +(67) ColumnarExchange Input [2]: [s_name#X, s_address#X] Arguments: rangepartitioning(s_name#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(65) ShuffleQueryStage +(68) ShuffleQueryStage Output [2]: [s_name#X, s_address#X] Arguments: X -(66) InputAdapter +(69) InputAdapter Input [2]: [s_name#X, s_address#X] -(67) InputIteratorTransformer +(70) InputIteratorTransformer Input [2]: [s_name#X, s_address#X] -(68) SortExecTransformer +(71) SortExecTransformer Input [2]: [s_name#X, s_address#X] Arguments: [s_name#X ASC NULLS FIRST], true, 0 -(69) WholeStageCodegenTransformer (X) +(72) WholeStageCodegenTransformer (X) Input [2]: [s_name#X, s_address#X] Arguments: false -(70) VeloxColumnarToRowExec +(73) VeloxColumnarToRowExec Input [2]: [s_name#X, s_address#X] -(71) Scan parquet +(74) Scan parquet Output [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_nationkey)] ReadSchema: struct -(72) Filter +(75) Filter Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Condition : isnotnull(s_nationkey#X) -(73) Exchange +(76) Exchange Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(74) Scan parquet +(77) Sort +Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] +Arguments: [s_suppkey#X ASC NULLS FIRST], false, 0 + +(78) Scan parquet Output [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_availqty), IsNotNull(ps_partkey), IsNotNull(ps_suppkey)] ReadSchema: struct -(75) Filter +(79) Filter Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Condition : ((isnotnull(ps_availqty#X) AND isnotnull(ps_partkey#X)) AND isnotnull(ps_suppkey#X)) -(76) Scan parquet +(80) Scan parquet Output [2]: [p_partkey#X, p_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_name), StringStartsWith(p_name,forest)] ReadSchema: struct -(77) Filter +(81) Filter Input [2]: [p_partkey#X, p_name#X] Condition : (isnotnull(p_name#X) AND StartsWith(p_name#X, forest)) -(78) Project +(82) Project Output [1]: [p_partkey#X] Input [2]: [p_partkey#X, p_name#X] -(79) BroadcastExchange +(83) BroadcastExchange Input [1]: [p_partkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(80) BroadcastHashJoin +(84) BroadcastHashJoin Left keys [1]: [ps_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(81) BroadcastExchange +(85) BroadcastExchange Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false], input[1, bigint, false]),false), [plan_id=X] -(82) Scan parquet +(86) Scan parquet Output [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1994-01-01), LessThan(l_shipdate,1995-01-01), IsNotNull(l_partkey), IsNotNull(l_suppkey)] ReadSchema: struct -(83) Filter +(87) Filter Input [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] Condition : ((((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1994-01-01)) AND (l_shipdate#X < 1995-01-01)) AND isnotnull(l_partkey#X)) AND isnotnull(l_suppkey#X)) -(84) Project +(88) Project Output [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] Input [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] -(85) Scan parquet +(89) Scan parquet Output [2]: [p_partkey#X, p_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_name), StringStartsWith(p_name,forest)] ReadSchema: struct -(86) Filter +(90) Filter Input [2]: [p_partkey#X, p_name#X] Condition : (isnotnull(p_name#X) AND StartsWith(p_name#X, forest)) -(87) Project +(91) Project Output [1]: [p_partkey#X] Input [2]: [p_partkey#X, p_name#X] -(88) BroadcastExchange +(92) BroadcastExchange Input [1]: [p_partkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(89) BroadcastHashJoin +(93) BroadcastHashJoin Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(90) HashAggregate +(94) HashAggregate Input [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] Keys [2]: [l_partkey#X, l_suppkey#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] -(91) Exchange +(95) Exchange Input [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_partkey#X, l_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(92) HashAggregate +(96) HashAggregate Input [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] Keys [2]: [l_partkey#X, l_suppkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [3]: [CheckOverflow((0.50 * promote_precision(sum(l_quantity#X)#X)), DecimalType(24,3), true) AS (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] -(93) Filter +(97) Filter Input [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Condition : isnotnull((0.5 * sum(l_quantity))#X) -(94) BroadcastHashJoin +(98) BroadcastHashJoin Left keys [2]: [ps_partkey#X, ps_suppkey#X] Right keys [2]: [l_partkey#X, l_suppkey#X] Join condition: (cast(cast(ps_availqty#X as decimal(10,0)) as decimal(24,3)) > (0.5 * sum(l_quantity))#X) -(95) Project +(99) Project Output [1]: [ps_suppkey#X] Input [6]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] -(96) Exchange +(100) Exchange Input [1]: [ps_suppkey#X] Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(97) ShuffledHashJoin +(101) Sort +Input [1]: [ps_suppkey#X] +Arguments: [ps_suppkey#X ASC NULLS FIRST], false, 0 + +(102) SortMergeJoin Left keys [1]: [s_suppkey#X] Right keys [1]: [ps_suppkey#X] Join condition: None -(98) Project +(103) Project Output [3]: [s_name#X, s_address#X, s_nationkey#X] Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] -(99) Scan parquet +(104) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,CANADA), IsNotNull(n_nationkey)] ReadSchema: struct -(100) Filter +(105) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : ((isnotnull(n_name#X) AND (n_name#X = CANADA)) AND isnotnull(n_nationkey#X)) -(101) Project +(106) Project Output [1]: [n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(102) BroadcastExchange +(107) BroadcastExchange Input [1]: [n_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(103) BroadcastHashJoin +(108) BroadcastHashJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(104) Project +(109) Project Output [2]: [s_name#X, s_address#X] Input [4]: [s_name#X, s_address#X, s_nationkey#X, n_nationkey#X] -(105) Exchange +(110) Exchange Input [2]: [s_name#X, s_address#X] Arguments: rangepartitioning(s_name#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(106) Sort +(111) Sort Input [2]: [s_name#X, s_address#X] Arguments: [s_name#X ASC NULLS FIRST], true, 0 -(107) AdaptiveSparkPlan +(112) AdaptiveSparkPlan Output [2]: [s_name#X, s_address#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/21.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/21.txt index 9c09032689eb4..eb3f64d3efccc 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/21.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/21.txt @@ -1,91 +1,86 @@ == Physical Plan == -AdaptiveSparkPlan (92) +AdaptiveSparkPlan (93) +- == Final Plan == - VeloxColumnarToRowExec (59) - +- TakeOrderedAndProjectExecTransformer (58) - +- ^ RegularHashAggregateExecTransformer (56) - +- ^ InputIteratorTransformer (55) - +- ^ InputAdapter (54) - +- ^ ShuffleQueryStage (53) - +- ColumnarExchange (52) + VeloxColumnarToRowExec (60) + +- TakeOrderedAndProjectExecTransformer (59) + +- ^ RegularHashAggregateExecTransformer (57) + +- ^ InputIteratorTransformer (56) + +- ShuffleQueryStage (54) + +- ColumnarExchange (53) + +- VeloxAppendBatches (52) +- ^ ProjectExecTransformer (50) +- ^ FlushableHashAggregateExecTransformer (49) +- ^ ProjectExecTransformer (48) - +- ^ BroadcastHashJoinExecTransformer Inner (47) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (47) :- ^ ProjectExecTransformer (38) - : +- ^ BroadcastHashJoinExecTransformer Inner (37) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (37) : :- ^ ProjectExecTransformer (28) - : : +- ^ BroadcastHashJoinExecTransformer Inner (27) + : : +- ^ BroadcastHashJoinExecTransformer Inner BuildLeft (27) : : :- ^ InputIteratorTransformer (7) - : : : +- ^ InputAdapter (6) - : : : +- ^ BroadcastQueryStage (5) - : : : +- ColumnarBroadcastExchange (4) - : : : +- ^ NoopFilter (2) - : : : +- ^ Scan parquet (1) - : : +- ^ BroadcastHashJoinExecTransformer LeftAnti (26) - : : :- ^ BroadcastHashJoinExecTransformer LeftSemi (17) + : : : +- BroadcastQueryStage (5) + : : : +- ColumnarBroadcastExchange (4) + : : : +- ^ NoopFilter (2) + : : : +- ^ Scan parquet (1) + : : +- ^ BroadcastHashJoinExecTransformer LeftAnti BuildRight (26) + : : :- ^ BroadcastHashJoinExecTransformer LeftSemi BuildRight (17) : : : :- ^ ProjectExecTransformer (10) : : : : +- ^ NoopFilter (9) : : : : +- ^ Scan parquet (8) : : : +- ^ InputIteratorTransformer (16) - : : : +- ^ InputAdapter (15) - : : : +- ^ BroadcastQueryStage (14) - : : : +- ColumnarBroadcastExchange (13) - : : : +- ^ Scan parquet (11) + : : : +- BroadcastQueryStage (14) + : : : +- ColumnarBroadcastExchange (13) + : : : +- ^ Scan parquet (11) : : +- ^ InputIteratorTransformer (25) - : : +- ^ InputAdapter (24) - : : +- ^ BroadcastQueryStage (23) - : : +- ColumnarBroadcastExchange (22) - : : +- ^ ProjectExecTransformer (20) - : : +- ^ NoopFilter (19) - : : +- ^ Scan parquet (18) + : : +- BroadcastQueryStage (23) + : : +- ColumnarBroadcastExchange (22) + : : +- ^ ProjectExecTransformer (20) + : : +- ^ NoopFilter (19) + : : +- ^ Scan parquet (18) : +- ^ InputIteratorTransformer (36) - : +- ^ InputAdapter (35) - : +- ^ BroadcastQueryStage (34) - : +- ColumnarBroadcastExchange (33) - : +- ^ ProjectExecTransformer (31) - : +- ^ NoopFilter (30) - : +- ^ Scan parquet (29) + : +- BroadcastQueryStage (34) + : +- ColumnarBroadcastExchange (33) + : +- ^ ProjectExecTransformer (31) + : +- ^ NoopFilter (30) + : +- ^ Scan parquet (29) +- ^ InputIteratorTransformer (46) - +- ^ InputAdapter (45) - +- ^ BroadcastQueryStage (44) - +- ColumnarBroadcastExchange (43) - +- ^ ProjectExecTransformer (41) - +- ^ NoopFilter (40) - +- ^ Scan parquet (39) + +- BroadcastQueryStage (44) + +- ColumnarBroadcastExchange (43) + +- ^ ProjectExecTransformer (41) + +- ^ NoopFilter (40) + +- ^ Scan parquet (39) +- == Initial Plan == - TakeOrderedAndProject (91) - +- HashAggregate (90) - +- Exchange (89) - +- HashAggregate (88) - +- Project (87) - +- BroadcastHashJoin Inner BuildRight (86) - :- Project (81) - : +- BroadcastHashJoin Inner BuildRight (80) - : :- Project (75) - : : +- BroadcastHashJoin Inner BuildLeft (74) - : : :- BroadcastExchange (62) - : : : +- Filter (61) - : : : +- Scan parquet (60) - : : +- BroadcastHashJoin LeftAnti BuildRight (73) - : : :- BroadcastHashJoin LeftSemi BuildRight (68) - : : : :- Project (65) - : : : : +- Filter (64) - : : : : +- Scan parquet (63) - : : : +- BroadcastExchange (67) - : : : +- Scan parquet (66) - : : +- BroadcastExchange (72) - : : +- Project (71) - : : +- Filter (70) - : : +- Scan parquet (69) - : +- BroadcastExchange (79) - : +- Project (78) - : +- Filter (77) - : +- Scan parquet (76) - +- BroadcastExchange (85) - +- Project (84) - +- Filter (83) - +- Scan parquet (82) + TakeOrderedAndProject (92) + +- HashAggregate (91) + +- Exchange (90) + +- HashAggregate (89) + +- Project (88) + +- BroadcastHashJoin Inner BuildRight (87) + :- Project (82) + : +- BroadcastHashJoin Inner BuildRight (81) + : :- Project (76) + : : +- BroadcastHashJoin Inner BuildLeft (75) + : : :- BroadcastExchange (63) + : : : +- Filter (62) + : : : +- Scan parquet (61) + : : +- BroadcastHashJoin LeftAnti BuildRight (74) + : : :- BroadcastHashJoin LeftSemi BuildRight (69) + : : : :- Project (66) + : : : : +- Filter (65) + : : : : +- Scan parquet (64) + : : : +- BroadcastExchange (68) + : : : +- Scan parquet (67) + : : +- BroadcastExchange (73) + : : +- Project (72) + : : +- Filter (71) + : : +- Scan parquet (70) + : +- BroadcastExchange (80) + : +- Project (79) + : +- Filter (78) + : +- Scan parquet (77) + +- BroadcastExchange (86) + +- Project (85) + +- Filter (84) + +- Scan parquet (83) (1) Scan parquet @@ -307,194 +302,198 @@ Input [2]: [s_name#X, count#X] Input [3]: [hash_partition_key#X, s_name#X, count#X] Arguments: false -(52) ColumnarExchange +(52) VeloxAppendBatches +Input [3]: [hash_partition_key#X, s_name#X, count#X] +Arguments: X + +(53) ColumnarExchange Input [3]: [hash_partition_key#X, s_name#X, count#X] Arguments: hashpartitioning(s_name#X, 1), ENSURE_REQUIREMENTS, [s_name#X, count#X], [plan_id=X], [id=#X] -(53) ShuffleQueryStage +(54) ShuffleQueryStage Output [2]: [s_name#X, count#X] Arguments: X -(54) InputAdapter +(55) InputAdapter Input [2]: [s_name#X, count#X] -(55) InputIteratorTransformer +(56) InputIteratorTransformer Input [2]: [s_name#X, count#X] -(56) RegularHashAggregateExecTransformer +(57) RegularHashAggregateExecTransformer Input [2]: [s_name#X, count#X] Keys [1]: [s_name#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [s_name#X, count(1)#X AS numwait#X] -(57) WholeStageCodegenTransformer (X) +(58) WholeStageCodegenTransformer (X) Input [2]: [s_name#X, numwait#X] Arguments: false -(58) TakeOrderedAndProjectExecTransformer +(59) TakeOrderedAndProjectExecTransformer Input [2]: [s_name#X, numwait#X] Arguments: X, [numwait#X DESC NULLS LAST, s_name#X ASC NULLS FIRST], [s_name#X, numwait#X], 0 -(59) VeloxColumnarToRowExec +(60) VeloxColumnarToRowExec Input [2]: [s_name#X, numwait#X] -(60) Scan parquet +(61) Scan parquet Output [3]: [s_suppkey#X, s_name#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(61) Filter +(62) Filter Input [3]: [s_suppkey#X, s_name#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(62) BroadcastExchange +(63) BroadcastExchange Input [3]: [s_suppkey#X, s_name#X, s_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(63) Scan parquet +(64) Scan parquet Output [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_receiptdate), IsNotNull(l_commitdate), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(64) Filter +(65) Filter Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Condition : ((((isnotnull(l_receiptdate#X) AND isnotnull(l_commitdate#X)) AND (l_receiptdate#X > l_commitdate#X)) AND isnotnull(l_suppkey#X)) AND isnotnull(l_orderkey#X)) -(65) Project +(66) Project Output [2]: [l_orderkey#X, l_suppkey#X] Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] -(66) Scan parquet +(67) Scan parquet Output [2]: [l_orderkey#X, l_suppkey#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(67) BroadcastExchange +(68) BroadcastExchange Input [2]: [l_orderkey#X, l_suppkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(68) BroadcastHashJoin +(69) BroadcastHashJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: NOT (l_suppkey#X = l_suppkey#X) -(69) Scan parquet +(70) Scan parquet Output [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_receiptdate), IsNotNull(l_commitdate)] ReadSchema: struct -(70) Filter +(71) Filter Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Condition : ((isnotnull(l_receiptdate#X) AND isnotnull(l_commitdate#X)) AND (l_receiptdate#X > l_commitdate#X)) -(71) Project +(72) Project Output [2]: [l_orderkey#X, l_suppkey#X] Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] -(72) BroadcastExchange +(73) BroadcastExchange Input [2]: [l_orderkey#X, l_suppkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(73) BroadcastHashJoin +(74) BroadcastHashJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: NOT (l_suppkey#X = l_suppkey#X) -(74) BroadcastHashJoin +(75) BroadcastHashJoin Left keys [1]: [s_suppkey#X] Right keys [1]: [l_suppkey#X] Join condition: None -(75) Project +(76) Project Output [3]: [s_name#X, s_nationkey#X, l_orderkey#X] Input [5]: [s_suppkey#X, s_name#X, s_nationkey#X, l_orderkey#X, l_suppkey#X] -(76) Scan parquet +(77) Scan parquet Output [2]: [o_orderkey#X, o_orderstatus#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderstatus), EqualTo(o_orderstatus,F), IsNotNull(o_orderkey)] ReadSchema: struct -(77) Filter +(78) Filter Input [2]: [o_orderkey#X, o_orderstatus#X] Condition : ((isnotnull(o_orderstatus#X) AND (o_orderstatus#X = F)) AND isnotnull(o_orderkey#X)) -(78) Project +(79) Project Output [1]: [o_orderkey#X] Input [2]: [o_orderkey#X, o_orderstatus#X] -(79) BroadcastExchange +(80) BroadcastExchange Input [1]: [o_orderkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(80) BroadcastHashJoin +(81) BroadcastHashJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join condition: None -(81) Project +(82) Project Output [2]: [s_name#X, s_nationkey#X] Input [4]: [s_name#X, s_nationkey#X, l_orderkey#X, o_orderkey#X] -(82) Scan parquet +(83) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,SAUDI ARABIA), IsNotNull(n_nationkey)] ReadSchema: struct -(83) Filter +(84) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : ((isnotnull(n_name#X) AND (n_name#X = SAUDI ARABIA)) AND isnotnull(n_nationkey#X)) -(84) Project +(85) Project Output [1]: [n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(85) BroadcastExchange +(86) BroadcastExchange Input [1]: [n_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(86) BroadcastHashJoin +(87) BroadcastHashJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(87) Project +(88) Project Output [1]: [s_name#X] Input [3]: [s_name#X, s_nationkey#X, n_nationkey#X] -(88) HashAggregate +(89) HashAggregate Input [1]: [s_name#X] Keys [1]: [s_name#X] Functions [1]: [partial_count(1)] Aggregate Attributes [1]: [count#X] Results [2]: [s_name#X, count#X] -(89) Exchange +(90) Exchange Input [2]: [s_name#X, count#X] Arguments: hashpartitioning(s_name#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(90) HashAggregate +(91) HashAggregate Input [2]: [s_name#X, count#X] Keys [1]: [s_name#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [s_name#X, count(1)#X AS numwait#X] -(91) TakeOrderedAndProject +(92) TakeOrderedAndProject Input [2]: [s_name#X, numwait#X] Arguments: X, [numwait#X DESC NULLS LAST, s_name#X ASC NULLS FIRST], [s_name#X, numwait#X] -(92) AdaptiveSparkPlan +(93) AdaptiveSparkPlan Output [2]: [s_name#X, numwait#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/22.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/22.txt index 3dc92be2d96e0..4c149d9bb43ed 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/22.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/22.txt @@ -1,40 +1,39 @@ == Physical Plan == -AdaptiveSparkPlan (38) +AdaptiveSparkPlan (40) +- == Final Plan == - VeloxColumnarToRowExec (26) - +- ^ SortExecTransformer (24) - +- ^ InputIteratorTransformer (23) - +- ^ InputAdapter (22) - +- ^ ShuffleQueryStage (21) - +- ColumnarExchange (20) - +- ^ RegularHashAggregateExecTransformer (18) - +- ^ InputIteratorTransformer (17) - +- ^ InputAdapter (16) - +- ^ ShuffleQueryStage (15) - +- ColumnarExchange (14) + VeloxColumnarToRowExec (28) + +- ^ SortExecTransformer (26) + +- ^ InputIteratorTransformer (25) + +- ShuffleQueryStage (23) + +- ColumnarExchange (22) + +- VeloxAppendBatches (21) + +- ^ RegularHashAggregateExecTransformer (19) + +- ^ InputIteratorTransformer (18) + +- ShuffleQueryStage (16) + +- ColumnarExchange (15) + +- VeloxAppendBatches (14) +- ^ ProjectExecTransformer (12) +- ^ FlushableHashAggregateExecTransformer (11) +- ^ ProjectExecTransformer (10) - +- ^ BroadcastHashJoinExecTransformer LeftAnti (9) + +- ^ BroadcastHashJoinExecTransformer LeftAnti BuildRight (9) :- ^ NoopFilter (2) : +- ^ Scan parquet (1) +- ^ InputIteratorTransformer (8) - +- ^ InputAdapter (7) - +- ^ BroadcastQueryStage (6) - +- ColumnarBroadcastExchange (5) - +- ^ Scan parquet (3) + +- BroadcastQueryStage (6) + +- ColumnarBroadcastExchange (5) + +- ^ Scan parquet (3) +- == Initial Plan == - Sort (37) - +- Exchange (36) - +- HashAggregate (35) - +- Exchange (34) - +- HashAggregate (33) - +- Project (32) - +- BroadcastHashJoin LeftAnti BuildRight (31) - :- Filter (28) - : +- Scan parquet (27) - +- BroadcastExchange (30) - +- Scan parquet (29) + Sort (39) + +- Exchange (38) + +- HashAggregate (37) + +- Exchange (36) + +- HashAggregate (35) + +- Project (34) + +- BroadcastHashJoin LeftAnti BuildRight (33) + :- Filter (30) + : +- Scan parquet (29) + +- BroadcastExchange (32) + +- Scan parquet (31) (1) Scan parquet @@ -96,112 +95,120 @@ Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] Input [5]: [hash_partition_key#X, cntrycode#X, count#X, sum#X, isEmpty#X] Arguments: false -(14) ColumnarExchange +(14) VeloxAppendBatches +Input [5]: [hash_partition_key#X, cntrycode#X, count#X, sum#X, isEmpty#X] +Arguments: X + +(15) ColumnarExchange Input [5]: [hash_partition_key#X, cntrycode#X, count#X, sum#X, isEmpty#X] Arguments: hashpartitioning(cntrycode#X, 1), ENSURE_REQUIREMENTS, [cntrycode#X, count#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(15) ShuffleQueryStage +(16) ShuffleQueryStage Output [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] Arguments: X -(16) InputAdapter +(17) InputAdapter Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] -(17) InputIteratorTransformer +(18) InputIteratorTransformer Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] -(18) RegularHashAggregateExecTransformer +(19) RegularHashAggregateExecTransformer Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] Keys [1]: [cntrycode#X] Functions [2]: [count(1), sum(c_acctbal#X)] Aggregate Attributes [2]: [count(1)#X, sum(c_acctbal#X)#X] Results [3]: [cntrycode#X, count(1)#X AS numcust#X, sum(c_acctbal#X)#X AS totacctbal#X] -(19) WholeStageCodegenTransformer (X) +(20) WholeStageCodegenTransformer (X) Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: false -(20) ColumnarExchange +(21) VeloxAppendBatches +Input [3]: [cntrycode#X, numcust#X, totacctbal#X] +Arguments: X + +(22) ColumnarExchange Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: rangepartitioning(cntrycode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(21) ShuffleQueryStage +(23) ShuffleQueryStage Output [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: X -(22) InputAdapter +(24) InputAdapter Input [3]: [cntrycode#X, numcust#X, totacctbal#X] -(23) InputIteratorTransformer +(25) InputIteratorTransformer Input [3]: [cntrycode#X, numcust#X, totacctbal#X] -(24) SortExecTransformer +(26) SortExecTransformer Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: [cntrycode#X ASC NULLS FIRST], true, 0 -(25) WholeStageCodegenTransformer (X) +(27) WholeStageCodegenTransformer (X) Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: false -(26) VeloxColumnarToRowExec +(28) VeloxColumnarToRowExec Input [3]: [cntrycode#X, numcust#X, totacctbal#X] -(27) Scan parquet +(29) Scan parquet Output [3]: [c_custkey#X, c_phone#X, c_acctbal#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_acctbal)] ReadSchema: struct -(28) Filter +(30) Filter Input [3]: [c_custkey#X, c_phone#X, c_acctbal#X] Condition : ((isnotnull(c_acctbal#X) AND substring(c_phone#X, 1, 2) IN (13,31,23,29,30,18,17)) AND (cast(c_acctbal#X as decimal(16,6)) > Subquery subquery#X, [id=#X])) -(29) Scan parquet +(31) Scan parquet Output [1]: [o_custkey#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(30) BroadcastExchange +(32) BroadcastExchange Input [1]: [o_custkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(31) BroadcastHashJoin +(33) BroadcastHashJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(32) Project +(34) Project Output [2]: [substring(c_phone#X, 1, 2) AS cntrycode#X, c_acctbal#X] Input [3]: [c_custkey#X, c_phone#X, c_acctbal#X] -(33) HashAggregate +(35) HashAggregate Input [2]: [cntrycode#X, c_acctbal#X] Keys [1]: [cntrycode#X] Functions [2]: [partial_count(1), partial_sum(c_acctbal#X)] Aggregate Attributes [3]: [count#X, sum#X, isEmpty#X] Results [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] -(34) Exchange +(36) Exchange Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] Arguments: hashpartitioning(cntrycode#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(35) HashAggregate +(37) HashAggregate Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] Keys [1]: [cntrycode#X] Functions [2]: [count(1), sum(c_acctbal#X)] Aggregate Attributes [2]: [count(1)#X, sum(c_acctbal#X)#X] Results [3]: [cntrycode#X, count(1)#X AS numcust#X, sum(c_acctbal#X)#X AS totacctbal#X] -(36) Exchange +(38) Exchange Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: rangepartitioning(cntrycode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(37) Sort +(39) Sort Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: [cntrycode#X ASC NULLS FIRST], true, 0 -(38) AdaptiveSparkPlan +(40) AdaptiveSparkPlan Output [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/3.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/3.txt index f962dbdd765f4..df2a6de8d1846 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/3.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/3.txt @@ -1,55 +1,53 @@ == Physical Plan == -AdaptiveSparkPlan (53) +AdaptiveSparkPlan (54) +- == Final Plan == - VeloxColumnarToRowExec (34) - +- TakeOrderedAndProjectExecTransformer (33) - +- ^ ProjectExecTransformer (31) - +- ^ RegularHashAggregateExecTransformer (30) - +- ^ InputIteratorTransformer (29) - +- ^ InputAdapter (28) - +- ^ ShuffleQueryStage (27) - +- ColumnarExchange (26) + VeloxColumnarToRowExec (35) + +- TakeOrderedAndProjectExecTransformer (34) + +- ^ ProjectExecTransformer (32) + +- ^ RegularHashAggregateExecTransformer (31) + +- ^ InputIteratorTransformer (30) + +- ShuffleQueryStage (28) + +- ColumnarExchange (27) + +- VeloxAppendBatches (26) +- ^ ProjectExecTransformer (24) +- ^ FlushableHashAggregateExecTransformer (23) +- ^ ProjectExecTransformer (22) - +- ^ BroadcastHashJoinExecTransformer Inner (21) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (21) :- ^ ProjectExecTransformer (12) - : +- ^ BroadcastHashJoinExecTransformer Inner (11) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildLeft (11) : :- ^ InputIteratorTransformer (8) - : : +- ^ InputAdapter (7) - : : +- ^ BroadcastQueryStage (6) - : : +- ColumnarBroadcastExchange (5) - : : +- ^ ProjectExecTransformer (3) - : : +- ^ NoopFilter (2) - : : +- ^ Scan parquet (1) + : : +- BroadcastQueryStage (6) + : : +- ColumnarBroadcastExchange (5) + : : +- ^ ProjectExecTransformer (3) + : : +- ^ NoopFilter (2) + : : +- ^ Scan parquet (1) : +- ^ NoopFilter (10) : +- ^ Scan parquet (9) +- ^ InputIteratorTransformer (20) - +- ^ InputAdapter (19) - +- ^ BroadcastQueryStage (18) - +- ColumnarBroadcastExchange (17) - +- ^ ProjectExecTransformer (15) - +- ^ NoopFilter (14) - +- ^ Scan parquet (13) + +- BroadcastQueryStage (18) + +- ColumnarBroadcastExchange (17) + +- ^ ProjectExecTransformer (15) + +- ^ NoopFilter (14) + +- ^ Scan parquet (13) +- == Initial Plan == - TakeOrderedAndProject (52) - +- HashAggregate (51) - +- Exchange (50) - +- HashAggregate (49) - +- Project (48) - +- BroadcastHashJoin Inner BuildRight (47) - :- Project (42) - : +- BroadcastHashJoin Inner BuildLeft (41) - : :- BroadcastExchange (38) - : : +- Project (37) - : : +- Filter (36) - : : +- Scan parquet (35) - : +- Filter (40) - : +- Scan parquet (39) - +- BroadcastExchange (46) - +- Project (45) - +- Filter (44) - +- Scan parquet (43) + TakeOrderedAndProject (53) + +- HashAggregate (52) + +- Exchange (51) + +- HashAggregate (50) + +- Project (49) + +- BroadcastHashJoin Inner BuildRight (48) + :- Project (43) + : +- BroadcastHashJoin Inner BuildLeft (42) + : :- BroadcastExchange (39) + : : +- Project (38) + : : +- Filter (37) + : : +- Scan parquet (36) + : +- Filter (41) + : +- Scan parquet (40) + +- BroadcastExchange (47) + +- Project (46) + +- Filter (45) + +- Scan parquet (44) (1) Scan parquet @@ -162,131 +160,135 @@ Input [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] Input [6]: [hash_partition_key#X, l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] Arguments: false -(26) ColumnarExchange +(26) VeloxAppendBatches +Input [6]: [hash_partition_key#X, l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] +Arguments: X + +(27) ColumnarExchange Input [6]: [hash_partition_key#X, l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_orderkey#X, o_orderdate#X, o_shippriority#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(27) ShuffleQueryStage +(28) ShuffleQueryStage Output [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] Arguments: X -(28) InputAdapter +(29) InputAdapter Input [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] -(29) InputIteratorTransformer +(30) InputIteratorTransformer Input [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] -(30) RegularHashAggregateExecTransformer +(31) RegularHashAggregateExecTransformer Input [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] Keys [3]: [l_orderkey#X, o_orderdate#X, o_shippriority#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] Results [4]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] -(31) ProjectExecTransformer +(32) ProjectExecTransformer Output [4]: [l_orderkey#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X AS revenue#X, o_orderdate#X, o_shippriority#X] Input [4]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] -(32) WholeStageCodegenTransformer (X) +(33) WholeStageCodegenTransformer (X) Input [4]: [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] Arguments: false -(33) TakeOrderedAndProjectExecTransformer +(34) TakeOrderedAndProjectExecTransformer Input [4]: [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] Arguments: X, [revenue#X DESC NULLS LAST, o_orderdate#X ASC NULLS FIRST], [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X], 0 -(34) VeloxColumnarToRowExec +(35) VeloxColumnarToRowExec Input [4]: [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] -(35) Scan parquet +(36) Scan parquet Output [2]: [c_custkey#X, c_mktsegment#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_mktsegment), EqualTo(c_mktsegment,BUILDING), IsNotNull(c_custkey)] ReadSchema: struct -(36) Filter +(37) Filter Input [2]: [c_custkey#X, c_mktsegment#X] Condition : ((isnotnull(c_mktsegment#X) AND (c_mktsegment#X = BUILDING)) AND isnotnull(c_custkey#X)) -(37) Project +(38) Project Output [1]: [c_custkey#X] Input [2]: [c_custkey#X, c_mktsegment#X] -(38) BroadcastExchange +(39) BroadcastExchange Input [1]: [c_custkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(39) Scan parquet +(40) Scan parquet Output [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), LessThan(o_orderdate,1995-03-15), IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(40) Filter +(41) Filter Input [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Condition : (((isnotnull(o_orderdate#X) AND (o_orderdate#X < 1995-03-15)) AND isnotnull(o_custkey#X)) AND isnotnull(o_orderkey#X)) -(41) BroadcastHashJoin +(42) BroadcastHashJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(42) Project +(43) Project Output [3]: [o_orderkey#X, o_orderdate#X, o_shippriority#X] Input [5]: [c_custkey#X, o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] -(43) Scan parquet +(44) Scan parquet Output [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThan(l_shipdate,1995-03-15), IsNotNull(l_orderkey)] ReadSchema: struct -(44) Filter +(45) Filter Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : ((isnotnull(l_shipdate#X) AND (l_shipdate#X > 1995-03-15)) AND isnotnull(l_orderkey#X)) -(45) Project +(46) Project Output [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(46) BroadcastExchange +(47) BroadcastExchange Input [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(47) BroadcastHashJoin +(48) BroadcastHashJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(48) Project +(49) Project Output [5]: [o_orderdate#X, o_shippriority#X, l_orderkey#X, l_extendedprice#X, l_discount#X] Input [6]: [o_orderkey#X, o_orderdate#X, o_shippriority#X, l_orderkey#X, l_extendedprice#X, l_discount#X] -(49) HashAggregate +(50) HashAggregate Input [5]: [o_orderdate#X, o_shippriority#X, l_orderkey#X, l_extendedprice#X, l_discount#X] Keys [3]: [l_orderkey#X, o_orderdate#X, o_shippriority#X] Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] -(50) Exchange +(51) Exchange Input [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_orderkey#X, o_orderdate#X, o_shippriority#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(51) HashAggregate +(52) HashAggregate Input [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] Keys [3]: [l_orderkey#X, o_orderdate#X, o_shippriority#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] Results [4]: [l_orderkey#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X AS revenue#X, o_orderdate#X, o_shippriority#X] -(52) TakeOrderedAndProject +(53) TakeOrderedAndProject Input [4]: [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] Arguments: X, [revenue#X DESC NULLS LAST, o_orderdate#X ASC NULLS FIRST], [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] -(53) AdaptiveSparkPlan +(54) AdaptiveSparkPlan Output [4]: [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/4.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/4.txt index e50973eb5abfd..34cb6467cbc2f 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/4.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/4.txt @@ -1,46 +1,45 @@ == Physical Plan == -AdaptiveSparkPlan (44) +AdaptiveSparkPlan (46) +- == Final Plan == - VeloxColumnarToRowExec (29) - +- ^ SortExecTransformer (27) - +- ^ InputIteratorTransformer (26) - +- ^ InputAdapter (25) - +- ^ ShuffleQueryStage (24) - +- ColumnarExchange (23) - +- ^ RegularHashAggregateExecTransformer (21) - +- ^ InputIteratorTransformer (20) - +- ^ InputAdapter (19) - +- ^ ShuffleQueryStage (18) - +- ColumnarExchange (17) + VeloxColumnarToRowExec (31) + +- ^ SortExecTransformer (29) + +- ^ InputIteratorTransformer (28) + +- ShuffleQueryStage (26) + +- ColumnarExchange (25) + +- VeloxAppendBatches (24) + +- ^ RegularHashAggregateExecTransformer (22) + +- ^ InputIteratorTransformer (21) + +- ShuffleQueryStage (19) + +- ColumnarExchange (18) + +- VeloxAppendBatches (17) +- ^ ProjectExecTransformer (15) +- ^ FlushableHashAggregateExecTransformer (14) +- ^ ProjectExecTransformer (13) - +- ^ BroadcastHashJoinExecTransformer LeftSemi (12) + +- ^ BroadcastHashJoinExecTransformer LeftSemi BuildRight (12) :- ^ ProjectExecTransformer (3) : +- ^ NoopFilter (2) : +- ^ Scan parquet (1) +- ^ InputIteratorTransformer (11) - +- ^ InputAdapter (10) - +- ^ BroadcastQueryStage (9) - +- ColumnarBroadcastExchange (8) - +- ^ ProjectExecTransformer (6) - +- ^ NoopFilter (5) - +- ^ Scan parquet (4) + +- BroadcastQueryStage (9) + +- ColumnarBroadcastExchange (8) + +- ^ ProjectExecTransformer (6) + +- ^ NoopFilter (5) + +- ^ Scan parquet (4) +- == Initial Plan == - Sort (43) - +- Exchange (42) - +- HashAggregate (41) - +- Exchange (40) - +- HashAggregate (39) - +- Project (38) - +- BroadcastHashJoin LeftSemi BuildRight (37) - :- Project (32) - : +- Filter (31) - : +- Scan parquet (30) - +- BroadcastExchange (36) - +- Project (35) - +- Filter (34) - +- Scan parquet (33) + Sort (45) + +- Exchange (44) + +- HashAggregate (43) + +- Exchange (42) + +- HashAggregate (41) + +- Project (40) + +- BroadcastHashJoin LeftSemi BuildRight (39) + :- Project (34) + : +- Filter (33) + : +- Scan parquet (32) + +- BroadcastExchange (38) + +- Project (37) + +- Filter (36) + +- Scan parquet (35) (1) Scan parquet @@ -115,125 +114,133 @@ Input [2]: [o_orderpriority#X, count#X] Input [3]: [hash_partition_key#X, o_orderpriority#X, count#X] Arguments: false -(17) ColumnarExchange +(17) VeloxAppendBatches +Input [3]: [hash_partition_key#X, o_orderpriority#X, count#X] +Arguments: X + +(18) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderpriority#X, count#X] Arguments: hashpartitioning(o_orderpriority#X, 1), ENSURE_REQUIREMENTS, [o_orderpriority#X, count#X], [plan_id=X], [id=#X] -(18) ShuffleQueryStage +(19) ShuffleQueryStage Output [2]: [o_orderpriority#X, count#X] Arguments: X -(19) InputAdapter +(20) InputAdapter Input [2]: [o_orderpriority#X, count#X] -(20) InputIteratorTransformer +(21) InputIteratorTransformer Input [2]: [o_orderpriority#X, count#X] -(21) RegularHashAggregateExecTransformer +(22) RegularHashAggregateExecTransformer Input [2]: [o_orderpriority#X, count#X] Keys [1]: [o_orderpriority#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [o_orderpriority#X, count(1)#X AS order_count#X] -(22) WholeStageCodegenTransformer (X) +(23) WholeStageCodegenTransformer (X) Input [2]: [o_orderpriority#X, order_count#X] Arguments: false -(23) ColumnarExchange +(24) VeloxAppendBatches +Input [2]: [o_orderpriority#X, order_count#X] +Arguments: X + +(25) ColumnarExchange Input [2]: [o_orderpriority#X, order_count#X] Arguments: rangepartitioning(o_orderpriority#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(24) ShuffleQueryStage +(26) ShuffleQueryStage Output [2]: [o_orderpriority#X, order_count#X] Arguments: X -(25) InputAdapter +(27) InputAdapter Input [2]: [o_orderpriority#X, order_count#X] -(26) InputIteratorTransformer +(28) InputIteratorTransformer Input [2]: [o_orderpriority#X, order_count#X] -(27) SortExecTransformer +(29) SortExecTransformer Input [2]: [o_orderpriority#X, order_count#X] Arguments: [o_orderpriority#X ASC NULLS FIRST], true, 0 -(28) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [2]: [o_orderpriority#X, order_count#X] Arguments: false -(29) VeloxColumnarToRowExec +(31) VeloxColumnarToRowExec Input [2]: [o_orderpriority#X, order_count#X] -(30) Scan parquet +(32) Scan parquet Output [3]: [o_orderkey#X, o_orderdate#X, o_orderpriority#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1993-07-01), LessThan(o_orderdate,1993-10-01)] ReadSchema: struct -(31) Filter +(33) Filter Input [3]: [o_orderkey#X, o_orderdate#X, o_orderpriority#X] Condition : ((isnotnull(o_orderdate#X) AND (o_orderdate#X >= 1993-07-01)) AND (o_orderdate#X < 1993-10-01)) -(32) Project +(34) Project Output [2]: [o_orderkey#X, o_orderpriority#X] Input [3]: [o_orderkey#X, o_orderdate#X, o_orderpriority#X] -(33) Scan parquet +(35) Scan parquet Output [3]: [l_orderkey#X, l_commitdate#X, l_receiptdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_commitdate), IsNotNull(l_receiptdate)] ReadSchema: struct -(34) Filter +(36) Filter Input [3]: [l_orderkey#X, l_commitdate#X, l_receiptdate#X] Condition : ((isnotnull(l_commitdate#X) AND isnotnull(l_receiptdate#X)) AND (l_commitdate#X < l_receiptdate#X)) -(35) Project +(37) Project Output [1]: [l_orderkey#X] Input [3]: [l_orderkey#X, l_commitdate#X, l_receiptdate#X] -(36) BroadcastExchange +(38) BroadcastExchange Input [1]: [l_orderkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(37) BroadcastHashJoin +(39) BroadcastHashJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(38) Project +(40) Project Output [1]: [o_orderpriority#X] Input [2]: [o_orderkey#X, o_orderpriority#X] -(39) HashAggregate +(41) HashAggregate Input [1]: [o_orderpriority#X] Keys [1]: [o_orderpriority#X] Functions [1]: [partial_count(1)] Aggregate Attributes [1]: [count#X] Results [2]: [o_orderpriority#X, count#X] -(40) Exchange +(42) Exchange Input [2]: [o_orderpriority#X, count#X] Arguments: hashpartitioning(o_orderpriority#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(41) HashAggregate +(43) HashAggregate Input [2]: [o_orderpriority#X, count#X] Keys [1]: [o_orderpriority#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [o_orderpriority#X, count(1)#X AS order_count#X] -(42) Exchange +(44) Exchange Input [2]: [o_orderpriority#X, order_count#X] Arguments: rangepartitioning(o_orderpriority#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(43) Sort +(45) Sort Input [2]: [o_orderpriority#X, order_count#X] Arguments: [o_orderpriority#X ASC NULLS FIRST], true, 0 -(44) AdaptiveSparkPlan +(46) AdaptiveSparkPlan Output [2]: [o_orderpriority#X, order_count#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/5.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/5.txt index 274c176ff8c86..2c9ba4c938d4c 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/5.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/5.txt @@ -1,98 +1,93 @@ == Physical Plan == -AdaptiveSparkPlan (100) +AdaptiveSparkPlan (102) +- == Final Plan == - VeloxColumnarToRowExec (65) - +- ^ SortExecTransformer (63) - +- ^ InputIteratorTransformer (62) - +- ^ InputAdapter (61) - +- ^ ShuffleQueryStage (60) - +- ColumnarExchange (59) - +- ^ RegularHashAggregateExecTransformer (57) - +- ^ InputIteratorTransformer (56) - +- ^ InputAdapter (55) - +- ^ ShuffleQueryStage (54) - +- ColumnarExchange (53) + VeloxColumnarToRowExec (67) + +- ^ SortExecTransformer (65) + +- ^ InputIteratorTransformer (64) + +- ShuffleQueryStage (62) + +- ColumnarExchange (61) + +- VeloxAppendBatches (60) + +- ^ RegularHashAggregateExecTransformer (58) + +- ^ InputIteratorTransformer (57) + +- ShuffleQueryStage (55) + +- ColumnarExchange (54) + +- VeloxAppendBatches (53) +- ^ ProjectExecTransformer (51) +- ^ FlushableHashAggregateExecTransformer (50) +- ^ ProjectExecTransformer (49) - +- ^ BroadcastHashJoinExecTransformer Inner (48) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (48) :- ^ ProjectExecTransformer (39) - : +- ^ BroadcastHashJoinExecTransformer Inner (38) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (38) : :- ^ ProjectExecTransformer (30) - : : +- ^ BroadcastHashJoinExecTransformer Inner (29) + : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (29) : : :- ^ ProjectExecTransformer (21) - : : : +- ^ BroadcastHashJoinExecTransformer Inner (20) + : : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (20) : : : :- ^ ProjectExecTransformer (12) - : : : : +- ^ BroadcastHashJoinExecTransformer Inner (11) + : : : : +- ^ BroadcastHashJoinExecTransformer Inner BuildLeft (11) : : : : :- ^ InputIteratorTransformer (7) - : : : : : +- ^ InputAdapter (6) - : : : : : +- ^ BroadcastQueryStage (5) - : : : : : +- ColumnarBroadcastExchange (4) - : : : : : +- ^ NoopFilter (2) - : : : : : +- ^ Scan parquet (1) + : : : : : +- BroadcastQueryStage (5) + : : : : : +- ColumnarBroadcastExchange (4) + : : : : : +- ^ NoopFilter (2) + : : : : : +- ^ Scan parquet (1) : : : : +- ^ ProjectExecTransformer (10) : : : : +- ^ NoopFilter (9) : : : : +- ^ Scan parquet (8) : : : +- ^ InputIteratorTransformer (19) - : : : +- ^ InputAdapter (18) - : : : +- ^ BroadcastQueryStage (17) - : : : +- ColumnarBroadcastExchange (16) - : : : +- ^ NoopFilter (14) - : : : +- ^ Scan parquet (13) + : : : +- BroadcastQueryStage (17) + : : : +- ColumnarBroadcastExchange (16) + : : : +- ^ NoopFilter (14) + : : : +- ^ Scan parquet (13) : : +- ^ InputIteratorTransformer (28) - : : +- ^ InputAdapter (27) - : : +- ^ BroadcastQueryStage (26) - : : +- ColumnarBroadcastExchange (25) - : : +- ^ NoopFilter (23) - : : +- ^ Scan parquet (22) + : : +- BroadcastQueryStage (26) + : : +- ColumnarBroadcastExchange (25) + : : +- ^ NoopFilter (23) + : : +- ^ Scan parquet (22) : +- ^ InputIteratorTransformer (37) - : +- ^ InputAdapter (36) - : +- ^ BroadcastQueryStage (35) - : +- ColumnarBroadcastExchange (34) - : +- ^ NoopFilter (32) - : +- ^ Scan parquet (31) + : +- BroadcastQueryStage (35) + : +- ColumnarBroadcastExchange (34) + : +- ^ NoopFilter (32) + : +- ^ Scan parquet (31) +- ^ InputIteratorTransformer (47) - +- ^ InputAdapter (46) - +- ^ BroadcastQueryStage (45) - +- ColumnarBroadcastExchange (44) - +- ^ ProjectExecTransformer (42) - +- ^ NoopFilter (41) - +- ^ Scan parquet (40) + +- BroadcastQueryStage (45) + +- ColumnarBroadcastExchange (44) + +- ^ ProjectExecTransformer (42) + +- ^ NoopFilter (41) + +- ^ Scan parquet (40) +- == Initial Plan == - Sort (99) - +- Exchange (98) - +- HashAggregate (97) - +- Exchange (96) - +- HashAggregate (95) - +- Project (94) - +- BroadcastHashJoin Inner BuildRight (93) - :- Project (88) - : +- BroadcastHashJoin Inner BuildRight (87) - : :- Project (83) - : : +- BroadcastHashJoin Inner BuildRight (82) - : : :- Project (78) - : : : +- BroadcastHashJoin Inner BuildRight (77) - : : : :- Project (73) - : : : : +- BroadcastHashJoin Inner BuildLeft (72) - : : : : :- BroadcastExchange (68) - : : : : : +- Filter (67) - : : : : : +- Scan parquet (66) - : : : : +- Project (71) - : : : : +- Filter (70) - : : : : +- Scan parquet (69) - : : : +- BroadcastExchange (76) - : : : +- Filter (75) - : : : +- Scan parquet (74) - : : +- BroadcastExchange (81) - : : +- Filter (80) - : : +- Scan parquet (79) - : +- BroadcastExchange (86) - : +- Filter (85) - : +- Scan parquet (84) - +- BroadcastExchange (92) - +- Project (91) - +- Filter (90) - +- Scan parquet (89) + Sort (101) + +- Exchange (100) + +- HashAggregate (99) + +- Exchange (98) + +- HashAggregate (97) + +- Project (96) + +- BroadcastHashJoin Inner BuildRight (95) + :- Project (90) + : +- BroadcastHashJoin Inner BuildRight (89) + : :- Project (85) + : : +- BroadcastHashJoin Inner BuildRight (84) + : : :- Project (80) + : : : +- BroadcastHashJoin Inner BuildRight (79) + : : : :- Project (75) + : : : : +- BroadcastHashJoin Inner BuildLeft (74) + : : : : :- BroadcastExchange (70) + : : : : : +- Filter (69) + : : : : : +- Scan parquet (68) + : : : : +- Project (73) + : : : : +- Filter (72) + : : : : +- Scan parquet (71) + : : : +- BroadcastExchange (78) + : : : +- Filter (77) + : : : +- Scan parquet (76) + : : +- BroadcastExchange (83) + : : +- Filter (82) + : : +- Scan parquet (81) + : +- BroadcastExchange (88) + : +- Filter (87) + : +- Scan parquet (86) + +- BroadcastExchange (94) + +- Project (93) + +- Filter (92) + +- Scan parquet (91) (1) Scan parquet @@ -319,221 +314,229 @@ Input [3]: [n_name#X, sum#X, isEmpty#X] Input [4]: [hash_partition_key#X, n_name#X, sum#X, isEmpty#X] Arguments: false -(53) ColumnarExchange +(53) VeloxAppendBatches +Input [4]: [hash_partition_key#X, n_name#X, sum#X, isEmpty#X] +Arguments: X + +(54) ColumnarExchange Input [4]: [hash_partition_key#X, n_name#X, sum#X, isEmpty#X] Arguments: hashpartitioning(n_name#X, 1), ENSURE_REQUIREMENTS, [n_name#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(54) ShuffleQueryStage +(55) ShuffleQueryStage Output [3]: [n_name#X, sum#X, isEmpty#X] Arguments: X -(55) InputAdapter +(56) InputAdapter Input [3]: [n_name#X, sum#X, isEmpty#X] -(56) InputIteratorTransformer +(57) InputIteratorTransformer Input [3]: [n_name#X, sum#X, isEmpty#X] -(57) RegularHashAggregateExecTransformer +(58) RegularHashAggregateExecTransformer Input [3]: [n_name#X, sum#X, isEmpty#X] Keys [1]: [n_name#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] Results [2]: [n_name#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X AS revenue#X] -(58) WholeStageCodegenTransformer (X) +(59) WholeStageCodegenTransformer (X) Input [2]: [n_name#X, revenue#X] Arguments: false -(59) ColumnarExchange +(60) VeloxAppendBatches +Input [2]: [n_name#X, revenue#X] +Arguments: X + +(61) ColumnarExchange Input [2]: [n_name#X, revenue#X] Arguments: rangepartitioning(revenue#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(60) ShuffleQueryStage +(62) ShuffleQueryStage Output [2]: [n_name#X, revenue#X] Arguments: X -(61) InputAdapter +(63) InputAdapter Input [2]: [n_name#X, revenue#X] -(62) InputIteratorTransformer +(64) InputIteratorTransformer Input [2]: [n_name#X, revenue#X] -(63) SortExecTransformer +(65) SortExecTransformer Input [2]: [n_name#X, revenue#X] Arguments: [revenue#X DESC NULLS LAST], true, 0 -(64) WholeStageCodegenTransformer (X) +(66) WholeStageCodegenTransformer (X) Input [2]: [n_name#X, revenue#X] Arguments: false -(65) VeloxColumnarToRowExec +(67) VeloxColumnarToRowExec Input [2]: [n_name#X, revenue#X] -(66) Scan parquet +(68) Scan parquet Output [2]: [c_custkey#X, c_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey), IsNotNull(c_nationkey)] ReadSchema: struct -(67) Filter +(69) Filter Input [2]: [c_custkey#X, c_nationkey#X] Condition : (isnotnull(c_custkey#X) AND isnotnull(c_nationkey#X)) -(68) BroadcastExchange +(70) BroadcastExchange Input [2]: [c_custkey#X, c_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(69) Scan parquet +(71) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1994-01-01), LessThan(o_orderdate,1995-01-01), IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(70) Filter +(72) Filter Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Condition : ((((isnotnull(o_orderdate#X) AND (o_orderdate#X >= 1994-01-01)) AND (o_orderdate#X < 1995-01-01)) AND isnotnull(o_custkey#X)) AND isnotnull(o_orderkey#X)) -(71) Project +(73) Project Output [2]: [o_orderkey#X, o_custkey#X] Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] -(72) BroadcastHashJoin +(74) BroadcastHashJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(73) Project +(75) Project Output [2]: [c_nationkey#X, o_orderkey#X] Input [4]: [c_custkey#X, c_nationkey#X, o_orderkey#X, o_custkey#X] -(74) Scan parquet +(76) Scan parquet Output [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_orderkey), IsNotNull(l_suppkey)] ReadSchema: struct -(75) Filter +(77) Filter Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Condition : (isnotnull(l_orderkey#X) AND isnotnull(l_suppkey#X)) -(76) BroadcastExchange +(78) BroadcastExchange Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(77) BroadcastHashJoin +(79) BroadcastHashJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(78) Project +(80) Project Output [4]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Input [6]: [c_nationkey#X, o_orderkey#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(79) Scan parquet +(81) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(80) Filter +(82) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(81) BroadcastExchange +(83) BroadcastExchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false], input[1, bigint, false]),false), [plan_id=X] -(82) BroadcastHashJoin +(84) BroadcastHashJoin Left keys [2]: [l_suppkey#X, c_nationkey#X] Right keys [2]: [s_suppkey#X, s_nationkey#X] Join condition: None -(83) Project +(85) Project Output [3]: [l_extendedprice#X, l_discount#X, s_nationkey#X] Input [6]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, s_suppkey#X, s_nationkey#X] -(84) Scan parquet +(86) Scan parquet Output [3]: [n_nationkey#X, n_name#X, n_regionkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), IsNotNull(n_regionkey)] ReadSchema: struct -(85) Filter +(87) Filter Input [3]: [n_nationkey#X, n_name#X, n_regionkey#X] Condition : (isnotnull(n_nationkey#X) AND isnotnull(n_regionkey#X)) -(86) BroadcastExchange +(88) BroadcastExchange Input [3]: [n_nationkey#X, n_name#X, n_regionkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(87) BroadcastHashJoin +(89) BroadcastHashJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(88) Project +(90) Project Output [4]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] Input [6]: [l_extendedprice#X, l_discount#X, s_nationkey#X, n_nationkey#X, n_name#X, n_regionkey#X] -(89) Scan parquet +(91) Scan parquet Output [2]: [r_regionkey#X, r_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(r_name), EqualTo(r_name,ASIA), IsNotNull(r_regionkey)] ReadSchema: struct -(90) Filter +(92) Filter Input [2]: [r_regionkey#X, r_name#X] Condition : ((isnotnull(r_name#X) AND (r_name#X = ASIA)) AND isnotnull(r_regionkey#X)) -(91) Project +(93) Project Output [1]: [r_regionkey#X] Input [2]: [r_regionkey#X, r_name#X] -(92) BroadcastExchange +(94) BroadcastExchange Input [1]: [r_regionkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(93) BroadcastHashJoin +(95) BroadcastHashJoin Left keys [1]: [n_regionkey#X] Right keys [1]: [r_regionkey#X] Join condition: None -(94) Project +(96) Project Output [3]: [l_extendedprice#X, l_discount#X, n_name#X] Input [5]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X, r_regionkey#X] -(95) HashAggregate +(97) HashAggregate Input [3]: [l_extendedprice#X, l_discount#X, n_name#X] Keys [1]: [n_name#X] Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [n_name#X, sum#X, isEmpty#X] -(96) Exchange +(98) Exchange Input [3]: [n_name#X, sum#X, isEmpty#X] Arguments: hashpartitioning(n_name#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(97) HashAggregate +(99) HashAggregate Input [3]: [n_name#X, sum#X, isEmpty#X] Keys [1]: [n_name#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] Results [2]: [n_name#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X AS revenue#X] -(98) Exchange +(100) Exchange Input [2]: [n_name#X, revenue#X] Arguments: rangepartitioning(revenue#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(99) Sort +(101) Sort Input [2]: [n_name#X, revenue#X] Arguments: [revenue#X DESC NULLS LAST], true, 0 -(100) AdaptiveSparkPlan +(102) AdaptiveSparkPlan Output [2]: [n_name#X, revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/6.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/6.txt index 8d1a71e9751e2..b6f876d48e5a0 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/6.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/6.txt @@ -1,23 +1,23 @@ == Physical Plan == -AdaptiveSparkPlan (19) +AdaptiveSparkPlan (20) +- == Final Plan == - VeloxColumnarToRowExec (12) - +- ^ RegularHashAggregateExecTransformer (10) - +- ^ InputIteratorTransformer (9) - +- ^ InputAdapter (8) - +- ^ ShuffleQueryStage (7) - +- ColumnarExchange (6) + VeloxColumnarToRowExec (13) + +- ^ RegularHashAggregateExecTransformer (11) + +- ^ InputIteratorTransformer (10) + +- ShuffleQueryStage (8) + +- ColumnarExchange (7) + +- VeloxAppendBatches (6) +- ^ FlushableHashAggregateExecTransformer (4) +- ^ ProjectExecTransformer (3) +- ^ NoopFilter (2) +- ^ Scan parquet (1) +- == Initial Plan == - HashAggregate (18) - +- Exchange (17) - +- HashAggregate (16) - +- Project (15) - +- Filter (14) - +- Scan parquet (13) + HashAggregate (19) + +- Exchange (18) + +- HashAggregate (17) + +- Project (16) + +- Filter (15) + +- Scan parquet (14) (1) Scan parquet @@ -46,67 +46,71 @@ Results [2]: [sum#X, isEmpty#X] Input [2]: [sum#X, isEmpty#X] Arguments: false -(6) ColumnarExchange +(6) VeloxAppendBatches +Input [2]: [sum#X, isEmpty#X] +Arguments: X + +(7) ColumnarExchange Input [2]: [sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(7) ShuffleQueryStage +(8) ShuffleQueryStage Output [2]: [sum#X, isEmpty#X] Arguments: X -(8) InputAdapter +(9) InputAdapter Input [2]: [sum#X, isEmpty#X] -(9) InputIteratorTransformer +(10) InputIteratorTransformer Input [2]: [sum#X, isEmpty#X] -(10) RegularHashAggregateExecTransformer +(11) RegularHashAggregateExecTransformer Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(CheckOverflow((promote_precision(l_extendedprice#X) * promote_precision(l_discount#X)), DecimalType(25,4), true))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(l_extendedprice#X) * promote_precision(l_discount#X)), DecimalType(25,4), true))#X] Results [1]: [sum(CheckOverflow((promote_precision(l_extendedprice#X) * promote_precision(l_discount#X)), DecimalType(25,4), true))#X AS revenue#X] -(11) WholeStageCodegenTransformer (X) +(12) WholeStageCodegenTransformer (X) Input [1]: [revenue#X] Arguments: false -(12) VeloxColumnarToRowExec +(13) VeloxColumnarToRowExec Input [1]: [revenue#X] -(13) Scan parquet +(14) Scan parquet Output [4]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), IsNotNull(l_discount), IsNotNull(l_quantity), GreaterThanOrEqual(l_shipdate,1994-01-01), LessThan(l_shipdate,1995-01-01), GreaterThanOrEqual(l_discount,0.05), LessThanOrEqual(l_discount,0.07), LessThan(l_quantity,24.00)] ReadSchema: struct -(14) Filter +(15) Filter Input [4]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : (((((((isnotnull(l_shipdate#X) AND isnotnull(l_discount#X)) AND isnotnull(l_quantity#X)) AND (l_shipdate#X >= 1994-01-01)) AND (l_shipdate#X < 1995-01-01)) AND (l_discount#X >= 0.05)) AND (l_discount#X <= 0.07)) AND (l_quantity#X < 24.00)) -(15) Project +(16) Project Output [2]: [l_extendedprice#X, l_discount#X] Input [4]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(16) HashAggregate +(17) HashAggregate Input [2]: [l_extendedprice#X, l_discount#X] Keys: [] Functions [1]: [partial_sum(CheckOverflow((promote_precision(l_extendedprice#X) * promote_precision(l_discount#X)), DecimalType(25,4), true))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(17) Exchange +(18) Exchange Input [2]: [sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X] -(18) HashAggregate +(19) HashAggregate Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(CheckOverflow((promote_precision(l_extendedprice#X) * promote_precision(l_discount#X)), DecimalType(25,4), true))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(l_extendedprice#X) * promote_precision(l_discount#X)), DecimalType(25,4), true))#X] Results [1]: [sum(CheckOverflow((promote_precision(l_extendedprice#X) * promote_precision(l_discount#X)), DecimalType(25,4), true))#X AS revenue#X] -(19) AdaptiveSparkPlan +(20) AdaptiveSparkPlan Output [1]: [revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/7.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/7.txt index 3d5b6496fe270..8cf20f7694c9e 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/7.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/7.txt @@ -1,92 +1,87 @@ == Physical Plan == -AdaptiveSparkPlan (93) +AdaptiveSparkPlan (95) +- == Final Plan == - VeloxColumnarToRowExec (60) - +- ^ SortExecTransformer (58) - +- ^ InputIteratorTransformer (57) - +- ^ InputAdapter (56) - +- ^ ShuffleQueryStage (55) - +- ColumnarExchange (54) - +- ^ RegularHashAggregateExecTransformer (52) - +- ^ InputIteratorTransformer (51) - +- ^ InputAdapter (50) - +- ^ ShuffleQueryStage (49) - +- ColumnarExchange (48) + VeloxColumnarToRowExec (62) + +- ^ SortExecTransformer (60) + +- ^ InputIteratorTransformer (59) + +- ShuffleQueryStage (57) + +- ColumnarExchange (56) + +- VeloxAppendBatches (55) + +- ^ RegularHashAggregateExecTransformer (53) + +- ^ InputIteratorTransformer (52) + +- ShuffleQueryStage (50) + +- ColumnarExchange (49) + +- VeloxAppendBatches (48) +- ^ ProjectExecTransformer (46) +- ^ FlushableHashAggregateExecTransformer (45) +- ^ ProjectExecTransformer (44) - +- ^ BroadcastHashJoinExecTransformer Inner (43) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (43) :- ^ ProjectExecTransformer (38) - : +- ^ BroadcastHashJoinExecTransformer Inner (37) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (37) : :- ^ ProjectExecTransformer (29) - : : +- ^ BroadcastHashJoinExecTransformer Inner (28) + : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (28) : : :- ^ ProjectExecTransformer (20) - : : : +- ^ BroadcastHashJoinExecTransformer Inner (19) + : : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (19) : : : :- ^ ProjectExecTransformer (11) - : : : : +- ^ BroadcastHashJoinExecTransformer Inner (10) + : : : : +- ^ BroadcastHashJoinExecTransformer Inner BuildLeft (10) : : : : :- ^ InputIteratorTransformer (7) - : : : : : +- ^ InputAdapter (6) - : : : : : +- ^ BroadcastQueryStage (5) - : : : : : +- ColumnarBroadcastExchange (4) - : : : : : +- ^ NoopFilter (2) - : : : : : +- ^ Scan parquet (1) + : : : : : +- BroadcastQueryStage (5) + : : : : : +- ColumnarBroadcastExchange (4) + : : : : : +- ^ NoopFilter (2) + : : : : : +- ^ Scan parquet (1) : : : : +- ^ NoopFilter (9) : : : : +- ^ Scan parquet (8) : : : +- ^ InputIteratorTransformer (18) - : : : +- ^ InputAdapter (17) - : : : +- ^ BroadcastQueryStage (16) - : : : +- ColumnarBroadcastExchange (15) - : : : +- ^ NoopFilter (13) - : : : +- ^ Scan parquet (12) + : : : +- BroadcastQueryStage (16) + : : : +- ColumnarBroadcastExchange (15) + : : : +- ^ NoopFilter (13) + : : : +- ^ Scan parquet (12) : : +- ^ InputIteratorTransformer (27) - : : +- ^ InputAdapter (26) - : : +- ^ BroadcastQueryStage (25) - : : +- ColumnarBroadcastExchange (24) - : : +- ^ NoopFilter (22) - : : +- ^ Scan parquet (21) + : : +- BroadcastQueryStage (25) + : : +- ColumnarBroadcastExchange (24) + : : +- ^ NoopFilter (22) + : : +- ^ Scan parquet (21) : +- ^ InputIteratorTransformer (36) - : +- ^ InputAdapter (35) - : +- ^ BroadcastQueryStage (34) - : +- ColumnarBroadcastExchange (33) - : +- ^ NoopFilter (31) - : +- ^ Scan parquet (30) + : +- BroadcastQueryStage (34) + : +- ColumnarBroadcastExchange (33) + : +- ^ NoopFilter (31) + : +- ^ Scan parquet (30) +- ^ InputIteratorTransformer (42) - +- ^ InputAdapter (41) - +- ^ BroadcastQueryStage (40) - +- ReusedExchange (39) + +- BroadcastQueryStage (40) + +- ReusedExchange (39) +- == Initial Plan == - Sort (92) - +- Exchange (91) - +- HashAggregate (90) - +- Exchange (89) - +- HashAggregate (88) - +- Project (87) - +- BroadcastHashJoin Inner BuildRight (86) - :- Project (82) - : +- BroadcastHashJoin Inner BuildRight (81) - : :- Project (77) - : : +- BroadcastHashJoin Inner BuildRight (76) - : : :- Project (72) - : : : +- BroadcastHashJoin Inner BuildRight (71) - : : : :- Project (67) - : : : : +- BroadcastHashJoin Inner BuildLeft (66) - : : : : :- BroadcastExchange (63) - : : : : : +- Filter (62) - : : : : : +- Scan parquet (61) - : : : : +- Filter (65) - : : : : +- Scan parquet (64) - : : : +- BroadcastExchange (70) - : : : +- Filter (69) - : : : +- Scan parquet (68) - : : +- BroadcastExchange (75) - : : +- Filter (74) - : : +- Scan parquet (73) - : +- BroadcastExchange (80) - : +- Filter (79) - : +- Scan parquet (78) - +- BroadcastExchange (85) - +- Filter (84) - +- Scan parquet (83) + Sort (94) + +- Exchange (93) + +- HashAggregate (92) + +- Exchange (91) + +- HashAggregate (90) + +- Project (89) + +- BroadcastHashJoin Inner BuildRight (88) + :- Project (84) + : +- BroadcastHashJoin Inner BuildRight (83) + : :- Project (79) + : : +- BroadcastHashJoin Inner BuildRight (78) + : : :- Project (74) + : : : +- BroadcastHashJoin Inner BuildRight (73) + : : : :- Project (69) + : : : : +- BroadcastHashJoin Inner BuildLeft (68) + : : : : :- BroadcastExchange (65) + : : : : : +- Filter (64) + : : : : : +- Scan parquet (63) + : : : : +- Filter (67) + : : : : +- Scan parquet (66) + : : : +- BroadcastExchange (72) + : : : +- Filter (71) + : : : +- Scan parquet (70) + : : +- BroadcastExchange (77) + : : +- Filter (76) + : : +- Scan parquet (75) + : +- BroadcastExchange (82) + : +- Filter (81) + : +- Scan parquet (80) + +- BroadcastExchange (87) + +- Filter (86) + +- Scan parquet (85) (1) Scan parquet @@ -289,213 +284,221 @@ Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Input [6]: [hash_partition_key#X, supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Arguments: false -(48) ColumnarExchange +(48) VeloxAppendBatches +Input [6]: [hash_partition_key#X, supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] +Arguments: X + +(49) ColumnarExchange Input [6]: [hash_partition_key#X, supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Arguments: hashpartitioning(supp_nation#X, cust_nation#X, l_year#X, 1), ENSURE_REQUIREMENTS, [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(49) ShuffleQueryStage +(50) ShuffleQueryStage Output [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Arguments: X -(50) InputAdapter +(51) InputAdapter Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] -(51) InputIteratorTransformer +(52) InputIteratorTransformer Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] -(52) RegularHashAggregateExecTransformer +(53) RegularHashAggregateExecTransformer Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Keys [3]: [supp_nation#X, cust_nation#X, l_year#X] Functions [1]: [sum(volume#X)] Aggregate Attributes [1]: [sum(volume#X)#X] Results [4]: [supp_nation#X, cust_nation#X, l_year#X, sum(volume#X)#X AS revenue#X] -(53) WholeStageCodegenTransformer (X) +(54) WholeStageCodegenTransformer (X) Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: false -(54) ColumnarExchange +(55) VeloxAppendBatches +Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] +Arguments: X + +(56) ColumnarExchange Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: rangepartitioning(supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(55) ShuffleQueryStage +(57) ShuffleQueryStage Output [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: X -(56) InputAdapter +(58) InputAdapter Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] -(57) InputIteratorTransformer +(59) InputIteratorTransformer Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] -(58) SortExecTransformer +(60) SortExecTransformer Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: [supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST], true, 0 -(59) WholeStageCodegenTransformer (X) +(61) WholeStageCodegenTransformer (X) Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: false -(60) VeloxColumnarToRowExec +(62) VeloxColumnarToRowExec Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] -(61) Scan parquet +(63) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(62) Filter +(64) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(63) BroadcastExchange +(65) BroadcastExchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(64) Scan parquet +(66) Scan parquet Output [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1995-01-01), LessThanOrEqual(l_shipdate,1996-12-31), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(65) Filter +(67) Filter Input [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : ((((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1995-01-01)) AND (l_shipdate#X <= 1996-12-31)) AND isnotnull(l_suppkey#X)) AND isnotnull(l_orderkey#X)) -(66) BroadcastHashJoin +(68) BroadcastHashJoin Left keys [1]: [s_suppkey#X] Right keys [1]: [l_suppkey#X] Join condition: None -(67) Project +(69) Project Output [5]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Input [7]: [s_suppkey#X, s_nationkey#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(68) Scan parquet +(70) Scan parquet Output [2]: [o_orderkey#X, o_custkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderkey), IsNotNull(o_custkey)] ReadSchema: struct -(69) Filter +(71) Filter Input [2]: [o_orderkey#X, o_custkey#X] Condition : (isnotnull(o_orderkey#X) AND isnotnull(o_custkey#X)) -(70) BroadcastExchange +(72) BroadcastExchange Input [2]: [o_orderkey#X, o_custkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(71) BroadcastHashJoin +(73) BroadcastHashJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join condition: None -(72) Project +(74) Project Output [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] Input [7]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_orderkey#X, o_custkey#X] -(73) Scan parquet +(75) Scan parquet Output [2]: [c_custkey#X, c_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey), IsNotNull(c_nationkey)] ReadSchema: struct -(74) Filter +(76) Filter Input [2]: [c_custkey#X, c_nationkey#X] Condition : (isnotnull(c_custkey#X) AND isnotnull(c_nationkey#X)) -(75) BroadcastExchange +(77) BroadcastExchange Input [2]: [c_custkey#X, c_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(76) BroadcastHashJoin +(78) BroadcastHashJoin Left keys [1]: [o_custkey#X] Right keys [1]: [c_custkey#X] Join condition: None -(77) Project +(79) Project Output [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] Input [7]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X, c_custkey#X, c_nationkey#X] -(78) Scan parquet +(80) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), Or(EqualTo(n_name,FRANCE),EqualTo(n_name,GERMANY))] ReadSchema: struct -(79) Filter +(81) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : (isnotnull(n_nationkey#X) AND ((n_name#X = FRANCE) OR (n_name#X = GERMANY))) -(80) BroadcastExchange +(82) BroadcastExchange Input [2]: [n_nationkey#X, n_name#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(81) BroadcastHashJoin +(83) BroadcastHashJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(82) Project +(84) Project Output [5]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] Input [7]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_nationkey#X, n_name#X] -(83) Scan parquet +(85) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), Or(EqualTo(n_name,GERMANY),EqualTo(n_name,FRANCE))] ReadSchema: struct -(84) Filter +(86) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : (isnotnull(n_nationkey#X) AND ((n_name#X = GERMANY) OR (n_name#X = FRANCE))) -(85) BroadcastExchange +(87) BroadcastExchange Input [2]: [n_nationkey#X, n_name#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(86) BroadcastHashJoin +(88) BroadcastHashJoin Left keys [1]: [c_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: (((n_name#X = FRANCE) AND (n_name#X = GERMANY)) OR ((n_name#X = GERMANY) AND (n_name#X = FRANCE))) -(87) Project +(89) Project Output [4]: [n_name#X AS supp_nation#X, n_name#X AS cust_nation#X, year(l_shipdate#X) AS l_year#X, CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) AS volume#X] Input [7]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X, n_nationkey#X, n_name#X] -(88) HashAggregate +(90) HashAggregate Input [4]: [supp_nation#X, cust_nation#X, l_year#X, volume#X] Keys [3]: [supp_nation#X, cust_nation#X, l_year#X] Functions [1]: [partial_sum(volume#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] -(89) Exchange +(91) Exchange Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Arguments: hashpartitioning(supp_nation#X, cust_nation#X, l_year#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(90) HashAggregate +(92) HashAggregate Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Keys [3]: [supp_nation#X, cust_nation#X, l_year#X] Functions [1]: [sum(volume#X)] Aggregate Attributes [1]: [sum(volume#X)#X] Results [4]: [supp_nation#X, cust_nation#X, l_year#X, sum(volume#X)#X AS revenue#X] -(91) Exchange +(93) Exchange Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: rangepartitioning(supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(92) Sort +(94) Sort Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: [supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST], true, 0 -(93) AdaptiveSparkPlan +(95) AdaptiveSparkPlan Output [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/8.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/8.txt index 9489452b52726..045e283bd036c 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/8.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/8.txt @@ -1,125 +1,118 @@ == Physical Plan == -AdaptiveSparkPlan (129) +AdaptiveSparkPlan (131) +- == Final Plan == - VeloxColumnarToRowExec (84) - +- ^ SortExecTransformer (82) - +- ^ InputIteratorTransformer (81) - +- ^ InputAdapter (80) - +- ^ ShuffleQueryStage (79) - +- ColumnarExchange (78) - +- ^ ProjectExecTransformer (76) - +- ^ RegularHashAggregateExecTransformer (75) - +- ^ InputIteratorTransformer (74) - +- ^ InputAdapter (73) - +- ^ ShuffleQueryStage (72) - +- ColumnarExchange (71) + VeloxColumnarToRowExec (86) + +- ^ SortExecTransformer (84) + +- ^ InputIteratorTransformer (83) + +- ShuffleQueryStage (81) + +- ColumnarExchange (80) + +- VeloxAppendBatches (79) + +- ^ ProjectExecTransformer (77) + +- ^ RegularHashAggregateExecTransformer (76) + +- ^ InputIteratorTransformer (75) + +- ShuffleQueryStage (73) + +- ColumnarExchange (72) + +- VeloxAppendBatches (71) +- ^ ProjectExecTransformer (69) +- ^ FlushableHashAggregateExecTransformer (68) +- ^ ProjectExecTransformer (67) - +- ^ BroadcastHashJoinExecTransformer Inner (66) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (66) :- ^ ProjectExecTransformer (57) - : +- ^ BroadcastHashJoinExecTransformer Inner (56) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (56) : :- ^ ProjectExecTransformer (48) - : : +- ^ BroadcastHashJoinExecTransformer Inner (47) + : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (47) : : :- ^ ProjectExecTransformer (39) - : : : +- ^ BroadcastHashJoinExecTransformer Inner (38) + : : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (38) : : : :- ^ ProjectExecTransformer (30) - : : : : +- ^ BroadcastHashJoinExecTransformer Inner (29) + : : : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (29) : : : : :- ^ ProjectExecTransformer (21) - : : : : : +- ^ BroadcastHashJoinExecTransformer Inner (20) + : : : : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (20) : : : : : :- ^ ProjectExecTransformer (12) - : : : : : : +- ^ BroadcastHashJoinExecTransformer Inner (11) + : : : : : : +- ^ BroadcastHashJoinExecTransformer Inner BuildLeft (11) : : : : : : :- ^ InputIteratorTransformer (8) - : : : : : : : +- ^ InputAdapter (7) - : : : : : : : +- ^ BroadcastQueryStage (6) - : : : : : : : +- ColumnarBroadcastExchange (5) - : : : : : : : +- ^ ProjectExecTransformer (3) - : : : : : : : +- ^ NoopFilter (2) - : : : : : : : +- ^ Scan parquet (1) + : : : : : : : +- BroadcastQueryStage (6) + : : : : : : : +- ColumnarBroadcastExchange (5) + : : : : : : : +- ^ ProjectExecTransformer (3) + : : : : : : : +- ^ NoopFilter (2) + : : : : : : : +- ^ Scan parquet (1) : : : : : : +- ^ NoopFilter (10) : : : : : : +- ^ Scan parquet (9) : : : : : +- ^ InputIteratorTransformer (19) - : : : : : +- ^ InputAdapter (18) - : : : : : +- ^ BroadcastQueryStage (17) - : : : : : +- ColumnarBroadcastExchange (16) - : : : : : +- ^ NoopFilter (14) - : : : : : +- ^ Scan parquet (13) + : : : : : +- BroadcastQueryStage (17) + : : : : : +- ColumnarBroadcastExchange (16) + : : : : : +- ^ NoopFilter (14) + : : : : : +- ^ Scan parquet (13) : : : : +- ^ InputIteratorTransformer (28) - : : : : +- ^ InputAdapter (27) - : : : : +- ^ BroadcastQueryStage (26) - : : : : +- ColumnarBroadcastExchange (25) - : : : : +- ^ NoopFilter (23) - : : : : +- ^ Scan parquet (22) + : : : : +- BroadcastQueryStage (26) + : : : : +- ColumnarBroadcastExchange (25) + : : : : +- ^ NoopFilter (23) + : : : : +- ^ Scan parquet (22) : : : +- ^ InputIteratorTransformer (37) - : : : +- ^ InputAdapter (36) - : : : +- ^ BroadcastQueryStage (35) - : : : +- ColumnarBroadcastExchange (34) - : : : +- ^ NoopFilter (32) - : : : +- ^ Scan parquet (31) + : : : +- BroadcastQueryStage (35) + : : : +- ColumnarBroadcastExchange (34) + : : : +- ^ NoopFilter (32) + : : : +- ^ Scan parquet (31) : : +- ^ InputIteratorTransformer (46) - : : +- ^ InputAdapter (45) - : : +- ^ BroadcastQueryStage (44) - : : +- ColumnarBroadcastExchange (43) - : : +- ^ NoopFilter (41) - : : +- ^ Scan parquet (40) + : : +- BroadcastQueryStage (44) + : : +- ColumnarBroadcastExchange (43) + : : +- ^ NoopFilter (41) + : : +- ^ Scan parquet (40) : +- ^ InputIteratorTransformer (55) - : +- ^ InputAdapter (54) - : +- ^ BroadcastQueryStage (53) - : +- ColumnarBroadcastExchange (52) - : +- ^ NoopFilter (50) - : +- ^ Scan parquet (49) + : +- BroadcastQueryStage (53) + : +- ColumnarBroadcastExchange (52) + : +- ^ NoopFilter (50) + : +- ^ Scan parquet (49) +- ^ InputIteratorTransformer (65) - +- ^ InputAdapter (64) - +- ^ BroadcastQueryStage (63) - +- ColumnarBroadcastExchange (62) - +- ^ ProjectExecTransformer (60) - +- ^ NoopFilter (59) - +- ^ Scan parquet (58) + +- BroadcastQueryStage (63) + +- ColumnarBroadcastExchange (62) + +- ^ ProjectExecTransformer (60) + +- ^ NoopFilter (59) + +- ^ Scan parquet (58) +- == Initial Plan == - Sort (128) - +- Exchange (127) - +- HashAggregate (126) - +- Exchange (125) - +- HashAggregate (124) - +- Project (123) - +- BroadcastHashJoin Inner BuildRight (122) - :- Project (117) - : +- BroadcastHashJoin Inner BuildRight (116) - : :- Project (112) - : : +- BroadcastHashJoin Inner BuildRight (111) - : : :- Project (107) - : : : +- BroadcastHashJoin Inner BuildRight (106) - : : : :- Project (102) - : : : : +- BroadcastHashJoin Inner BuildRight (101) - : : : : :- Project (97) - : : : : : +- BroadcastHashJoin Inner BuildRight (96) - : : : : : :- Project (92) - : : : : : : +- BroadcastHashJoin Inner BuildLeft (91) - : : : : : : :- BroadcastExchange (88) - : : : : : : : +- Project (87) - : : : : : : : +- Filter (86) - : : : : : : : +- Scan parquet (85) - : : : : : : +- Filter (90) - : : : : : : +- Scan parquet (89) - : : : : : +- BroadcastExchange (95) - : : : : : +- Filter (94) - : : : : : +- Scan parquet (93) - : : : : +- BroadcastExchange (100) - : : : : +- Filter (99) - : : : : +- Scan parquet (98) - : : : +- BroadcastExchange (105) - : : : +- Filter (104) - : : : +- Scan parquet (103) - : : +- BroadcastExchange (110) - : : +- Filter (109) - : : +- Scan parquet (108) - : +- BroadcastExchange (115) - : +- Filter (114) - : +- Scan parquet (113) - +- BroadcastExchange (121) - +- Project (120) - +- Filter (119) - +- Scan parquet (118) + Sort (130) + +- Exchange (129) + +- HashAggregate (128) + +- Exchange (127) + +- HashAggregate (126) + +- Project (125) + +- BroadcastHashJoin Inner BuildRight (124) + :- Project (119) + : +- BroadcastHashJoin Inner BuildRight (118) + : :- Project (114) + : : +- BroadcastHashJoin Inner BuildRight (113) + : : :- Project (109) + : : : +- BroadcastHashJoin Inner BuildRight (108) + : : : :- Project (104) + : : : : +- BroadcastHashJoin Inner BuildRight (103) + : : : : :- Project (99) + : : : : : +- BroadcastHashJoin Inner BuildRight (98) + : : : : : :- Project (94) + : : : : : : +- BroadcastHashJoin Inner BuildLeft (93) + : : : : : : :- BroadcastExchange (90) + : : : : : : : +- Project (89) + : : : : : : : +- Filter (88) + : : : : : : : +- Scan parquet (87) + : : : : : : +- Filter (92) + : : : : : : +- Scan parquet (91) + : : : : : +- BroadcastExchange (97) + : : : : : +- Filter (96) + : : : : : +- Scan parquet (95) + : : : : +- BroadcastExchange (102) + : : : : +- Filter (101) + : : : : +- Scan parquet (100) + : : : +- BroadcastExchange (107) + : : : +- Filter (106) + : : : +- Scan parquet (105) + : : +- BroadcastExchange (112) + : : +- Filter (111) + : : +- Scan parquet (110) + : +- BroadcastExchange (117) + : +- Filter (116) + : +- Scan parquet (115) + +- BroadcastExchange (123) + +- Project (122) + +- Filter (121) + +- Scan parquet (120) (1) Scan parquet @@ -422,273 +415,281 @@ Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Input [6]: [hash_partition_key#X, o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: false -(71) ColumnarExchange +(71) VeloxAppendBatches +Input [6]: [hash_partition_key#X, o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] +Arguments: X + +(72) ColumnarExchange Input [6]: [hash_partition_key#X, o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: hashpartitioning(o_year#X, 1), ENSURE_REQUIREMENTS, [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(72) ShuffleQueryStage +(73) ShuffleQueryStage Output [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: X -(73) InputAdapter +(74) InputAdapter Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] -(74) InputIteratorTransformer +(75) InputIteratorTransformer Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] -(75) RegularHashAggregateExecTransformer +(76) RegularHashAggregateExecTransformer Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Keys [1]: [o_year#X] Functions [2]: [sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END), sum(volume#X)] Aggregate Attributes [2]: [sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X, sum(volume#X)#X] Results [3]: [o_year#X, sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X, sum(volume#X)#X] -(76) ProjectExecTransformer +(77) ProjectExecTransformer Output [2]: [o_year#X, CheckOverflow((promote_precision(sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X) / promote_precision(sum(volume#X)#X)), DecimalType(38,6), true) AS mkt_share#X] Input [3]: [o_year#X, sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X, sum(volume#X)#X] -(77) WholeStageCodegenTransformer (X) +(78) WholeStageCodegenTransformer (X) Input [2]: [o_year#X, mkt_share#X] Arguments: false -(78) ColumnarExchange +(79) VeloxAppendBatches +Input [2]: [o_year#X, mkt_share#X] +Arguments: X + +(80) ColumnarExchange Input [2]: [o_year#X, mkt_share#X] Arguments: rangepartitioning(o_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(79) ShuffleQueryStage +(81) ShuffleQueryStage Output [2]: [o_year#X, mkt_share#X] Arguments: X -(80) InputAdapter +(82) InputAdapter Input [2]: [o_year#X, mkt_share#X] -(81) InputIteratorTransformer +(83) InputIteratorTransformer Input [2]: [o_year#X, mkt_share#X] -(82) SortExecTransformer +(84) SortExecTransformer Input [2]: [o_year#X, mkt_share#X] Arguments: [o_year#X ASC NULLS FIRST], true, 0 -(83) WholeStageCodegenTransformer (X) +(85) WholeStageCodegenTransformer (X) Input [2]: [o_year#X, mkt_share#X] Arguments: false -(84) VeloxColumnarToRowExec +(86) VeloxColumnarToRowExec Input [2]: [o_year#X, mkt_share#X] -(85) Scan parquet +(87) Scan parquet Output [2]: [p_partkey#X, p_type#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_type), EqualTo(p_type,ECONOMY ANODIZED STEEL), IsNotNull(p_partkey)] ReadSchema: struct -(86) Filter +(88) Filter Input [2]: [p_partkey#X, p_type#X] Condition : ((isnotnull(p_type#X) AND (p_type#X = ECONOMY ANODIZED STEEL)) AND isnotnull(p_partkey#X)) -(87) Project +(89) Project Output [1]: [p_partkey#X] Input [2]: [p_partkey#X, p_type#X] -(88) BroadcastExchange +(90) BroadcastExchange Input [1]: [p_partkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(89) Scan parquet +(91) Scan parquet Output [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(90) Filter +(92) Filter Input [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Condition : ((isnotnull(l_partkey#X) AND isnotnull(l_suppkey#X)) AND isnotnull(l_orderkey#X)) -(91) BroadcastHashJoin +(93) BroadcastHashJoin Left keys [1]: [p_partkey#X] Right keys [1]: [l_partkey#X] Join condition: None -(92) Project +(94) Project Output [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Input [6]: [p_partkey#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(93) Scan parquet +(95) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(94) Filter +(96) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(95) BroadcastExchange +(97) BroadcastExchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(96) BroadcastHashJoin +(98) BroadcastHashJoin Left keys [1]: [l_suppkey#X] Right keys [1]: [s_suppkey#X] Join condition: None -(97) Project +(99) Project Output [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Input [6]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, s_suppkey#X, s_nationkey#X] -(98) Scan parquet +(100) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1995-01-01), LessThanOrEqual(o_orderdate,1996-12-31), IsNotNull(o_orderkey), IsNotNull(o_custkey)] ReadSchema: struct -(99) Filter +(101) Filter Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Condition : ((((isnotnull(o_orderdate#X) AND (o_orderdate#X >= 1995-01-01)) AND (o_orderdate#X <= 1996-12-31)) AND isnotnull(o_orderkey#X)) AND isnotnull(o_custkey#X)) -(100) BroadcastExchange +(102) BroadcastExchange Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(101) BroadcastHashJoin +(103) BroadcastHashJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join condition: None -(102) Project +(104) Project Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] Input [7]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderkey#X, o_custkey#X, o_orderdate#X] -(103) Scan parquet +(105) Scan parquet Output [2]: [c_custkey#X, c_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey), IsNotNull(c_nationkey)] ReadSchema: struct -(104) Filter +(106) Filter Input [2]: [c_custkey#X, c_nationkey#X] Condition : (isnotnull(c_custkey#X) AND isnotnull(c_nationkey#X)) -(105) BroadcastExchange +(107) BroadcastExchange Input [2]: [c_custkey#X, c_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(106) BroadcastHashJoin +(108) BroadcastHashJoin Left keys [1]: [o_custkey#X] Right keys [1]: [c_custkey#X] Join condition: None -(107) Project +(109) Project Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] Input [7]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X, c_custkey#X, c_nationkey#X] -(108) Scan parquet +(110) Scan parquet Output [2]: [n_nationkey#X, n_regionkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), IsNotNull(n_regionkey)] ReadSchema: struct -(109) Filter +(111) Filter Input [2]: [n_nationkey#X, n_regionkey#X] Condition : (isnotnull(n_nationkey#X) AND isnotnull(n_regionkey#X)) -(110) BroadcastExchange +(112) BroadcastExchange Input [2]: [n_nationkey#X, n_regionkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(111) BroadcastHashJoin +(113) BroadcastHashJoin Left keys [1]: [c_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(112) Project +(114) Project Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] Input [7]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X, n_nationkey#X, n_regionkey#X] -(113) Scan parquet +(115) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey)] ReadSchema: struct -(114) Filter +(116) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : isnotnull(n_nationkey#X) -(115) BroadcastExchange +(117) BroadcastExchange Input [2]: [n_nationkey#X, n_name#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(116) BroadcastHashJoin +(118) BroadcastHashJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(117) Project +(119) Project Output [5]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] Input [7]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X, n_nationkey#X, n_name#X] -(118) Scan parquet +(120) Scan parquet Output [2]: [r_regionkey#X, r_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(r_name), EqualTo(r_name,AMERICA), IsNotNull(r_regionkey)] ReadSchema: struct -(119) Filter +(121) Filter Input [2]: [r_regionkey#X, r_name#X] Condition : ((isnotnull(r_name#X) AND (r_name#X = AMERICA)) AND isnotnull(r_regionkey#X)) -(120) Project +(122) Project Output [1]: [r_regionkey#X] Input [2]: [r_regionkey#X, r_name#X] -(121) BroadcastExchange +(123) BroadcastExchange Input [1]: [r_regionkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(122) BroadcastHashJoin +(124) BroadcastHashJoin Left keys [1]: [n_regionkey#X] Right keys [1]: [r_regionkey#X] Join condition: None -(123) Project +(125) Project Output [3]: [year(o_orderdate#X) AS o_year#X, CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) AS volume#X, n_name#X AS nation#X] Input [6]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X, r_regionkey#X] -(124) HashAggregate +(126) HashAggregate Input [3]: [o_year#X, volume#X, nation#X] Keys [1]: [o_year#X] Functions [2]: [partial_sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END), partial_sum(volume#X)] Aggregate Attributes [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Results [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] -(125) Exchange +(127) Exchange Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: hashpartitioning(o_year#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(126) HashAggregate +(128) HashAggregate Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Keys [1]: [o_year#X] Functions [2]: [sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END), sum(volume#X)] Aggregate Attributes [2]: [sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X, sum(volume#X)#X] Results [2]: [o_year#X, CheckOverflow((promote_precision(sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X) / promote_precision(sum(volume#X)#X)), DecimalType(38,6), true) AS mkt_share#X] -(127) Exchange +(129) Exchange Input [2]: [o_year#X, mkt_share#X] Arguments: rangepartitioning(o_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(128) Sort +(130) Sort Input [2]: [o_year#X, mkt_share#X] Arguments: [o_year#X ASC NULLS FIRST], true, 0 -(129) AdaptiveSparkPlan +(131) AdaptiveSparkPlan Output [2]: [o_year#X, mkt_share#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/9.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/9.txt index 29e6d2f72f570..abdd1d7b9369a 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/9.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/9.txt @@ -1,96 +1,91 @@ == Physical Plan == -AdaptiveSparkPlan (98) +AdaptiveSparkPlan (100) +- == Final Plan == - VeloxColumnarToRowExec (64) - +- ^ SortExecTransformer (62) - +- ^ InputIteratorTransformer (61) - +- ^ InputAdapter (60) - +- ^ ShuffleQueryStage (59) - +- ColumnarExchange (58) - +- ^ RegularHashAggregateExecTransformer (56) - +- ^ InputIteratorTransformer (55) - +- ^ InputAdapter (54) - +- ^ ShuffleQueryStage (53) - +- ColumnarExchange (52) + VeloxColumnarToRowExec (66) + +- ^ SortExecTransformer (64) + +- ^ InputIteratorTransformer (63) + +- ShuffleQueryStage (61) + +- ColumnarExchange (60) + +- VeloxAppendBatches (59) + +- ^ RegularHashAggregateExecTransformer (57) + +- ^ InputIteratorTransformer (56) + +- ShuffleQueryStage (54) + +- ColumnarExchange (53) + +- VeloxAppendBatches (52) +- ^ ProjectExecTransformer (50) +- ^ FlushableHashAggregateExecTransformer (49) +- ^ ProjectExecTransformer (48) - +- ^ BroadcastHashJoinExecTransformer Inner (47) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (47) :- ^ ProjectExecTransformer (39) - : +- ^ BroadcastHashJoinExecTransformer Inner (38) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (38) : :- ^ ProjectExecTransformer (30) - : : +- ^ BroadcastHashJoinExecTransformer Inner (29) + : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (29) : : :- ^ ProjectExecTransformer (21) - : : : +- ^ BroadcastHashJoinExecTransformer Inner (20) + : : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (20) : : : :- ^ ProjectExecTransformer (12) - : : : : +- ^ BroadcastHashJoinExecTransformer Inner (11) + : : : : +- ^ BroadcastHashJoinExecTransformer Inner BuildLeft (11) : : : : :- ^ InputIteratorTransformer (8) - : : : : : +- ^ InputAdapter (7) - : : : : : +- ^ BroadcastQueryStage (6) - : : : : : +- ColumnarBroadcastExchange (5) - : : : : : +- ^ ProjectExecTransformer (3) - : : : : : +- ^ NoopFilter (2) - : : : : : +- ^ Scan parquet (1) + : : : : : +- BroadcastQueryStage (6) + : : : : : +- ColumnarBroadcastExchange (5) + : : : : : +- ^ ProjectExecTransformer (3) + : : : : : +- ^ NoopFilter (2) + : : : : : +- ^ Scan parquet (1) : : : : +- ^ NoopFilter (10) : : : : +- ^ Scan parquet (9) : : : +- ^ InputIteratorTransformer (19) - : : : +- ^ InputAdapter (18) - : : : +- ^ BroadcastQueryStage (17) - : : : +- ColumnarBroadcastExchange (16) - : : : +- ^ NoopFilter (14) - : : : +- ^ Scan parquet (13) + : : : +- BroadcastQueryStage (17) + : : : +- ColumnarBroadcastExchange (16) + : : : +- ^ NoopFilter (14) + : : : +- ^ Scan parquet (13) : : +- ^ InputIteratorTransformer (28) - : : +- ^ InputAdapter (27) - : : +- ^ BroadcastQueryStage (26) - : : +- ColumnarBroadcastExchange (25) - : : +- ^ NoopFilter (23) - : : +- ^ Scan parquet (22) + : : +- BroadcastQueryStage (26) + : : +- ColumnarBroadcastExchange (25) + : : +- ^ NoopFilter (23) + : : +- ^ Scan parquet (22) : +- ^ InputIteratorTransformer (37) - : +- ^ InputAdapter (36) - : +- ^ BroadcastQueryStage (35) - : +- ColumnarBroadcastExchange (34) - : +- ^ NoopFilter (32) - : +- ^ Scan parquet (31) + : +- BroadcastQueryStage (35) + : +- ColumnarBroadcastExchange (34) + : +- ^ NoopFilter (32) + : +- ^ Scan parquet (31) +- ^ InputIteratorTransformer (46) - +- ^ InputAdapter (45) - +- ^ BroadcastQueryStage (44) - +- ColumnarBroadcastExchange (43) - +- ^ NoopFilter (41) - +- ^ Scan parquet (40) + +- BroadcastQueryStage (44) + +- ColumnarBroadcastExchange (43) + +- ^ NoopFilter (41) + +- ^ Scan parquet (40) +- == Initial Plan == - Sort (97) - +- Exchange (96) - +- HashAggregate (95) - +- Exchange (94) - +- HashAggregate (93) - +- Project (92) - +- BroadcastHashJoin Inner BuildRight (91) - :- Project (87) - : +- BroadcastHashJoin Inner BuildRight (86) - : :- Project (82) - : : +- BroadcastHashJoin Inner BuildRight (81) - : : :- Project (77) - : : : +- BroadcastHashJoin Inner BuildRight (76) - : : : :- Project (72) - : : : : +- BroadcastHashJoin Inner BuildLeft (71) - : : : : :- BroadcastExchange (68) - : : : : : +- Project (67) - : : : : : +- Filter (66) - : : : : : +- Scan parquet (65) - : : : : +- Filter (70) - : : : : +- Scan parquet (69) - : : : +- BroadcastExchange (75) - : : : +- Filter (74) - : : : +- Scan parquet (73) - : : +- BroadcastExchange (80) - : : +- Filter (79) - : : +- Scan parquet (78) - : +- BroadcastExchange (85) - : +- Filter (84) - : +- Scan parquet (83) - +- BroadcastExchange (90) - +- Filter (89) - +- Scan parquet (88) + Sort (99) + +- Exchange (98) + +- HashAggregate (97) + +- Exchange (96) + +- HashAggregate (95) + +- Project (94) + +- BroadcastHashJoin Inner BuildRight (93) + :- Project (89) + : +- BroadcastHashJoin Inner BuildRight (88) + : :- Project (84) + : : +- BroadcastHashJoin Inner BuildRight (83) + : : :- Project (79) + : : : +- BroadcastHashJoin Inner BuildRight (78) + : : : :- Project (74) + : : : : +- BroadcastHashJoin Inner BuildLeft (73) + : : : : :- BroadcastExchange (70) + : : : : : +- Project (69) + : : : : : +- Filter (68) + : : : : : +- Scan parquet (67) + : : : : +- Filter (72) + : : : : +- Scan parquet (71) + : : : +- BroadcastExchange (77) + : : : +- Filter (76) + : : : +- Scan parquet (75) + : : +- BroadcastExchange (82) + : : +- Filter (81) + : : +- Scan parquet (80) + : +- BroadcastExchange (87) + : +- Filter (86) + : +- Scan parquet (85) + +- BroadcastExchange (92) + +- Filter (91) + +- Scan parquet (90) (1) Scan parquet @@ -313,217 +308,225 @@ Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] Input [5]: [hash_partition_key#X, nation#X, o_year#X, sum#X, isEmpty#X] Arguments: false -(52) ColumnarExchange +(52) VeloxAppendBatches +Input [5]: [hash_partition_key#X, nation#X, o_year#X, sum#X, isEmpty#X] +Arguments: X + +(53) ColumnarExchange Input [5]: [hash_partition_key#X, nation#X, o_year#X, sum#X, isEmpty#X] Arguments: hashpartitioning(nation#X, o_year#X, 1), ENSURE_REQUIREMENTS, [nation#X, o_year#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(53) ShuffleQueryStage +(54) ShuffleQueryStage Output [4]: [nation#X, o_year#X, sum#X, isEmpty#X] Arguments: X -(54) InputAdapter +(55) InputAdapter Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] -(55) InputIteratorTransformer +(56) InputIteratorTransformer Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] -(56) RegularHashAggregateExecTransformer +(57) RegularHashAggregateExecTransformer Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] Keys [2]: [nation#X, o_year#X] Functions [1]: [sum(amount#X)] Aggregate Attributes [1]: [sum(amount#X)#X] Results [3]: [nation#X, o_year#X, sum(amount#X)#X AS sum_profit#X] -(57) WholeStageCodegenTransformer (X) +(58) WholeStageCodegenTransformer (X) Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: false -(58) ColumnarExchange +(59) VeloxAppendBatches +Input [3]: [nation#X, o_year#X, sum_profit#X] +Arguments: X + +(60) ColumnarExchange Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: rangepartitioning(nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(59) ShuffleQueryStage +(61) ShuffleQueryStage Output [3]: [nation#X, o_year#X, sum_profit#X] Arguments: X -(60) InputAdapter +(62) InputAdapter Input [3]: [nation#X, o_year#X, sum_profit#X] -(61) InputIteratorTransformer +(63) InputIteratorTransformer Input [3]: [nation#X, o_year#X, sum_profit#X] -(62) SortExecTransformer +(64) SortExecTransformer Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: [nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST], true, 0 -(63) WholeStageCodegenTransformer (X) +(65) WholeStageCodegenTransformer (X) Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: false -(64) VeloxColumnarToRowExec +(66) VeloxColumnarToRowExec Input [3]: [nation#X, o_year#X, sum_profit#X] -(65) Scan parquet +(67) Scan parquet Output [2]: [p_partkey#X, p_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_name), StringContains(p_name,green), IsNotNull(p_partkey)] ReadSchema: struct -(66) Filter +(68) Filter Input [2]: [p_partkey#X, p_name#X] Condition : ((isnotnull(p_name#X) AND Contains(p_name#X, green)) AND isnotnull(p_partkey#X)) -(67) Project +(69) Project Output [1]: [p_partkey#X] Input [2]: [p_partkey#X, p_name#X] -(68) BroadcastExchange +(70) BroadcastExchange Input [1]: [p_partkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(69) Scan parquet +(71) Scan parquet Output [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(70) Filter +(72) Filter Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Condition : ((isnotnull(l_partkey#X) AND isnotnull(l_suppkey#X)) AND isnotnull(l_orderkey#X)) -(71) BroadcastHashJoin +(73) BroadcastHashJoin Left keys [1]: [p_partkey#X] Right keys [1]: [l_partkey#X] Join condition: None -(72) Project +(74) Project Output [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Input [7]: [p_partkey#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -(73) Scan parquet +(75) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(74) Filter +(76) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(75) BroadcastExchange +(77) BroadcastExchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(76) BroadcastHashJoin +(78) BroadcastHashJoin Left keys [1]: [l_suppkey#X] Right keys [1]: [s_suppkey#X] Join condition: None -(77) Project +(79) Project Output [7]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Input [8]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_suppkey#X, s_nationkey#X] -(78) Scan parquet +(80) Scan parquet Output [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_suppkey), IsNotNull(ps_partkey)] ReadSchema: struct -(79) Filter +(81) Filter Input [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Condition : (isnotnull(ps_suppkey#X) AND isnotnull(ps_partkey#X)) -(80) BroadcastExchange +(82) BroadcastExchange Input [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Arguments: HashedRelationBroadcastMode(List(input[1, bigint, false], input[0, bigint, false]),false), [plan_id=X] -(81) BroadcastHashJoin +(83) BroadcastHashJoin Left keys [2]: [l_suppkey#X, l_partkey#X] Right keys [2]: [ps_suppkey#X, ps_partkey#X] Join condition: None -(82) Project +(84) Project Output [6]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] Input [10]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_partkey#X, ps_suppkey#X, ps_supplycost#X] -(83) Scan parquet +(85) Scan parquet Output [2]: [o_orderkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderkey)] ReadSchema: struct -(84) Filter +(86) Filter Input [2]: [o_orderkey#X, o_orderdate#X] Condition : isnotnull(o_orderkey#X) -(85) BroadcastExchange +(87) BroadcastExchange Input [2]: [o_orderkey#X, o_orderdate#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(86) BroadcastHashJoin +(88) BroadcastHashJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join condition: None -(87) Project +(89) Project Output [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] Input [8]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderkey#X, o_orderdate#X] -(88) Scan parquet +(90) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey)] ReadSchema: struct -(89) Filter +(91) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : isnotnull(n_nationkey#X) -(90) BroadcastExchange +(92) BroadcastExchange Input [2]: [n_nationkey#X, n_name#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(91) BroadcastHashJoin +(93) BroadcastHashJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(92) Project +(94) Project Output [3]: [n_name#X AS nation#X, year(o_orderdate#X) AS o_year#X, CheckOverflow((promote_precision(cast(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) as decimal(27,4))) - promote_precision(cast(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(l_quantity#X)), DecimalType(25,4), true) as decimal(27,4)))), DecimalType(27,4), true) AS amount#X] Input [8]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X, n_nationkey#X, n_name#X] -(93) HashAggregate +(95) HashAggregate Input [3]: [nation#X, o_year#X, amount#X] Keys [2]: [nation#X, o_year#X] Functions [1]: [partial_sum(amount#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [4]: [nation#X, o_year#X, sum#X, isEmpty#X] -(94) Exchange +(96) Exchange Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] Arguments: hashpartitioning(nation#X, o_year#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(95) HashAggregate +(97) HashAggregate Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] Keys [2]: [nation#X, o_year#X] Functions [1]: [sum(amount#X)] Aggregate Attributes [1]: [sum(amount#X)#X] Results [3]: [nation#X, o_year#X, sum(amount#X)#X AS sum_profit#X] -(96) Exchange +(98) Exchange Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: rangepartitioning(nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(97) Sort +(99) Sort Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: [nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST], true, 0 -(98) AdaptiveSparkPlan +(100) AdaptiveSparkPlan Output [3]: [nation#X, o_year#X, sum_profit#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/1.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/1.txt index 74f79bd3ee642..afea15af53d24 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/1.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/1.txt @@ -1,31 +1,31 @@ == Physical Plan == -AdaptiveSparkPlan (28) +AdaptiveSparkPlan (30) +- == Final Plan == - VeloxColumnarToRowExec (19) - +- ^ SortExecTransformer (17) - +- ^ InputIteratorTransformer (16) - +- ^ InputAdapter (15) - +- ^ ShuffleQueryStage (14), Statistics(X) - +- ColumnarExchange (13) - +- ^ RegularHashAggregateExecTransformer (11) - +- ^ InputIteratorTransformer (10) - +- ^ InputAdapter (9) - +- ^ ShuffleQueryStage (8), Statistics(X) - +- ColumnarExchange (7) + VeloxColumnarToRowExec (21) + +- ^ SortExecTransformer (19) + +- ^ InputIteratorTransformer (18) + +- ShuffleQueryStage (16), Statistics(X) + +- ColumnarExchange (15) + +- VeloxAppendBatches (14) + +- ^ RegularHashAggregateExecTransformer (12) + +- ^ InputIteratorTransformer (11) + +- ShuffleQueryStage (9), Statistics(X) + +- ColumnarExchange (8) + +- VeloxAppendBatches (7) +- ^ ProjectExecTransformer (5) +- ^ FlushableHashAggregateExecTransformer (4) +- ^ ProjectExecTransformer (3) +- ^ NoopFilter (2) +- ^ Scan parquet (1) +- == Initial Plan == - Sort (27) - +- Exchange (26) - +- HashAggregate (25) - +- Exchange (24) - +- HashAggregate (23) - +- Project (22) - +- Filter (21) - +- Scan parquet (20) + Sort (29) + +- Exchange (28) + +- HashAggregate (27) + +- Exchange (26) + +- HashAggregate (25) + +- Project (24) + +- Filter (23) + +- Scan parquet (22) (1) Scan parquet @@ -58,97 +58,105 @@ Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, Input [18]: [hash_partition_key#X, l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Arguments: false -(7) ColumnarExchange +(7) VeloxAppendBatches +Input [18]: [hash_partition_key#X, l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] +Arguments: X + +(8) ColumnarExchange Input [18]: [hash_partition_key#X, l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Arguments: hashpartitioning(l_returnflag#X, l_linestatus#X, 1), ENSURE_REQUIREMENTS, [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X], [plan_id=X], [id=#X] -(8) ShuffleQueryStage +(9) ShuffleQueryStage Output [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Arguments: X -(9) InputAdapter +(10) InputAdapter Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] -(10) InputIteratorTransformer +(11) InputIteratorTransformer Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] -(11) RegularHashAggregateExecTransformer +(12) RegularHashAggregateExecTransformer Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Keys [2]: [l_returnflag#X, l_linestatus#X] Functions [8]: [sum(l_quantity#X), sum(l_extendedprice#X), sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4))), sum(CheckOverflow((promote_precision(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4))) * promote_precision(cast(CheckOverflow((1.00 + promote_precision(cast(l_tax#X as decimal(13,2)))), DecimalType(13,2)) as decimal(26,4)))), DecimalType(38,6))), avg(l_quantity#X), avg(l_extendedprice#X), avg(l_discount#X), count(1)] Aggregate Attributes [8]: [sum(l_quantity#X)#X, sum(l_extendedprice#X)#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X, sum(CheckOverflow((promote_precision(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4))) * promote_precision(cast(CheckOverflow((1.00 + promote_precision(cast(l_tax#X as decimal(13,2)))), DecimalType(13,2)) as decimal(26,4)))), DecimalType(38,6)))#X, avg(l_quantity#X)#X, avg(l_extendedprice#X)#X, avg(l_discount#X)#X, count(1)#X] Results [10]: [l_returnflag#X, l_linestatus#X, sum(l_quantity#X)#X AS sum_qty#X, sum(l_extendedprice#X)#X AS sum_base_price#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X AS sum_disc_price#X, sum(CheckOverflow((promote_precision(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4))) * promote_precision(cast(CheckOverflow((1.00 + promote_precision(cast(l_tax#X as decimal(13,2)))), DecimalType(13,2)) as decimal(26,4)))), DecimalType(38,6)))#X AS sum_charge#X, avg(l_quantity#X)#X AS avg_qty#X, avg(l_extendedprice#X)#X AS avg_price#X, avg(l_discount#X)#X AS avg_disc#X, count(1)#X AS count_order#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] +Arguments: X + +(15) ColumnarExchange Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: rangepartitioning(l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] -(17) SortExecTransformer +(19) SortExecTransformer Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: [l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST], true, 0 -(18) WholeStageCodegenTransformer (X) +(20) WholeStageCodegenTransformer (X) Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: false -(19) VeloxColumnarToRowExec +(21) VeloxColumnarToRowExec Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] -(20) Scan parquet +(22) Scan parquet Output [7]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_tax#X, l_returnflag#X, l_linestatus#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), LessThanOrEqual(l_shipdate,1998-09-02)] ReadSchema: struct -(21) Filter +(23) Filter Input [7]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_tax#X, l_returnflag#X, l_linestatus#X, l_shipdate#X] Condition : (isnotnull(l_shipdate#X) AND (l_shipdate#X <= 1998-09-02)) -(22) Project +(24) Project Output [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_tax#X, l_returnflag#X, l_linestatus#X] Input [7]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_tax#X, l_returnflag#X, l_linestatus#X, l_shipdate#X] -(23) HashAggregate +(25) HashAggregate Input [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_tax#X, l_returnflag#X, l_linestatus#X] Keys [2]: [l_returnflag#X, l_linestatus#X] Functions [8]: [partial_sum(l_quantity#X), partial_sum(l_extendedprice#X), partial_sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4))), partial_sum(CheckOverflow((promote_precision(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4))) * promote_precision(cast(CheckOverflow((1.00 + promote_precision(cast(l_tax#X as decimal(13,2)))), DecimalType(13,2)) as decimal(26,4)))), DecimalType(38,6))), partial_avg(l_quantity#X), partial_avg(l_extendedprice#X), partial_avg(l_discount#X), partial_count(1)] Aggregate Attributes [15]: [sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Results [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] -(24) Exchange +(26) Exchange Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Arguments: hashpartitioning(l_returnflag#X, l_linestatus#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(25) HashAggregate +(27) HashAggregate Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Keys [2]: [l_returnflag#X, l_linestatus#X] Functions [8]: [sum(l_quantity#X), sum(l_extendedprice#X), sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4))), sum(CheckOverflow((promote_precision(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4))) * promote_precision(cast(CheckOverflow((1.00 + promote_precision(cast(l_tax#X as decimal(13,2)))), DecimalType(13,2)) as decimal(26,4)))), DecimalType(38,6))), avg(l_quantity#X), avg(l_extendedprice#X), avg(l_discount#X), count(1)] Aggregate Attributes [8]: [sum(l_quantity#X)#X, sum(l_extendedprice#X)#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X, sum(CheckOverflow((promote_precision(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4))) * promote_precision(cast(CheckOverflow((1.00 + promote_precision(cast(l_tax#X as decimal(13,2)))), DecimalType(13,2)) as decimal(26,4)))), DecimalType(38,6)))#X, avg(l_quantity#X)#X, avg(l_extendedprice#X)#X, avg(l_discount#X)#X, count(1)#X] Results [10]: [l_returnflag#X, l_linestatus#X, sum(l_quantity#X)#X AS sum_qty#X, sum(l_extendedprice#X)#X AS sum_base_price#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X AS sum_disc_price#X, sum(CheckOverflow((promote_precision(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4))) * promote_precision(cast(CheckOverflow((1.00 + promote_precision(cast(l_tax#X as decimal(13,2)))), DecimalType(13,2)) as decimal(26,4)))), DecimalType(38,6)))#X AS sum_charge#X, avg(l_quantity#X)#X AS avg_qty#X, avg(l_extendedprice#X)#X AS avg_price#X, avg(l_discount#X)#X AS avg_disc#X, count(1)#X AS count_order#X] -(26) Exchange +(28) Exchange Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: rangepartitioning(l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(27) Sort +(29) Sort Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: [l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST], true, 0 -(28) AdaptiveSparkPlan +(30) AdaptiveSparkPlan Output [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/10.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/10.txt index 276588749734a..7a509e435d075 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/10.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/10.txt @@ -1,68 +1,65 @@ == Physical Plan == -AdaptiveSparkPlan (67) +AdaptiveSparkPlan (68) +- == Final Plan == - VeloxColumnarToRowExec (43) - +- TakeOrderedAndProjectExecTransformer (42) - +- ^ ProjectExecTransformer (40) - +- ^ RegularHashAggregateExecTransformer (39) - +- ^ InputIteratorTransformer (38) - +- ^ InputAdapter (37) - +- ^ ShuffleQueryStage (36), Statistics(X) - +- ColumnarExchange (35) + VeloxColumnarToRowExec (44) + +- TakeOrderedAndProjectExecTransformer (43) + +- ^ ProjectExecTransformer (41) + +- ^ RegularHashAggregateExecTransformer (40) + +- ^ InputIteratorTransformer (39) + +- ShuffleQueryStage (37), Statistics(X) + +- ColumnarExchange (36) + +- VeloxAppendBatches (35) +- ^ ProjectExecTransformer (33) +- ^ FlushableHashAggregateExecTransformer (32) +- ^ ProjectExecTransformer (31) - +- ^ BroadcastHashJoinExecTransformer Inner (30) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (30) :- ^ ProjectExecTransformer (22) - : +- ^ BroadcastHashJoinExecTransformer Inner (21) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (21) : :- ^ ProjectExecTransformer (12) - : : +- ^ BroadcastHashJoinExecTransformer Inner (11) + : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (11) : : :- ^ NoopFilter (2) : : : +- ^ Scan parquet (1) : : +- ^ InputIteratorTransformer (10) - : : +- ^ InputAdapter (9) - : : +- ^ BroadcastQueryStage (8), Statistics(X) - : : +- ColumnarBroadcastExchange (7) - : : +- ^ ProjectExecTransformer (5) - : : +- ^ NoopFilter (4) - : : +- ^ Scan parquet (3) + : : +- BroadcastQueryStage (8), Statistics(X) + : : +- ColumnarBroadcastExchange (7) + : : +- ^ ProjectExecTransformer (5) + : : +- ^ NoopFilter (4) + : : +- ^ Scan parquet (3) : +- ^ InputIteratorTransformer (20) - : +- ^ InputAdapter (19) - : +- ^ BroadcastQueryStage (18), Statistics(X) - : +- ColumnarBroadcastExchange (17) - : +- ^ ProjectExecTransformer (15) - : +- ^ NoopFilter (14) - : +- ^ Scan parquet (13) + : +- BroadcastQueryStage (18), Statistics(X) + : +- ColumnarBroadcastExchange (17) + : +- ^ ProjectExecTransformer (15) + : +- ^ NoopFilter (14) + : +- ^ Scan parquet (13) +- ^ InputIteratorTransformer (29) - +- ^ InputAdapter (28) - +- ^ BroadcastQueryStage (27), Statistics(X) - +- ColumnarBroadcastExchange (26) - +- ^ NoopFilter (24) - +- ^ Scan parquet (23) + +- BroadcastQueryStage (27), Statistics(X) + +- ColumnarBroadcastExchange (26) + +- ^ NoopFilter (24) + +- ^ Scan parquet (23) +- == Initial Plan == - TakeOrderedAndProject (66) - +- HashAggregate (65) - +- Exchange (64) - +- HashAggregate (63) - +- Project (62) - +- BroadcastHashJoin Inner BuildRight (61) - :- Project (57) - : +- BroadcastHashJoin Inner BuildRight (56) - : :- Project (51) - : : +- BroadcastHashJoin Inner BuildRight (50) - : : :- Filter (45) - : : : +- Scan parquet (44) - : : +- BroadcastExchange (49) - : : +- Project (48) - : : +- Filter (47) - : : +- Scan parquet (46) - : +- BroadcastExchange (55) - : +- Project (54) - : +- Filter (53) - : +- Scan parquet (52) - +- BroadcastExchange (60) - +- Filter (59) - +- Scan parquet (58) + TakeOrderedAndProject (67) + +- HashAggregate (66) + +- Exchange (65) + +- HashAggregate (64) + +- Project (63) + +- BroadcastHashJoin Inner BuildRight (62) + :- Project (58) + : +- BroadcastHashJoin Inner BuildRight (57) + : :- Project (52) + : : +- BroadcastHashJoin Inner BuildRight (51) + : : :- Filter (46) + : : : +- Scan parquet (45) + : : +- BroadcastExchange (50) + : : +- Project (49) + : : +- Filter (48) + : : +- Scan parquet (47) + : +- BroadcastExchange (56) + : +- Project (55) + : +- Filter (54) + : +- Scan parquet (53) + +- BroadcastExchange (61) + +- Filter (60) + +- Scan parquet (59) (1) Scan parquet @@ -213,155 +210,159 @@ Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Arguments: false -(35) ColumnarExchange +(35) VeloxAppendBatches +Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] +Arguments: X + +(36) ColumnarExchange Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Arguments: hashpartitioning(c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(36) ShuffleQueryStage +(37) ShuffleQueryStage Output [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Arguments: X -(37) InputAdapter +(38) InputAdapter Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] -(38) InputIteratorTransformer +(39) InputIteratorTransformer Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] -(39) RegularHashAggregateExecTransformer +(40) RegularHashAggregateExecTransformer Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Keys [7]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] Results [8]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] -(40) ProjectExecTransformer +(41) ProjectExecTransformer Output [8]: [c_custkey#X, c_name#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X AS revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] Input [8]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] -(41) WholeStageCodegenTransformer (X) +(42) WholeStageCodegenTransformer (X) Input [8]: [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] Arguments: false -(42) TakeOrderedAndProjectExecTransformer +(43) TakeOrderedAndProjectExecTransformer Input [8]: [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] Arguments: X, [revenue#X DESC NULLS LAST], [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X], 0 -(43) VeloxColumnarToRowExec +(44) VeloxColumnarToRowExec Input [8]: [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] -(44) Scan parquet +(45) Scan parquet Output [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey), IsNotNull(c_nationkey)] ReadSchema: struct -(45) Filter +(46) Filter Input [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] Condition : (isnotnull(c_custkey#X) AND isnotnull(c_nationkey#X)) -(46) Scan parquet +(47) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1993-10-01), LessThan(o_orderdate,1994-01-01), IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(47) Filter +(48) Filter Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Condition : ((((isnotnull(o_orderdate#X) AND (o_orderdate#X >= 1993-10-01)) AND (o_orderdate#X < 1994-01-01)) AND isnotnull(o_custkey#X)) AND isnotnull(o_orderkey#X)) -(48) Project +(49) Project Output [2]: [o_orderkey#X, o_custkey#X] Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] -(49) BroadcastExchange +(50) BroadcastExchange Input [2]: [o_orderkey#X, o_custkey#X] Arguments: HashedRelationBroadcastMode(List(input[1, bigint, true]),false), [plan_id=X] -(50) BroadcastHashJoin +(51) BroadcastHashJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(51) Project +(52) Project Output [8]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] Input [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X, o_custkey#X] -(52) Scan parquet +(53) Scan parquet Output [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_returnflag#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_returnflag), EqualTo(l_returnflag,R), IsNotNull(l_orderkey)] ReadSchema: struct -(53) Filter +(54) Filter Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_returnflag#X] Condition : ((isnotnull(l_returnflag#X) AND (l_returnflag#X = R)) AND isnotnull(l_orderkey#X)) -(54) Project +(55) Project Output [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_returnflag#X] -(55) BroadcastExchange +(56) BroadcastExchange Input [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(56) BroadcastHashJoin +(57) BroadcastHashJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(57) Project +(58) Project Output [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] Input [11]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X] -(58) Scan parquet +(59) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey)] ReadSchema: struct -(59) Filter +(60) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : isnotnull(n_nationkey#X) -(60) BroadcastExchange +(61) BroadcastExchange Input [2]: [n_nationkey#X, n_name#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(61) BroadcastHashJoin +(62) BroadcastHashJoin Left keys [1]: [c_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(62) Project +(63) Project Output [9]: [c_custkey#X, c_name#X, c_address#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X, n_name#X] Input [11]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X, n_nationkey#X, n_name#X] -(63) HashAggregate +(64) HashAggregate Input [9]: [c_custkey#X, c_name#X, c_address#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X, n_name#X] Keys [7]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X] Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] -(64) Exchange +(65) Exchange Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Arguments: hashpartitioning(c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(65) HashAggregate +(66) HashAggregate Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Keys [7]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] Results [8]: [c_custkey#X, c_name#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X AS revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] -(66) TakeOrderedAndProject +(67) TakeOrderedAndProject Input [8]: [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] Arguments: X, [revenue#X DESC NULLS LAST], [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] -(67) AdaptiveSparkPlan +(68) AdaptiveSparkPlan Output [8]: [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/11.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/11.txt index 54f535dfbfb62..1c4e102aa0b98 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/11.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/11.txt @@ -1,59 +1,57 @@ == Physical Plan == -AdaptiveSparkPlan (58) +AdaptiveSparkPlan (60) +- == Final Plan == - VeloxColumnarToRowExec (38) - +- ^ SortExecTransformer (36) - +- ^ InputIteratorTransformer (35) - +- ^ InputAdapter (34) - +- ^ ShuffleQueryStage (33), Statistics(X) - +- ColumnarExchange (32) - +- ^ FilterExecTransformer (30) - +- ^ RegularHashAggregateExecTransformer (29) - +- ^ InputIteratorTransformer (28) - +- ^ InputAdapter (27) - +- ^ ShuffleQueryStage (26), Statistics(X) - +- ColumnarExchange (25) + VeloxColumnarToRowExec (40) + +- ^ SortExecTransformer (38) + +- ^ InputIteratorTransformer (37) + +- ShuffleQueryStage (35), Statistics(X) + +- ColumnarExchange (34) + +- VeloxAppendBatches (33) + +- ^ FilterExecTransformer (31) + +- ^ RegularHashAggregateExecTransformer (30) + +- ^ InputIteratorTransformer (29) + +- ShuffleQueryStage (27), Statistics(X) + +- ColumnarExchange (26) + +- VeloxAppendBatches (25) +- ^ ProjectExecTransformer (23) +- ^ FlushableHashAggregateExecTransformer (22) +- ^ ProjectExecTransformer (21) - +- ^ BroadcastHashJoinExecTransformer Inner (20) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (20) :- ^ ProjectExecTransformer (11) - : +- ^ BroadcastHashJoinExecTransformer Inner (10) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (10) : :- ^ NoopFilter (2) : : +- ^ Scan parquet (1) : +- ^ InputIteratorTransformer (9) - : +- ^ InputAdapter (8) - : +- ^ BroadcastQueryStage (7), Statistics(X) - : +- ColumnarBroadcastExchange (6) - : +- ^ NoopFilter (4) - : +- ^ Scan parquet (3) + : +- BroadcastQueryStage (7), Statistics(X) + : +- ColumnarBroadcastExchange (6) + : +- ^ NoopFilter (4) + : +- ^ Scan parquet (3) +- ^ InputIteratorTransformer (19) - +- ^ InputAdapter (18) - +- ^ BroadcastQueryStage (17), Statistics(X) - +- ColumnarBroadcastExchange (16) - +- ^ ProjectExecTransformer (14) - +- ^ NoopFilter (13) - +- ^ Scan parquet (12) + +- BroadcastQueryStage (17), Statistics(X) + +- ColumnarBroadcastExchange (16) + +- ^ ProjectExecTransformer (14) + +- ^ NoopFilter (13) + +- ^ Scan parquet (12) +- == Initial Plan == - Sort (57) - +- Exchange (56) - +- Filter (55) - +- HashAggregate (54) - +- Exchange (53) - +- HashAggregate (52) - +- Project (51) - +- BroadcastHashJoin Inner BuildRight (50) - :- Project (45) - : +- BroadcastHashJoin Inner BuildRight (44) - : :- Filter (40) - : : +- Scan parquet (39) - : +- BroadcastExchange (43) - : +- Filter (42) - : +- Scan parquet (41) - +- BroadcastExchange (49) - +- Project (48) - +- Filter (47) - +- Scan parquet (46) + Sort (59) + +- Exchange (58) + +- Filter (57) + +- HashAggregate (56) + +- Exchange (55) + +- HashAggregate (54) + +- Project (53) + +- BroadcastHashJoin Inner BuildRight (52) + :- Project (47) + : +- BroadcastHashJoin Inner BuildRight (46) + : :- Filter (42) + : : +- Scan parquet (41) + : +- BroadcastExchange (45) + : +- Filter (44) + : +- Scan parquet (43) + +- BroadcastExchange (51) + +- Project (50) + +- Filter (49) + +- Scan parquet (48) (1) Scan parquet @@ -162,382 +160,392 @@ Input [3]: [ps_partkey#X, sum#X, isEmpty#X] Input [4]: [hash_partition_key#X, ps_partkey#X, sum#X, isEmpty#X] Arguments: false -(25) ColumnarExchange +(25) VeloxAppendBatches +Input [4]: [hash_partition_key#X, ps_partkey#X, sum#X, isEmpty#X] +Arguments: X + +(26) ColumnarExchange Input [4]: [hash_partition_key#X, ps_partkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(26) ShuffleQueryStage +(27) ShuffleQueryStage Output [3]: [ps_partkey#X, sum#X, isEmpty#X] Arguments: X -(27) InputAdapter +(28) InputAdapter Input [3]: [ps_partkey#X, sum#X, isEmpty#X] -(28) InputIteratorTransformer +(29) InputIteratorTransformer Input [3]: [ps_partkey#X, sum#X, isEmpty#X] -(29) RegularHashAggregateExecTransformer +(30) RegularHashAggregateExecTransformer Input [3]: [ps_partkey#X, sum#X, isEmpty#X] Keys [1]: [ps_partkey#X] Functions [1]: [sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))#X] Results [2]: [ps_partkey#X, sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))#X AS value#X] -(30) FilterExecTransformer +(31) FilterExecTransformer Input [2]: [ps_partkey#X, value#X] Arguments: (isnotnull(value#X) AND (cast(value#X as decimal(38,6)) > Subquery subquery#X, [id=#X])) -(31) WholeStageCodegenTransformer (X) +(32) WholeStageCodegenTransformer (X) Input [2]: [ps_partkey#X, value#X] Arguments: false -(32) ColumnarExchange +(33) VeloxAppendBatches +Input [2]: [ps_partkey#X, value#X] +Arguments: X + +(34) ColumnarExchange Input [2]: [ps_partkey#X, value#X] Arguments: rangepartitioning(value#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(33) ShuffleQueryStage +(35) ShuffleQueryStage Output [2]: [ps_partkey#X, value#X] Arguments: X -(34) InputAdapter +(36) InputAdapter Input [2]: [ps_partkey#X, value#X] -(35) InputIteratorTransformer +(37) InputIteratorTransformer Input [2]: [ps_partkey#X, value#X] -(36) SortExecTransformer +(38) SortExecTransformer Input [2]: [ps_partkey#X, value#X] Arguments: [value#X DESC NULLS LAST], true, 0 -(37) WholeStageCodegenTransformer (X) +(39) WholeStageCodegenTransformer (X) Input [2]: [ps_partkey#X, value#X] Arguments: false -(38) VeloxColumnarToRowExec +(40) VeloxColumnarToRowExec Input [2]: [ps_partkey#X, value#X] -(39) Scan parquet +(41) Scan parquet Output [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_suppkey)] ReadSchema: struct -(40) Filter +(42) Filter Input [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Condition : isnotnull(ps_suppkey#X) -(41) Scan parquet +(43) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(42) Filter +(44) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(43) BroadcastExchange +(45) BroadcastExchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(44) BroadcastHashJoin +(46) BroadcastHashJoin Left keys [1]: [ps_suppkey#X] Right keys [1]: [s_suppkey#X] Join condition: None -(45) Project +(47) Project Output [4]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] Input [6]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X, s_suppkey#X, s_nationkey#X] -(46) Scan parquet +(48) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,GERMANY), IsNotNull(n_nationkey)] ReadSchema: struct -(47) Filter +(49) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : ((isnotnull(n_name#X) AND (n_name#X = GERMANY)) AND isnotnull(n_nationkey#X)) -(48) Project +(50) Project Output [1]: [n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(49) BroadcastExchange +(51) BroadcastExchange Input [1]: [n_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(50) BroadcastHashJoin +(52) BroadcastHashJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(51) Project +(53) Project Output [3]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X] Input [5]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X, n_nationkey#X] -(52) HashAggregate +(54) HashAggregate Input [3]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X] Keys [1]: [ps_partkey#X] Functions [1]: [partial_sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [ps_partkey#X, sum#X, isEmpty#X] -(53) Exchange +(55) Exchange Input [3]: [ps_partkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(54) HashAggregate +(56) HashAggregate Input [3]: [ps_partkey#X, sum#X, isEmpty#X] Keys [1]: [ps_partkey#X] Functions [1]: [sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))#X] Results [2]: [ps_partkey#X, sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))#X AS value#X] -(55) Filter +(57) Filter Input [2]: [ps_partkey#X, value#X] Condition : (isnotnull(value#X) AND (cast(value#X as decimal(38,6)) > Subquery subquery#X, [id=#X])) -(56) Exchange +(58) Exchange Input [2]: [ps_partkey#X, value#X] Arguments: rangepartitioning(value#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(57) Sort +(59) Sort Input [2]: [ps_partkey#X, value#X] Arguments: [value#X DESC NULLS LAST], true, 0 -(58) AdaptiveSparkPlan +(60) AdaptiveSparkPlan Output [2]: [ps_partkey#X, value#X] Arguments: isFinalPlan=true ===== Subqueries ===== -Subquery:1 Hosting operator id = 30 Hosting Expression = Subquery subquery#X, [id=#X] -AdaptiveSparkPlan (99) +Subquery:1 Hosting operator id = 31 Hosting Expression = Subquery subquery#X, [id=#X] +AdaptiveSparkPlan (102) +- == Final Plan == - VeloxColumnarToRowExec (82) - +- ^ ProjectExecTransformer (80) - +- ^ RegularHashAggregateExecTransformer (79) - +- ^ InputIteratorTransformer (78) - +- ^ InputAdapter (77) - +- ^ ShuffleQueryStage (76), Statistics(X) - +- ColumnarExchange (75) - +- ^ FlushableHashAggregateExecTransformer (73) - +- ^ ProjectExecTransformer (72) - +- ^ BroadcastHashJoinExecTransformer Inner (71) - :- ^ ProjectExecTransformer (66) - : +- ^ BroadcastHashJoinExecTransformer Inner (65) - : :- ^ NoopFilter (60) - : : +- ^ Scan parquet (59) - : +- ^ InputIteratorTransformer (64) - : +- ^ InputAdapter (63) - : +- ^ BroadcastQueryStage (62), Statistics(X) - : +- ReusedExchange (61) - +- ^ InputIteratorTransformer (70) - +- ^ InputAdapter (69) - +- ^ BroadcastQueryStage (68), Statistics(X) - +- ReusedExchange (67) + VeloxColumnarToRowExec (85) + +- ^ ProjectExecTransformer (83) + +- ^ RegularHashAggregateExecTransformer (82) + +- ^ InputIteratorTransformer (81) + +- ShuffleQueryStage (79), Statistics(X) + +- ColumnarExchange (78) + +- VeloxAppendBatches (77) + +- ^ FlushableHashAggregateExecTransformer (75) + +- ^ ProjectExecTransformer (74) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (73) + :- ^ ProjectExecTransformer (68) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (67) + : :- ^ NoopFilter (62) + : : +- ^ Scan parquet (61) + : +- ^ InputIteratorTransformer (66) + : +- BroadcastQueryStage (64), Statistics(X) + : +- ReusedExchange (63) + +- ^ InputIteratorTransformer (72) + +- BroadcastQueryStage (70), Statistics(X) + +- ReusedExchange (69) +- == Initial Plan == - HashAggregate (98) - +- Exchange (97) - +- HashAggregate (96) - +- Project (95) - +- BroadcastHashJoin Inner BuildRight (94) - :- Project (89) - : +- BroadcastHashJoin Inner BuildRight (88) - : :- Filter (84) - : : +- Scan parquet (83) - : +- BroadcastExchange (87) - : +- Filter (86) - : +- Scan parquet (85) - +- BroadcastExchange (93) - +- Project (92) - +- Filter (91) - +- Scan parquet (90) - - -(59) Scan parquet + HashAggregate (101) + +- Exchange (100) + +- HashAggregate (99) + +- Project (98) + +- BroadcastHashJoin Inner BuildRight (97) + :- Project (92) + : +- BroadcastHashJoin Inner BuildRight (91) + : :- Filter (87) + : : +- Scan parquet (86) + : +- BroadcastExchange (90) + : +- Filter (89) + : +- Scan parquet (88) + +- BroadcastExchange (96) + +- Project (95) + +- Filter (94) + +- Scan parquet (93) + + +(61) Scan parquet Output [3]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_suppkey)] ReadSchema: struct -(60) NoopFilter +(62) NoopFilter Input [3]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Arguments: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] -(61) ReusedExchange [Reuses operator id: 6] +(63) ReusedExchange [Reuses operator id: 6] Output [2]: [s_suppkey#X, s_nationkey#X] -(62) BroadcastQueryStage +(64) BroadcastQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] Arguments: X -(63) InputAdapter +(65) InputAdapter Input [2]: [s_suppkey#X, s_nationkey#X] -(64) InputIteratorTransformer +(66) InputIteratorTransformer Input [2]: [s_suppkey#X, s_nationkey#X] -(65) BroadcastHashJoinExecTransformer +(67) BroadcastHashJoinExecTransformer Left keys [1]: [ps_suppkey#X] Right keys [1]: [s_suppkey#X] Join condition: None -(66) ProjectExecTransformer +(68) ProjectExecTransformer Output [3]: [ps_availqty#X, ps_supplycost#X, s_nationkey#X] Input [5]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X, s_suppkey#X, s_nationkey#X] -(67) ReusedExchange [Reuses operator id: 16] +(69) ReusedExchange [Reuses operator id: 16] Output [1]: [n_nationkey#X] -(68) BroadcastQueryStage +(70) BroadcastQueryStage Output [1]: [n_nationkey#X] Arguments: X -(69) InputAdapter +(71) InputAdapter Input [1]: [n_nationkey#X] -(70) InputIteratorTransformer +(72) InputIteratorTransformer Input [1]: [n_nationkey#X] -(71) BroadcastHashJoinExecTransformer +(73) BroadcastHashJoinExecTransformer Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(72) ProjectExecTransformer +(74) ProjectExecTransformer Output [3]: [ps_availqty#X, ps_supplycost#X, CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)) AS _pre_X#X] Input [4]: [ps_availqty#X, ps_supplycost#X, s_nationkey#X, n_nationkey#X] -(73) FlushableHashAggregateExecTransformer +(75) FlushableHashAggregateExecTransformer Input [3]: [ps_availqty#X, ps_supplycost#X, _pre_X#X] Keys: [] Functions [1]: [partial_sum(_pre_X#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(74) WholeStageCodegenTransformer (X) +(76) WholeStageCodegenTransformer (X) Input [2]: [sum#X, isEmpty#X] Arguments: false -(75) ColumnarExchange +(77) VeloxAppendBatches +Input [2]: [sum#X, isEmpty#X] +Arguments: X + +(78) ColumnarExchange Input [2]: [sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(76) ShuffleQueryStage +(79) ShuffleQueryStage Output [2]: [sum#X, isEmpty#X] Arguments: X -(77) InputAdapter +(80) InputAdapter Input [2]: [sum#X, isEmpty#X] -(78) InputIteratorTransformer +(81) InputIteratorTransformer Input [2]: [sum#X, isEmpty#X] -(79) RegularHashAggregateExecTransformer +(82) RegularHashAggregateExecTransformer Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))#X] Results [1]: [sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))#X] -(80) ProjectExecTransformer +(83) ProjectExecTransformer Output [1]: [CheckOverflow((promote_precision(cast(sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))#X as decimal(38,10))) * 0.0001000000), DecimalType(38,6)) AS (sum((ps_supplycost * ps_availqty)) * 0.0001000000)#X] Input [1]: [sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))#X] -(81) WholeStageCodegenTransformer (X) +(84) WholeStageCodegenTransformer (X) Input [1]: [(sum((ps_supplycost * ps_availqty)) * 0.0001000000)#X] Arguments: false -(82) VeloxColumnarToRowExec +(85) VeloxColumnarToRowExec Input [1]: [(sum((ps_supplycost * ps_availqty)) * 0.0001000000)#X] -(83) Scan parquet +(86) Scan parquet Output [3]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_suppkey)] ReadSchema: struct -(84) Filter +(87) Filter Input [3]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Condition : isnotnull(ps_suppkey#X) -(85) Scan parquet +(88) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(86) Filter +(89) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(87) BroadcastExchange +(90) BroadcastExchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(88) BroadcastHashJoin +(91) BroadcastHashJoin Left keys [1]: [ps_suppkey#X] Right keys [1]: [s_suppkey#X] Join condition: None -(89) Project +(92) Project Output [3]: [ps_availqty#X, ps_supplycost#X, s_nationkey#X] Input [5]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X, s_suppkey#X, s_nationkey#X] -(90) Scan parquet +(93) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,GERMANY), IsNotNull(n_nationkey)] ReadSchema: struct -(91) Filter +(94) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : ((isnotnull(n_name#X) AND (n_name#X = GERMANY)) AND isnotnull(n_nationkey#X)) -(92) Project +(95) Project Output [1]: [n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(93) BroadcastExchange +(96) BroadcastExchange Input [1]: [n_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(94) BroadcastHashJoin +(97) BroadcastHashJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(95) Project +(98) Project Output [2]: [ps_availqty#X, ps_supplycost#X] Input [4]: [ps_availqty#X, ps_supplycost#X, s_nationkey#X, n_nationkey#X] -(96) HashAggregate +(99) HashAggregate Input [2]: [ps_availqty#X, ps_supplycost#X] Keys: [] Functions [1]: [partial_sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(97) Exchange +(100) Exchange Input [2]: [sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X] -(98) HashAggregate +(101) HashAggregate Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))#X] Results [1]: [CheckOverflow((promote_precision(cast(sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))#X as decimal(38,10))) * 0.0001000000), DecimalType(38,6)) AS (sum((ps_supplycost * ps_availqty)) * 0.0001000000)#X] -(99) AdaptiveSparkPlan +(102) AdaptiveSparkPlan Output [1]: [(sum((ps_supplycost * ps_availqty)) * 0.0001000000)#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/12.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/12.txt index 1f1fcd3fb577f..12d0f6f0ff955 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/12.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/12.txt @@ -1,44 +1,43 @@ == Physical Plan == -AdaptiveSparkPlan (42) +AdaptiveSparkPlan (44) +- == Final Plan == - VeloxColumnarToRowExec (28) - +- ^ SortExecTransformer (26) - +- ^ InputIteratorTransformer (25) - +- ^ InputAdapter (24) - +- ^ ShuffleQueryStage (23), Statistics(X) - +- ColumnarExchange (22) - +- ^ RegularHashAggregateExecTransformer (20) - +- ^ InputIteratorTransformer (19) - +- ^ InputAdapter (18) - +- ^ ShuffleQueryStage (17), Statistics(X) - +- ColumnarExchange (16) + VeloxColumnarToRowExec (30) + +- ^ SortExecTransformer (28) + +- ^ InputIteratorTransformer (27) + +- ShuffleQueryStage (25), Statistics(X) + +- ColumnarExchange (24) + +- VeloxAppendBatches (23) + +- ^ RegularHashAggregateExecTransformer (21) + +- ^ InputIteratorTransformer (20) + +- ShuffleQueryStage (18), Statistics(X) + +- ColumnarExchange (17) + +- VeloxAppendBatches (16) +- ^ ProjectExecTransformer (14) +- ^ FlushableHashAggregateExecTransformer (13) +- ^ ProjectExecTransformer (12) - +- ^ BroadcastHashJoinExecTransformer Inner (11) + +- ^ BroadcastHashJoinExecTransformer Inner BuildLeft (11) :- ^ InputIteratorTransformer (7) - : +- ^ InputAdapter (6) - : +- ^ BroadcastQueryStage (5), Statistics(X) - : +- ColumnarBroadcastExchange (4) - : +- ^ NoopFilter (2) - : +- ^ Scan parquet (1) + : +- BroadcastQueryStage (5), Statistics(X) + : +- ColumnarBroadcastExchange (4) + : +- ^ NoopFilter (2) + : +- ^ Scan parquet (1) +- ^ ProjectExecTransformer (10) +- ^ NoopFilter (9) +- ^ Scan parquet (8) +- == Initial Plan == - Sort (41) - +- Exchange (40) - +- HashAggregate (39) - +- Exchange (38) - +- HashAggregate (37) - +- Project (36) - +- BroadcastHashJoin Inner BuildLeft (35) - :- BroadcastExchange (31) - : +- Filter (30) - : +- Scan parquet (29) - +- Project (34) - +- Filter (33) - +- Scan parquet (32) + Sort (43) + +- Exchange (42) + +- HashAggregate (41) + +- Exchange (40) + +- HashAggregate (39) + +- Project (38) + +- BroadcastHashJoin Inner BuildLeft (37) + :- BroadcastExchange (33) + : +- Filter (32) + : +- Scan parquet (31) + +- Project (36) + +- Filter (35) + +- Scan parquet (34) (1) Scan parquet @@ -109,121 +108,129 @@ Input [3]: [l_shipmode#X, sum#X, sum#X] Input [4]: [hash_partition_key#X, l_shipmode#X, sum#X, sum#X] Arguments: false -(16) ColumnarExchange +(16) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_shipmode#X, sum#X, sum#X] +Arguments: X + +(17) ColumnarExchange Input [4]: [hash_partition_key#X, l_shipmode#X, sum#X, sum#X] Arguments: hashpartitioning(l_shipmode#X, 1), ENSURE_REQUIREMENTS, [l_shipmode#X, sum#X, sum#X], [plan_id=X], [id=#X] -(17) ShuffleQueryStage +(18) ShuffleQueryStage Output [3]: [l_shipmode#X, sum#X, sum#X] Arguments: X -(18) InputAdapter +(19) InputAdapter Input [3]: [l_shipmode#X, sum#X, sum#X] -(19) InputIteratorTransformer +(20) InputIteratorTransformer Input [3]: [l_shipmode#X, sum#X, sum#X] -(20) RegularHashAggregateExecTransformer +(21) RegularHashAggregateExecTransformer Input [3]: [l_shipmode#X, sum#X, sum#X] Keys [1]: [l_shipmode#X] Functions [2]: [sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END), sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)] Aggregate Attributes [2]: [sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X, sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X] Results [3]: [l_shipmode#X, sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X AS high_line_count#X, sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X AS low_line_count#X] -(21) WholeStageCodegenTransformer (X) +(22) WholeStageCodegenTransformer (X) Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: false -(22) ColumnarExchange +(23) VeloxAppendBatches +Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] +Arguments: X + +(24) ColumnarExchange Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: rangepartitioning(l_shipmode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(23) ShuffleQueryStage +(25) ShuffleQueryStage Output [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: X -(24) InputAdapter +(26) InputAdapter Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] -(25) InputIteratorTransformer +(27) InputIteratorTransformer Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] -(26) SortExecTransformer +(28) SortExecTransformer Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: [l_shipmode#X ASC NULLS FIRST], true, 0 -(27) WholeStageCodegenTransformer (X) +(29) WholeStageCodegenTransformer (X) Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: false -(28) VeloxColumnarToRowExec +(30) VeloxColumnarToRowExec Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] -(29) Scan parquet +(31) Scan parquet Output [2]: [o_orderkey#X, o_orderpriority#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderkey)] ReadSchema: struct -(30) Filter +(32) Filter Input [2]: [o_orderkey#X, o_orderpriority#X] Condition : isnotnull(o_orderkey#X) -(31) BroadcastExchange +(33) BroadcastExchange Input [2]: [o_orderkey#X, o_orderpriority#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(32) Scan parquet +(34) Scan parquet Output [5]: [l_orderkey#X, l_shipdate#X, l_commitdate#X, l_receiptdate#X, l_shipmode#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_commitdate), IsNotNull(l_receiptdate), IsNotNull(l_shipdate), In(l_shipmode, [MAIL,SHIP]), GreaterThanOrEqual(l_receiptdate,1994-01-01), LessThan(l_receiptdate,1995-01-01), IsNotNull(l_orderkey)] ReadSchema: struct -(33) Filter +(35) Filter Input [5]: [l_orderkey#X, l_shipdate#X, l_commitdate#X, l_receiptdate#X, l_shipmode#X] Condition : ((((((((isnotnull(l_commitdate#X) AND isnotnull(l_receiptdate#X)) AND isnotnull(l_shipdate#X)) AND l_shipmode#X IN (MAIL,SHIP)) AND (l_commitdate#X < l_receiptdate#X)) AND (l_shipdate#X < l_commitdate#X)) AND (l_receiptdate#X >= 1994-01-01)) AND (l_receiptdate#X < 1995-01-01)) AND isnotnull(l_orderkey#X)) -(34) Project +(36) Project Output [2]: [l_orderkey#X, l_shipmode#X] Input [5]: [l_orderkey#X, l_shipdate#X, l_commitdate#X, l_receiptdate#X, l_shipmode#X] -(35) BroadcastHashJoin +(37) BroadcastHashJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(36) Project +(38) Project Output [2]: [o_orderpriority#X, l_shipmode#X] Input [4]: [o_orderkey#X, o_orderpriority#X, l_orderkey#X, l_shipmode#X] -(37) HashAggregate +(39) HashAggregate Input [2]: [o_orderpriority#X, l_shipmode#X] Keys [1]: [l_shipmode#X] Functions [2]: [partial_sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END), partial_sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)] Aggregate Attributes [2]: [sum#X, sum#X] Results [3]: [l_shipmode#X, sum#X, sum#X] -(38) Exchange +(40) Exchange Input [3]: [l_shipmode#X, sum#X, sum#X] Arguments: hashpartitioning(l_shipmode#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(39) HashAggregate +(41) HashAggregate Input [3]: [l_shipmode#X, sum#X, sum#X] Keys [1]: [l_shipmode#X] Functions [2]: [sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END), sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)] Aggregate Attributes [2]: [sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X, sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X] Results [3]: [l_shipmode#X, sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X AS high_line_count#X, sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X AS low_line_count#X] -(40) Exchange +(42) Exchange Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: rangepartitioning(l_shipmode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(41) Sort +(43) Sort Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: [l_shipmode#X ASC NULLS FIRST], true, 0 -(42) AdaptiveSparkPlan +(44) AdaptiveSparkPlan Output [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/13.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/13.txt index e67819b828608..1dfb80d78050a 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/13.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/13.txt @@ -1,53 +1,52 @@ == Physical Plan == -AdaptiveSparkPlan (52) +AdaptiveSparkPlan (55) +- == Final Plan == - VeloxColumnarToRowExec (36) - +- ^ SortExecTransformer (34) - +- ^ InputIteratorTransformer (33) - +- ^ InputAdapter (32) - +- ^ ShuffleQueryStage (31), Statistics(X) - +- ColumnarExchange (30) - +- ^ RegularHashAggregateExecTransformer (28) - +- ^ InputIteratorTransformer (27) - +- ^ InputAdapter (26) - +- ^ ShuffleQueryStage (25), Statistics(X) - +- ColumnarExchange (24) - +- ^ ProjectExecTransformer (22) - +- ^ FlushableHashAggregateExecTransformer (21) - +- ^ ProjectExecTransformer (20) - +- ^ RegularHashAggregateExecTransformer (19) - +- ^ InputIteratorTransformer (18) - +- ^ InputAdapter (17) - +- ^ ShuffleQueryStage (16), Statistics(X) - +- ColumnarExchange (15) + VeloxColumnarToRowExec (39) + +- ^ SortExecTransformer (37) + +- ^ InputIteratorTransformer (36) + +- ShuffleQueryStage (34), Statistics(X) + +- ColumnarExchange (33) + +- VeloxAppendBatches (32) + +- ^ RegularHashAggregateExecTransformer (30) + +- ^ InputIteratorTransformer (29) + +- ShuffleQueryStage (27), Statistics(X) + +- ColumnarExchange (26) + +- VeloxAppendBatches (25) + +- ^ ProjectExecTransformer (23) + +- ^ FlushableHashAggregateExecTransformer (22) + +- ^ ProjectExecTransformer (21) + +- ^ RegularHashAggregateExecTransformer (20) + +- ^ InputIteratorTransformer (19) + +- ShuffleQueryStage (17), Statistics(X) + +- ColumnarExchange (16) + +- VeloxAppendBatches (15) +- ^ ProjectExecTransformer (13) +- ^ FlushableHashAggregateExecTransformer (12) +- ^ ProjectExecTransformer (11) - +- ^ BroadcastHashJoinExecTransformer LeftOuter (10) + +- ^ BroadcastHashJoinExecTransformer LeftOuter BuildRight (10) :- ^ Scan parquet (1) +- ^ InputIteratorTransformer (9) - +- ^ InputAdapter (8) - +- ^ BroadcastQueryStage (7), Statistics(X) - +- ColumnarBroadcastExchange (6) - +- ^ ProjectExecTransformer (4) - +- ^ NoopFilter (3) - +- ^ Scan parquet (2) + +- BroadcastQueryStage (7), Statistics(X) + +- ColumnarBroadcastExchange (6) + +- ^ ProjectExecTransformer (4) + +- ^ NoopFilter (3) + +- ^ Scan parquet (2) +- == Initial Plan == - Sort (51) - +- Exchange (50) - +- HashAggregate (49) - +- Exchange (48) - +- HashAggregate (47) - +- HashAggregate (46) - +- Exchange (45) - +- HashAggregate (44) - +- Project (43) - +- BroadcastHashJoin LeftOuter BuildRight (42) - :- Scan parquet (37) - +- BroadcastExchange (41) - +- Project (40) - +- Filter (39) - +- Scan parquet (38) + Sort (54) + +- Exchange (53) + +- HashAggregate (52) + +- Exchange (51) + +- HashAggregate (50) + +- HashAggregate (49) + +- Exchange (48) + +- HashAggregate (47) + +- Project (46) + +- BroadcastHashJoin LeftOuter BuildRight (45) + :- Scan parquet (40) + +- BroadcastExchange (44) + +- Project (43) + +- Filter (42) + +- Scan parquet (41) (1) Scan parquet @@ -113,174 +112,186 @@ Input [2]: [c_custkey#X, count#X] Input [3]: [hash_partition_key#X, c_custkey#X, count#X] Arguments: false -(15) ColumnarExchange +(15) VeloxAppendBatches +Input [3]: [hash_partition_key#X, c_custkey#X, count#X] +Arguments: X + +(16) ColumnarExchange Input [3]: [hash_partition_key#X, c_custkey#X, count#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, count#X], [plan_id=X], [id=#X] -(16) ShuffleQueryStage +(17) ShuffleQueryStage Output [2]: [c_custkey#X, count#X] Arguments: X -(17) InputAdapter +(18) InputAdapter Input [2]: [c_custkey#X, count#X] -(18) InputIteratorTransformer +(19) InputIteratorTransformer Input [2]: [c_custkey#X, count#X] -(19) RegularHashAggregateExecTransformer +(20) RegularHashAggregateExecTransformer Input [2]: [c_custkey#X, count#X] Keys [1]: [c_custkey#X] Functions [1]: [count(o_orderkey#X)] Aggregate Attributes [1]: [count(o_orderkey#X)#X] Results [2]: [c_custkey#X, count(o_orderkey#X)#X] -(20) ProjectExecTransformer +(21) ProjectExecTransformer Output [1]: [count(o_orderkey#X)#X AS c_count#X] Input [2]: [c_custkey#X, count(o_orderkey#X)#X] -(21) FlushableHashAggregateExecTransformer +(22) FlushableHashAggregateExecTransformer Input [1]: [c_count#X] Keys [1]: [c_count#X] Functions [1]: [partial_count(1)] Aggregate Attributes [1]: [count#X] Results [2]: [c_count#X, count#X] -(22) ProjectExecTransformer +(23) ProjectExecTransformer Output [3]: [hash(c_count#X, 42) AS hash_partition_key#X, c_count#X, count#X] Input [2]: [c_count#X, count#X] -(23) WholeStageCodegenTransformer (X) +(24) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, c_count#X, count#X] Arguments: false -(24) ColumnarExchange +(25) VeloxAppendBatches +Input [3]: [hash_partition_key#X, c_count#X, count#X] +Arguments: X + +(26) ColumnarExchange Input [3]: [hash_partition_key#X, c_count#X, count#X] Arguments: hashpartitioning(c_count#X, 1), ENSURE_REQUIREMENTS, [c_count#X, count#X], [plan_id=X], [id=#X] -(25) ShuffleQueryStage +(27) ShuffleQueryStage Output [2]: [c_count#X, count#X] Arguments: X -(26) InputAdapter +(28) InputAdapter Input [2]: [c_count#X, count#X] -(27) InputIteratorTransformer +(29) InputIteratorTransformer Input [2]: [c_count#X, count#X] -(28) RegularHashAggregateExecTransformer +(30) RegularHashAggregateExecTransformer Input [2]: [c_count#X, count#X] Keys [1]: [c_count#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [c_count#X, count(1)#X AS custdist#X] -(29) WholeStageCodegenTransformer (X) +(31) WholeStageCodegenTransformer (X) Input [2]: [c_count#X, custdist#X] Arguments: false -(30) ColumnarExchange +(32) VeloxAppendBatches +Input [2]: [c_count#X, custdist#X] +Arguments: X + +(33) ColumnarExchange Input [2]: [c_count#X, custdist#X] Arguments: rangepartitioning(custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(31) ShuffleQueryStage +(34) ShuffleQueryStage Output [2]: [c_count#X, custdist#X] Arguments: X -(32) InputAdapter +(35) InputAdapter Input [2]: [c_count#X, custdist#X] -(33) InputIteratorTransformer +(36) InputIteratorTransformer Input [2]: [c_count#X, custdist#X] -(34) SortExecTransformer +(37) SortExecTransformer Input [2]: [c_count#X, custdist#X] Arguments: [custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST], true, 0 -(35) WholeStageCodegenTransformer (X) +(38) WholeStageCodegenTransformer (X) Input [2]: [c_count#X, custdist#X] Arguments: false -(36) VeloxColumnarToRowExec +(39) VeloxColumnarToRowExec Input [2]: [c_count#X, custdist#X] -(37) Scan parquet +(40) Scan parquet Output [1]: [c_custkey#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(38) Scan parquet +(41) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_comment#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_comment), IsNotNull(o_custkey)] ReadSchema: struct -(39) Filter +(42) Filter Input [3]: [o_orderkey#X, o_custkey#X, o_comment#X] Condition : ((isnotnull(o_comment#X) AND NOT o_comment#X LIKE %special%requests%) AND isnotnull(o_custkey#X)) -(40) Project +(43) Project Output [2]: [o_orderkey#X, o_custkey#X] Input [3]: [o_orderkey#X, o_custkey#X, o_comment#X] -(41) BroadcastExchange +(44) BroadcastExchange Input [2]: [o_orderkey#X, o_custkey#X] Arguments: HashedRelationBroadcastMode(List(input[1, bigint, true]),false), [plan_id=X] -(42) BroadcastHashJoin +(45) BroadcastHashJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(43) Project +(46) Project Output [2]: [c_custkey#X, o_orderkey#X] Input [3]: [c_custkey#X, o_orderkey#X, o_custkey#X] -(44) HashAggregate +(47) HashAggregate Input [2]: [c_custkey#X, o_orderkey#X] Keys [1]: [c_custkey#X] Functions [1]: [partial_count(o_orderkey#X)] Aggregate Attributes [1]: [count#X] Results [2]: [c_custkey#X, count#X] -(45) Exchange +(48) Exchange Input [2]: [c_custkey#X, count#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(46) HashAggregate +(49) HashAggregate Input [2]: [c_custkey#X, count#X] Keys [1]: [c_custkey#X] Functions [1]: [count(o_orderkey#X)] Aggregate Attributes [1]: [count(o_orderkey#X)#X] Results [1]: [count(o_orderkey#X)#X AS c_count#X] -(47) HashAggregate +(50) HashAggregate Input [1]: [c_count#X] Keys [1]: [c_count#X] Functions [1]: [partial_count(1)] Aggregate Attributes [1]: [count#X] Results [2]: [c_count#X, count#X] -(48) Exchange +(51) Exchange Input [2]: [c_count#X, count#X] Arguments: hashpartitioning(c_count#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(49) HashAggregate +(52) HashAggregate Input [2]: [c_count#X, count#X] Keys [1]: [c_count#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [c_count#X, count(1)#X AS custdist#X] -(50) Exchange +(53) Exchange Input [2]: [c_count#X, custdist#X] Arguments: rangepartitioning(custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(51) Sort +(54) Sort Input [2]: [c_count#X, custdist#X] Arguments: [custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST], true, 0 -(52) AdaptiveSparkPlan +(55) AdaptiveSparkPlan Output [2]: [c_count#X, custdist#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/14.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/14.txt index eb04b6c18271e..6f71859cba6d2 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/14.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/14.txt @@ -1,37 +1,36 @@ == Physical Plan == -AdaptiveSparkPlan (34) +AdaptiveSparkPlan (35) +- == Final Plan == - VeloxColumnarToRowExec (22) - +- ^ ProjectExecTransformer (20) - +- ^ RegularHashAggregateExecTransformer (19) - +- ^ InputIteratorTransformer (18) - +- ^ InputAdapter (17) - +- ^ ShuffleQueryStage (16), Statistics(X) - +- ColumnarExchange (15) + VeloxColumnarToRowExec (23) + +- ^ ProjectExecTransformer (21) + +- ^ RegularHashAggregateExecTransformer (20) + +- ^ InputIteratorTransformer (19) + +- ShuffleQueryStage (17), Statistics(X) + +- ColumnarExchange (16) + +- VeloxAppendBatches (15) +- ^ FlushableHashAggregateExecTransformer (13) +- ^ ProjectExecTransformer (12) - +- ^ BroadcastHashJoinExecTransformer Inner (11) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (11) :- ^ ProjectExecTransformer (3) : +- ^ NoopFilter (2) : +- ^ Scan parquet (1) +- ^ InputIteratorTransformer (10) - +- ^ InputAdapter (9) - +- ^ BroadcastQueryStage (8), Statistics(X) - +- ColumnarBroadcastExchange (7) - +- ^ NoopFilter (5) - +- ^ Scan parquet (4) + +- BroadcastQueryStage (8), Statistics(X) + +- ColumnarBroadcastExchange (7) + +- ^ NoopFilter (5) + +- ^ Scan parquet (4) +- == Initial Plan == - HashAggregate (33) - +- Exchange (32) - +- HashAggregate (31) - +- Project (30) - +- BroadcastHashJoin Inner BuildRight (29) - :- Project (25) - : +- Filter (24) - : +- Scan parquet (23) - +- BroadcastExchange (28) - +- Filter (27) - +- Scan parquet (26) + HashAggregate (34) + +- Exchange (33) + +- HashAggregate (32) + +- Project (31) + +- BroadcastHashJoin Inner BuildRight (30) + :- Project (26) + : +- Filter (25) + : +- Scan parquet (24) + +- BroadcastExchange (29) + +- Filter (28) + +- Scan parquet (27) (1) Scan parquet @@ -98,95 +97,99 @@ Results [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: false -(15) ColumnarExchange +(15) VeloxAppendBatches +Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] +Arguments: X + +(16) ColumnarExchange Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(16) ShuffleQueryStage +(17) ShuffleQueryStage Output [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: X -(17) InputAdapter +(18) InputAdapter Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] -(18) InputIteratorTransformer +(19) InputIteratorTransformer Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] -(19) RegularHashAggregateExecTransformer +(20) RegularHashAggregateExecTransformer Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Keys: [] Functions [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) ELSE 0.0000 END), sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) ELSE 0.0000 END)#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] Results [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) ELSE 0.0000 END)#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] -(20) ProjectExecTransformer +(21) ProjectExecTransformer Output [1]: [CheckOverflow((promote_precision(CheckOverflow((100.0000 * promote_precision(sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) ELSE 0.0000 END)#X)), DecimalType(38,6))) / promote_precision(cast(sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X as decimal(38,6)))), DecimalType(38,6)) AS promo_revenue#X] Input [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) ELSE 0.0000 END)#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] -(21) WholeStageCodegenTransformer (X) +(22) WholeStageCodegenTransformer (X) Input [1]: [promo_revenue#X] Arguments: false -(22) VeloxColumnarToRowExec +(23) VeloxColumnarToRowExec Input [1]: [promo_revenue#X] -(23) Scan parquet +(24) Scan parquet Output [4]: [l_partkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1995-09-01), LessThan(l_shipdate,1995-10-01), IsNotNull(l_partkey)] ReadSchema: struct -(24) Filter +(25) Filter Input [4]: [l_partkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : (((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1995-09-01)) AND (l_shipdate#X < 1995-10-01)) AND isnotnull(l_partkey#X)) -(25) Project +(26) Project Output [3]: [l_partkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_partkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(26) Scan parquet +(27) Scan parquet Output [2]: [p_partkey#X, p_type#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_partkey)] ReadSchema: struct -(27) Filter +(28) Filter Input [2]: [p_partkey#X, p_type#X] Condition : isnotnull(p_partkey#X) -(28) BroadcastExchange +(29) BroadcastExchange Input [2]: [p_partkey#X, p_type#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(29) BroadcastHashJoin +(30) BroadcastHashJoin Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(30) Project +(31) Project Output [3]: [l_extendedprice#X, l_discount#X, p_type#X] Input [5]: [l_partkey#X, l_extendedprice#X, l_discount#X, p_partkey#X, p_type#X] -(31) HashAggregate +(32) HashAggregate Input [3]: [l_extendedprice#X, l_discount#X, p_type#X] Keys: [] Functions [2]: [partial_sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) ELSE 0.0000 END), partial_sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Results [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] -(32) Exchange +(33) Exchange Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X] -(33) HashAggregate +(34) HashAggregate Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Keys: [] Functions [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) ELSE 0.0000 END), sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) ELSE 0.0000 END)#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] Results [1]: [CheckOverflow((promote_precision(CheckOverflow((100.0000 * promote_precision(sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) ELSE 0.0000 END)#X)), DecimalType(38,6))) / promote_precision(cast(sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X as decimal(38,6)))), DecimalType(38,6)) AS promo_revenue#X] -(34) AdaptiveSparkPlan +(35) AdaptiveSparkPlan Output [1]: [promo_revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/15.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/15.txt index 5c6304211102f..40b342510de2b 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/15.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/15.txt @@ -1,44 +1,44 @@ == Physical Plan == -AdaptiveSparkPlan (41) +AdaptiveSparkPlan (43) +- == Final Plan == - VeloxColumnarToRowExec (26) - +- AQEShuffleRead (25) - +- ShuffleQueryStage (24), Statistics(X) - +- ColumnarExchange (23) - +- ^ ProjectExecTransformer (21) - +- ^ BroadcastHashJoinExecTransformer Inner (20) - :- ^ InputIteratorTransformer (7) - : +- ^ InputAdapter (6) - : +- ^ BroadcastQueryStage (5), Statistics(X) - : +- ColumnarBroadcastExchange (4) - : +- ^ NoopFilter (2) - : +- ^ Scan parquet (1) - +- ^ FilterExecTransformer (19) - +- ^ RegularHashAggregateExecTransformer (18) - +- ^ InputIteratorTransformer (17) - +- ^ InputAdapter (16) - +- ^ ShuffleQueryStage (15), Statistics(X) - +- ColumnarExchange (14) - +- ^ ProjectExecTransformer (12) - +- ^ FlushableHashAggregateExecTransformer (11) - +- ^ ProjectExecTransformer (10) - +- ^ NoopFilter (9) - +- ^ Scan parquet (8) + VeloxColumnarToRowExec (28) + +- AQEShuffleRead (27) + +- ShuffleQueryStage (26), Statistics(X) + +- ColumnarExchange (25) + +- VeloxAppendBatches (24) + +- ^ ProjectExecTransformer (22) + +- ^ BroadcastHashJoinExecTransformer Inner BuildLeft (21) + :- ^ InputIteratorTransformer (7) + : +- BroadcastQueryStage (5), Statistics(X) + : +- ColumnarBroadcastExchange (4) + : +- ^ NoopFilter (2) + : +- ^ Scan parquet (1) + +- ^ FilterExecTransformer (20) + +- ^ RegularHashAggregateExecTransformer (19) + +- ^ InputIteratorTransformer (18) + +- ShuffleQueryStage (16), Statistics(X) + +- ColumnarExchange (15) + +- VeloxAppendBatches (14) + +- ^ ProjectExecTransformer (12) + +- ^ FlushableHashAggregateExecTransformer (11) + +- ^ ProjectExecTransformer (10) + +- ^ NoopFilter (9) + +- ^ Scan parquet (8) +- == Initial Plan == - Sort (40) - +- Exchange (39) - +- Project (38) - +- BroadcastHashJoin Inner BuildLeft (37) - :- BroadcastExchange (29) - : +- Filter (28) - : +- Scan parquet (27) - +- Filter (36) - +- HashAggregate (35) - +- Exchange (34) - +- HashAggregate (33) - +- Project (32) - +- Filter (31) - +- Scan parquet (30) + Sort (42) + +- Exchange (41) + +- Project (40) + +- BroadcastHashJoin Inner BuildLeft (39) + :- BroadcastExchange (31) + : +- Filter (30) + : +- Scan parquet (29) + +- Filter (38) + +- HashAggregate (37) + +- Exchange (36) + +- HashAggregate (35) + +- Project (34) + +- Filter (33) + +- Scan parquet (32) (1) Scan parquet @@ -100,285 +100,297 @@ Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: false -(14) ColumnarExchange +(14) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] +Arguments: X + +(15) ColumnarExchange Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(15) ShuffleQueryStage +(16) ShuffleQueryStage Output [3]: [l_suppkey#X, sum#X, isEmpty#X] Arguments: X -(16) InputAdapter +(17) InputAdapter Input [3]: [l_suppkey#X, sum#X, isEmpty#X] -(17) InputIteratorTransformer +(18) InputIteratorTransformer Input [3]: [l_suppkey#X, sum#X, isEmpty#X] -(18) RegularHashAggregateExecTransformer +(19) RegularHashAggregateExecTransformer Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Keys [1]: [l_suppkey#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] Results [2]: [l_suppkey#X AS supplier_no#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X AS total_revenue#X] -(19) FilterExecTransformer +(20) FilterExecTransformer Input [2]: [supplier_no#X, total_revenue#X] Arguments: (isnotnull(total_revenue#X) AND (total_revenue#X = Subquery subquery#X, [id=#X])) -(20) BroadcastHashJoinExecTransformer +(21) BroadcastHashJoinExecTransformer Left keys [1]: [s_suppkey#X] Right keys [1]: [supplier_no#X] Join condition: None -(21) ProjectExecTransformer +(22) ProjectExecTransformer Output [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Input [6]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, supplier_no#X, total_revenue#X] -(22) WholeStageCodegenTransformer (X) +(23) WholeStageCodegenTransformer (X) Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: false -(23) ColumnarExchange +(24) VeloxAppendBatches +Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] +Arguments: X + +(25) ColumnarExchange Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: rangepartitioning(s_suppkey#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(24) ShuffleQueryStage +(26) ShuffleQueryStage Output [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: X -(25) AQEShuffleRead +(27) AQEShuffleRead Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: local -(26) VeloxColumnarToRowExec +(28) VeloxColumnarToRowExec Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] -(27) Scan parquet +(29) Scan parquet Output [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey)] ReadSchema: struct -(28) Filter +(30) Filter Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] Condition : isnotnull(s_suppkey#X) -(29) BroadcastExchange +(31) BroadcastExchange Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(30) Scan parquet +(32) Scan parquet Output [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1996-01-01), LessThan(l_shipdate,1996-04-01), IsNotNull(l_suppkey)] ReadSchema: struct -(31) Filter +(33) Filter Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : (((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1996-01-01)) AND (l_shipdate#X < 1996-04-01)) AND isnotnull(l_suppkey#X)) -(32) Project +(34) Project Output [3]: [l_suppkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(33) HashAggregate +(35) HashAggregate Input [3]: [l_suppkey#X, l_extendedprice#X, l_discount#X] Keys [1]: [l_suppkey#X] Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_suppkey#X, sum#X, isEmpty#X] -(34) Exchange +(36) Exchange Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(35) HashAggregate +(37) HashAggregate Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Keys [1]: [l_suppkey#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] Results [2]: [l_suppkey#X AS supplier_no#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X AS total_revenue#X] -(36) Filter +(38) Filter Input [2]: [supplier_no#X, total_revenue#X] Condition : (isnotnull(total_revenue#X) AND (total_revenue#X = Subquery subquery#X, [id=#X])) -(37) BroadcastHashJoin +(39) BroadcastHashJoin Left keys [1]: [s_suppkey#X] Right keys [1]: [supplier_no#X] Join condition: None -(38) Project +(40) Project Output [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Input [6]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, supplier_no#X, total_revenue#X] -(39) Exchange +(41) Exchange Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: rangepartitioning(s_suppkey#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(40) Sort +(42) Sort Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: [s_suppkey#X ASC NULLS FIRST], true, 0 -(41) AdaptiveSparkPlan +(43) AdaptiveSparkPlan Output [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: isFinalPlan=true ===== Subqueries ===== -Subquery:1 Hosting operator id = 19 Hosting Expression = Subquery subquery#X, [id=#X] -AdaptiveSparkPlan (66) +Subquery:1 Hosting operator id = 20 Hosting Expression = Subquery subquery#X, [id=#X] +AdaptiveSparkPlan (69) +- == Final Plan == - VeloxColumnarToRowExec (57) - +- ^ RegularHashAggregateExecTransformer (55) - +- ^ RegularHashAggregateExecTransformer (54) - +- ^ ProjectExecTransformer (53) - +- ^ RegularHashAggregateExecTransformer (52) - +- ^ InputIteratorTransformer (51) - +- ^ InputAdapter (50) - +- ^ ShuffleQueryStage (49), Statistics(X) - +- ColumnarExchange (48) - +- ^ ProjectExecTransformer (46) - +- ^ FlushableHashAggregateExecTransformer (45) - +- ^ ProjectExecTransformer (44) - +- ^ NoopFilter (43) - +- ^ Scan parquet (42) + VeloxColumnarToRowExec (60) + +- ^ RegularHashAggregateExecTransformer (58) + +- ^ RegularHashAggregateExecTransformer (57) + +- ^ ProjectExecTransformer (56) + +- ^ RegularHashAggregateExecTransformer (55) + +- ^ InputIteratorTransformer (54) + +- ShuffleQueryStage (52), Statistics(X) + +- ColumnarExchange (51) + +- VeloxAppendBatches (50) + +- ^ ProjectExecTransformer (48) + +- ^ FlushableHashAggregateExecTransformer (47) + +- ^ ProjectExecTransformer (46) + +- ^ NoopFilter (45) + +- ^ Scan parquet (44) +- == Initial Plan == - HashAggregate (65) - +- HashAggregate (64) - +- HashAggregate (63) - +- Exchange (62) - +- HashAggregate (61) - +- Project (60) - +- Filter (59) - +- Scan parquet (58) + HashAggregate (68) + +- HashAggregate (67) + +- HashAggregate (66) + +- Exchange (65) + +- HashAggregate (64) + +- Project (63) + +- Filter (62) + +- Scan parquet (61) -(42) Scan parquet +(44) Scan parquet Output [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1996-01-01), LessThan(l_shipdate,1996-04-01)] ReadSchema: struct -(43) NoopFilter +(45) NoopFilter Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(44) ProjectExecTransformer +(46) ProjectExecTransformer Output [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) AS _pre_X#X] Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(45) FlushableHashAggregateExecTransformer +(47) FlushableHashAggregateExecTransformer Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, _pre_X#X] Keys [1]: [l_suppkey#X] Functions [1]: [partial_sum(_pre_X#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_suppkey#X, sum#X, isEmpty#X] -(46) ProjectExecTransformer +(48) ProjectExecTransformer Output [4]: [hash(l_suppkey#X, 42) AS hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] Input [3]: [l_suppkey#X, sum#X, isEmpty#X] -(47) WholeStageCodegenTransformer (X) +(49) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: false -(48) ColumnarExchange +(50) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] +Arguments: X + +(51) ColumnarExchange Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(49) ShuffleQueryStage +(52) ShuffleQueryStage Output [3]: [l_suppkey#X, sum#X, isEmpty#X] Arguments: X -(50) InputAdapter +(53) InputAdapter Input [3]: [l_suppkey#X, sum#X, isEmpty#X] -(51) InputIteratorTransformer +(54) InputIteratorTransformer Input [3]: [l_suppkey#X, sum#X, isEmpty#X] -(52) RegularHashAggregateExecTransformer +(55) RegularHashAggregateExecTransformer Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Keys [1]: [l_suppkey#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] Results [2]: [l_suppkey#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] -(53) ProjectExecTransformer +(56) ProjectExecTransformer Output [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X AS total_revenue#X] Input [2]: [l_suppkey#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] -(54) RegularHashAggregateExecTransformer +(57) RegularHashAggregateExecTransformer Input [1]: [total_revenue#X] Keys: [] Functions [1]: [partial_max(total_revenue#X)] Aggregate Attributes [1]: [max#X] Results [1]: [max#X] -(55) RegularHashAggregateExecTransformer +(58) RegularHashAggregateExecTransformer Input [1]: [max#X] Keys: [] Functions [1]: [max(total_revenue#X)] Aggregate Attributes [1]: [max(total_revenue#X)#X] Results [1]: [max(total_revenue#X)#X AS max(total_revenue)#X] -(56) WholeStageCodegenTransformer (X) +(59) WholeStageCodegenTransformer (X) Input [1]: [max(total_revenue)#X] Arguments: false -(57) VeloxColumnarToRowExec +(60) VeloxColumnarToRowExec Input [1]: [max(total_revenue)#X] -(58) Scan parquet +(61) Scan parquet Output [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1996-01-01), LessThan(l_shipdate,1996-04-01)] ReadSchema: struct -(59) Filter +(62) Filter Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : ((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1996-01-01)) AND (l_shipdate#X < 1996-04-01)) -(60) Project +(63) Project Output [3]: [l_suppkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(61) HashAggregate +(64) HashAggregate Input [3]: [l_suppkey#X, l_extendedprice#X, l_discount#X] Keys [1]: [l_suppkey#X] Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_suppkey#X, sum#X, isEmpty#X] -(62) Exchange +(65) Exchange Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(63) HashAggregate +(66) HashAggregate Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Keys [1]: [l_suppkey#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] Results [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X AS total_revenue#X] -(64) HashAggregate +(67) HashAggregate Input [1]: [total_revenue#X] Keys: [] Functions [1]: [partial_max(total_revenue#X)] Aggregate Attributes [1]: [max#X] Results [1]: [max#X] -(65) HashAggregate +(68) HashAggregate Input [1]: [max#X] Keys: [] Functions [1]: [max(total_revenue#X)] Aggregate Attributes [1]: [max(total_revenue#X)#X] Results [1]: [max(total_revenue#X)#X AS max(total_revenue)#X] -(66) AdaptiveSparkPlan +(69) AdaptiveSparkPlan Output [1]: [max(total_revenue)#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/16.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/16.txt index 9a4005e3da858..d579c1a704e5e 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/16.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/16.txt @@ -1,57 +1,56 @@ == Physical Plan == -AdaptiveSparkPlan (56) +AdaptiveSparkPlan (59) +- == Final Plan == - VeloxColumnarToRowExec (35) - +- ^ SortExecTransformer (33) - +- ^ InputIteratorTransformer (32) - +- ^ InputAdapter (31) - +- ^ ShuffleQueryStage (30), Statistics(X) - +- ColumnarExchange (29) - +- ^ RegularHashAggregateExecTransformer (27) - +- ^ InputIteratorTransformer (26) - +- ^ InputAdapter (25) - +- ^ ShuffleQueryStage (24), Statistics(X) - +- ColumnarExchange (23) - +- ^ ProjectExecTransformer (21) - +- ^ FlushableHashAggregateExecTransformer (20) - +- ^ RegularHashAggregateExecTransformer (19) - +- ^ InputIteratorTransformer (18) - +- ^ InputAdapter (17) - +- ^ ShuffleQueryStage (16), Statistics(X) - +- ColumnarExchange (15) + VeloxColumnarToRowExec (38) + +- ^ SortExecTransformer (36) + +- ^ InputIteratorTransformer (35) + +- ShuffleQueryStage (33), Statistics(X) + +- ColumnarExchange (32) + +- VeloxAppendBatches (31) + +- ^ RegularHashAggregateExecTransformer (29) + +- ^ InputIteratorTransformer (28) + +- ShuffleQueryStage (26), Statistics(X) + +- ColumnarExchange (25) + +- VeloxAppendBatches (24) + +- ^ ProjectExecTransformer (22) + +- ^ FlushableHashAggregateExecTransformer (21) + +- ^ RegularHashAggregateExecTransformer (20) + +- ^ InputIteratorTransformer (19) + +- ShuffleQueryStage (17), Statistics(X) + +- ColumnarExchange (16) + +- VeloxAppendBatches (15) +- ^ ProjectExecTransformer (13) +- ^ FlushableHashAggregateExecTransformer (12) +- ^ ProjectExecTransformer (11) - +- ^ BroadcastHashJoinExecTransformer Inner (10) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (10) :- ^ NoopFilter (2) : +- ^ Scan parquet (1) +- ^ InputIteratorTransformer (9) - +- ^ InputAdapter (8) - +- ^ BroadcastQueryStage (7), Statistics(X) - +- ColumnarBroadcastExchange (6) - +- ^ NoopFilter (4) - +- ^ Scan parquet (3) + +- BroadcastQueryStage (7), Statistics(X) + +- ColumnarBroadcastExchange (6) + +- ^ NoopFilter (4) + +- ^ Scan parquet (3) +- == Initial Plan == - Sort (55) - +- Exchange (54) - +- HashAggregate (53) - +- Exchange (52) - +- HashAggregate (51) - +- HashAggregate (50) - +- Exchange (49) - +- HashAggregate (48) - +- Project (47) - +- BroadcastHashJoin Inner BuildRight (46) - :- BroadcastHashJoin LeftAnti BuildRight (42) - : :- Filter (37) - : : +- Scan parquet (36) - : +- BroadcastExchange (41) - : +- Project (40) - : +- Filter (39) - : +- Scan parquet (38) - +- BroadcastExchange (45) - +- Filter (44) - +- Scan parquet (43) + Sort (58) + +- Exchange (57) + +- HashAggregate (56) + +- Exchange (55) + +- HashAggregate (54) + +- HashAggregate (53) + +- Exchange (52) + +- HashAggregate (51) + +- Project (50) + +- BroadcastHashJoin Inner BuildRight (49) + :- BroadcastHashJoin LeftAnti BuildRight (45) + : :- Filter (40) + : : +- Scan parquet (39) + : +- BroadcastExchange (44) + : +- Project (43) + : +- Filter (42) + : +- Scan parquet (41) + +- BroadcastExchange (48) + +- Filter (47) + +- Scan parquet (46) (1) Scan parquet @@ -118,195 +117,207 @@ Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Arguments: false -(15) ColumnarExchange +(15) VeloxAppendBatches +Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, ps_suppkey#X] +Arguments: X + +(16) ColumnarExchange Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, ps_suppkey#X], [plan_id=X], [id=#X] -(16) ShuffleQueryStage +(17) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Arguments: X -(17) InputAdapter +(18) InputAdapter Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(18) InputIteratorTransformer +(19) InputIteratorTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(19) RegularHashAggregateExecTransformer +(20) RegularHashAggregateExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Keys [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Functions: [] Aggregate Attributes: [] Results [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(20) FlushableHashAggregateExecTransformer +(21) FlushableHashAggregateExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] Functions [1]: [partial_count(distinct ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count#X] -(21) ProjectExecTransformer +(22) ProjectExecTransformer Output [5]: [hash(p_brand#X, p_type#X, p_size#X, 42) AS hash_partition_key#X, p_brand#X, p_type#X, p_size#X, count#X] Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] -(22) WholeStageCodegenTransformer (X) +(23) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, count#X] Arguments: false -(23) ColumnarExchange +(24) VeloxAppendBatches +Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, count#X] +Arguments: X + +(25) ColumnarExchange Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, count#X] Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, count#X], [plan_id=X], [id=#X] -(24) ShuffleQueryStage +(26) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, count#X] Arguments: X -(25) InputAdapter +(27) InputAdapter Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] -(26) InputIteratorTransformer +(28) InputIteratorTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] -(27) RegularHashAggregateExecTransformer +(29) RegularHashAggregateExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] Functions [1]: [count(distinct ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count(ps_suppkey#X)#X AS supplier_cnt#X] -(28) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: false -(29) ColumnarExchange +(31) VeloxAppendBatches +Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] +Arguments: X + +(32) ColumnarExchange Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: rangepartitioning(supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(30) ShuffleQueryStage +(33) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: X -(31) InputAdapter +(34) InputAdapter Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] -(32) InputIteratorTransformer +(35) InputIteratorTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] -(33) SortExecTransformer +(36) SortExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: [supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST], true, 0 -(34) WholeStageCodegenTransformer (X) +(37) WholeStageCodegenTransformer (X) Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: false -(35) VeloxColumnarToRowExec +(38) VeloxColumnarToRowExec Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] -(36) Scan parquet +(39) Scan parquet Output [2]: [ps_partkey#X, ps_suppkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_partkey)] ReadSchema: struct -(37) Filter +(40) Filter Input [2]: [ps_partkey#X, ps_suppkey#X] Condition : isnotnull(ps_partkey#X) -(38) Scan parquet +(41) Scan parquet Output [2]: [s_suppkey#X, s_comment#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_comment)] ReadSchema: struct -(39) Filter +(42) Filter Input [2]: [s_suppkey#X, s_comment#X] Condition : (isnotnull(s_comment#X) AND s_comment#X LIKE %Customer%Complaints%) -(40) Project +(43) Project Output [1]: [s_suppkey#X] Input [2]: [s_suppkey#X, s_comment#X] -(41) BroadcastExchange +(44) BroadcastExchange Input [1]: [s_suppkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),true), [plan_id=X] -(42) BroadcastHashJoin +(45) BroadcastHashJoin Left keys [1]: [ps_suppkey#X] Right keys [1]: [s_suppkey#X] Join condition: None -(43) Scan parquet +(46) Scan parquet Output [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_brand), IsNotNull(p_type), Not(EqualTo(p_brand,Brand#X)), Not(StringStartsWith(p_type,MEDIUM POLISHED)), In(p_size, [14,19,23,3,36,45,49,9]), IsNotNull(p_partkey)] ReadSchema: struct -(44) Filter +(47) Filter Input [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] Condition : (((((isnotnull(p_brand#X) AND isnotnull(p_type#X)) AND NOT (p_brand#X = Brand#X)) AND NOT StartsWith(p_type#X, MEDIUM POLISHED)) AND p_size#X IN (49,14,23,45,19,3,36,9)) AND isnotnull(p_partkey#X)) -(45) BroadcastExchange +(48) BroadcastExchange Input [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(46) BroadcastHashJoin +(49) BroadcastHashJoin Left keys [1]: [ps_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(47) Project +(50) Project Output [4]: [ps_suppkey#X, p_brand#X, p_type#X, p_size#X] Input [6]: [ps_partkey#X, ps_suppkey#X, p_partkey#X, p_brand#X, p_type#X, p_size#X] -(48) HashAggregate +(51) HashAggregate Input [4]: [ps_suppkey#X, p_brand#X, p_type#X, p_size#X] Keys [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Functions: [] Aggregate Attributes: [] Results [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(49) Exchange +(52) Exchange Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(50) HashAggregate +(53) HashAggregate Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Keys [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Functions: [] Aggregate Attributes: [] Results [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(51) HashAggregate +(54) HashAggregate Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] Functions [1]: [partial_count(distinct ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count#X] -(52) Exchange +(55) Exchange Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(53) HashAggregate +(56) HashAggregate Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] Functions [1]: [count(distinct ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count(ps_suppkey#X)#X AS supplier_cnt#X] -(54) Exchange +(57) Exchange Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: rangepartitioning(supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(55) Sort +(58) Sort Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: [supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST], true, 0 -(56) AdaptiveSparkPlan +(59) AdaptiveSparkPlan Output [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/17.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/17.txt index 712560e7f577c..1e2ed970aef4f 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/17.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/17.txt @@ -1,39 +1,38 @@ == Physical Plan == -AdaptiveSparkPlan (35) +AdaptiveSparkPlan (36) +- == Final Plan == - VeloxColumnarToRowExec (14) - +- ^ ProjectExecTransformer (12) - +- ^ RegularHashAggregateExecTransformer (11) - +- ^ InputIteratorTransformer (10) - +- ^ InputAdapter (9) - +- ^ ShuffleQueryStage (8), Statistics(X) - +- ColumnarExchange (7) + VeloxColumnarToRowExec (15) + +- ^ ProjectExecTransformer (13) + +- ^ RegularHashAggregateExecTransformer (12) + +- ^ InputIteratorTransformer (11) + +- ShuffleQueryStage (9), Statistics(X) + +- ColumnarExchange (8) + +- VeloxAppendBatches (7) +- ^ FlushableHashAggregateExecTransformer (5) +- ^ InputIteratorTransformer (4) - +- ^ InputAdapter (3) - +- ^ RowToVeloxColumnar (2) - +- ^ LocalTableScan (1) + +- RowToVeloxColumnar (2) + +- LocalTableScan (1) +- == Initial Plan == - HashAggregate (34) - +- Exchange (33) - +- HashAggregate (32) - +- Project (31) - +- BroadcastHashJoin Inner BuildRight (30) - :- Project (22) - : +- BroadcastHashJoin Inner BuildRight (21) - : :- Filter (16) - : : +- Scan parquet (15) - : +- BroadcastExchange (20) - : +- Project (19) - : +- Filter (18) - : +- Scan parquet (17) - +- BroadcastExchange (29) - +- Filter (28) - +- HashAggregate (27) - +- Exchange (26) - +- HashAggregate (25) - +- Filter (24) - +- Scan parquet (23) + HashAggregate (35) + +- Exchange (34) + +- HashAggregate (33) + +- Project (32) + +- BroadcastHashJoin Inner BuildRight (31) + :- Project (23) + : +- BroadcastHashJoin Inner BuildRight (22) + : :- Filter (17) + : : +- Scan parquet (16) + : +- BroadcastExchange (21) + : +- Project (20) + : +- Filter (19) + : +- Scan parquet (18) + +- BroadcastExchange (30) + +- Filter (29) + +- HashAggregate (28) + +- Exchange (27) + +- HashAggregate (26) + +- Filter (25) + +- Scan parquet (24) (1) LocalTableScan @@ -60,141 +59,145 @@ Results [2]: [sum#X, isEmpty#X] Input [2]: [sum#X, isEmpty#X] Arguments: false -(7) ColumnarExchange +(7) VeloxAppendBatches +Input [2]: [sum#X, isEmpty#X] +Arguments: X + +(8) ColumnarExchange Input [2]: [sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(8) ShuffleQueryStage +(9) ShuffleQueryStage Output [2]: [sum#X, isEmpty#X] Arguments: X -(9) InputAdapter +(10) InputAdapter Input [2]: [sum#X, isEmpty#X] -(10) InputIteratorTransformer +(11) InputIteratorTransformer Input [2]: [sum#X, isEmpty#X] -(11) RegularHashAggregateExecTransformer +(12) RegularHashAggregateExecTransformer Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(l_extendedprice#X)] Aggregate Attributes [1]: [sum(l_extendedprice#X)#X] Results [1]: [sum(l_extendedprice#X)#X] -(12) ProjectExecTransformer +(13) ProjectExecTransformer Output [1]: [CheckOverflow((promote_precision(sum(l_extendedprice#X)#X) / 7.00), DecimalType(27,6)) AS avg_yearly#X] Input [1]: [sum(l_extendedprice#X)#X] -(13) WholeStageCodegenTransformer (X) +(14) WholeStageCodegenTransformer (X) Input [1]: [avg_yearly#X] Arguments: false -(14) VeloxColumnarToRowExec +(15) VeloxColumnarToRowExec Input [1]: [avg_yearly#X] -(15) Scan parquet +(16) Scan parquet Output [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey), IsNotNull(l_quantity)] ReadSchema: struct -(16) Filter +(17) Filter Input [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] Condition : (isnotnull(l_partkey#X) AND isnotnull(l_quantity#X)) -(17) Scan parquet +(18) Scan parquet Output [3]: [p_partkey#X, p_brand#X, p_container#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_brand), IsNotNull(p_container), EqualTo(p_brand,Brand#X), EqualTo(p_container,MED BOX), IsNotNull(p_partkey)] ReadSchema: struct -(18) Filter +(19) Filter Input [3]: [p_partkey#X, p_brand#X, p_container#X] Condition : ((((isnotnull(p_brand#X) AND isnotnull(p_container#X)) AND (p_brand#X = Brand#X)) AND (p_container#X = MED BOX)) AND isnotnull(p_partkey#X)) -(19) Project +(20) Project Output [1]: [p_partkey#X] Input [3]: [p_partkey#X, p_brand#X, p_container#X] -(20) BroadcastExchange +(21) BroadcastExchange Input [1]: [p_partkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(21) BroadcastHashJoin +(22) BroadcastHashJoin Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(22) Project +(23) Project Output [3]: [l_quantity#X, l_extendedprice#X, p_partkey#X] Input [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, p_partkey#X] -(23) Scan parquet +(24) Scan parquet Output [2]: [l_partkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey)] ReadSchema: struct -(24) Filter +(25) Filter Input [2]: [l_partkey#X, l_quantity#X] Condition : isnotnull(l_partkey#X) -(25) HashAggregate +(26) HashAggregate Input [2]: [l_partkey#X, l_quantity#X] Keys [1]: [l_partkey#X] Functions [1]: [partial_avg(l_quantity#X)] Aggregate Attributes [2]: [sum#X, count#X] Results [3]: [l_partkey#X, sum#X, count#X] -(26) Exchange +(27) Exchange Input [3]: [l_partkey#X, sum#X, count#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(27) HashAggregate +(28) HashAggregate Input [3]: [l_partkey#X, sum#X, count#X] Keys [1]: [l_partkey#X] Functions [1]: [avg(l_quantity#X)] Aggregate Attributes [1]: [avg(l_quantity#X)#X] Results [2]: [CheckOverflow((0.200000 * promote_precision(avg(l_quantity#X)#X)), DecimalType(18,7)) AS (0.2 * avg(l_quantity))#X, l_partkey#X] -(28) Filter +(29) Filter Input [2]: [(0.2 * avg(l_quantity))#X, l_partkey#X] Condition : isnotnull((0.2 * avg(l_quantity))#X) -(29) BroadcastExchange +(30) BroadcastExchange Input [2]: [(0.2 * avg(l_quantity))#X, l_partkey#X] Arguments: HashedRelationBroadcastMode(List(input[1, bigint, true]),false), [plan_id=X] -(30) BroadcastHashJoin +(31) BroadcastHashJoin Left keys [1]: [p_partkey#X] Right keys [1]: [l_partkey#X] Join condition: (cast(l_quantity#X as decimal(18,7)) < (0.2 * avg(l_quantity))#X) -(31) Project +(32) Project Output [1]: [l_extendedprice#X] Input [5]: [l_quantity#X, l_extendedprice#X, p_partkey#X, (0.2 * avg(l_quantity))#X, l_partkey#X] -(32) HashAggregate +(33) HashAggregate Input [1]: [l_extendedprice#X] Keys: [] Functions [1]: [partial_sum(l_extendedprice#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(33) Exchange +(34) Exchange Input [2]: [sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X] -(34) HashAggregate +(35) HashAggregate Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(l_extendedprice#X)] Aggregate Attributes [1]: [sum(l_extendedprice#X)#X] Results [1]: [CheckOverflow((promote_precision(sum(l_extendedprice#X)#X) / 7.00), DecimalType(27,6)) AS avg_yearly#X] -(35) AdaptiveSparkPlan +(36) AdaptiveSparkPlan Output [1]: [avg_yearly#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/18.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/18.txt index 5a941687c5258..2e7ce455aebd0 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/18.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/18.txt @@ -1,86 +1,82 @@ == Physical Plan == -AdaptiveSparkPlan (86) +AdaptiveSparkPlan (88) +- == Final Plan == - VeloxColumnarToRowExec (53) - +- TakeOrderedAndProjectExecTransformer (52) - +- ^ RegularHashAggregateExecTransformer (50) - +- ^ InputIteratorTransformer (49) - +- ^ InputAdapter (48) - +- ^ ShuffleQueryStage (47), Statistics(X) - +- ColumnarExchange (46) - +- ^ ProjectExecTransformer (44) - +- ^ FlushableHashAggregateExecTransformer (43) - +- ^ ProjectExecTransformer (42) - +- ^ BroadcastHashJoinExecTransformer Inner (41) - :- ^ ProjectExecTransformer (28) - : +- ^ BroadcastHashJoinExecTransformer Inner (27) + VeloxColumnarToRowExec (55) + +- TakeOrderedAndProjectExecTransformer (54) + +- ^ RegularHashAggregateExecTransformer (52) + +- ^ InputIteratorTransformer (51) + +- ShuffleQueryStage (49), Statistics(X) + +- ColumnarExchange (48) + +- VeloxAppendBatches (47) + +- ^ ProjectExecTransformer (45) + +- ^ FlushableHashAggregateExecTransformer (44) + +- ^ ProjectExecTransformer (43) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (42) + :- ^ ProjectExecTransformer (29) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildLeft (28) : :- ^ InputIteratorTransformer (7) - : : +- ^ InputAdapter (6) - : : +- ^ BroadcastQueryStage (5), Statistics(X) - : : +- ColumnarBroadcastExchange (4) - : : +- ^ NoopFilter (2) - : : +- ^ Scan parquet (1) - : +- ^ BroadcastHashJoinExecTransformer LeftSemi (26) + : : +- BroadcastQueryStage (5), Statistics(X) + : : +- ColumnarBroadcastExchange (4) + : : +- ^ NoopFilter (2) + : : +- ^ Scan parquet (1) + : +- ^ BroadcastHashJoinExecTransformer LeftSemi BuildRight (27) : :- ^ NoopFilter (9) : : +- ^ Scan parquet (8) - : +- ^ InputIteratorTransformer (25) - : +- ^ InputAdapter (24) - : +- ^ BroadcastQueryStage (23), Statistics(X) - : +- ColumnarBroadcastExchange (22) - : +- ^ ProjectExecTransformer (20) - : +- ^ FilterExecTransformer (19) - : +- ^ RegularHashAggregateExecTransformer (18) - : +- ^ InputIteratorTransformer (17) - : +- ^ InputAdapter (16) - : +- ^ ShuffleQueryStage (15), Statistics(X) - : +- ColumnarExchange (14) - : +- ^ ProjectExecTransformer (12) - : +- ^ FlushableHashAggregateExecTransformer (11) - : +- ^ Scan parquet (10) - +- ^ InputIteratorTransformer (40) - +- ^ InputAdapter (39) - +- ^ BroadcastQueryStage (38), Statistics(X) - +- ColumnarBroadcastExchange (37) - +- ^ BroadcastHashJoinExecTransformer LeftSemi (35) - :- ^ NoopFilter (30) - : +- ^ Scan parquet (29) - +- ^ InputIteratorTransformer (34) - +- ^ InputAdapter (33) - +- ^ BroadcastQueryStage (32), Statistics(X) - +- ReusedExchange (31) + : +- ^ InputIteratorTransformer (26) + : +- BroadcastQueryStage (24), Statistics(X) + : +- ColumnarBroadcastExchange (23) + : +- ^ ProjectExecTransformer (21) + : +- ^ FilterExecTransformer (20) + : +- ^ RegularHashAggregateExecTransformer (19) + : +- ^ InputIteratorTransformer (18) + : +- ShuffleQueryStage (16), Statistics(X) + : +- ColumnarExchange (15) + : +- VeloxAppendBatches (14) + : +- ^ ProjectExecTransformer (12) + : +- ^ FlushableHashAggregateExecTransformer (11) + : +- ^ Scan parquet (10) + +- ^ InputIteratorTransformer (41) + +- BroadcastQueryStage (39), Statistics(X) + +- ColumnarBroadcastExchange (38) + +- ^ BroadcastHashJoinExecTransformer LeftSemi BuildRight (36) + :- ^ NoopFilter (31) + : +- ^ Scan parquet (30) + +- ^ InputIteratorTransformer (35) + +- BroadcastQueryStage (33), Statistics(X) + +- ReusedExchange (32) +- == Initial Plan == - TakeOrderedAndProject (85) - +- HashAggregate (84) - +- Exchange (83) - +- HashAggregate (82) - +- Project (81) - +- BroadcastHashJoin Inner BuildRight (80) - :- Project (68) - : +- BroadcastHashJoin Inner BuildLeft (67) - : :- BroadcastExchange (56) - : : +- Filter (55) - : : +- Scan parquet (54) - : +- BroadcastHashJoin LeftSemi BuildRight (66) - : :- Filter (58) - : : +- Scan parquet (57) - : +- BroadcastExchange (65) - : +- Project (64) - : +- Filter (63) - : +- HashAggregate (62) - : +- Exchange (61) - : +- HashAggregate (60) - : +- Scan parquet (59) - +- BroadcastExchange (79) - +- BroadcastHashJoin LeftSemi BuildRight (78) - :- Filter (70) - : +- Scan parquet (69) - +- BroadcastExchange (77) - +- Project (76) - +- Filter (75) - +- HashAggregate (74) - +- Exchange (73) - +- HashAggregate (72) - +- Scan parquet (71) + TakeOrderedAndProject (87) + +- HashAggregate (86) + +- Exchange (85) + +- HashAggregate (84) + +- Project (83) + +- BroadcastHashJoin Inner BuildRight (82) + :- Project (70) + : +- BroadcastHashJoin Inner BuildLeft (69) + : :- BroadcastExchange (58) + : : +- Filter (57) + : : +- Scan parquet (56) + : +- BroadcastHashJoin LeftSemi BuildRight (68) + : :- Filter (60) + : : +- Scan parquet (59) + : +- BroadcastExchange (67) + : +- Project (66) + : +- Filter (65) + : +- HashAggregate (64) + : +- Exchange (63) + : +- HashAggregate (62) + : +- Scan parquet (61) + +- BroadcastExchange (81) + +- BroadcastHashJoin LeftSemi BuildRight (80) + :- Filter (72) + : +- Scan parquet (71) + +- BroadcastExchange (79) + +- Project (78) + +- Filter (77) + +- HashAggregate (76) + +- Exchange (75) + +- HashAggregate (74) + +- Scan parquet (73) (1) Scan parquet @@ -144,333 +140,341 @@ Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Input [4]: [hash_partition_key#X, l_orderkey#X, sum#X, isEmpty#X] Arguments: false -(14) ColumnarExchange +(14) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_orderkey#X, sum#X, isEmpty#X] +Arguments: X + +(15) ColumnarExchange Input [4]: [hash_partition_key#X, l_orderkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(15) ShuffleQueryStage +(16) ShuffleQueryStage Output [3]: [l_orderkey#X, sum#X, isEmpty#X] Arguments: X -(16) InputAdapter +(17) InputAdapter Input [3]: [l_orderkey#X, sum#X, isEmpty#X] -(17) InputIteratorTransformer +(18) InputIteratorTransformer Input [3]: [l_orderkey#X, sum#X, isEmpty#X] -(18) RegularHashAggregateExecTransformer +(19) RegularHashAggregateExecTransformer Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Keys [1]: [l_orderkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [2]: [l_orderkey#X, sum(l_quantity#X)#X AS sum(l_quantity#X)#X] -(19) FilterExecTransformer +(20) FilterExecTransformer Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] Arguments: (isnotnull(sum(l_quantity#X)#X) AND (sum(l_quantity#X)#X > 300.00)) -(20) ProjectExecTransformer +(21) ProjectExecTransformer Output [1]: [l_orderkey#X] Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] -(21) WholeStageCodegenTransformer (X) +(22) WholeStageCodegenTransformer (X) Input [1]: [l_orderkey#X] Arguments: false -(22) ColumnarBroadcastExchange +(23) ColumnarBroadcastExchange Input [1]: [l_orderkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(23) BroadcastQueryStage +(24) BroadcastQueryStage Output [1]: [l_orderkey#X] Arguments: X -(24) InputAdapter +(25) InputAdapter Input [1]: [l_orderkey#X] -(25) InputIteratorTransformer +(26) InputIteratorTransformer Input [1]: [l_orderkey#X] -(26) BroadcastHashJoinExecTransformer +(27) BroadcastHashJoinExecTransformer Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(27) BroadcastHashJoinExecTransformer +(28) BroadcastHashJoinExecTransformer Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(28) ProjectExecTransformer +(29) ProjectExecTransformer Output [5]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] Input [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -(29) Scan parquet +(30) Scan parquet Output [2]: [l_orderkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_orderkey)] ReadSchema: struct -(30) NoopFilter +(31) NoopFilter Input [2]: [l_orderkey#X, l_quantity#X] Arguments: [l_orderkey#X, l_quantity#X] -(31) ReusedExchange [Reuses operator id: 22] +(32) ReusedExchange [Reuses operator id: 23] Output [1]: [l_orderkey#X] -(32) BroadcastQueryStage +(33) BroadcastQueryStage Output [1]: [l_orderkey#X] Arguments: X -(33) InputAdapter +(34) InputAdapter Input [1]: [l_orderkey#X] -(34) InputIteratorTransformer +(35) InputIteratorTransformer Input [1]: [l_orderkey#X] -(35) BroadcastHashJoinExecTransformer +(36) BroadcastHashJoinExecTransformer Left keys [1]: [l_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(36) WholeStageCodegenTransformer (X) +(37) WholeStageCodegenTransformer (X) Input [2]: [l_orderkey#X, l_quantity#X] Arguments: false -(37) ColumnarBroadcastExchange +(38) ColumnarBroadcastExchange Input [2]: [l_orderkey#X, l_quantity#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(38) BroadcastQueryStage +(39) BroadcastQueryStage Output [2]: [l_orderkey#X, l_quantity#X] Arguments: X -(39) InputAdapter +(40) InputAdapter Input [2]: [l_orderkey#X, l_quantity#X] -(40) InputIteratorTransformer +(41) InputIteratorTransformer Input [2]: [l_orderkey#X, l_quantity#X] -(41) BroadcastHashJoinExecTransformer +(42) BroadcastHashJoinExecTransformer Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(42) ProjectExecTransformer +(43) ProjectExecTransformer Output [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_quantity#X] Input [7]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_orderkey#X, l_quantity#X] -(43) FlushableHashAggregateExecTransformer +(44) FlushableHashAggregateExecTransformer Input [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_quantity#X] Keys [5]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] -(44) ProjectExecTransformer +(45) ProjectExecTransformer Output [8]: [hash(c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, 42) AS hash_partition_key#X, c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] Input [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] -(45) WholeStageCodegenTransformer (X) +(46) WholeStageCodegenTransformer (X) Input [8]: [hash_partition_key#X, c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] Arguments: false -(46) ColumnarExchange +(47) VeloxAppendBatches +Input [8]: [hash_partition_key#X, c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] +Arguments: X + +(48) ColumnarExchange Input [8]: [hash_partition_key#X, c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] Arguments: hashpartitioning(c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, 1), ENSURE_REQUIREMENTS, [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(47) ShuffleQueryStage +(49) ShuffleQueryStage Output [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] Arguments: X -(48) InputAdapter +(50) InputAdapter Input [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] -(49) InputIteratorTransformer +(51) InputIteratorTransformer Input [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] -(50) RegularHashAggregateExecTransformer +(52) RegularHashAggregateExecTransformer Input [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] Keys [5]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity#X)#X AS sum(l_quantity)#X] -(51) WholeStageCodegenTransformer (X) +(53) WholeStageCodegenTransformer (X) Input [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] Arguments: false -(52) TakeOrderedAndProjectExecTransformer +(54) TakeOrderedAndProjectExecTransformer Input [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] Arguments: X, [o_totalprice#X DESC NULLS LAST, o_orderdate#X ASC NULLS FIRST], [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X], 0 -(53) VeloxColumnarToRowExec +(55) VeloxColumnarToRowExec Input [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] -(54) Scan parquet +(56) Scan parquet Output [2]: [c_custkey#X, c_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey)] ReadSchema: struct -(55) Filter +(57) Filter Input [2]: [c_custkey#X, c_name#X] Condition : isnotnull(c_custkey#X) -(56) BroadcastExchange +(58) BroadcastExchange Input [2]: [c_custkey#X, c_name#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(57) Scan parquet +(59) Scan parquet Output [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(58) Filter +(60) Filter Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Condition : (isnotnull(o_custkey#X) AND isnotnull(o_orderkey#X)) -(59) Scan parquet +(61) Scan parquet Output [2]: [l_orderkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(60) HashAggregate +(62) HashAggregate Input [2]: [l_orderkey#X, l_quantity#X] Keys [1]: [l_orderkey#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_orderkey#X, sum#X, isEmpty#X] -(61) Exchange +(63) Exchange Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(62) HashAggregate +(64) HashAggregate Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Keys [1]: [l_orderkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [2]: [l_orderkey#X, sum(l_quantity#X)#X AS sum(l_quantity#X)#X] -(63) Filter +(65) Filter Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] Condition : (isnotnull(sum(l_quantity#X)#X) AND (sum(l_quantity#X)#X > 300.00)) -(64) Project +(66) Project Output [1]: [l_orderkey#X] Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] -(65) BroadcastExchange +(67) BroadcastExchange Input [1]: [l_orderkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(66) BroadcastHashJoin +(68) BroadcastHashJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(67) BroadcastHashJoin +(69) BroadcastHashJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(68) Project +(70) Project Output [5]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] Input [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -(69) Scan parquet +(71) Scan parquet Output [2]: [l_orderkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_orderkey)] ReadSchema: struct -(70) Filter +(72) Filter Input [2]: [l_orderkey#X, l_quantity#X] Condition : isnotnull(l_orderkey#X) -(71) Scan parquet +(73) Scan parquet Output [2]: [l_orderkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(72) HashAggregate +(74) HashAggregate Input [2]: [l_orderkey#X, l_quantity#X] Keys [1]: [l_orderkey#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_orderkey#X, sum#X, isEmpty#X] -(73) Exchange +(75) Exchange Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(74) HashAggregate +(76) HashAggregate Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Keys [1]: [l_orderkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [2]: [l_orderkey#X, sum(l_quantity#X)#X AS sum(l_quantity#X)#X] -(75) Filter +(77) Filter Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] Condition : (isnotnull(sum(l_quantity#X)#X) AND (sum(l_quantity#X)#X > 300.00)) -(76) Project +(78) Project Output [1]: [l_orderkey#X] Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] -(77) BroadcastExchange +(79) BroadcastExchange Input [1]: [l_orderkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(78) BroadcastHashJoin +(80) BroadcastHashJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(79) BroadcastExchange +(81) BroadcastExchange Input [2]: [l_orderkey#X, l_quantity#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(80) BroadcastHashJoin +(82) BroadcastHashJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(81) Project +(83) Project Output [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_quantity#X] Input [7]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_orderkey#X, l_quantity#X] -(82) HashAggregate +(84) HashAggregate Input [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_quantity#X] Keys [5]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] -(83) Exchange +(85) Exchange Input [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] Arguments: hashpartitioning(c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(84) HashAggregate +(86) HashAggregate Input [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] Keys [5]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity#X)#X AS sum(l_quantity)#X] -(85) TakeOrderedAndProject +(87) TakeOrderedAndProject Input [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] Arguments: X, [o_totalprice#X DESC NULLS LAST, o_orderdate#X ASC NULLS FIRST], [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] -(86) AdaptiveSparkPlan +(88) AdaptiveSparkPlan Output [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/19.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/19.txt index 244f1c6ffd897..992c5328adbab 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/19.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/19.txt @@ -1,36 +1,35 @@ == Physical Plan == -AdaptiveSparkPlan (33) +AdaptiveSparkPlan (34) +- == Final Plan == - VeloxColumnarToRowExec (21) - +- ^ RegularHashAggregateExecTransformer (19) - +- ^ InputIteratorTransformer (18) - +- ^ InputAdapter (17) - +- ^ ShuffleQueryStage (16), Statistics(X) - +- ColumnarExchange (15) + VeloxColumnarToRowExec (22) + +- ^ RegularHashAggregateExecTransformer (20) + +- ^ InputIteratorTransformer (19) + +- ShuffleQueryStage (17), Statistics(X) + +- ColumnarExchange (16) + +- VeloxAppendBatches (15) +- ^ FlushableHashAggregateExecTransformer (13) +- ^ ProjectExecTransformer (12) - +- ^ BroadcastHashJoinExecTransformer Inner (11) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (11) :- ^ ProjectExecTransformer (3) : +- ^ NoopFilter (2) : +- ^ Scan parquet (1) +- ^ InputIteratorTransformer (10) - +- ^ InputAdapter (9) - +- ^ BroadcastQueryStage (8), Statistics(X) - +- ColumnarBroadcastExchange (7) - +- ^ NoopFilter (5) - +- ^ Scan parquet (4) + +- BroadcastQueryStage (8), Statistics(X) + +- ColumnarBroadcastExchange (7) + +- ^ NoopFilter (5) + +- ^ Scan parquet (4) +- == Initial Plan == - HashAggregate (32) - +- Exchange (31) - +- HashAggregate (30) - +- Project (29) - +- BroadcastHashJoin Inner BuildRight (28) - :- Project (24) - : +- Filter (23) - : +- Scan parquet (22) - +- BroadcastExchange (27) - +- Filter (26) - +- Scan parquet (25) + HashAggregate (33) + +- Exchange (32) + +- HashAggregate (31) + +- Project (30) + +- BroadcastHashJoin Inner BuildRight (29) + :- Project (25) + : +- Filter (24) + : +- Scan parquet (23) + +- BroadcastExchange (28) + +- Filter (27) + +- Scan parquet (26) (1) Scan parquet @@ -97,91 +96,95 @@ Results [2]: [sum#X, isEmpty#X] Input [2]: [sum#X, isEmpty#X] Arguments: false -(15) ColumnarExchange +(15) VeloxAppendBatches +Input [2]: [sum#X, isEmpty#X] +Arguments: X + +(16) ColumnarExchange Input [2]: [sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(16) ShuffleQueryStage +(17) ShuffleQueryStage Output [2]: [sum#X, isEmpty#X] Arguments: X -(17) InputAdapter +(18) InputAdapter Input [2]: [sum#X, isEmpty#X] -(18) InputIteratorTransformer +(19) InputIteratorTransformer Input [2]: [sum#X, isEmpty#X] -(19) RegularHashAggregateExecTransformer +(20) RegularHashAggregateExecTransformer Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] Results [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X AS revenue#X] -(20) WholeStageCodegenTransformer (X) +(21) WholeStageCodegenTransformer (X) Input [1]: [revenue#X] Arguments: false -(21) VeloxColumnarToRowExec +(22) VeloxColumnarToRowExec Input [1]: [revenue#X] -(22) Scan parquet +(23) Scan parquet Output [6]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, l_shipinstruct#X, l_shipmode#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipinstruct), In(l_shipmode, [AIR,AIR REG]), EqualTo(l_shipinstruct,DELIVER IN PERSON), IsNotNull(l_partkey), Or(Or(And(GreaterThanOrEqual(l_quantity,1.00),LessThanOrEqual(l_quantity,11.00)),And(GreaterThanOrEqual(l_quantity,10.00),LessThanOrEqual(l_quantity,20.00))),And(GreaterThanOrEqual(l_quantity,20.00),LessThanOrEqual(l_quantity,30.00)))] ReadSchema: struct -(23) Filter +(24) Filter Input [6]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, l_shipinstruct#X, l_shipmode#X] Condition : ((((isnotnull(l_shipinstruct#X) AND l_shipmode#X IN (AIR,AIR REG)) AND (l_shipinstruct#X = DELIVER IN PERSON)) AND isnotnull(l_partkey#X)) AND ((((l_quantity#X >= 1.00) AND (l_quantity#X <= 11.00)) OR ((l_quantity#X >= 10.00) AND (l_quantity#X <= 20.00))) OR ((l_quantity#X >= 20.00) AND (l_quantity#X <= 30.00)))) -(24) Project +(25) Project Output [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Input [6]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, l_shipinstruct#X, l_shipmode#X] -(25) Scan parquet +(26) Scan parquet Output [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_size), GreaterThanOrEqual(p_size,1), IsNotNull(p_partkey), Or(Or(And(And(EqualTo(p_brand,Brand#X),In(p_container, [SM BOX,SM CASE,SM PACK,SM PKG])),LessThanOrEqual(p_size,5)),And(And(EqualTo(p_brand,Brand#X),In(p_container, [MED BAG,MED BOX,MED PACK,MED PKG])),LessThanOrEqual(p_size,10))),And(And(EqualTo(p_brand,Brand#X),In(p_container, [LG BOX,LG CASE,LG PACK,LG PKG])),LessThanOrEqual(p_size,15)))] ReadSchema: struct -(26) Filter +(27) Filter Input [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] Condition : (((isnotnull(p_size#X) AND (p_size#X >= 1)) AND isnotnull(p_partkey#X)) AND (((((p_brand#X = Brand#X) AND p_container#X IN (SM CASE,SM BOX,SM PACK,SM PKG)) AND (p_size#X <= 5)) OR (((p_brand#X = Brand#X) AND p_container#X IN (MED BAG,MED BOX,MED PKG,MED PACK)) AND (p_size#X <= 10))) OR (((p_brand#X = Brand#X) AND p_container#X IN (LG CASE,LG BOX,LG PACK,LG PKG)) AND (p_size#X <= 15)))) -(27) BroadcastExchange +(28) BroadcastExchange Input [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(28) BroadcastHashJoin +(29) BroadcastHashJoin Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join condition: (((((((p_brand#X = Brand#X) AND p_container#X IN (SM CASE,SM BOX,SM PACK,SM PKG)) AND (l_quantity#X >= 1.00)) AND (l_quantity#X <= 11.00)) AND (p_size#X <= 5)) OR (((((p_brand#X = Brand#X) AND p_container#X IN (MED BAG,MED BOX,MED PKG,MED PACK)) AND (l_quantity#X >= 10.00)) AND (l_quantity#X <= 20.00)) AND (p_size#X <= 10))) OR (((((p_brand#X = Brand#X) AND p_container#X IN (LG CASE,LG BOX,LG PACK,LG PKG)) AND (l_quantity#X >= 20.00)) AND (l_quantity#X <= 30.00)) AND (p_size#X <= 15))) -(29) Project +(30) Project Output [2]: [l_extendedprice#X, l_discount#X] Input [8]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, p_partkey#X, p_brand#X, p_size#X, p_container#X] -(30) HashAggregate +(31) HashAggregate Input [2]: [l_extendedprice#X, l_discount#X] Keys: [] Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(31) Exchange +(32) Exchange Input [2]: [sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X] -(32) HashAggregate +(33) HashAggregate Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] Results [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X AS revenue#X] -(33) AdaptiveSparkPlan +(34) AdaptiveSparkPlan Output [1]: [revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/20.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/20.txt index fd6deaabc6f52..4a899ae239be3 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/20.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/20.txt @@ -1,103 +1,101 @@ == Physical Plan == -AdaptiveSparkPlan (104) +AdaptiveSparkPlan (109) +- == Final Plan == - VeloxColumnarToRowExec (67) - +- AQEShuffleRead (66) - +- ShuffleQueryStage (65), Statistics(X) - +- ColumnarExchange (64) - +- ^ ProjectExecTransformer (62) - +- ^ BroadcastHashJoinExecTransformer Inner (61) - :- ^ ProjectExecTransformer (52) - : +- ^ BroadcastHashJoinExecTransformer LeftSemi (51) - : :- ^ InputIteratorTransformer (9) - : : +- ^ InputAdapter (8) - : : +- ^ AQEShuffleRead (7) - : : +- ^ ShuffleQueryStage (6), Statistics(X) - : : +- ColumnarExchange (5) - : : +- ^ ProjectExecTransformer (3) - : : +- ^ NoopFilter (2) - : : +- ^ Scan parquet (1) - : +- ^ InputIteratorTransformer (50) - : +- ^ InputAdapter (49) - : +- ^ BroadcastQueryStage (48), Statistics(X) - : +- ColumnarBroadcastExchange (47) - : +- ^ ProjectExecTransformer (45) - : +- ^ BroadcastHashJoinExecTransformer Inner (44) - : :- ^ InputIteratorTransformer (25) - : : +- ^ InputAdapter (24) - : : +- ^ BroadcastQueryStage (23), Statistics(X) - : : +- ColumnarBroadcastExchange (22) - : : +- ^ BroadcastHashJoinExecTransformer LeftSemi (20) - : : :- ^ NoopFilter (11) - : : : +- ^ Scan parquet (10) - : : +- ^ InputIteratorTransformer (19) - : : +- ^ InputAdapter (18) - : : +- ^ BroadcastQueryStage (17), Statistics(X) - : : +- ColumnarBroadcastExchange (16) - : : +- ^ ProjectExecTransformer (14) - : : +- ^ NoopFilter (13) - : : +- ^ Scan parquet (12) - : +- ^ FilterExecTransformer (43) - : +- ^ ProjectExecTransformer (42) - : +- ^ RegularHashAggregateExecTransformer (41) - : +- ^ InputIteratorTransformer (40) - : +- ^ InputAdapter (39) - : +- ^ ShuffleQueryStage (38), Statistics(X) - : +- ColumnarExchange (37) - : +- ^ ProjectExecTransformer (35) - : +- ^ FlushableHashAggregateExecTransformer (34) - : +- ^ BroadcastHashJoinExecTransformer LeftSemi (33) - : :- ^ ProjectExecTransformer (28) - : : +- ^ NoopFilter (27) - : : +- ^ Scan parquet (26) - : +- ^ InputIteratorTransformer (32) - : +- ^ InputAdapter (31) - : +- ^ BroadcastQueryStage (30), Statistics(X) - : +- ReusedExchange (29) - +- ^ InputIteratorTransformer (60) - +- ^ InputAdapter (59) - +- ^ BroadcastQueryStage (58), Statistics(X) - +- ColumnarBroadcastExchange (57) - +- ^ ProjectExecTransformer (55) - +- ^ NoopFilter (54) - +- ^ Scan parquet (53) + VeloxColumnarToRowExec (70) + +- AQEShuffleRead (69) + +- ShuffleQueryStage (68), Statistics(X) + +- ColumnarExchange (67) + +- VeloxAppendBatches (66) + +- ^ ProjectExecTransformer (64) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (63) + :- ^ ProjectExecTransformer (54) + : +- ^ BroadcastHashJoinExecTransformer LeftSemi BuildRight (53) + : :- ^ InputIteratorTransformer (10) + : : +- AQEShuffleRead (8) + : : +- ShuffleQueryStage (7), Statistics(X) + : : +- ColumnarExchange (6) + : : +- VeloxAppendBatches (5) + : : +- ^ ProjectExecTransformer (3) + : : +- ^ NoopFilter (2) + : : +- ^ Scan parquet (1) + : +- ^ InputIteratorTransformer (52) + : +- BroadcastQueryStage (50), Statistics(X) + : +- ColumnarBroadcastExchange (49) + : +- ^ ProjectExecTransformer (47) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildLeft (46) + : :- ^ InputIteratorTransformer (26) + : : +- BroadcastQueryStage (24), Statistics(X) + : : +- ColumnarBroadcastExchange (23) + : : +- ^ BroadcastHashJoinExecTransformer LeftSemi BuildRight (21) + : : :- ^ NoopFilter (12) + : : : +- ^ Scan parquet (11) + : : +- ^ InputIteratorTransformer (20) + : : +- BroadcastQueryStage (18), Statistics(X) + : : +- ColumnarBroadcastExchange (17) + : : +- ^ ProjectExecTransformer (15) + : : +- ^ NoopFilter (14) + : : +- ^ Scan parquet (13) + : +- ^ FilterExecTransformer (45) + : +- ^ ProjectExecTransformer (44) + : +- ^ RegularHashAggregateExecTransformer (43) + : +- ^ InputIteratorTransformer (42) + : +- ShuffleQueryStage (40), Statistics(X) + : +- ColumnarExchange (39) + : +- VeloxAppendBatches (38) + : +- ^ ProjectExecTransformer (36) + : +- ^ FlushableHashAggregateExecTransformer (35) + : +- ^ BroadcastHashJoinExecTransformer LeftSemi BuildRight (34) + : :- ^ ProjectExecTransformer (29) + : : +- ^ NoopFilter (28) + : : +- ^ Scan parquet (27) + : +- ^ InputIteratorTransformer (33) + : +- BroadcastQueryStage (31), Statistics(X) + : +- ReusedExchange (30) + +- ^ InputIteratorTransformer (62) + +- BroadcastQueryStage (60), Statistics(X) + +- ColumnarBroadcastExchange (59) + +- ^ ProjectExecTransformer (57) + +- ^ NoopFilter (56) + +- ^ Scan parquet (55) +- == Initial Plan == - Sort (103) - +- Exchange (102) - +- Project (101) - +- BroadcastHashJoin Inner BuildRight (100) - :- Project (95) - : +- ShuffledHashJoin LeftSemi BuildRight (94) - : :- Exchange (70) - : : +- Filter (69) - : : +- Scan parquet (68) - : +- Exchange (93) - : +- Project (92) - : +- BroadcastHashJoin Inner BuildLeft (91) - : :- BroadcastExchange (78) - : : +- BroadcastHashJoin LeftSemi BuildRight (77) - : : :- Filter (72) - : : : +- Scan parquet (71) - : : +- BroadcastExchange (76) - : : +- Project (75) - : : +- Filter (74) - : : +- Scan parquet (73) - : +- Filter (90) - : +- HashAggregate (89) - : +- Exchange (88) - : +- HashAggregate (87) - : +- BroadcastHashJoin LeftSemi BuildRight (86) - : :- Project (81) - : : +- Filter (80) - : : +- Scan parquet (79) - : +- BroadcastExchange (85) - : +- Project (84) - : +- Filter (83) - : +- Scan parquet (82) - +- BroadcastExchange (99) - +- Project (98) - +- Filter (97) - +- Scan parquet (96) + Sort (108) + +- Exchange (107) + +- Project (106) + +- BroadcastHashJoin Inner BuildRight (105) + :- Project (100) + : +- SortMergeJoin LeftSemi (99) + : :- Sort (74) + : : +- Exchange (73) + : : +- Filter (72) + : : +- Scan parquet (71) + : +- Sort (98) + : +- Exchange (97) + : +- Project (96) + : +- BroadcastHashJoin Inner BuildLeft (95) + : :- BroadcastExchange (82) + : : +- BroadcastHashJoin LeftSemi BuildRight (81) + : : :- Filter (76) + : : : +- Scan parquet (75) + : : +- BroadcastExchange (80) + : : +- Project (79) + : : +- Filter (78) + : : +- Scan parquet (77) + : +- Filter (94) + : +- HashAggregate (93) + : +- Exchange (92) + : +- HashAggregate (91) + : +- BroadcastHashJoin LeftSemi BuildRight (90) + : :- Project (85) + : : +- Filter (84) + : : +- Scan parquet (83) + : +- BroadcastExchange (89) + : +- Project (88) + : +- Filter (87) + : +- Scan parquet (86) + +- BroadcastExchange (104) + +- Project (103) + +- Filter (102) + +- Scan parquet (101) (1) Scan parquet @@ -119,438 +117,458 @@ Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] +Arguments: X + +(6) ColumnarExchange Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Arguments: X -(7) AQEShuffleRead +(8) AQEShuffleRead Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Arguments: local -(8) InputAdapter +(9) InputAdapter Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] -(9) InputIteratorTransformer +(10) InputIteratorTransformer Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] -(10) Scan parquet +(11) Scan parquet Output [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_availqty), IsNotNull(ps_partkey), IsNotNull(ps_suppkey)] ReadSchema: struct -(11) NoopFilter +(12) NoopFilter Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] -(12) Scan parquet +(13) Scan parquet Output [2]: [p_partkey#X, p_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_name), StringStartsWith(p_name,forest)] ReadSchema: struct -(13) NoopFilter +(14) NoopFilter Input [2]: [p_partkey#X, p_name#X] Arguments: [p_partkey#X, p_name#X] -(14) ProjectExecTransformer +(15) ProjectExecTransformer Output [1]: [p_partkey#X] Input [2]: [p_partkey#X, p_name#X] -(15) WholeStageCodegenTransformer (X) +(16) WholeStageCodegenTransformer (X) Input [1]: [p_partkey#X] Arguments: false -(16) ColumnarBroadcastExchange +(17) ColumnarBroadcastExchange Input [1]: [p_partkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(17) BroadcastQueryStage +(18) BroadcastQueryStage Output [1]: [p_partkey#X] Arguments: X -(18) InputAdapter +(19) InputAdapter Input [1]: [p_partkey#X] -(19) InputIteratorTransformer +(20) InputIteratorTransformer Input [1]: [p_partkey#X] -(20) BroadcastHashJoinExecTransformer +(21) BroadcastHashJoinExecTransformer Left keys [1]: [ps_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(21) WholeStageCodegenTransformer (X) +(22) WholeStageCodegenTransformer (X) Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: false -(22) ColumnarBroadcastExchange +(23) ColumnarBroadcastExchange Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false], input[1, bigint, false]),false), [plan_id=X] -(23) BroadcastQueryStage +(24) BroadcastQueryStage Output [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: X -(24) InputAdapter +(25) InputAdapter Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] -(25) InputIteratorTransformer +(26) InputIteratorTransformer Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] -(26) Scan parquet +(27) Scan parquet Output [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1994-01-01), LessThan(l_shipdate,1995-01-01), IsNotNull(l_partkey), IsNotNull(l_suppkey)] ReadSchema: struct -(27) NoopFilter +(28) NoopFilter Input [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] Arguments: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] -(28) ProjectExecTransformer +(29) ProjectExecTransformer Output [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] Input [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] -(29) ReusedExchange [Reuses operator id: 16] +(30) ReusedExchange [Reuses operator id: 17] Output [1]: [p_partkey#X] -(30) BroadcastQueryStage +(31) BroadcastQueryStage Output [1]: [p_partkey#X] Arguments: X -(31) InputAdapter +(32) InputAdapter Input [1]: [p_partkey#X] -(32) InputIteratorTransformer +(33) InputIteratorTransformer Input [1]: [p_partkey#X] -(33) BroadcastHashJoinExecTransformer +(34) BroadcastHashJoinExecTransformer Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(34) FlushableHashAggregateExecTransformer +(35) FlushableHashAggregateExecTransformer Input [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] Keys [2]: [l_partkey#X, l_suppkey#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] -(35) ProjectExecTransformer +(36) ProjectExecTransformer Output [5]: [hash(l_partkey#X, l_suppkey#X, 42) AS hash_partition_key#X, l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] Input [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] -(36) WholeStageCodegenTransformer (X) +(37) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: false -(37) ColumnarExchange +(38) VeloxAppendBatches +Input [5]: [hash_partition_key#X, l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] +Arguments: X + +(39) ColumnarExchange Input [5]: [hash_partition_key#X, l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_partkey#X, l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(38) ShuffleQueryStage +(40) ShuffleQueryStage Output [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: X -(39) InputAdapter +(41) InputAdapter Input [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] -(40) InputIteratorTransformer +(42) InputIteratorTransformer Input [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] -(41) RegularHashAggregateExecTransformer +(43) RegularHashAggregateExecTransformer Input [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] Keys [2]: [l_partkey#X, l_suppkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [3]: [l_partkey#X, l_suppkey#X, sum(l_quantity#X)#X] -(42) ProjectExecTransformer +(44) ProjectExecTransformer Output [3]: [CheckOverflow((0.50 * promote_precision(sum(l_quantity#X)#X)), DecimalType(24,3)) AS (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Input [3]: [l_partkey#X, l_suppkey#X, sum(l_quantity#X)#X] -(43) FilterExecTransformer +(45) FilterExecTransformer Input [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Arguments: isnotnull((0.5 * sum(l_quantity))#X) -(44) BroadcastHashJoinExecTransformer +(46) BroadcastHashJoinExecTransformer Left keys [2]: [ps_partkey#X, ps_suppkey#X] Right keys [2]: [l_partkey#X, l_suppkey#X] Join condition: (cast(ps_availqty#X as decimal(24,3)) > (0.5 * sum(l_quantity))#X) -(45) ProjectExecTransformer +(47) ProjectExecTransformer Output [1]: [ps_suppkey#X] Input [6]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] -(46) WholeStageCodegenTransformer (X) +(48) WholeStageCodegenTransformer (X) Input [1]: [ps_suppkey#X] Arguments: false -(47) ColumnarBroadcastExchange +(49) ColumnarBroadcastExchange Input [1]: [ps_suppkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(48) BroadcastQueryStage +(50) BroadcastQueryStage Output [1]: [ps_suppkey#X] Arguments: X -(49) InputAdapter +(51) InputAdapter Input [1]: [ps_suppkey#X] -(50) InputIteratorTransformer +(52) InputIteratorTransformer Input [1]: [ps_suppkey#X] -(51) BroadcastHashJoinExecTransformer +(53) BroadcastHashJoinExecTransformer Left keys [1]: [s_suppkey#X] Right keys [1]: [ps_suppkey#X] Join condition: None -(52) ProjectExecTransformer +(54) ProjectExecTransformer Output [3]: [s_name#X, s_address#X, s_nationkey#X] Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] -(53) Scan parquet +(55) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,CANADA), IsNotNull(n_nationkey)] ReadSchema: struct -(54) NoopFilter +(56) NoopFilter Input [2]: [n_nationkey#X, n_name#X] Arguments: [n_nationkey#X, n_name#X] -(55) ProjectExecTransformer +(57) ProjectExecTransformer Output [1]: [n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(56) WholeStageCodegenTransformer (X) +(58) WholeStageCodegenTransformer (X) Input [1]: [n_nationkey#X] Arguments: false -(57) ColumnarBroadcastExchange +(59) ColumnarBroadcastExchange Input [1]: [n_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(58) BroadcastQueryStage +(60) BroadcastQueryStage Output [1]: [n_nationkey#X] Arguments: X -(59) InputAdapter +(61) InputAdapter Input [1]: [n_nationkey#X] -(60) InputIteratorTransformer +(62) InputIteratorTransformer Input [1]: [n_nationkey#X] -(61) BroadcastHashJoinExecTransformer +(63) BroadcastHashJoinExecTransformer Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(62) ProjectExecTransformer +(64) ProjectExecTransformer Output [2]: [s_name#X, s_address#X] Input [4]: [s_name#X, s_address#X, s_nationkey#X, n_nationkey#X] -(63) WholeStageCodegenTransformer (X) +(65) WholeStageCodegenTransformer (X) Input [2]: [s_name#X, s_address#X] Arguments: false -(64) ColumnarExchange +(66) VeloxAppendBatches +Input [2]: [s_name#X, s_address#X] +Arguments: X + +(67) ColumnarExchange Input [2]: [s_name#X, s_address#X] Arguments: rangepartitioning(s_name#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(65) ShuffleQueryStage +(68) ShuffleQueryStage Output [2]: [s_name#X, s_address#X] Arguments: X -(66) AQEShuffleRead +(69) AQEShuffleRead Input [2]: [s_name#X, s_address#X] Arguments: local -(67) VeloxColumnarToRowExec +(70) VeloxColumnarToRowExec Input [2]: [s_name#X, s_address#X] -(68) Scan parquet +(71) Scan parquet Output [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_nationkey)] ReadSchema: struct -(69) Filter +(72) Filter Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Condition : isnotnull(s_nationkey#X) -(70) Exchange +(73) Exchange Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(71) Scan parquet +(74) Sort +Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] +Arguments: [s_suppkey#X ASC NULLS FIRST], false, 0 + +(75) Scan parquet Output [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_availqty), IsNotNull(ps_partkey), IsNotNull(ps_suppkey)] ReadSchema: struct -(72) Filter +(76) Filter Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Condition : ((isnotnull(ps_availqty#X) AND isnotnull(ps_partkey#X)) AND isnotnull(ps_suppkey#X)) -(73) Scan parquet +(77) Scan parquet Output [2]: [p_partkey#X, p_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_name), StringStartsWith(p_name,forest)] ReadSchema: struct -(74) Filter +(78) Filter Input [2]: [p_partkey#X, p_name#X] Condition : (isnotnull(p_name#X) AND StartsWith(p_name#X, forest)) -(75) Project +(79) Project Output [1]: [p_partkey#X] Input [2]: [p_partkey#X, p_name#X] -(76) BroadcastExchange +(80) BroadcastExchange Input [1]: [p_partkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(77) BroadcastHashJoin +(81) BroadcastHashJoin Left keys [1]: [ps_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(78) BroadcastExchange +(82) BroadcastExchange Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false], input[1, bigint, false]),false), [plan_id=X] -(79) Scan parquet +(83) Scan parquet Output [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1994-01-01), LessThan(l_shipdate,1995-01-01), IsNotNull(l_partkey), IsNotNull(l_suppkey)] ReadSchema: struct -(80) Filter +(84) Filter Input [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] Condition : ((((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1994-01-01)) AND (l_shipdate#X < 1995-01-01)) AND isnotnull(l_partkey#X)) AND isnotnull(l_suppkey#X)) -(81) Project +(85) Project Output [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] Input [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] -(82) Scan parquet +(86) Scan parquet Output [2]: [p_partkey#X, p_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_name), StringStartsWith(p_name,forest)] ReadSchema: struct -(83) Filter +(87) Filter Input [2]: [p_partkey#X, p_name#X] Condition : (isnotnull(p_name#X) AND StartsWith(p_name#X, forest)) -(84) Project +(88) Project Output [1]: [p_partkey#X] Input [2]: [p_partkey#X, p_name#X] -(85) BroadcastExchange +(89) BroadcastExchange Input [1]: [p_partkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(86) BroadcastHashJoin +(90) BroadcastHashJoin Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(87) HashAggregate +(91) HashAggregate Input [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] Keys [2]: [l_partkey#X, l_suppkey#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] -(88) Exchange +(92) Exchange Input [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_partkey#X, l_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(89) HashAggregate +(93) HashAggregate Input [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] Keys [2]: [l_partkey#X, l_suppkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [3]: [CheckOverflow((0.50 * promote_precision(sum(l_quantity#X)#X)), DecimalType(24,3)) AS (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] -(90) Filter +(94) Filter Input [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Condition : isnotnull((0.5 * sum(l_quantity))#X) -(91) BroadcastHashJoin +(95) BroadcastHashJoin Left keys [2]: [ps_partkey#X, ps_suppkey#X] Right keys [2]: [l_partkey#X, l_suppkey#X] Join condition: (cast(ps_availqty#X as decimal(24,3)) > (0.5 * sum(l_quantity))#X) -(92) Project +(96) Project Output [1]: [ps_suppkey#X] Input [6]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] -(93) Exchange +(97) Exchange Input [1]: [ps_suppkey#X] Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(94) ShuffledHashJoin +(98) Sort +Input [1]: [ps_suppkey#X] +Arguments: [ps_suppkey#X ASC NULLS FIRST], false, 0 + +(99) SortMergeJoin Left keys [1]: [s_suppkey#X] Right keys [1]: [ps_suppkey#X] Join condition: None -(95) Project +(100) Project Output [3]: [s_name#X, s_address#X, s_nationkey#X] Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] -(96) Scan parquet +(101) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,CANADA), IsNotNull(n_nationkey)] ReadSchema: struct -(97) Filter +(102) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : ((isnotnull(n_name#X) AND (n_name#X = CANADA)) AND isnotnull(n_nationkey#X)) -(98) Project +(103) Project Output [1]: [n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(99) BroadcastExchange +(104) BroadcastExchange Input [1]: [n_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(100) BroadcastHashJoin +(105) BroadcastHashJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(101) Project +(106) Project Output [2]: [s_name#X, s_address#X] Input [4]: [s_name#X, s_address#X, s_nationkey#X, n_nationkey#X] -(102) Exchange +(107) Exchange Input [2]: [s_name#X, s_address#X] Arguments: rangepartitioning(s_name#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(103) Sort +(108) Sort Input [2]: [s_name#X, s_address#X] Arguments: [s_name#X ASC NULLS FIRST], true, 0 -(104) AdaptiveSparkPlan +(109) AdaptiveSparkPlan Output [2]: [s_name#X, s_address#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/21.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/21.txt index ac047fdbcabfa..7627c7f4f147b 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/21.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/21.txt @@ -1,90 +1,85 @@ == Physical Plan == -AdaptiveSparkPlan (91) +AdaptiveSparkPlan (92) +- == Final Plan == - VeloxColumnarToRowExec (58) - +- ^ RegularHashAggregateExecTransformer (56) - +- ^ InputIteratorTransformer (55) - +- ^ InputAdapter (54) - +- ^ ShuffleQueryStage (53), Statistics(X) - +- ColumnarExchange (52) + VeloxColumnarToRowExec (59) + +- ^ RegularHashAggregateExecTransformer (57) + +- ^ InputIteratorTransformer (56) + +- ShuffleQueryStage (54), Statistics(X) + +- ColumnarExchange (53) + +- VeloxAppendBatches (52) +- ^ ProjectExecTransformer (50) +- ^ FlushableHashAggregateExecTransformer (49) +- ^ ProjectExecTransformer (48) - +- ^ BroadcastHashJoinExecTransformer Inner (47) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (47) :- ^ ProjectExecTransformer (38) - : +- ^ BroadcastHashJoinExecTransformer Inner (37) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (37) : :- ^ ProjectExecTransformer (28) - : : +- ^ BroadcastHashJoinExecTransformer Inner (27) + : : +- ^ BroadcastHashJoinExecTransformer Inner BuildLeft (27) : : :- ^ InputIteratorTransformer (7) - : : : +- ^ InputAdapter (6) - : : : +- ^ BroadcastQueryStage (5), Statistics(X) - : : : +- ColumnarBroadcastExchange (4) - : : : +- ^ NoopFilter (2) - : : : +- ^ Scan parquet (1) - : : +- ^ BroadcastHashJoinExecTransformer LeftAnti (26) - : : :- ^ BroadcastHashJoinExecTransformer LeftSemi (17) + : : : +- BroadcastQueryStage (5), Statistics(X) + : : : +- ColumnarBroadcastExchange (4) + : : : +- ^ NoopFilter (2) + : : : +- ^ Scan parquet (1) + : : +- ^ BroadcastHashJoinExecTransformer LeftAnti BuildRight (26) + : : :- ^ BroadcastHashJoinExecTransformer LeftSemi BuildRight (17) : : : :- ^ ProjectExecTransformer (10) : : : : +- ^ NoopFilter (9) : : : : +- ^ Scan parquet (8) : : : +- ^ InputIteratorTransformer (16) - : : : +- ^ InputAdapter (15) - : : : +- ^ BroadcastQueryStage (14), Statistics(X) - : : : +- ColumnarBroadcastExchange (13) - : : : +- ^ Scan parquet (11) + : : : +- BroadcastQueryStage (14), Statistics(X) + : : : +- ColumnarBroadcastExchange (13) + : : : +- ^ Scan parquet (11) : : +- ^ InputIteratorTransformer (25) - : : +- ^ InputAdapter (24) - : : +- ^ BroadcastQueryStage (23), Statistics(X) - : : +- ColumnarBroadcastExchange (22) - : : +- ^ ProjectExecTransformer (20) - : : +- ^ NoopFilter (19) - : : +- ^ Scan parquet (18) + : : +- BroadcastQueryStage (23), Statistics(X) + : : +- ColumnarBroadcastExchange (22) + : : +- ^ ProjectExecTransformer (20) + : : +- ^ NoopFilter (19) + : : +- ^ Scan parquet (18) : +- ^ InputIteratorTransformer (36) - : +- ^ InputAdapter (35) - : +- ^ BroadcastQueryStage (34), Statistics(X) - : +- ColumnarBroadcastExchange (33) - : +- ^ ProjectExecTransformer (31) - : +- ^ NoopFilter (30) - : +- ^ Scan parquet (29) + : +- BroadcastQueryStage (34), Statistics(X) + : +- ColumnarBroadcastExchange (33) + : +- ^ ProjectExecTransformer (31) + : +- ^ NoopFilter (30) + : +- ^ Scan parquet (29) +- ^ InputIteratorTransformer (46) - +- ^ InputAdapter (45) - +- ^ BroadcastQueryStage (44), Statistics(X) - +- ColumnarBroadcastExchange (43) - +- ^ ProjectExecTransformer (41) - +- ^ NoopFilter (40) - +- ^ Scan parquet (39) + +- BroadcastQueryStage (44), Statistics(X) + +- ColumnarBroadcastExchange (43) + +- ^ ProjectExecTransformer (41) + +- ^ NoopFilter (40) + +- ^ Scan parquet (39) +- == Initial Plan == - TakeOrderedAndProject (90) - +- HashAggregate (89) - +- Exchange (88) - +- HashAggregate (87) - +- Project (86) - +- BroadcastHashJoin Inner BuildRight (85) - :- Project (80) - : +- BroadcastHashJoin Inner BuildRight (79) - : :- Project (74) - : : +- BroadcastHashJoin Inner BuildLeft (73) - : : :- BroadcastExchange (61) - : : : +- Filter (60) - : : : +- Scan parquet (59) - : : +- BroadcastHashJoin LeftAnti BuildRight (72) - : : :- BroadcastHashJoin LeftSemi BuildRight (67) - : : : :- Project (64) - : : : : +- Filter (63) - : : : : +- Scan parquet (62) - : : : +- BroadcastExchange (66) - : : : +- Scan parquet (65) - : : +- BroadcastExchange (71) - : : +- Project (70) - : : +- Filter (69) - : : +- Scan parquet (68) - : +- BroadcastExchange (78) - : +- Project (77) - : +- Filter (76) - : +- Scan parquet (75) - +- BroadcastExchange (84) - +- Project (83) - +- Filter (82) - +- Scan parquet (81) + TakeOrderedAndProject (91) + +- HashAggregate (90) + +- Exchange (89) + +- HashAggregate (88) + +- Project (87) + +- BroadcastHashJoin Inner BuildRight (86) + :- Project (81) + : +- BroadcastHashJoin Inner BuildRight (80) + : :- Project (75) + : : +- BroadcastHashJoin Inner BuildLeft (74) + : : :- BroadcastExchange (62) + : : : +- Filter (61) + : : : +- Scan parquet (60) + : : +- BroadcastHashJoin LeftAnti BuildRight (73) + : : :- BroadcastHashJoin LeftSemi BuildRight (68) + : : : :- Project (65) + : : : : +- Filter (64) + : : : : +- Scan parquet (63) + : : : +- BroadcastExchange (67) + : : : +- Scan parquet (66) + : : +- BroadcastExchange (72) + : : +- Project (71) + : : +- Filter (70) + : : +- Scan parquet (69) + : +- BroadcastExchange (79) + : +- Project (78) + : +- Filter (77) + : +- Scan parquet (76) + +- BroadcastExchange (85) + +- Project (84) + +- Filter (83) + +- Scan parquet (82) (1) Scan parquet @@ -306,190 +301,194 @@ Input [2]: [s_name#X, count#X] Input [3]: [hash_partition_key#X, s_name#X, count#X] Arguments: false -(52) ColumnarExchange +(52) VeloxAppendBatches +Input [3]: [hash_partition_key#X, s_name#X, count#X] +Arguments: X + +(53) ColumnarExchange Input [3]: [hash_partition_key#X, s_name#X, count#X] Arguments: hashpartitioning(s_name#X, 1), ENSURE_REQUIREMENTS, [s_name#X, count#X], [plan_id=X], [id=#X] -(53) ShuffleQueryStage +(54) ShuffleQueryStage Output [2]: [s_name#X, count#X] Arguments: X -(54) InputAdapter +(55) InputAdapter Input [2]: [s_name#X, count#X] -(55) InputIteratorTransformer +(56) InputIteratorTransformer Input [2]: [s_name#X, count#X] -(56) RegularHashAggregateExecTransformer +(57) RegularHashAggregateExecTransformer Input [2]: [s_name#X, count#X] Keys [1]: [s_name#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [s_name#X, count(1)#X AS numwait#X] -(57) WholeStageCodegenTransformer (X) +(58) WholeStageCodegenTransformer (X) Input [2]: [s_name#X, numwait#X] Arguments: false -(58) VeloxColumnarToRowExec +(59) VeloxColumnarToRowExec Input [2]: [s_name#X, numwait#X] -(59) Scan parquet +(60) Scan parquet Output [3]: [s_suppkey#X, s_name#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(60) Filter +(61) Filter Input [3]: [s_suppkey#X, s_name#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(61) BroadcastExchange +(62) BroadcastExchange Input [3]: [s_suppkey#X, s_name#X, s_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(62) Scan parquet +(63) Scan parquet Output [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_receiptdate), IsNotNull(l_commitdate), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(63) Filter +(64) Filter Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Condition : ((((isnotnull(l_receiptdate#X) AND isnotnull(l_commitdate#X)) AND (l_receiptdate#X > l_commitdate#X)) AND isnotnull(l_suppkey#X)) AND isnotnull(l_orderkey#X)) -(64) Project +(65) Project Output [2]: [l_orderkey#X, l_suppkey#X] Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] -(65) Scan parquet +(66) Scan parquet Output [2]: [l_orderkey#X, l_suppkey#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(66) BroadcastExchange +(67) BroadcastExchange Input [2]: [l_orderkey#X, l_suppkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(67) BroadcastHashJoin +(68) BroadcastHashJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: NOT (l_suppkey#X = l_suppkey#X) -(68) Scan parquet +(69) Scan parquet Output [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_receiptdate), IsNotNull(l_commitdate)] ReadSchema: struct -(69) Filter +(70) Filter Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Condition : ((isnotnull(l_receiptdate#X) AND isnotnull(l_commitdate#X)) AND (l_receiptdate#X > l_commitdate#X)) -(70) Project +(71) Project Output [2]: [l_orderkey#X, l_suppkey#X] Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] -(71) BroadcastExchange +(72) BroadcastExchange Input [2]: [l_orderkey#X, l_suppkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(72) BroadcastHashJoin +(73) BroadcastHashJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: NOT (l_suppkey#X = l_suppkey#X) -(73) BroadcastHashJoin +(74) BroadcastHashJoin Left keys [1]: [s_suppkey#X] Right keys [1]: [l_suppkey#X] Join condition: None -(74) Project +(75) Project Output [3]: [s_name#X, s_nationkey#X, l_orderkey#X] Input [5]: [s_suppkey#X, s_name#X, s_nationkey#X, l_orderkey#X, l_suppkey#X] -(75) Scan parquet +(76) Scan parquet Output [2]: [o_orderkey#X, o_orderstatus#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderstatus), EqualTo(o_orderstatus,F), IsNotNull(o_orderkey)] ReadSchema: struct -(76) Filter +(77) Filter Input [2]: [o_orderkey#X, o_orderstatus#X] Condition : ((isnotnull(o_orderstatus#X) AND (o_orderstatus#X = F)) AND isnotnull(o_orderkey#X)) -(77) Project +(78) Project Output [1]: [o_orderkey#X] Input [2]: [o_orderkey#X, o_orderstatus#X] -(78) BroadcastExchange +(79) BroadcastExchange Input [1]: [o_orderkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(79) BroadcastHashJoin +(80) BroadcastHashJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join condition: None -(80) Project +(81) Project Output [2]: [s_name#X, s_nationkey#X] Input [4]: [s_name#X, s_nationkey#X, l_orderkey#X, o_orderkey#X] -(81) Scan parquet +(82) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,SAUDI ARABIA), IsNotNull(n_nationkey)] ReadSchema: struct -(82) Filter +(83) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : ((isnotnull(n_name#X) AND (n_name#X = SAUDI ARABIA)) AND isnotnull(n_nationkey#X)) -(83) Project +(84) Project Output [1]: [n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(84) BroadcastExchange +(85) BroadcastExchange Input [1]: [n_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(85) BroadcastHashJoin +(86) BroadcastHashJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(86) Project +(87) Project Output [1]: [s_name#X] Input [3]: [s_name#X, s_nationkey#X, n_nationkey#X] -(87) HashAggregate +(88) HashAggregate Input [1]: [s_name#X] Keys [1]: [s_name#X] Functions [1]: [partial_count(1)] Aggregate Attributes [1]: [count#X] Results [2]: [s_name#X, count#X] -(88) Exchange +(89) Exchange Input [2]: [s_name#X, count#X] Arguments: hashpartitioning(s_name#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(89) HashAggregate +(90) HashAggregate Input [2]: [s_name#X, count#X] Keys [1]: [s_name#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [s_name#X, count(1)#X AS numwait#X] -(90) TakeOrderedAndProject +(91) TakeOrderedAndProject Input [2]: [s_name#X, numwait#X] Arguments: X, [numwait#X DESC NULLS LAST, s_name#X ASC NULLS FIRST], [s_name#X, numwait#X] -(91) AdaptiveSparkPlan +(92) AdaptiveSparkPlan Output [2]: [s_name#X, numwait#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/22.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/22.txt index 3306ea6b2fd9a..9a0475d25d78e 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/22.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/22.txt @@ -1,40 +1,39 @@ == Physical Plan == -AdaptiveSparkPlan (38) +AdaptiveSparkPlan (40) +- == Final Plan == - VeloxColumnarToRowExec (26) - +- ^ SortExecTransformer (24) - +- ^ InputIteratorTransformer (23) - +- ^ InputAdapter (22) - +- ^ ShuffleQueryStage (21), Statistics(X) - +- ColumnarExchange (20) - +- ^ RegularHashAggregateExecTransformer (18) - +- ^ InputIteratorTransformer (17) - +- ^ InputAdapter (16) - +- ^ ShuffleQueryStage (15), Statistics(X) - +- ColumnarExchange (14) + VeloxColumnarToRowExec (28) + +- ^ SortExecTransformer (26) + +- ^ InputIteratorTransformer (25) + +- ShuffleQueryStage (23), Statistics(X) + +- ColumnarExchange (22) + +- VeloxAppendBatches (21) + +- ^ RegularHashAggregateExecTransformer (19) + +- ^ InputIteratorTransformer (18) + +- ShuffleQueryStage (16), Statistics(X) + +- ColumnarExchange (15) + +- VeloxAppendBatches (14) +- ^ ProjectExecTransformer (12) +- ^ FlushableHashAggregateExecTransformer (11) +- ^ ProjectExecTransformer (10) - +- ^ BroadcastHashJoinExecTransformer LeftAnti (9) + +- ^ BroadcastHashJoinExecTransformer LeftAnti BuildRight (9) :- ^ NoopFilter (2) : +- ^ Scan parquet (1) +- ^ InputIteratorTransformer (8) - +- ^ InputAdapter (7) - +- ^ BroadcastQueryStage (6), Statistics(X) - +- ColumnarBroadcastExchange (5) - +- ^ Scan parquet (3) + +- BroadcastQueryStage (6), Statistics(X) + +- ColumnarBroadcastExchange (5) + +- ^ Scan parquet (3) +- == Initial Plan == - Sort (37) - +- Exchange (36) - +- HashAggregate (35) - +- Exchange (34) - +- HashAggregate (33) - +- Project (32) - +- BroadcastHashJoin LeftAnti BuildRight (31) - :- Filter (28) - : +- Scan parquet (27) - +- BroadcastExchange (30) - +- Scan parquet (29) + Sort (39) + +- Exchange (38) + +- HashAggregate (37) + +- Exchange (36) + +- HashAggregate (35) + +- Project (34) + +- BroadcastHashJoin LeftAnti BuildRight (33) + :- Filter (30) + : +- Scan parquet (29) + +- BroadcastExchange (32) + +- Scan parquet (31) (1) Scan parquet @@ -96,227 +95,239 @@ Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] Input [5]: [hash_partition_key#X, cntrycode#X, count#X, sum#X, isEmpty#X] Arguments: false -(14) ColumnarExchange +(14) VeloxAppendBatches +Input [5]: [hash_partition_key#X, cntrycode#X, count#X, sum#X, isEmpty#X] +Arguments: X + +(15) ColumnarExchange Input [5]: [hash_partition_key#X, cntrycode#X, count#X, sum#X, isEmpty#X] Arguments: hashpartitioning(cntrycode#X, 1), ENSURE_REQUIREMENTS, [cntrycode#X, count#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(15) ShuffleQueryStage +(16) ShuffleQueryStage Output [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] Arguments: X -(16) InputAdapter +(17) InputAdapter Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] -(17) InputIteratorTransformer +(18) InputIteratorTransformer Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] -(18) RegularHashAggregateExecTransformer +(19) RegularHashAggregateExecTransformer Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] Keys [1]: [cntrycode#X] Functions [2]: [count(1), sum(c_acctbal#X)] Aggregate Attributes [2]: [count(1)#X, sum(c_acctbal#X)#X] Results [3]: [cntrycode#X, count(1)#X AS numcust#X, sum(c_acctbal#X)#X AS totacctbal#X] -(19) WholeStageCodegenTransformer (X) +(20) WholeStageCodegenTransformer (X) Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: false -(20) ColumnarExchange +(21) VeloxAppendBatches +Input [3]: [cntrycode#X, numcust#X, totacctbal#X] +Arguments: X + +(22) ColumnarExchange Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: rangepartitioning(cntrycode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(21) ShuffleQueryStage +(23) ShuffleQueryStage Output [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: X -(22) InputAdapter +(24) InputAdapter Input [3]: [cntrycode#X, numcust#X, totacctbal#X] -(23) InputIteratorTransformer +(25) InputIteratorTransformer Input [3]: [cntrycode#X, numcust#X, totacctbal#X] -(24) SortExecTransformer +(26) SortExecTransformer Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: [cntrycode#X ASC NULLS FIRST], true, 0 -(25) WholeStageCodegenTransformer (X) +(27) WholeStageCodegenTransformer (X) Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: false -(26) VeloxColumnarToRowExec +(28) VeloxColumnarToRowExec Input [3]: [cntrycode#X, numcust#X, totacctbal#X] -(27) Scan parquet +(29) Scan parquet Output [3]: [c_custkey#X, c_phone#X, c_acctbal#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_acctbal)] ReadSchema: struct -(28) Filter +(30) Filter Input [3]: [c_custkey#X, c_phone#X, c_acctbal#X] Condition : ((isnotnull(c_acctbal#X) AND substring(c_phone#X, 1, 2) IN (13,31,23,29,30,18,17)) AND (cast(c_acctbal#X as decimal(16,6)) > Subquery subquery#X, [id=#X])) -(29) Scan parquet +(31) Scan parquet Output [1]: [o_custkey#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(30) BroadcastExchange +(32) BroadcastExchange Input [1]: [o_custkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(31) BroadcastHashJoin +(33) BroadcastHashJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(32) Project +(34) Project Output [2]: [substring(c_phone#X, 1, 2) AS cntrycode#X, c_acctbal#X] Input [3]: [c_custkey#X, c_phone#X, c_acctbal#X] -(33) HashAggregate +(35) HashAggregate Input [2]: [cntrycode#X, c_acctbal#X] Keys [1]: [cntrycode#X] Functions [2]: [partial_count(1), partial_sum(c_acctbal#X)] Aggregate Attributes [3]: [count#X, sum#X, isEmpty#X] Results [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] -(34) Exchange +(36) Exchange Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] Arguments: hashpartitioning(cntrycode#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(35) HashAggregate +(37) HashAggregate Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] Keys [1]: [cntrycode#X] Functions [2]: [count(1), sum(c_acctbal#X)] Aggregate Attributes [2]: [count(1)#X, sum(c_acctbal#X)#X] Results [3]: [cntrycode#X, count(1)#X AS numcust#X, sum(c_acctbal#X)#X AS totacctbal#X] -(36) Exchange +(38) Exchange Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: rangepartitioning(cntrycode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(37) Sort +(39) Sort Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: [cntrycode#X ASC NULLS FIRST], true, 0 -(38) AdaptiveSparkPlan +(40) AdaptiveSparkPlan Output [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: isFinalPlan=true ===== Subqueries ===== Subquery:1 Hosting operator id = 1 Hosting Expression = Subquery subquery#X, [id=#X] -AdaptiveSparkPlan (57) +AdaptiveSparkPlan (60) +- == Final Plan == - VeloxColumnarToRowExec (50) - +- ^ RegularHashAggregateExecTransformer (48) - +- ^ InputIteratorTransformer (47) - +- ^ InputAdapter (46) - +- ^ ShuffleQueryStage (45), Statistics(X) - +- ColumnarExchange (44) - +- ^ FlushableHashAggregateExecTransformer (42) - +- ^ ProjectExecTransformer (41) - +- ^ NoopFilter (40) - +- ^ Scan parquet (39) + VeloxColumnarToRowExec (53) + +- ^ RegularHashAggregateExecTransformer (51) + +- ^ InputIteratorTransformer (50) + +- ShuffleQueryStage (48), Statistics(X) + +- ColumnarExchange (47) + +- VeloxAppendBatches (46) + +- ^ FlushableHashAggregateExecTransformer (44) + +- ^ ProjectExecTransformer (43) + +- ^ NoopFilter (42) + +- ^ Scan parquet (41) +- == Initial Plan == - HashAggregate (56) - +- Exchange (55) - +- HashAggregate (54) - +- Project (53) - +- Filter (52) - +- Scan parquet (51) + HashAggregate (59) + +- Exchange (58) + +- HashAggregate (57) + +- Project (56) + +- Filter (55) + +- Scan parquet (54) -(39) Scan parquet +(41) Scan parquet Output [2]: [c_phone#X, c_acctbal#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_acctbal), GreaterThan(c_acctbal,0.00)] ReadSchema: struct -(40) NoopFilter +(42) NoopFilter Input [2]: [c_phone#X, c_acctbal#X] Arguments: [c_phone#X, c_acctbal#X] -(41) ProjectExecTransformer +(43) ProjectExecTransformer Output [1]: [c_acctbal#X] Input [2]: [c_phone#X, c_acctbal#X] -(42) FlushableHashAggregateExecTransformer +(44) FlushableHashAggregateExecTransformer Input [1]: [c_acctbal#X] Keys: [] Functions [1]: [partial_avg(c_acctbal#X)] Aggregate Attributes [2]: [sum#X, count#X] Results [2]: [sum#X, count#X] -(43) WholeStageCodegenTransformer (X) +(45) WholeStageCodegenTransformer (X) Input [2]: [sum#X, count#X] Arguments: false -(44) ColumnarExchange +(46) VeloxAppendBatches +Input [2]: [sum#X, count#X] +Arguments: X + +(47) ColumnarExchange Input [2]: [sum#X, count#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(45) ShuffleQueryStage +(48) ShuffleQueryStage Output [2]: [sum#X, count#X] Arguments: X -(46) InputAdapter +(49) InputAdapter Input [2]: [sum#X, count#X] -(47) InputIteratorTransformer +(50) InputIteratorTransformer Input [2]: [sum#X, count#X] -(48) RegularHashAggregateExecTransformer +(51) RegularHashAggregateExecTransformer Input [2]: [sum#X, count#X] Keys: [] Functions [1]: [avg(c_acctbal#X)] Aggregate Attributes [1]: [avg(c_acctbal#X)#X] Results [1]: [avg(c_acctbal#X)#X AS avg(c_acctbal)#X] -(49) WholeStageCodegenTransformer (X) +(52) WholeStageCodegenTransformer (X) Input [1]: [avg(c_acctbal)#X] Arguments: false -(50) VeloxColumnarToRowExec +(53) VeloxColumnarToRowExec Input [1]: [avg(c_acctbal)#X] -(51) Scan parquet +(54) Scan parquet Output [2]: [c_phone#X, c_acctbal#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_acctbal), GreaterThan(c_acctbal,0.00)] ReadSchema: struct -(52) Filter +(55) Filter Input [2]: [c_phone#X, c_acctbal#X] Condition : ((isnotnull(c_acctbal#X) AND (c_acctbal#X > 0.00)) AND substring(c_phone#X, 1, 2) IN (13,31,23,29,30,18,17)) -(53) Project +(56) Project Output [1]: [c_acctbal#X] Input [2]: [c_phone#X, c_acctbal#X] -(54) HashAggregate +(57) HashAggregate Input [1]: [c_acctbal#X] Keys: [] Functions [1]: [partial_avg(c_acctbal#X)] Aggregate Attributes [2]: [sum#X, count#X] Results [2]: [sum#X, count#X] -(55) Exchange +(58) Exchange Input [2]: [sum#X, count#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X] -(56) HashAggregate +(59) HashAggregate Input [2]: [sum#X, count#X] Keys: [] Functions [1]: [avg(c_acctbal#X)] Aggregate Attributes [1]: [avg(c_acctbal#X)#X] Results [1]: [avg(c_acctbal#X)#X AS avg(c_acctbal)#X] -(57) AdaptiveSparkPlan +(60) AdaptiveSparkPlan Output [1]: [avg(c_acctbal)#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/3.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/3.txt index 0bcfb2c8228d0..60843c0991ade 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/3.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/3.txt @@ -1,55 +1,53 @@ == Physical Plan == -AdaptiveSparkPlan (53) +AdaptiveSparkPlan (54) +- == Final Plan == - VeloxColumnarToRowExec (34) - +- TakeOrderedAndProjectExecTransformer (33) - +- ^ ProjectExecTransformer (31) - +- ^ RegularHashAggregateExecTransformer (30) - +- ^ InputIteratorTransformer (29) - +- ^ InputAdapter (28) - +- ^ ShuffleQueryStage (27), Statistics(X) - +- ColumnarExchange (26) + VeloxColumnarToRowExec (35) + +- TakeOrderedAndProjectExecTransformer (34) + +- ^ ProjectExecTransformer (32) + +- ^ RegularHashAggregateExecTransformer (31) + +- ^ InputIteratorTransformer (30) + +- ShuffleQueryStage (28), Statistics(X) + +- ColumnarExchange (27) + +- VeloxAppendBatches (26) +- ^ ProjectExecTransformer (24) +- ^ FlushableHashAggregateExecTransformer (23) +- ^ ProjectExecTransformer (22) - +- ^ BroadcastHashJoinExecTransformer Inner (21) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (21) :- ^ ProjectExecTransformer (12) - : +- ^ BroadcastHashJoinExecTransformer Inner (11) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildLeft (11) : :- ^ InputIteratorTransformer (8) - : : +- ^ InputAdapter (7) - : : +- ^ BroadcastQueryStage (6), Statistics(X) - : : +- ColumnarBroadcastExchange (5) - : : +- ^ ProjectExecTransformer (3) - : : +- ^ NoopFilter (2) - : : +- ^ Scan parquet (1) + : : +- BroadcastQueryStage (6), Statistics(X) + : : +- ColumnarBroadcastExchange (5) + : : +- ^ ProjectExecTransformer (3) + : : +- ^ NoopFilter (2) + : : +- ^ Scan parquet (1) : +- ^ NoopFilter (10) : +- ^ Scan parquet (9) +- ^ InputIteratorTransformer (20) - +- ^ InputAdapter (19) - +- ^ BroadcastQueryStage (18), Statistics(X) - +- ColumnarBroadcastExchange (17) - +- ^ ProjectExecTransformer (15) - +- ^ NoopFilter (14) - +- ^ Scan parquet (13) + +- BroadcastQueryStage (18), Statistics(X) + +- ColumnarBroadcastExchange (17) + +- ^ ProjectExecTransformer (15) + +- ^ NoopFilter (14) + +- ^ Scan parquet (13) +- == Initial Plan == - TakeOrderedAndProject (52) - +- HashAggregate (51) - +- Exchange (50) - +- HashAggregate (49) - +- Project (48) - +- BroadcastHashJoin Inner BuildRight (47) - :- Project (42) - : +- BroadcastHashJoin Inner BuildLeft (41) - : :- BroadcastExchange (38) - : : +- Project (37) - : : +- Filter (36) - : : +- Scan parquet (35) - : +- Filter (40) - : +- Scan parquet (39) - +- BroadcastExchange (46) - +- Project (45) - +- Filter (44) - +- Scan parquet (43) + TakeOrderedAndProject (53) + +- HashAggregate (52) + +- Exchange (51) + +- HashAggregate (50) + +- Project (49) + +- BroadcastHashJoin Inner BuildRight (48) + :- Project (43) + : +- BroadcastHashJoin Inner BuildLeft (42) + : :- BroadcastExchange (39) + : : +- Project (38) + : : +- Filter (37) + : : +- Scan parquet (36) + : +- Filter (41) + : +- Scan parquet (40) + +- BroadcastExchange (47) + +- Project (46) + +- Filter (45) + +- Scan parquet (44) (1) Scan parquet @@ -162,131 +160,135 @@ Input [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] Input [6]: [hash_partition_key#X, l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] Arguments: false -(26) ColumnarExchange +(26) VeloxAppendBatches +Input [6]: [hash_partition_key#X, l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] +Arguments: X + +(27) ColumnarExchange Input [6]: [hash_partition_key#X, l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_orderkey#X, o_orderdate#X, o_shippriority#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(27) ShuffleQueryStage +(28) ShuffleQueryStage Output [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] Arguments: X -(28) InputAdapter +(29) InputAdapter Input [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] -(29) InputIteratorTransformer +(30) InputIteratorTransformer Input [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] -(30) RegularHashAggregateExecTransformer +(31) RegularHashAggregateExecTransformer Input [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] Keys [3]: [l_orderkey#X, o_orderdate#X, o_shippriority#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] Results [4]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] -(31) ProjectExecTransformer +(32) ProjectExecTransformer Output [4]: [l_orderkey#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X AS revenue#X, o_orderdate#X, o_shippriority#X] Input [4]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] -(32) WholeStageCodegenTransformer (X) +(33) WholeStageCodegenTransformer (X) Input [4]: [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] Arguments: false -(33) TakeOrderedAndProjectExecTransformer +(34) TakeOrderedAndProjectExecTransformer Input [4]: [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] Arguments: X, [revenue#X DESC NULLS LAST, o_orderdate#X ASC NULLS FIRST], [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X], 0 -(34) VeloxColumnarToRowExec +(35) VeloxColumnarToRowExec Input [4]: [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] -(35) Scan parquet +(36) Scan parquet Output [2]: [c_custkey#X, c_mktsegment#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_mktsegment), EqualTo(c_mktsegment,BUILDING), IsNotNull(c_custkey)] ReadSchema: struct -(36) Filter +(37) Filter Input [2]: [c_custkey#X, c_mktsegment#X] Condition : ((isnotnull(c_mktsegment#X) AND (c_mktsegment#X = BUILDING)) AND isnotnull(c_custkey#X)) -(37) Project +(38) Project Output [1]: [c_custkey#X] Input [2]: [c_custkey#X, c_mktsegment#X] -(38) BroadcastExchange +(39) BroadcastExchange Input [1]: [c_custkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(39) Scan parquet +(40) Scan parquet Output [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), LessThan(o_orderdate,1995-03-15), IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(40) Filter +(41) Filter Input [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Condition : (((isnotnull(o_orderdate#X) AND (o_orderdate#X < 1995-03-15)) AND isnotnull(o_custkey#X)) AND isnotnull(o_orderkey#X)) -(41) BroadcastHashJoin +(42) BroadcastHashJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(42) Project +(43) Project Output [3]: [o_orderkey#X, o_orderdate#X, o_shippriority#X] Input [5]: [c_custkey#X, o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] -(43) Scan parquet +(44) Scan parquet Output [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThan(l_shipdate,1995-03-15), IsNotNull(l_orderkey)] ReadSchema: struct -(44) Filter +(45) Filter Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : ((isnotnull(l_shipdate#X) AND (l_shipdate#X > 1995-03-15)) AND isnotnull(l_orderkey#X)) -(45) Project +(46) Project Output [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(46) BroadcastExchange +(47) BroadcastExchange Input [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(47) BroadcastHashJoin +(48) BroadcastHashJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(48) Project +(49) Project Output [5]: [o_orderdate#X, o_shippriority#X, l_orderkey#X, l_extendedprice#X, l_discount#X] Input [6]: [o_orderkey#X, o_orderdate#X, o_shippriority#X, l_orderkey#X, l_extendedprice#X, l_discount#X] -(49) HashAggregate +(50) HashAggregate Input [5]: [o_orderdate#X, o_shippriority#X, l_orderkey#X, l_extendedprice#X, l_discount#X] Keys [3]: [l_orderkey#X, o_orderdate#X, o_shippriority#X] Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] -(50) Exchange +(51) Exchange Input [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_orderkey#X, o_orderdate#X, o_shippriority#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(51) HashAggregate +(52) HashAggregate Input [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] Keys [3]: [l_orderkey#X, o_orderdate#X, o_shippriority#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] Results [4]: [l_orderkey#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X AS revenue#X, o_orderdate#X, o_shippriority#X] -(52) TakeOrderedAndProject +(53) TakeOrderedAndProject Input [4]: [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] Arguments: X, [revenue#X DESC NULLS LAST, o_orderdate#X ASC NULLS FIRST], [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] -(53) AdaptiveSparkPlan +(54) AdaptiveSparkPlan Output [4]: [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/4.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/4.txt index b547b4051bcf0..b142ee1d8f33a 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/4.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/4.txt @@ -1,46 +1,45 @@ == Physical Plan == -AdaptiveSparkPlan (44) +AdaptiveSparkPlan (46) +- == Final Plan == - VeloxColumnarToRowExec (29) - +- ^ SortExecTransformer (27) - +- ^ InputIteratorTransformer (26) - +- ^ InputAdapter (25) - +- ^ ShuffleQueryStage (24), Statistics(X) - +- ColumnarExchange (23) - +- ^ RegularHashAggregateExecTransformer (21) - +- ^ InputIteratorTransformer (20) - +- ^ InputAdapter (19) - +- ^ ShuffleQueryStage (18), Statistics(X) - +- ColumnarExchange (17) + VeloxColumnarToRowExec (31) + +- ^ SortExecTransformer (29) + +- ^ InputIteratorTransformer (28) + +- ShuffleQueryStage (26), Statistics(X) + +- ColumnarExchange (25) + +- VeloxAppendBatches (24) + +- ^ RegularHashAggregateExecTransformer (22) + +- ^ InputIteratorTransformer (21) + +- ShuffleQueryStage (19), Statistics(X) + +- ColumnarExchange (18) + +- VeloxAppendBatches (17) +- ^ ProjectExecTransformer (15) +- ^ FlushableHashAggregateExecTransformer (14) +- ^ ProjectExecTransformer (13) - +- ^ BroadcastHashJoinExecTransformer LeftSemi (12) + +- ^ BroadcastHashJoinExecTransformer LeftSemi BuildRight (12) :- ^ ProjectExecTransformer (3) : +- ^ NoopFilter (2) : +- ^ Scan parquet (1) +- ^ InputIteratorTransformer (11) - +- ^ InputAdapter (10) - +- ^ BroadcastQueryStage (9), Statistics(X) - +- ColumnarBroadcastExchange (8) - +- ^ ProjectExecTransformer (6) - +- ^ NoopFilter (5) - +- ^ Scan parquet (4) + +- BroadcastQueryStage (9), Statistics(X) + +- ColumnarBroadcastExchange (8) + +- ^ ProjectExecTransformer (6) + +- ^ NoopFilter (5) + +- ^ Scan parquet (4) +- == Initial Plan == - Sort (43) - +- Exchange (42) - +- HashAggregate (41) - +- Exchange (40) - +- HashAggregate (39) - +- Project (38) - +- BroadcastHashJoin LeftSemi BuildRight (37) - :- Project (32) - : +- Filter (31) - : +- Scan parquet (30) - +- BroadcastExchange (36) - +- Project (35) - +- Filter (34) - +- Scan parquet (33) + Sort (45) + +- Exchange (44) + +- HashAggregate (43) + +- Exchange (42) + +- HashAggregate (41) + +- Project (40) + +- BroadcastHashJoin LeftSemi BuildRight (39) + :- Project (34) + : +- Filter (33) + : +- Scan parquet (32) + +- BroadcastExchange (38) + +- Project (37) + +- Filter (36) + +- Scan parquet (35) (1) Scan parquet @@ -115,125 +114,133 @@ Input [2]: [o_orderpriority#X, count#X] Input [3]: [hash_partition_key#X, o_orderpriority#X, count#X] Arguments: false -(17) ColumnarExchange +(17) VeloxAppendBatches +Input [3]: [hash_partition_key#X, o_orderpriority#X, count#X] +Arguments: X + +(18) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderpriority#X, count#X] Arguments: hashpartitioning(o_orderpriority#X, 1), ENSURE_REQUIREMENTS, [o_orderpriority#X, count#X], [plan_id=X], [id=#X] -(18) ShuffleQueryStage +(19) ShuffleQueryStage Output [2]: [o_orderpriority#X, count#X] Arguments: X -(19) InputAdapter +(20) InputAdapter Input [2]: [o_orderpriority#X, count#X] -(20) InputIteratorTransformer +(21) InputIteratorTransformer Input [2]: [o_orderpriority#X, count#X] -(21) RegularHashAggregateExecTransformer +(22) RegularHashAggregateExecTransformer Input [2]: [o_orderpriority#X, count#X] Keys [1]: [o_orderpriority#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [o_orderpriority#X, count(1)#X AS order_count#X] -(22) WholeStageCodegenTransformer (X) +(23) WholeStageCodegenTransformer (X) Input [2]: [o_orderpriority#X, order_count#X] Arguments: false -(23) ColumnarExchange +(24) VeloxAppendBatches +Input [2]: [o_orderpriority#X, order_count#X] +Arguments: X + +(25) ColumnarExchange Input [2]: [o_orderpriority#X, order_count#X] Arguments: rangepartitioning(o_orderpriority#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(24) ShuffleQueryStage +(26) ShuffleQueryStage Output [2]: [o_orderpriority#X, order_count#X] Arguments: X -(25) InputAdapter +(27) InputAdapter Input [2]: [o_orderpriority#X, order_count#X] -(26) InputIteratorTransformer +(28) InputIteratorTransformer Input [2]: [o_orderpriority#X, order_count#X] -(27) SortExecTransformer +(29) SortExecTransformer Input [2]: [o_orderpriority#X, order_count#X] Arguments: [o_orderpriority#X ASC NULLS FIRST], true, 0 -(28) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [2]: [o_orderpriority#X, order_count#X] Arguments: false -(29) VeloxColumnarToRowExec +(31) VeloxColumnarToRowExec Input [2]: [o_orderpriority#X, order_count#X] -(30) Scan parquet +(32) Scan parquet Output [3]: [o_orderkey#X, o_orderdate#X, o_orderpriority#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1993-07-01), LessThan(o_orderdate,1993-10-01)] ReadSchema: struct -(31) Filter +(33) Filter Input [3]: [o_orderkey#X, o_orderdate#X, o_orderpriority#X] Condition : ((isnotnull(o_orderdate#X) AND (o_orderdate#X >= 1993-07-01)) AND (o_orderdate#X < 1993-10-01)) -(32) Project +(34) Project Output [2]: [o_orderkey#X, o_orderpriority#X] Input [3]: [o_orderkey#X, o_orderdate#X, o_orderpriority#X] -(33) Scan parquet +(35) Scan parquet Output [3]: [l_orderkey#X, l_commitdate#X, l_receiptdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_commitdate), IsNotNull(l_receiptdate)] ReadSchema: struct -(34) Filter +(36) Filter Input [3]: [l_orderkey#X, l_commitdate#X, l_receiptdate#X] Condition : ((isnotnull(l_commitdate#X) AND isnotnull(l_receiptdate#X)) AND (l_commitdate#X < l_receiptdate#X)) -(35) Project +(37) Project Output [1]: [l_orderkey#X] Input [3]: [l_orderkey#X, l_commitdate#X, l_receiptdate#X] -(36) BroadcastExchange +(38) BroadcastExchange Input [1]: [l_orderkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(37) BroadcastHashJoin +(39) BroadcastHashJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(38) Project +(40) Project Output [1]: [o_orderpriority#X] Input [2]: [o_orderkey#X, o_orderpriority#X] -(39) HashAggregate +(41) HashAggregate Input [1]: [o_orderpriority#X] Keys [1]: [o_orderpriority#X] Functions [1]: [partial_count(1)] Aggregate Attributes [1]: [count#X] Results [2]: [o_orderpriority#X, count#X] -(40) Exchange +(42) Exchange Input [2]: [o_orderpriority#X, count#X] Arguments: hashpartitioning(o_orderpriority#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(41) HashAggregate +(43) HashAggregate Input [2]: [o_orderpriority#X, count#X] Keys [1]: [o_orderpriority#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [o_orderpriority#X, count(1)#X AS order_count#X] -(42) Exchange +(44) Exchange Input [2]: [o_orderpriority#X, order_count#X] Arguments: rangepartitioning(o_orderpriority#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(43) Sort +(45) Sort Input [2]: [o_orderpriority#X, order_count#X] Arguments: [o_orderpriority#X ASC NULLS FIRST], true, 0 -(44) AdaptiveSparkPlan +(46) AdaptiveSparkPlan Output [2]: [o_orderpriority#X, order_count#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/5.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/5.txt index a87ef7c618c0d..c8a62c3aca1f4 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/5.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/5.txt @@ -1,98 +1,93 @@ == Physical Plan == -AdaptiveSparkPlan (100) +AdaptiveSparkPlan (102) +- == Final Plan == - VeloxColumnarToRowExec (65) - +- ^ SortExecTransformer (63) - +- ^ InputIteratorTransformer (62) - +- ^ InputAdapter (61) - +- ^ ShuffleQueryStage (60), Statistics(X) - +- ColumnarExchange (59) - +- ^ RegularHashAggregateExecTransformer (57) - +- ^ InputIteratorTransformer (56) - +- ^ InputAdapter (55) - +- ^ ShuffleQueryStage (54), Statistics(X) - +- ColumnarExchange (53) + VeloxColumnarToRowExec (67) + +- ^ SortExecTransformer (65) + +- ^ InputIteratorTransformer (64) + +- ShuffleQueryStage (62), Statistics(X) + +- ColumnarExchange (61) + +- VeloxAppendBatches (60) + +- ^ RegularHashAggregateExecTransformer (58) + +- ^ InputIteratorTransformer (57) + +- ShuffleQueryStage (55), Statistics(X) + +- ColumnarExchange (54) + +- VeloxAppendBatches (53) +- ^ ProjectExecTransformer (51) +- ^ FlushableHashAggregateExecTransformer (50) +- ^ ProjectExecTransformer (49) - +- ^ BroadcastHashJoinExecTransformer Inner (48) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (48) :- ^ ProjectExecTransformer (39) - : +- ^ BroadcastHashJoinExecTransformer Inner (38) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (38) : :- ^ ProjectExecTransformer (30) - : : +- ^ BroadcastHashJoinExecTransformer Inner (29) + : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (29) : : :- ^ ProjectExecTransformer (21) - : : : +- ^ BroadcastHashJoinExecTransformer Inner (20) + : : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (20) : : : :- ^ ProjectExecTransformer (12) - : : : : +- ^ BroadcastHashJoinExecTransformer Inner (11) + : : : : +- ^ BroadcastHashJoinExecTransformer Inner BuildLeft (11) : : : : :- ^ InputIteratorTransformer (7) - : : : : : +- ^ InputAdapter (6) - : : : : : +- ^ BroadcastQueryStage (5), Statistics(X) - : : : : : +- ColumnarBroadcastExchange (4) - : : : : : +- ^ NoopFilter (2) - : : : : : +- ^ Scan parquet (1) + : : : : : +- BroadcastQueryStage (5), Statistics(X) + : : : : : +- ColumnarBroadcastExchange (4) + : : : : : +- ^ NoopFilter (2) + : : : : : +- ^ Scan parquet (1) : : : : +- ^ ProjectExecTransformer (10) : : : : +- ^ NoopFilter (9) : : : : +- ^ Scan parquet (8) : : : +- ^ InputIteratorTransformer (19) - : : : +- ^ InputAdapter (18) - : : : +- ^ BroadcastQueryStage (17), Statistics(X) - : : : +- ColumnarBroadcastExchange (16) - : : : +- ^ NoopFilter (14) - : : : +- ^ Scan parquet (13) + : : : +- BroadcastQueryStage (17), Statistics(X) + : : : +- ColumnarBroadcastExchange (16) + : : : +- ^ NoopFilter (14) + : : : +- ^ Scan parquet (13) : : +- ^ InputIteratorTransformer (28) - : : +- ^ InputAdapter (27) - : : +- ^ BroadcastQueryStage (26), Statistics(X) - : : +- ColumnarBroadcastExchange (25) - : : +- ^ NoopFilter (23) - : : +- ^ Scan parquet (22) + : : +- BroadcastQueryStage (26), Statistics(X) + : : +- ColumnarBroadcastExchange (25) + : : +- ^ NoopFilter (23) + : : +- ^ Scan parquet (22) : +- ^ InputIteratorTransformer (37) - : +- ^ InputAdapter (36) - : +- ^ BroadcastQueryStage (35), Statistics(X) - : +- ColumnarBroadcastExchange (34) - : +- ^ NoopFilter (32) - : +- ^ Scan parquet (31) + : +- BroadcastQueryStage (35), Statistics(X) + : +- ColumnarBroadcastExchange (34) + : +- ^ NoopFilter (32) + : +- ^ Scan parquet (31) +- ^ InputIteratorTransformer (47) - +- ^ InputAdapter (46) - +- ^ BroadcastQueryStage (45), Statistics(X) - +- ColumnarBroadcastExchange (44) - +- ^ ProjectExecTransformer (42) - +- ^ NoopFilter (41) - +- ^ Scan parquet (40) + +- BroadcastQueryStage (45), Statistics(X) + +- ColumnarBroadcastExchange (44) + +- ^ ProjectExecTransformer (42) + +- ^ NoopFilter (41) + +- ^ Scan parquet (40) +- == Initial Plan == - Sort (99) - +- Exchange (98) - +- HashAggregate (97) - +- Exchange (96) - +- HashAggregate (95) - +- Project (94) - +- BroadcastHashJoin Inner BuildRight (93) - :- Project (88) - : +- BroadcastHashJoin Inner BuildRight (87) - : :- Project (83) - : : +- BroadcastHashJoin Inner BuildRight (82) - : : :- Project (78) - : : : +- BroadcastHashJoin Inner BuildRight (77) - : : : :- Project (73) - : : : : +- BroadcastHashJoin Inner BuildLeft (72) - : : : : :- BroadcastExchange (68) - : : : : : +- Filter (67) - : : : : : +- Scan parquet (66) - : : : : +- Project (71) - : : : : +- Filter (70) - : : : : +- Scan parquet (69) - : : : +- BroadcastExchange (76) - : : : +- Filter (75) - : : : +- Scan parquet (74) - : : +- BroadcastExchange (81) - : : +- Filter (80) - : : +- Scan parquet (79) - : +- BroadcastExchange (86) - : +- Filter (85) - : +- Scan parquet (84) - +- BroadcastExchange (92) - +- Project (91) - +- Filter (90) - +- Scan parquet (89) + Sort (101) + +- Exchange (100) + +- HashAggregate (99) + +- Exchange (98) + +- HashAggregate (97) + +- Project (96) + +- BroadcastHashJoin Inner BuildRight (95) + :- Project (90) + : +- BroadcastHashJoin Inner BuildRight (89) + : :- Project (85) + : : +- BroadcastHashJoin Inner BuildRight (84) + : : :- Project (80) + : : : +- BroadcastHashJoin Inner BuildRight (79) + : : : :- Project (75) + : : : : +- BroadcastHashJoin Inner BuildLeft (74) + : : : : :- BroadcastExchange (70) + : : : : : +- Filter (69) + : : : : : +- Scan parquet (68) + : : : : +- Project (73) + : : : : +- Filter (72) + : : : : +- Scan parquet (71) + : : : +- BroadcastExchange (78) + : : : +- Filter (77) + : : : +- Scan parquet (76) + : : +- BroadcastExchange (83) + : : +- Filter (82) + : : +- Scan parquet (81) + : +- BroadcastExchange (88) + : +- Filter (87) + : +- Scan parquet (86) + +- BroadcastExchange (94) + +- Project (93) + +- Filter (92) + +- Scan parquet (91) (1) Scan parquet @@ -319,221 +314,229 @@ Input [3]: [n_name#X, sum#X, isEmpty#X] Input [4]: [hash_partition_key#X, n_name#X, sum#X, isEmpty#X] Arguments: false -(53) ColumnarExchange +(53) VeloxAppendBatches +Input [4]: [hash_partition_key#X, n_name#X, sum#X, isEmpty#X] +Arguments: X + +(54) ColumnarExchange Input [4]: [hash_partition_key#X, n_name#X, sum#X, isEmpty#X] Arguments: hashpartitioning(n_name#X, 1), ENSURE_REQUIREMENTS, [n_name#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(54) ShuffleQueryStage +(55) ShuffleQueryStage Output [3]: [n_name#X, sum#X, isEmpty#X] Arguments: X -(55) InputAdapter +(56) InputAdapter Input [3]: [n_name#X, sum#X, isEmpty#X] -(56) InputIteratorTransformer +(57) InputIteratorTransformer Input [3]: [n_name#X, sum#X, isEmpty#X] -(57) RegularHashAggregateExecTransformer +(58) RegularHashAggregateExecTransformer Input [3]: [n_name#X, sum#X, isEmpty#X] Keys [1]: [n_name#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] Results [2]: [n_name#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X AS revenue#X] -(58) WholeStageCodegenTransformer (X) +(59) WholeStageCodegenTransformer (X) Input [2]: [n_name#X, revenue#X] Arguments: false -(59) ColumnarExchange +(60) VeloxAppendBatches +Input [2]: [n_name#X, revenue#X] +Arguments: X + +(61) ColumnarExchange Input [2]: [n_name#X, revenue#X] Arguments: rangepartitioning(revenue#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(60) ShuffleQueryStage +(62) ShuffleQueryStage Output [2]: [n_name#X, revenue#X] Arguments: X -(61) InputAdapter +(63) InputAdapter Input [2]: [n_name#X, revenue#X] -(62) InputIteratorTransformer +(64) InputIteratorTransformer Input [2]: [n_name#X, revenue#X] -(63) SortExecTransformer +(65) SortExecTransformer Input [2]: [n_name#X, revenue#X] Arguments: [revenue#X DESC NULLS LAST], true, 0 -(64) WholeStageCodegenTransformer (X) +(66) WholeStageCodegenTransformer (X) Input [2]: [n_name#X, revenue#X] Arguments: false -(65) VeloxColumnarToRowExec +(67) VeloxColumnarToRowExec Input [2]: [n_name#X, revenue#X] -(66) Scan parquet +(68) Scan parquet Output [2]: [c_custkey#X, c_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey), IsNotNull(c_nationkey)] ReadSchema: struct -(67) Filter +(69) Filter Input [2]: [c_custkey#X, c_nationkey#X] Condition : (isnotnull(c_custkey#X) AND isnotnull(c_nationkey#X)) -(68) BroadcastExchange +(70) BroadcastExchange Input [2]: [c_custkey#X, c_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(69) Scan parquet +(71) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1994-01-01), LessThan(o_orderdate,1995-01-01), IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(70) Filter +(72) Filter Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Condition : ((((isnotnull(o_orderdate#X) AND (o_orderdate#X >= 1994-01-01)) AND (o_orderdate#X < 1995-01-01)) AND isnotnull(o_custkey#X)) AND isnotnull(o_orderkey#X)) -(71) Project +(73) Project Output [2]: [o_orderkey#X, o_custkey#X] Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] -(72) BroadcastHashJoin +(74) BroadcastHashJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(73) Project +(75) Project Output [2]: [c_nationkey#X, o_orderkey#X] Input [4]: [c_custkey#X, c_nationkey#X, o_orderkey#X, o_custkey#X] -(74) Scan parquet +(76) Scan parquet Output [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_orderkey), IsNotNull(l_suppkey)] ReadSchema: struct -(75) Filter +(77) Filter Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Condition : (isnotnull(l_orderkey#X) AND isnotnull(l_suppkey#X)) -(76) BroadcastExchange +(78) BroadcastExchange Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(77) BroadcastHashJoin +(79) BroadcastHashJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(78) Project +(80) Project Output [4]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Input [6]: [c_nationkey#X, o_orderkey#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(79) Scan parquet +(81) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(80) Filter +(82) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(81) BroadcastExchange +(83) BroadcastExchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false], input[1, bigint, false]),false), [plan_id=X] -(82) BroadcastHashJoin +(84) BroadcastHashJoin Left keys [2]: [l_suppkey#X, c_nationkey#X] Right keys [2]: [s_suppkey#X, s_nationkey#X] Join condition: None -(83) Project +(85) Project Output [3]: [l_extendedprice#X, l_discount#X, s_nationkey#X] Input [6]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, s_suppkey#X, s_nationkey#X] -(84) Scan parquet +(86) Scan parquet Output [3]: [n_nationkey#X, n_name#X, n_regionkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), IsNotNull(n_regionkey)] ReadSchema: struct -(85) Filter +(87) Filter Input [3]: [n_nationkey#X, n_name#X, n_regionkey#X] Condition : (isnotnull(n_nationkey#X) AND isnotnull(n_regionkey#X)) -(86) BroadcastExchange +(88) BroadcastExchange Input [3]: [n_nationkey#X, n_name#X, n_regionkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(87) BroadcastHashJoin +(89) BroadcastHashJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(88) Project +(90) Project Output [4]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] Input [6]: [l_extendedprice#X, l_discount#X, s_nationkey#X, n_nationkey#X, n_name#X, n_regionkey#X] -(89) Scan parquet +(91) Scan parquet Output [2]: [r_regionkey#X, r_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(r_name), EqualTo(r_name,ASIA), IsNotNull(r_regionkey)] ReadSchema: struct -(90) Filter +(92) Filter Input [2]: [r_regionkey#X, r_name#X] Condition : ((isnotnull(r_name#X) AND (r_name#X = ASIA)) AND isnotnull(r_regionkey#X)) -(91) Project +(93) Project Output [1]: [r_regionkey#X] Input [2]: [r_regionkey#X, r_name#X] -(92) BroadcastExchange +(94) BroadcastExchange Input [1]: [r_regionkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(93) BroadcastHashJoin +(95) BroadcastHashJoin Left keys [1]: [n_regionkey#X] Right keys [1]: [r_regionkey#X] Join condition: None -(94) Project +(96) Project Output [3]: [l_extendedprice#X, l_discount#X, n_name#X] Input [5]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X, r_regionkey#X] -(95) HashAggregate +(97) HashAggregate Input [3]: [l_extendedprice#X, l_discount#X, n_name#X] Keys [1]: [n_name#X] Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [n_name#X, sum#X, isEmpty#X] -(96) Exchange +(98) Exchange Input [3]: [n_name#X, sum#X, isEmpty#X] Arguments: hashpartitioning(n_name#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(97) HashAggregate +(99) HashAggregate Input [3]: [n_name#X, sum#X, isEmpty#X] Keys [1]: [n_name#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] Results [2]: [n_name#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X AS revenue#X] -(98) Exchange +(100) Exchange Input [2]: [n_name#X, revenue#X] Arguments: rangepartitioning(revenue#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(99) Sort +(101) Sort Input [2]: [n_name#X, revenue#X] Arguments: [revenue#X DESC NULLS LAST], true, 0 -(100) AdaptiveSparkPlan +(102) AdaptiveSparkPlan Output [2]: [n_name#X, revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/6.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/6.txt index 51c5836bdd11e..fa9c936a0ca11 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/6.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/6.txt @@ -1,23 +1,23 @@ == Physical Plan == -AdaptiveSparkPlan (19) +AdaptiveSparkPlan (20) +- == Final Plan == - VeloxColumnarToRowExec (12) - +- ^ RegularHashAggregateExecTransformer (10) - +- ^ InputIteratorTransformer (9) - +- ^ InputAdapter (8) - +- ^ ShuffleQueryStage (7), Statistics(X) - +- ColumnarExchange (6) + VeloxColumnarToRowExec (13) + +- ^ RegularHashAggregateExecTransformer (11) + +- ^ InputIteratorTransformer (10) + +- ShuffleQueryStage (8), Statistics(X) + +- ColumnarExchange (7) + +- VeloxAppendBatches (6) +- ^ FlushableHashAggregateExecTransformer (4) +- ^ ProjectExecTransformer (3) +- ^ NoopFilter (2) +- ^ Scan parquet (1) +- == Initial Plan == - HashAggregate (18) - +- Exchange (17) - +- HashAggregate (16) - +- Project (15) - +- Filter (14) - +- Scan parquet (13) + HashAggregate (19) + +- Exchange (18) + +- HashAggregate (17) + +- Project (16) + +- Filter (15) + +- Scan parquet (14) (1) Scan parquet @@ -46,67 +46,71 @@ Results [2]: [sum#X, isEmpty#X] Input [2]: [sum#X, isEmpty#X] Arguments: false -(6) ColumnarExchange +(6) VeloxAppendBatches +Input [2]: [sum#X, isEmpty#X] +Arguments: X + +(7) ColumnarExchange Input [2]: [sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(7) ShuffleQueryStage +(8) ShuffleQueryStage Output [2]: [sum#X, isEmpty#X] Arguments: X -(8) InputAdapter +(9) InputAdapter Input [2]: [sum#X, isEmpty#X] -(9) InputIteratorTransformer +(10) InputIteratorTransformer Input [2]: [sum#X, isEmpty#X] -(10) RegularHashAggregateExecTransformer +(11) RegularHashAggregateExecTransformer Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(CheckOverflow((promote_precision(l_extendedprice#X) * promote_precision(l_discount#X)), DecimalType(25,4)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(l_extendedprice#X) * promote_precision(l_discount#X)), DecimalType(25,4)))#X] Results [1]: [sum(CheckOverflow((promote_precision(l_extendedprice#X) * promote_precision(l_discount#X)), DecimalType(25,4)))#X AS revenue#X] -(11) WholeStageCodegenTransformer (X) +(12) WholeStageCodegenTransformer (X) Input [1]: [revenue#X] Arguments: false -(12) VeloxColumnarToRowExec +(13) VeloxColumnarToRowExec Input [1]: [revenue#X] -(13) Scan parquet +(14) Scan parquet Output [4]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), IsNotNull(l_discount), IsNotNull(l_quantity), GreaterThanOrEqual(l_shipdate,1994-01-01), LessThan(l_shipdate,1995-01-01), GreaterThanOrEqual(l_discount,0.05), LessThanOrEqual(l_discount,0.07), LessThan(l_quantity,24.00)] ReadSchema: struct -(14) Filter +(15) Filter Input [4]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : (((((((isnotnull(l_shipdate#X) AND isnotnull(l_discount#X)) AND isnotnull(l_quantity#X)) AND (l_shipdate#X >= 1994-01-01)) AND (l_shipdate#X < 1995-01-01)) AND (l_discount#X >= 0.05)) AND (l_discount#X <= 0.07)) AND (l_quantity#X < 24.00)) -(15) Project +(16) Project Output [2]: [l_extendedprice#X, l_discount#X] Input [4]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(16) HashAggregate +(17) HashAggregate Input [2]: [l_extendedprice#X, l_discount#X] Keys: [] Functions [1]: [partial_sum(CheckOverflow((promote_precision(l_extendedprice#X) * promote_precision(l_discount#X)), DecimalType(25,4)))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(17) Exchange +(18) Exchange Input [2]: [sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X] -(18) HashAggregate +(19) HashAggregate Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(CheckOverflow((promote_precision(l_extendedprice#X) * promote_precision(l_discount#X)), DecimalType(25,4)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(l_extendedprice#X) * promote_precision(l_discount#X)), DecimalType(25,4)))#X] Results [1]: [sum(CheckOverflow((promote_precision(l_extendedprice#X) * promote_precision(l_discount#X)), DecimalType(25,4)))#X AS revenue#X] -(19) AdaptiveSparkPlan +(20) AdaptiveSparkPlan Output [1]: [revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/7.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/7.txt index 37e9e52142568..f098b9ba837f5 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/7.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/7.txt @@ -1,92 +1,87 @@ == Physical Plan == -AdaptiveSparkPlan (93) +AdaptiveSparkPlan (95) +- == Final Plan == - VeloxColumnarToRowExec (60) - +- ^ SortExecTransformer (58) - +- ^ InputIteratorTransformer (57) - +- ^ InputAdapter (56) - +- ^ ShuffleQueryStage (55), Statistics(X) - +- ColumnarExchange (54) - +- ^ RegularHashAggregateExecTransformer (52) - +- ^ InputIteratorTransformer (51) - +- ^ InputAdapter (50) - +- ^ ShuffleQueryStage (49), Statistics(X) - +- ColumnarExchange (48) + VeloxColumnarToRowExec (62) + +- ^ SortExecTransformer (60) + +- ^ InputIteratorTransformer (59) + +- ShuffleQueryStage (57), Statistics(X) + +- ColumnarExchange (56) + +- VeloxAppendBatches (55) + +- ^ RegularHashAggregateExecTransformer (53) + +- ^ InputIteratorTransformer (52) + +- ShuffleQueryStage (50), Statistics(X) + +- ColumnarExchange (49) + +- VeloxAppendBatches (48) +- ^ ProjectExecTransformer (46) +- ^ FlushableHashAggregateExecTransformer (45) +- ^ ProjectExecTransformer (44) - +- ^ BroadcastHashJoinExecTransformer Inner (43) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (43) :- ^ ProjectExecTransformer (38) - : +- ^ BroadcastHashJoinExecTransformer Inner (37) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (37) : :- ^ ProjectExecTransformer (29) - : : +- ^ BroadcastHashJoinExecTransformer Inner (28) + : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (28) : : :- ^ ProjectExecTransformer (20) - : : : +- ^ BroadcastHashJoinExecTransformer Inner (19) + : : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (19) : : : :- ^ ProjectExecTransformer (11) - : : : : +- ^ BroadcastHashJoinExecTransformer Inner (10) + : : : : +- ^ BroadcastHashJoinExecTransformer Inner BuildLeft (10) : : : : :- ^ InputIteratorTransformer (7) - : : : : : +- ^ InputAdapter (6) - : : : : : +- ^ BroadcastQueryStage (5), Statistics(X) - : : : : : +- ColumnarBroadcastExchange (4) - : : : : : +- ^ NoopFilter (2) - : : : : : +- ^ Scan parquet (1) + : : : : : +- BroadcastQueryStage (5), Statistics(X) + : : : : : +- ColumnarBroadcastExchange (4) + : : : : : +- ^ NoopFilter (2) + : : : : : +- ^ Scan parquet (1) : : : : +- ^ NoopFilter (9) : : : : +- ^ Scan parquet (8) : : : +- ^ InputIteratorTransformer (18) - : : : +- ^ InputAdapter (17) - : : : +- ^ BroadcastQueryStage (16), Statistics(X) - : : : +- ColumnarBroadcastExchange (15) - : : : +- ^ NoopFilter (13) - : : : +- ^ Scan parquet (12) + : : : +- BroadcastQueryStage (16), Statistics(X) + : : : +- ColumnarBroadcastExchange (15) + : : : +- ^ NoopFilter (13) + : : : +- ^ Scan parquet (12) : : +- ^ InputIteratorTransformer (27) - : : +- ^ InputAdapter (26) - : : +- ^ BroadcastQueryStage (25), Statistics(X) - : : +- ColumnarBroadcastExchange (24) - : : +- ^ NoopFilter (22) - : : +- ^ Scan parquet (21) + : : +- BroadcastQueryStage (25), Statistics(X) + : : +- ColumnarBroadcastExchange (24) + : : +- ^ NoopFilter (22) + : : +- ^ Scan parquet (21) : +- ^ InputIteratorTransformer (36) - : +- ^ InputAdapter (35) - : +- ^ BroadcastQueryStage (34), Statistics(X) - : +- ColumnarBroadcastExchange (33) - : +- ^ NoopFilter (31) - : +- ^ Scan parquet (30) + : +- BroadcastQueryStage (34), Statistics(X) + : +- ColumnarBroadcastExchange (33) + : +- ^ NoopFilter (31) + : +- ^ Scan parquet (30) +- ^ InputIteratorTransformer (42) - +- ^ InputAdapter (41) - +- ^ BroadcastQueryStage (40), Statistics(X) - +- ReusedExchange (39) + +- BroadcastQueryStage (40), Statistics(X) + +- ReusedExchange (39) +- == Initial Plan == - Sort (92) - +- Exchange (91) - +- HashAggregate (90) - +- Exchange (89) - +- HashAggregate (88) - +- Project (87) - +- BroadcastHashJoin Inner BuildRight (86) - :- Project (82) - : +- BroadcastHashJoin Inner BuildRight (81) - : :- Project (77) - : : +- BroadcastHashJoin Inner BuildRight (76) - : : :- Project (72) - : : : +- BroadcastHashJoin Inner BuildRight (71) - : : : :- Project (67) - : : : : +- BroadcastHashJoin Inner BuildLeft (66) - : : : : :- BroadcastExchange (63) - : : : : : +- Filter (62) - : : : : : +- Scan parquet (61) - : : : : +- Filter (65) - : : : : +- Scan parquet (64) - : : : +- BroadcastExchange (70) - : : : +- Filter (69) - : : : +- Scan parquet (68) - : : +- BroadcastExchange (75) - : : +- Filter (74) - : : +- Scan parquet (73) - : +- BroadcastExchange (80) - : +- Filter (79) - : +- Scan parquet (78) - +- BroadcastExchange (85) - +- Filter (84) - +- Scan parquet (83) + Sort (94) + +- Exchange (93) + +- HashAggregate (92) + +- Exchange (91) + +- HashAggregate (90) + +- Project (89) + +- BroadcastHashJoin Inner BuildRight (88) + :- Project (84) + : +- BroadcastHashJoin Inner BuildRight (83) + : :- Project (79) + : : +- BroadcastHashJoin Inner BuildRight (78) + : : :- Project (74) + : : : +- BroadcastHashJoin Inner BuildRight (73) + : : : :- Project (69) + : : : : +- BroadcastHashJoin Inner BuildLeft (68) + : : : : :- BroadcastExchange (65) + : : : : : +- Filter (64) + : : : : : +- Scan parquet (63) + : : : : +- Filter (67) + : : : : +- Scan parquet (66) + : : : +- BroadcastExchange (72) + : : : +- Filter (71) + : : : +- Scan parquet (70) + : : +- BroadcastExchange (77) + : : +- Filter (76) + : : +- Scan parquet (75) + : +- BroadcastExchange (82) + : +- Filter (81) + : +- Scan parquet (80) + +- BroadcastExchange (87) + +- Filter (86) + +- Scan parquet (85) (1) Scan parquet @@ -289,213 +284,221 @@ Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Input [6]: [hash_partition_key#X, supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Arguments: false -(48) ColumnarExchange +(48) VeloxAppendBatches +Input [6]: [hash_partition_key#X, supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] +Arguments: X + +(49) ColumnarExchange Input [6]: [hash_partition_key#X, supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Arguments: hashpartitioning(supp_nation#X, cust_nation#X, l_year#X, 1), ENSURE_REQUIREMENTS, [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(49) ShuffleQueryStage +(50) ShuffleQueryStage Output [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Arguments: X -(50) InputAdapter +(51) InputAdapter Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] -(51) InputIteratorTransformer +(52) InputIteratorTransformer Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] -(52) RegularHashAggregateExecTransformer +(53) RegularHashAggregateExecTransformer Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Keys [3]: [supp_nation#X, cust_nation#X, l_year#X] Functions [1]: [sum(volume#X)] Aggregate Attributes [1]: [sum(volume#X)#X] Results [4]: [supp_nation#X, cust_nation#X, l_year#X, sum(volume#X)#X AS revenue#X] -(53) WholeStageCodegenTransformer (X) +(54) WholeStageCodegenTransformer (X) Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: false -(54) ColumnarExchange +(55) VeloxAppendBatches +Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] +Arguments: X + +(56) ColumnarExchange Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: rangepartitioning(supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(55) ShuffleQueryStage +(57) ShuffleQueryStage Output [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: X -(56) InputAdapter +(58) InputAdapter Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] -(57) InputIteratorTransformer +(59) InputIteratorTransformer Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] -(58) SortExecTransformer +(60) SortExecTransformer Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: [supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST], true, 0 -(59) WholeStageCodegenTransformer (X) +(61) WholeStageCodegenTransformer (X) Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: false -(60) VeloxColumnarToRowExec +(62) VeloxColumnarToRowExec Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] -(61) Scan parquet +(63) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(62) Filter +(64) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(63) BroadcastExchange +(65) BroadcastExchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(64) Scan parquet +(66) Scan parquet Output [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1995-01-01), LessThanOrEqual(l_shipdate,1996-12-31), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(65) Filter +(67) Filter Input [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : ((((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1995-01-01)) AND (l_shipdate#X <= 1996-12-31)) AND isnotnull(l_suppkey#X)) AND isnotnull(l_orderkey#X)) -(66) BroadcastHashJoin +(68) BroadcastHashJoin Left keys [1]: [s_suppkey#X] Right keys [1]: [l_suppkey#X] Join condition: None -(67) Project +(69) Project Output [5]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Input [7]: [s_suppkey#X, s_nationkey#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(68) Scan parquet +(70) Scan parquet Output [2]: [o_orderkey#X, o_custkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderkey), IsNotNull(o_custkey)] ReadSchema: struct -(69) Filter +(71) Filter Input [2]: [o_orderkey#X, o_custkey#X] Condition : (isnotnull(o_orderkey#X) AND isnotnull(o_custkey#X)) -(70) BroadcastExchange +(72) BroadcastExchange Input [2]: [o_orderkey#X, o_custkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(71) BroadcastHashJoin +(73) BroadcastHashJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join condition: None -(72) Project +(74) Project Output [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] Input [7]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_orderkey#X, o_custkey#X] -(73) Scan parquet +(75) Scan parquet Output [2]: [c_custkey#X, c_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey), IsNotNull(c_nationkey)] ReadSchema: struct -(74) Filter +(76) Filter Input [2]: [c_custkey#X, c_nationkey#X] Condition : (isnotnull(c_custkey#X) AND isnotnull(c_nationkey#X)) -(75) BroadcastExchange +(77) BroadcastExchange Input [2]: [c_custkey#X, c_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(76) BroadcastHashJoin +(78) BroadcastHashJoin Left keys [1]: [o_custkey#X] Right keys [1]: [c_custkey#X] Join condition: None -(77) Project +(79) Project Output [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] Input [7]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X, c_custkey#X, c_nationkey#X] -(78) Scan parquet +(80) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), Or(EqualTo(n_name,FRANCE),EqualTo(n_name,GERMANY))] ReadSchema: struct -(79) Filter +(81) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : (isnotnull(n_nationkey#X) AND ((n_name#X = FRANCE) OR (n_name#X = GERMANY))) -(80) BroadcastExchange +(82) BroadcastExchange Input [2]: [n_nationkey#X, n_name#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(81) BroadcastHashJoin +(83) BroadcastHashJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(82) Project +(84) Project Output [5]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] Input [7]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_nationkey#X, n_name#X] -(83) Scan parquet +(85) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), Or(EqualTo(n_name,GERMANY),EqualTo(n_name,FRANCE))] ReadSchema: struct -(84) Filter +(86) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : (isnotnull(n_nationkey#X) AND ((n_name#X = GERMANY) OR (n_name#X = FRANCE))) -(85) BroadcastExchange +(87) BroadcastExchange Input [2]: [n_nationkey#X, n_name#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(86) BroadcastHashJoin +(88) BroadcastHashJoin Left keys [1]: [c_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: (((n_name#X = FRANCE) AND (n_name#X = GERMANY)) OR ((n_name#X = GERMANY) AND (n_name#X = FRANCE))) -(87) Project +(89) Project Output [4]: [n_name#X AS supp_nation#X, n_name#X AS cust_nation#X, year(l_shipdate#X) AS l_year#X, CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) AS volume#X] Input [7]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X, n_nationkey#X, n_name#X] -(88) HashAggregate +(90) HashAggregate Input [4]: [supp_nation#X, cust_nation#X, l_year#X, volume#X] Keys [3]: [supp_nation#X, cust_nation#X, l_year#X] Functions [1]: [partial_sum(volume#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] -(89) Exchange +(91) Exchange Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Arguments: hashpartitioning(supp_nation#X, cust_nation#X, l_year#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(90) HashAggregate +(92) HashAggregate Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Keys [3]: [supp_nation#X, cust_nation#X, l_year#X] Functions [1]: [sum(volume#X)] Aggregate Attributes [1]: [sum(volume#X)#X] Results [4]: [supp_nation#X, cust_nation#X, l_year#X, sum(volume#X)#X AS revenue#X] -(91) Exchange +(93) Exchange Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: rangepartitioning(supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(92) Sort +(94) Sort Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: [supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST], true, 0 -(93) AdaptiveSparkPlan +(95) AdaptiveSparkPlan Output [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/8.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/8.txt index 9b490b9ba436d..becfc12a1b866 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/8.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/8.txt @@ -1,125 +1,118 @@ == Physical Plan == -AdaptiveSparkPlan (129) +AdaptiveSparkPlan (131) +- == Final Plan == - VeloxColumnarToRowExec (84) - +- ^ SortExecTransformer (82) - +- ^ InputIteratorTransformer (81) - +- ^ InputAdapter (80) - +- ^ ShuffleQueryStage (79), Statistics(X) - +- ColumnarExchange (78) - +- ^ ProjectExecTransformer (76) - +- ^ RegularHashAggregateExecTransformer (75) - +- ^ InputIteratorTransformer (74) - +- ^ InputAdapter (73) - +- ^ ShuffleQueryStage (72), Statistics(X) - +- ColumnarExchange (71) + VeloxColumnarToRowExec (86) + +- ^ SortExecTransformer (84) + +- ^ InputIteratorTransformer (83) + +- ShuffleQueryStage (81), Statistics(X) + +- ColumnarExchange (80) + +- VeloxAppendBatches (79) + +- ^ ProjectExecTransformer (77) + +- ^ RegularHashAggregateExecTransformer (76) + +- ^ InputIteratorTransformer (75) + +- ShuffleQueryStage (73), Statistics(X) + +- ColumnarExchange (72) + +- VeloxAppendBatches (71) +- ^ ProjectExecTransformer (69) +- ^ FlushableHashAggregateExecTransformer (68) +- ^ ProjectExecTransformer (67) - +- ^ BroadcastHashJoinExecTransformer Inner (66) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (66) :- ^ ProjectExecTransformer (57) - : +- ^ BroadcastHashJoinExecTransformer Inner (56) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (56) : :- ^ ProjectExecTransformer (48) - : : +- ^ BroadcastHashJoinExecTransformer Inner (47) + : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (47) : : :- ^ ProjectExecTransformer (39) - : : : +- ^ BroadcastHashJoinExecTransformer Inner (38) + : : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (38) : : : :- ^ ProjectExecTransformer (30) - : : : : +- ^ BroadcastHashJoinExecTransformer Inner (29) + : : : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (29) : : : : :- ^ ProjectExecTransformer (21) - : : : : : +- ^ BroadcastHashJoinExecTransformer Inner (20) + : : : : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (20) : : : : : :- ^ ProjectExecTransformer (12) - : : : : : : +- ^ BroadcastHashJoinExecTransformer Inner (11) + : : : : : : +- ^ BroadcastHashJoinExecTransformer Inner BuildLeft (11) : : : : : : :- ^ InputIteratorTransformer (8) - : : : : : : : +- ^ InputAdapter (7) - : : : : : : : +- ^ BroadcastQueryStage (6), Statistics(X) - : : : : : : : +- ColumnarBroadcastExchange (5) - : : : : : : : +- ^ ProjectExecTransformer (3) - : : : : : : : +- ^ NoopFilter (2) - : : : : : : : +- ^ Scan parquet (1) + : : : : : : : +- BroadcastQueryStage (6), Statistics(X) + : : : : : : : +- ColumnarBroadcastExchange (5) + : : : : : : : +- ^ ProjectExecTransformer (3) + : : : : : : : +- ^ NoopFilter (2) + : : : : : : : +- ^ Scan parquet (1) : : : : : : +- ^ NoopFilter (10) : : : : : : +- ^ Scan parquet (9) : : : : : +- ^ InputIteratorTransformer (19) - : : : : : +- ^ InputAdapter (18) - : : : : : +- ^ BroadcastQueryStage (17), Statistics(X) - : : : : : +- ColumnarBroadcastExchange (16) - : : : : : +- ^ NoopFilter (14) - : : : : : +- ^ Scan parquet (13) + : : : : : +- BroadcastQueryStage (17), Statistics(X) + : : : : : +- ColumnarBroadcastExchange (16) + : : : : : +- ^ NoopFilter (14) + : : : : : +- ^ Scan parquet (13) : : : : +- ^ InputIteratorTransformer (28) - : : : : +- ^ InputAdapter (27) - : : : : +- ^ BroadcastQueryStage (26), Statistics(X) - : : : : +- ColumnarBroadcastExchange (25) - : : : : +- ^ NoopFilter (23) - : : : : +- ^ Scan parquet (22) + : : : : +- BroadcastQueryStage (26), Statistics(X) + : : : : +- ColumnarBroadcastExchange (25) + : : : : +- ^ NoopFilter (23) + : : : : +- ^ Scan parquet (22) : : : +- ^ InputIteratorTransformer (37) - : : : +- ^ InputAdapter (36) - : : : +- ^ BroadcastQueryStage (35), Statistics(X) - : : : +- ColumnarBroadcastExchange (34) - : : : +- ^ NoopFilter (32) - : : : +- ^ Scan parquet (31) + : : : +- BroadcastQueryStage (35), Statistics(X) + : : : +- ColumnarBroadcastExchange (34) + : : : +- ^ NoopFilter (32) + : : : +- ^ Scan parquet (31) : : +- ^ InputIteratorTransformer (46) - : : +- ^ InputAdapter (45) - : : +- ^ BroadcastQueryStage (44), Statistics(X) - : : +- ColumnarBroadcastExchange (43) - : : +- ^ NoopFilter (41) - : : +- ^ Scan parquet (40) + : : +- BroadcastQueryStage (44), Statistics(X) + : : +- ColumnarBroadcastExchange (43) + : : +- ^ NoopFilter (41) + : : +- ^ Scan parquet (40) : +- ^ InputIteratorTransformer (55) - : +- ^ InputAdapter (54) - : +- ^ BroadcastQueryStage (53), Statistics(X) - : +- ColumnarBroadcastExchange (52) - : +- ^ NoopFilter (50) - : +- ^ Scan parquet (49) + : +- BroadcastQueryStage (53), Statistics(X) + : +- ColumnarBroadcastExchange (52) + : +- ^ NoopFilter (50) + : +- ^ Scan parquet (49) +- ^ InputIteratorTransformer (65) - +- ^ InputAdapter (64) - +- ^ BroadcastQueryStage (63), Statistics(X) - +- ColumnarBroadcastExchange (62) - +- ^ ProjectExecTransformer (60) - +- ^ NoopFilter (59) - +- ^ Scan parquet (58) + +- BroadcastQueryStage (63), Statistics(X) + +- ColumnarBroadcastExchange (62) + +- ^ ProjectExecTransformer (60) + +- ^ NoopFilter (59) + +- ^ Scan parquet (58) +- == Initial Plan == - Sort (128) - +- Exchange (127) - +- HashAggregate (126) - +- Exchange (125) - +- HashAggregate (124) - +- Project (123) - +- BroadcastHashJoin Inner BuildRight (122) - :- Project (117) - : +- BroadcastHashJoin Inner BuildRight (116) - : :- Project (112) - : : +- BroadcastHashJoin Inner BuildRight (111) - : : :- Project (107) - : : : +- BroadcastHashJoin Inner BuildRight (106) - : : : :- Project (102) - : : : : +- BroadcastHashJoin Inner BuildRight (101) - : : : : :- Project (97) - : : : : : +- BroadcastHashJoin Inner BuildRight (96) - : : : : : :- Project (92) - : : : : : : +- BroadcastHashJoin Inner BuildLeft (91) - : : : : : : :- BroadcastExchange (88) - : : : : : : : +- Project (87) - : : : : : : : +- Filter (86) - : : : : : : : +- Scan parquet (85) - : : : : : : +- Filter (90) - : : : : : : +- Scan parquet (89) - : : : : : +- BroadcastExchange (95) - : : : : : +- Filter (94) - : : : : : +- Scan parquet (93) - : : : : +- BroadcastExchange (100) - : : : : +- Filter (99) - : : : : +- Scan parquet (98) - : : : +- BroadcastExchange (105) - : : : +- Filter (104) - : : : +- Scan parquet (103) - : : +- BroadcastExchange (110) - : : +- Filter (109) - : : +- Scan parquet (108) - : +- BroadcastExchange (115) - : +- Filter (114) - : +- Scan parquet (113) - +- BroadcastExchange (121) - +- Project (120) - +- Filter (119) - +- Scan parquet (118) + Sort (130) + +- Exchange (129) + +- HashAggregate (128) + +- Exchange (127) + +- HashAggregate (126) + +- Project (125) + +- BroadcastHashJoin Inner BuildRight (124) + :- Project (119) + : +- BroadcastHashJoin Inner BuildRight (118) + : :- Project (114) + : : +- BroadcastHashJoin Inner BuildRight (113) + : : :- Project (109) + : : : +- BroadcastHashJoin Inner BuildRight (108) + : : : :- Project (104) + : : : : +- BroadcastHashJoin Inner BuildRight (103) + : : : : :- Project (99) + : : : : : +- BroadcastHashJoin Inner BuildRight (98) + : : : : : :- Project (94) + : : : : : : +- BroadcastHashJoin Inner BuildLeft (93) + : : : : : : :- BroadcastExchange (90) + : : : : : : : +- Project (89) + : : : : : : : +- Filter (88) + : : : : : : : +- Scan parquet (87) + : : : : : : +- Filter (92) + : : : : : : +- Scan parquet (91) + : : : : : +- BroadcastExchange (97) + : : : : : +- Filter (96) + : : : : : +- Scan parquet (95) + : : : : +- BroadcastExchange (102) + : : : : +- Filter (101) + : : : : +- Scan parquet (100) + : : : +- BroadcastExchange (107) + : : : +- Filter (106) + : : : +- Scan parquet (105) + : : +- BroadcastExchange (112) + : : +- Filter (111) + : : +- Scan parquet (110) + : +- BroadcastExchange (117) + : +- Filter (116) + : +- Scan parquet (115) + +- BroadcastExchange (123) + +- Project (122) + +- Filter (121) + +- Scan parquet (120) (1) Scan parquet @@ -422,273 +415,281 @@ Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Input [6]: [hash_partition_key#X, o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: false -(71) ColumnarExchange +(71) VeloxAppendBatches +Input [6]: [hash_partition_key#X, o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] +Arguments: X + +(72) ColumnarExchange Input [6]: [hash_partition_key#X, o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: hashpartitioning(o_year#X, 1), ENSURE_REQUIREMENTS, [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(72) ShuffleQueryStage +(73) ShuffleQueryStage Output [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: X -(73) InputAdapter +(74) InputAdapter Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] -(74) InputIteratorTransformer +(75) InputIteratorTransformer Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] -(75) RegularHashAggregateExecTransformer +(76) RegularHashAggregateExecTransformer Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Keys [1]: [o_year#X] Functions [2]: [sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END), sum(volume#X)] Aggregate Attributes [2]: [sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X, sum(volume#X)#X] Results [3]: [o_year#X, sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X, sum(volume#X)#X] -(76) ProjectExecTransformer +(77) ProjectExecTransformer Output [2]: [o_year#X, CheckOverflow((promote_precision(sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X) / promote_precision(sum(volume#X)#X)), DecimalType(38,6)) AS mkt_share#X] Input [3]: [o_year#X, sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X, sum(volume#X)#X] -(77) WholeStageCodegenTransformer (X) +(78) WholeStageCodegenTransformer (X) Input [2]: [o_year#X, mkt_share#X] Arguments: false -(78) ColumnarExchange +(79) VeloxAppendBatches +Input [2]: [o_year#X, mkt_share#X] +Arguments: X + +(80) ColumnarExchange Input [2]: [o_year#X, mkt_share#X] Arguments: rangepartitioning(o_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(79) ShuffleQueryStage +(81) ShuffleQueryStage Output [2]: [o_year#X, mkt_share#X] Arguments: X -(80) InputAdapter +(82) InputAdapter Input [2]: [o_year#X, mkt_share#X] -(81) InputIteratorTransformer +(83) InputIteratorTransformer Input [2]: [o_year#X, mkt_share#X] -(82) SortExecTransformer +(84) SortExecTransformer Input [2]: [o_year#X, mkt_share#X] Arguments: [o_year#X ASC NULLS FIRST], true, 0 -(83) WholeStageCodegenTransformer (X) +(85) WholeStageCodegenTransformer (X) Input [2]: [o_year#X, mkt_share#X] Arguments: false -(84) VeloxColumnarToRowExec +(86) VeloxColumnarToRowExec Input [2]: [o_year#X, mkt_share#X] -(85) Scan parquet +(87) Scan parquet Output [2]: [p_partkey#X, p_type#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_type), EqualTo(p_type,ECONOMY ANODIZED STEEL), IsNotNull(p_partkey)] ReadSchema: struct -(86) Filter +(88) Filter Input [2]: [p_partkey#X, p_type#X] Condition : ((isnotnull(p_type#X) AND (p_type#X = ECONOMY ANODIZED STEEL)) AND isnotnull(p_partkey#X)) -(87) Project +(89) Project Output [1]: [p_partkey#X] Input [2]: [p_partkey#X, p_type#X] -(88) BroadcastExchange +(90) BroadcastExchange Input [1]: [p_partkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(89) Scan parquet +(91) Scan parquet Output [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(90) Filter +(92) Filter Input [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Condition : ((isnotnull(l_partkey#X) AND isnotnull(l_suppkey#X)) AND isnotnull(l_orderkey#X)) -(91) BroadcastHashJoin +(93) BroadcastHashJoin Left keys [1]: [p_partkey#X] Right keys [1]: [l_partkey#X] Join condition: None -(92) Project +(94) Project Output [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Input [6]: [p_partkey#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(93) Scan parquet +(95) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(94) Filter +(96) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(95) BroadcastExchange +(97) BroadcastExchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(96) BroadcastHashJoin +(98) BroadcastHashJoin Left keys [1]: [l_suppkey#X] Right keys [1]: [s_suppkey#X] Join condition: None -(97) Project +(99) Project Output [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Input [6]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, s_suppkey#X, s_nationkey#X] -(98) Scan parquet +(100) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1995-01-01), LessThanOrEqual(o_orderdate,1996-12-31), IsNotNull(o_orderkey), IsNotNull(o_custkey)] ReadSchema: struct -(99) Filter +(101) Filter Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Condition : ((((isnotnull(o_orderdate#X) AND (o_orderdate#X >= 1995-01-01)) AND (o_orderdate#X <= 1996-12-31)) AND isnotnull(o_orderkey#X)) AND isnotnull(o_custkey#X)) -(100) BroadcastExchange +(102) BroadcastExchange Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(101) BroadcastHashJoin +(103) BroadcastHashJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join condition: None -(102) Project +(104) Project Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] Input [7]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderkey#X, o_custkey#X, o_orderdate#X] -(103) Scan parquet +(105) Scan parquet Output [2]: [c_custkey#X, c_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey), IsNotNull(c_nationkey)] ReadSchema: struct -(104) Filter +(106) Filter Input [2]: [c_custkey#X, c_nationkey#X] Condition : (isnotnull(c_custkey#X) AND isnotnull(c_nationkey#X)) -(105) BroadcastExchange +(107) BroadcastExchange Input [2]: [c_custkey#X, c_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(106) BroadcastHashJoin +(108) BroadcastHashJoin Left keys [1]: [o_custkey#X] Right keys [1]: [c_custkey#X] Join condition: None -(107) Project +(109) Project Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] Input [7]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X, c_custkey#X, c_nationkey#X] -(108) Scan parquet +(110) Scan parquet Output [2]: [n_nationkey#X, n_regionkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), IsNotNull(n_regionkey)] ReadSchema: struct -(109) Filter +(111) Filter Input [2]: [n_nationkey#X, n_regionkey#X] Condition : (isnotnull(n_nationkey#X) AND isnotnull(n_regionkey#X)) -(110) BroadcastExchange +(112) BroadcastExchange Input [2]: [n_nationkey#X, n_regionkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(111) BroadcastHashJoin +(113) BroadcastHashJoin Left keys [1]: [c_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(112) Project +(114) Project Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] Input [7]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X, n_nationkey#X, n_regionkey#X] -(113) Scan parquet +(115) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey)] ReadSchema: struct -(114) Filter +(116) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : isnotnull(n_nationkey#X) -(115) BroadcastExchange +(117) BroadcastExchange Input [2]: [n_nationkey#X, n_name#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(116) BroadcastHashJoin +(118) BroadcastHashJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(117) Project +(119) Project Output [5]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] Input [7]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X, n_nationkey#X, n_name#X] -(118) Scan parquet +(120) Scan parquet Output [2]: [r_regionkey#X, r_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(r_name), EqualTo(r_name,AMERICA), IsNotNull(r_regionkey)] ReadSchema: struct -(119) Filter +(121) Filter Input [2]: [r_regionkey#X, r_name#X] Condition : ((isnotnull(r_name#X) AND (r_name#X = AMERICA)) AND isnotnull(r_regionkey#X)) -(120) Project +(122) Project Output [1]: [r_regionkey#X] Input [2]: [r_regionkey#X, r_name#X] -(121) BroadcastExchange +(123) BroadcastExchange Input [1]: [r_regionkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(122) BroadcastHashJoin +(124) BroadcastHashJoin Left keys [1]: [n_regionkey#X] Right keys [1]: [r_regionkey#X] Join condition: None -(123) Project +(125) Project Output [3]: [year(o_orderdate#X) AS o_year#X, CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) AS volume#X, n_name#X AS nation#X] Input [6]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X, r_regionkey#X] -(124) HashAggregate +(126) HashAggregate Input [3]: [o_year#X, volume#X, nation#X] Keys [1]: [o_year#X] Functions [2]: [partial_sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END), partial_sum(volume#X)] Aggregate Attributes [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Results [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] -(125) Exchange +(127) Exchange Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: hashpartitioning(o_year#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(126) HashAggregate +(128) HashAggregate Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Keys [1]: [o_year#X] Functions [2]: [sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END), sum(volume#X)] Aggregate Attributes [2]: [sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X, sum(volume#X)#X] Results [2]: [o_year#X, CheckOverflow((promote_precision(sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X) / promote_precision(sum(volume#X)#X)), DecimalType(38,6)) AS mkt_share#X] -(127) Exchange +(129) Exchange Input [2]: [o_year#X, mkt_share#X] Arguments: rangepartitioning(o_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(128) Sort +(130) Sort Input [2]: [o_year#X, mkt_share#X] Arguments: [o_year#X ASC NULLS FIRST], true, 0 -(129) AdaptiveSparkPlan +(131) AdaptiveSparkPlan Output [2]: [o_year#X, mkt_share#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/9.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/9.txt index 8523cbeff2abb..a486fc65677b7 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/9.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/9.txt @@ -1,96 +1,91 @@ == Physical Plan == -AdaptiveSparkPlan (98) +AdaptiveSparkPlan (100) +- == Final Plan == - VeloxColumnarToRowExec (64) - +- ^ SortExecTransformer (62) - +- ^ InputIteratorTransformer (61) - +- ^ InputAdapter (60) - +- ^ ShuffleQueryStage (59), Statistics(X) - +- ColumnarExchange (58) - +- ^ RegularHashAggregateExecTransformer (56) - +- ^ InputIteratorTransformer (55) - +- ^ InputAdapter (54) - +- ^ ShuffleQueryStage (53), Statistics(X) - +- ColumnarExchange (52) + VeloxColumnarToRowExec (66) + +- ^ SortExecTransformer (64) + +- ^ InputIteratorTransformer (63) + +- ShuffleQueryStage (61), Statistics(X) + +- ColumnarExchange (60) + +- VeloxAppendBatches (59) + +- ^ RegularHashAggregateExecTransformer (57) + +- ^ InputIteratorTransformer (56) + +- ShuffleQueryStage (54), Statistics(X) + +- ColumnarExchange (53) + +- VeloxAppendBatches (52) +- ^ ProjectExecTransformer (50) +- ^ FlushableHashAggregateExecTransformer (49) +- ^ ProjectExecTransformer (48) - +- ^ BroadcastHashJoinExecTransformer Inner (47) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (47) :- ^ ProjectExecTransformer (39) - : +- ^ BroadcastHashJoinExecTransformer Inner (38) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (38) : :- ^ ProjectExecTransformer (30) - : : +- ^ BroadcastHashJoinExecTransformer Inner (29) + : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (29) : : :- ^ ProjectExecTransformer (21) - : : : +- ^ BroadcastHashJoinExecTransformer Inner (20) + : : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (20) : : : :- ^ ProjectExecTransformer (12) - : : : : +- ^ BroadcastHashJoinExecTransformer Inner (11) + : : : : +- ^ BroadcastHashJoinExecTransformer Inner BuildLeft (11) : : : : :- ^ InputIteratorTransformer (8) - : : : : : +- ^ InputAdapter (7) - : : : : : +- ^ BroadcastQueryStage (6), Statistics(X) - : : : : : +- ColumnarBroadcastExchange (5) - : : : : : +- ^ ProjectExecTransformer (3) - : : : : : +- ^ NoopFilter (2) - : : : : : +- ^ Scan parquet (1) + : : : : : +- BroadcastQueryStage (6), Statistics(X) + : : : : : +- ColumnarBroadcastExchange (5) + : : : : : +- ^ ProjectExecTransformer (3) + : : : : : +- ^ NoopFilter (2) + : : : : : +- ^ Scan parquet (1) : : : : +- ^ NoopFilter (10) : : : : +- ^ Scan parquet (9) : : : +- ^ InputIteratorTransformer (19) - : : : +- ^ InputAdapter (18) - : : : +- ^ BroadcastQueryStage (17), Statistics(X) - : : : +- ColumnarBroadcastExchange (16) - : : : +- ^ NoopFilter (14) - : : : +- ^ Scan parquet (13) + : : : +- BroadcastQueryStage (17), Statistics(X) + : : : +- ColumnarBroadcastExchange (16) + : : : +- ^ NoopFilter (14) + : : : +- ^ Scan parquet (13) : : +- ^ InputIteratorTransformer (28) - : : +- ^ InputAdapter (27) - : : +- ^ BroadcastQueryStage (26), Statistics(X) - : : +- ColumnarBroadcastExchange (25) - : : +- ^ NoopFilter (23) - : : +- ^ Scan parquet (22) + : : +- BroadcastQueryStage (26), Statistics(X) + : : +- ColumnarBroadcastExchange (25) + : : +- ^ NoopFilter (23) + : : +- ^ Scan parquet (22) : +- ^ InputIteratorTransformer (37) - : +- ^ InputAdapter (36) - : +- ^ BroadcastQueryStage (35), Statistics(X) - : +- ColumnarBroadcastExchange (34) - : +- ^ NoopFilter (32) - : +- ^ Scan parquet (31) + : +- BroadcastQueryStage (35), Statistics(X) + : +- ColumnarBroadcastExchange (34) + : +- ^ NoopFilter (32) + : +- ^ Scan parquet (31) +- ^ InputIteratorTransformer (46) - +- ^ InputAdapter (45) - +- ^ BroadcastQueryStage (44), Statistics(X) - +- ColumnarBroadcastExchange (43) - +- ^ NoopFilter (41) - +- ^ Scan parquet (40) + +- BroadcastQueryStage (44), Statistics(X) + +- ColumnarBroadcastExchange (43) + +- ^ NoopFilter (41) + +- ^ Scan parquet (40) +- == Initial Plan == - Sort (97) - +- Exchange (96) - +- HashAggregate (95) - +- Exchange (94) - +- HashAggregate (93) - +- Project (92) - +- BroadcastHashJoin Inner BuildRight (91) - :- Project (87) - : +- BroadcastHashJoin Inner BuildRight (86) - : :- Project (82) - : : +- BroadcastHashJoin Inner BuildRight (81) - : : :- Project (77) - : : : +- BroadcastHashJoin Inner BuildRight (76) - : : : :- Project (72) - : : : : +- BroadcastHashJoin Inner BuildLeft (71) - : : : : :- BroadcastExchange (68) - : : : : : +- Project (67) - : : : : : +- Filter (66) - : : : : : +- Scan parquet (65) - : : : : +- Filter (70) - : : : : +- Scan parquet (69) - : : : +- BroadcastExchange (75) - : : : +- Filter (74) - : : : +- Scan parquet (73) - : : +- BroadcastExchange (80) - : : +- Filter (79) - : : +- Scan parquet (78) - : +- BroadcastExchange (85) - : +- Filter (84) - : +- Scan parquet (83) - +- BroadcastExchange (90) - +- Filter (89) - +- Scan parquet (88) + Sort (99) + +- Exchange (98) + +- HashAggregate (97) + +- Exchange (96) + +- HashAggregate (95) + +- Project (94) + +- BroadcastHashJoin Inner BuildRight (93) + :- Project (89) + : +- BroadcastHashJoin Inner BuildRight (88) + : :- Project (84) + : : +- BroadcastHashJoin Inner BuildRight (83) + : : :- Project (79) + : : : +- BroadcastHashJoin Inner BuildRight (78) + : : : :- Project (74) + : : : : +- BroadcastHashJoin Inner BuildLeft (73) + : : : : :- BroadcastExchange (70) + : : : : : +- Project (69) + : : : : : +- Filter (68) + : : : : : +- Scan parquet (67) + : : : : +- Filter (72) + : : : : +- Scan parquet (71) + : : : +- BroadcastExchange (77) + : : : +- Filter (76) + : : : +- Scan parquet (75) + : : +- BroadcastExchange (82) + : : +- Filter (81) + : : +- Scan parquet (80) + : +- BroadcastExchange (87) + : +- Filter (86) + : +- Scan parquet (85) + +- BroadcastExchange (92) + +- Filter (91) + +- Scan parquet (90) (1) Scan parquet @@ -313,217 +308,225 @@ Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] Input [5]: [hash_partition_key#X, nation#X, o_year#X, sum#X, isEmpty#X] Arguments: false -(52) ColumnarExchange +(52) VeloxAppendBatches +Input [5]: [hash_partition_key#X, nation#X, o_year#X, sum#X, isEmpty#X] +Arguments: X + +(53) ColumnarExchange Input [5]: [hash_partition_key#X, nation#X, o_year#X, sum#X, isEmpty#X] Arguments: hashpartitioning(nation#X, o_year#X, 1), ENSURE_REQUIREMENTS, [nation#X, o_year#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(53) ShuffleQueryStage +(54) ShuffleQueryStage Output [4]: [nation#X, o_year#X, sum#X, isEmpty#X] Arguments: X -(54) InputAdapter +(55) InputAdapter Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] -(55) InputIteratorTransformer +(56) InputIteratorTransformer Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] -(56) RegularHashAggregateExecTransformer +(57) RegularHashAggregateExecTransformer Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] Keys [2]: [nation#X, o_year#X] Functions [1]: [sum(amount#X)] Aggregate Attributes [1]: [sum(amount#X)#X] Results [3]: [nation#X, o_year#X, sum(amount#X)#X AS sum_profit#X] -(57) WholeStageCodegenTransformer (X) +(58) WholeStageCodegenTransformer (X) Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: false -(58) ColumnarExchange +(59) VeloxAppendBatches +Input [3]: [nation#X, o_year#X, sum_profit#X] +Arguments: X + +(60) ColumnarExchange Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: rangepartitioning(nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(59) ShuffleQueryStage +(61) ShuffleQueryStage Output [3]: [nation#X, o_year#X, sum_profit#X] Arguments: X -(60) InputAdapter +(62) InputAdapter Input [3]: [nation#X, o_year#X, sum_profit#X] -(61) InputIteratorTransformer +(63) InputIteratorTransformer Input [3]: [nation#X, o_year#X, sum_profit#X] -(62) SortExecTransformer +(64) SortExecTransformer Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: [nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST], true, 0 -(63) WholeStageCodegenTransformer (X) +(65) WholeStageCodegenTransformer (X) Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: false -(64) VeloxColumnarToRowExec +(66) VeloxColumnarToRowExec Input [3]: [nation#X, o_year#X, sum_profit#X] -(65) Scan parquet +(67) Scan parquet Output [2]: [p_partkey#X, p_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_name), StringContains(p_name,green), IsNotNull(p_partkey)] ReadSchema: struct -(66) Filter +(68) Filter Input [2]: [p_partkey#X, p_name#X] Condition : ((isnotnull(p_name#X) AND Contains(p_name#X, green)) AND isnotnull(p_partkey#X)) -(67) Project +(69) Project Output [1]: [p_partkey#X] Input [2]: [p_partkey#X, p_name#X] -(68) BroadcastExchange +(70) BroadcastExchange Input [1]: [p_partkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(69) Scan parquet +(71) Scan parquet Output [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(70) Filter +(72) Filter Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Condition : ((isnotnull(l_partkey#X) AND isnotnull(l_suppkey#X)) AND isnotnull(l_orderkey#X)) -(71) BroadcastHashJoin +(73) BroadcastHashJoin Left keys [1]: [p_partkey#X] Right keys [1]: [l_partkey#X] Join condition: None -(72) Project +(74) Project Output [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Input [7]: [p_partkey#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -(73) Scan parquet +(75) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(74) Filter +(76) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(75) BroadcastExchange +(77) BroadcastExchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(76) BroadcastHashJoin +(78) BroadcastHashJoin Left keys [1]: [l_suppkey#X] Right keys [1]: [s_suppkey#X] Join condition: None -(77) Project +(79) Project Output [7]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Input [8]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_suppkey#X, s_nationkey#X] -(78) Scan parquet +(80) Scan parquet Output [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_suppkey), IsNotNull(ps_partkey)] ReadSchema: struct -(79) Filter +(81) Filter Input [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Condition : (isnotnull(ps_suppkey#X) AND isnotnull(ps_partkey#X)) -(80) BroadcastExchange +(82) BroadcastExchange Input [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Arguments: HashedRelationBroadcastMode(List(input[1, bigint, false], input[0, bigint, false]),false), [plan_id=X] -(81) BroadcastHashJoin +(83) BroadcastHashJoin Left keys [2]: [l_suppkey#X, l_partkey#X] Right keys [2]: [ps_suppkey#X, ps_partkey#X] Join condition: None -(82) Project +(84) Project Output [6]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] Input [10]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_partkey#X, ps_suppkey#X, ps_supplycost#X] -(83) Scan parquet +(85) Scan parquet Output [2]: [o_orderkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderkey)] ReadSchema: struct -(84) Filter +(86) Filter Input [2]: [o_orderkey#X, o_orderdate#X] Condition : isnotnull(o_orderkey#X) -(85) BroadcastExchange +(87) BroadcastExchange Input [2]: [o_orderkey#X, o_orderdate#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(86) BroadcastHashJoin +(88) BroadcastHashJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join condition: None -(87) Project +(89) Project Output [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] Input [8]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderkey#X, o_orderdate#X] -(88) Scan parquet +(90) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey)] ReadSchema: struct -(89) Filter +(91) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : isnotnull(n_nationkey#X) -(90) BroadcastExchange +(92) BroadcastExchange Input [2]: [n_nationkey#X, n_name#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(91) BroadcastHashJoin +(93) BroadcastHashJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(92) Project +(94) Project Output [3]: [n_name#X AS nation#X, year(o_orderdate#X) AS o_year#X, CheckOverflow((promote_precision(cast(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) as decimal(27,4))) - promote_precision(cast(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(l_quantity#X)), DecimalType(25,4)) as decimal(27,4)))), DecimalType(27,4)) AS amount#X] Input [8]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X, n_nationkey#X, n_name#X] -(93) HashAggregate +(95) HashAggregate Input [3]: [nation#X, o_year#X, amount#X] Keys [2]: [nation#X, o_year#X] Functions [1]: [partial_sum(amount#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [4]: [nation#X, o_year#X, sum#X, isEmpty#X] -(94) Exchange +(96) Exchange Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] Arguments: hashpartitioning(nation#X, o_year#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(95) HashAggregate +(97) HashAggregate Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] Keys [2]: [nation#X, o_year#X] Functions [1]: [sum(amount#X)] Aggregate Attributes [1]: [sum(amount#X)#X] Results [3]: [nation#X, o_year#X, sum(amount#X)#X AS sum_profit#X] -(96) Exchange +(98) Exchange Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: rangepartitioning(nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(97) Sort +(99) Sort Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: [nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST], true, 0 -(98) AdaptiveSparkPlan +(100) AdaptiveSparkPlan Output [3]: [nation#X, o_year#X, sum_profit#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/1.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/1.txt index 41613c36f7bd8..545f2e7e086df 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/1.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/1.txt @@ -1,31 +1,31 @@ == Physical Plan == -AdaptiveSparkPlan (28) +AdaptiveSparkPlan (30) +- == Final Plan == - VeloxColumnarToRowExec (19) - +- ^ SortExecTransformer (17) - +- ^ InputIteratorTransformer (16) - +- ^ InputAdapter (15) - +- ^ ShuffleQueryStage (14), Statistics(X) - +- ColumnarExchange (13) - +- ^ RegularHashAggregateExecTransformer (11) - +- ^ InputIteratorTransformer (10) - +- ^ InputAdapter (9) - +- ^ ShuffleQueryStage (8), Statistics(X) - +- ColumnarExchange (7) + VeloxColumnarToRowExec (21) + +- ^ SortExecTransformer (19) + +- ^ InputIteratorTransformer (18) + +- ShuffleQueryStage (16), Statistics(X) + +- ColumnarExchange (15) + +- VeloxAppendBatches (14) + +- ^ RegularHashAggregateExecTransformer (12) + +- ^ InputIteratorTransformer (11) + +- ShuffleQueryStage (9), Statistics(X) + +- ColumnarExchange (8) + +- VeloxAppendBatches (7) +- ^ ProjectExecTransformer (5) +- ^ FlushableHashAggregateExecTransformer (4) +- ^ ProjectExecTransformer (3) +- ^ NoopFilter (2) +- ^ Scan parquet (1) +- == Initial Plan == - Sort (27) - +- Exchange (26) - +- HashAggregate (25) - +- Exchange (24) - +- HashAggregate (23) - +- Project (22) - +- Filter (21) - +- Scan parquet (20) + Sort (29) + +- Exchange (28) + +- HashAggregate (27) + +- Exchange (26) + +- HashAggregate (25) + +- Project (24) + +- Filter (23) + +- Scan parquet (22) (1) Scan parquet @@ -58,97 +58,105 @@ Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, Input [18]: [hash_partition_key#X, l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Arguments: false -(7) ColumnarExchange +(7) VeloxAppendBatches +Input [18]: [hash_partition_key#X, l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] +Arguments: X + +(8) ColumnarExchange Input [18]: [hash_partition_key#X, l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Arguments: hashpartitioning(l_returnflag#X, l_linestatus#X, 1), ENSURE_REQUIREMENTS, [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X], [plan_id=X], [id=#X] -(8) ShuffleQueryStage +(9) ShuffleQueryStage Output [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Arguments: X -(9) InputAdapter +(10) InputAdapter Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] -(10) InputIteratorTransformer +(11) InputIteratorTransformer Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] -(11) RegularHashAggregateExecTransformer +(12) RegularHashAggregateExecTransformer Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Keys [2]: [l_returnflag#X, l_linestatus#X] Functions [8]: [sum(l_quantity#X), sum(l_extendedprice#X), sum((l_extendedprice#X * (1 - l_discount#X))), sum(((l_extendedprice#X * (1 - l_discount#X)) * (1 + l_tax#X))), avg(l_quantity#X), avg(l_extendedprice#X), avg(l_discount#X), count(1)] Aggregate Attributes [8]: [sum(l_quantity#X)#X, sum(l_extendedprice#X)#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X, sum(((l_extendedprice#X * (1 - l_discount#X)) * (1 + l_tax#X)))#X, avg(l_quantity#X)#X, avg(l_extendedprice#X)#X, avg(l_discount#X)#X, count(1)#X] Results [10]: [l_returnflag#X, l_linestatus#X, sum(l_quantity#X)#X AS sum_qty#X, sum(l_extendedprice#X)#X AS sum_base_price#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X AS sum_disc_price#X, sum(((l_extendedprice#X * (1 - l_discount#X)) * (1 + l_tax#X)))#X AS sum_charge#X, avg(l_quantity#X)#X AS avg_qty#X, avg(l_extendedprice#X)#X AS avg_price#X, avg(l_discount#X)#X AS avg_disc#X, count(1)#X AS count_order#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] +Arguments: X + +(15) ColumnarExchange Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: rangepartitioning(l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] -(17) SortExecTransformer +(19) SortExecTransformer Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: [l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST], true, 0 -(18) WholeStageCodegenTransformer (X) +(20) WholeStageCodegenTransformer (X) Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: false -(19) VeloxColumnarToRowExec +(21) VeloxColumnarToRowExec Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] -(20) Scan parquet +(22) Scan parquet Output [7]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_tax#X, l_returnflag#X, l_linestatus#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), LessThanOrEqual(l_shipdate,1998-09-02)] ReadSchema: struct -(21) Filter +(23) Filter Input [7]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_tax#X, l_returnflag#X, l_linestatus#X, l_shipdate#X] Condition : (isnotnull(l_shipdate#X) AND (l_shipdate#X <= 1998-09-02)) -(22) Project +(24) Project Output [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_tax#X, l_returnflag#X, l_linestatus#X] Input [7]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_tax#X, l_returnflag#X, l_linestatus#X, l_shipdate#X] -(23) HashAggregate +(25) HashAggregate Input [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_tax#X, l_returnflag#X, l_linestatus#X] Keys [2]: [l_returnflag#X, l_linestatus#X] Functions [8]: [partial_sum(l_quantity#X), partial_sum(l_extendedprice#X), partial_sum((l_extendedprice#X * (1 - l_discount#X))), partial_sum(((l_extendedprice#X * (1 - l_discount#X)) * (1 + l_tax#X))), partial_avg(l_quantity#X), partial_avg(l_extendedprice#X), partial_avg(l_discount#X), partial_count(1)] Aggregate Attributes [15]: [sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Results [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] -(24) Exchange +(26) Exchange Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Arguments: hashpartitioning(l_returnflag#X, l_linestatus#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(25) HashAggregate +(27) HashAggregate Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Keys [2]: [l_returnflag#X, l_linestatus#X] Functions [8]: [sum(l_quantity#X), sum(l_extendedprice#X), sum((l_extendedprice#X * (1 - l_discount#X))), sum(((l_extendedprice#X * (1 - l_discount#X)) * (1 + l_tax#X))), avg(l_quantity#X), avg(l_extendedprice#X), avg(l_discount#X), count(1)] Aggregate Attributes [8]: [sum(l_quantity#X)#X, sum(l_extendedprice#X)#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X, sum(((l_extendedprice#X * (1 - l_discount#X)) * (1 + l_tax#X)))#X, avg(l_quantity#X)#X, avg(l_extendedprice#X)#X, avg(l_discount#X)#X, count(1)#X] Results [10]: [l_returnflag#X, l_linestatus#X, sum(l_quantity#X)#X AS sum_qty#X, sum(l_extendedprice#X)#X AS sum_base_price#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X AS sum_disc_price#X, sum(((l_extendedprice#X * (1 - l_discount#X)) * (1 + l_tax#X)))#X AS sum_charge#X, avg(l_quantity#X)#X AS avg_qty#X, avg(l_extendedprice#X)#X AS avg_price#X, avg(l_discount#X)#X AS avg_disc#X, count(1)#X AS count_order#X] -(26) Exchange +(28) Exchange Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: rangepartitioning(l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(27) Sort +(29) Sort Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: [l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST], true, 0 -(28) AdaptiveSparkPlan +(30) AdaptiveSparkPlan Output [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/10.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/10.txt index 276a885015d0a..9325f007789b8 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/10.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/10.txt @@ -1,68 +1,65 @@ == Physical Plan == -AdaptiveSparkPlan (67) +AdaptiveSparkPlan (68) +- == Final Plan == - VeloxColumnarToRowExec (43) - +- TakeOrderedAndProjectExecTransformer (42) - +- ^ ProjectExecTransformer (40) - +- ^ RegularHashAggregateExecTransformer (39) - +- ^ InputIteratorTransformer (38) - +- ^ InputAdapter (37) - +- ^ ShuffleQueryStage (36), Statistics(X) - +- ColumnarExchange (35) + VeloxColumnarToRowExec (44) + +- TakeOrderedAndProjectExecTransformer (43) + +- ^ ProjectExecTransformer (41) + +- ^ RegularHashAggregateExecTransformer (40) + +- ^ InputIteratorTransformer (39) + +- ShuffleQueryStage (37), Statistics(X) + +- ColumnarExchange (36) + +- VeloxAppendBatches (35) +- ^ ProjectExecTransformer (33) +- ^ FlushableHashAggregateExecTransformer (32) +- ^ ProjectExecTransformer (31) - +- ^ BroadcastHashJoinExecTransformer Inner (30) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (30) :- ^ ProjectExecTransformer (22) - : +- ^ BroadcastHashJoinExecTransformer Inner (21) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (21) : :- ^ ProjectExecTransformer (12) - : : +- ^ BroadcastHashJoinExecTransformer Inner (11) + : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (11) : : :- ^ NoopFilter (2) : : : +- ^ Scan parquet (1) : : +- ^ InputIteratorTransformer (10) - : : +- ^ InputAdapter (9) - : : +- ^ BroadcastQueryStage (8), Statistics(X) - : : +- ColumnarBroadcastExchange (7) - : : +- ^ ProjectExecTransformer (5) - : : +- ^ NoopFilter (4) - : : +- ^ Scan parquet (3) + : : +- BroadcastQueryStage (8), Statistics(X) + : : +- ColumnarBroadcastExchange (7) + : : +- ^ ProjectExecTransformer (5) + : : +- ^ NoopFilter (4) + : : +- ^ Scan parquet (3) : +- ^ InputIteratorTransformer (20) - : +- ^ InputAdapter (19) - : +- ^ BroadcastQueryStage (18), Statistics(X) - : +- ColumnarBroadcastExchange (17) - : +- ^ ProjectExecTransformer (15) - : +- ^ NoopFilter (14) - : +- ^ Scan parquet (13) + : +- BroadcastQueryStage (18), Statistics(X) + : +- ColumnarBroadcastExchange (17) + : +- ^ ProjectExecTransformer (15) + : +- ^ NoopFilter (14) + : +- ^ Scan parquet (13) +- ^ InputIteratorTransformer (29) - +- ^ InputAdapter (28) - +- ^ BroadcastQueryStage (27), Statistics(X) - +- ColumnarBroadcastExchange (26) - +- ^ NoopFilter (24) - +- ^ Scan parquet (23) + +- BroadcastQueryStage (27), Statistics(X) + +- ColumnarBroadcastExchange (26) + +- ^ NoopFilter (24) + +- ^ Scan parquet (23) +- == Initial Plan == - TakeOrderedAndProject (66) - +- HashAggregate (65) - +- Exchange (64) - +- HashAggregate (63) - +- Project (62) - +- BroadcastHashJoin Inner BuildRight (61) - :- Project (57) - : +- BroadcastHashJoin Inner BuildRight (56) - : :- Project (51) - : : +- BroadcastHashJoin Inner BuildRight (50) - : : :- Filter (45) - : : : +- Scan parquet (44) - : : +- BroadcastExchange (49) - : : +- Project (48) - : : +- Filter (47) - : : +- Scan parquet (46) - : +- BroadcastExchange (55) - : +- Project (54) - : +- Filter (53) - : +- Scan parquet (52) - +- BroadcastExchange (60) - +- Filter (59) - +- Scan parquet (58) + TakeOrderedAndProject (67) + +- HashAggregate (66) + +- Exchange (65) + +- HashAggregate (64) + +- Project (63) + +- BroadcastHashJoin Inner BuildRight (62) + :- Project (58) + : +- BroadcastHashJoin Inner BuildRight (57) + : :- Project (52) + : : +- BroadcastHashJoin Inner BuildRight (51) + : : :- Filter (46) + : : : +- Scan parquet (45) + : : +- BroadcastExchange (50) + : : +- Project (49) + : : +- Filter (48) + : : +- Scan parquet (47) + : +- BroadcastExchange (56) + : +- Project (55) + : +- Filter (54) + : +- Scan parquet (53) + +- BroadcastExchange (61) + +- Filter (60) + +- Scan parquet (59) (1) Scan parquet @@ -216,158 +213,162 @@ Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Arguments: false -(35) ColumnarExchange +(35) VeloxAppendBatches +Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] +Arguments: X + +(36) ColumnarExchange Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Arguments: hashpartitioning(c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(36) ShuffleQueryStage +(37) ShuffleQueryStage Output [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Arguments: X -(37) InputAdapter +(38) InputAdapter Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] -(38) InputIteratorTransformer +(39) InputIteratorTransformer Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] -(39) RegularHashAggregateExecTransformer +(40) RegularHashAggregateExecTransformer Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Keys [7]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X] Functions [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X] Results [8]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X] -(40) ProjectExecTransformer +(41) ProjectExecTransformer Output [8]: [c_custkey#X, c_name#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X AS revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] Input [8]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X] -(41) WholeStageCodegenTransformer (X) +(42) WholeStageCodegenTransformer (X) Input [8]: [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] Arguments: false -(42) TakeOrderedAndProjectExecTransformer +(43) TakeOrderedAndProjectExecTransformer Input [8]: [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] Arguments: X, [revenue#X DESC NULLS LAST], [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X], 0 -(43) VeloxColumnarToRowExec +(44) VeloxColumnarToRowExec Input [8]: [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] -(44) Scan parquet +(45) Scan parquet Output [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey), IsNotNull(c_nationkey)] ReadSchema: struct -(45) Filter +(46) Filter Input [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] Condition : (isnotnull(c_custkey#X) AND isnotnull(c_nationkey#X)) -(46) Scan parquet +(47) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1993-10-01), LessThan(o_orderdate,1994-01-01), IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(47) Filter +(48) Filter Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Condition : ((((isnotnull(o_orderdate#X) AND (o_orderdate#X >= 1993-10-01)) AND (o_orderdate#X < 1994-01-01)) AND isnotnull(o_custkey#X)) AND isnotnull(o_orderkey#X)) -(48) Project +(49) Project Output [2]: [o_orderkey#X, o_custkey#X] Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] -(49) BroadcastExchange +(50) BroadcastExchange Input [2]: [o_orderkey#X, o_custkey#X] Arguments: HashedRelationBroadcastMode(List(input[1, bigint, true]),false), [plan_id=X] -(50) BroadcastHashJoin +(51) BroadcastHashJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join type: Inner Join condition: None -(51) Project +(52) Project Output [8]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] Input [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X, o_custkey#X] -(52) Scan parquet +(53) Scan parquet Output [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_returnflag#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_returnflag), EqualTo(l_returnflag,R), IsNotNull(l_orderkey)] ReadSchema: struct -(53) Filter +(54) Filter Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_returnflag#X] Condition : ((isnotnull(l_returnflag#X) AND (l_returnflag#X = R)) AND isnotnull(l_orderkey#X)) -(54) Project +(55) Project Output [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_returnflag#X] -(55) BroadcastExchange +(56) BroadcastExchange Input [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(56) BroadcastHashJoin +(57) BroadcastHashJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: Inner Join condition: None -(57) Project +(58) Project Output [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] Input [11]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X] -(58) Scan parquet +(59) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey)] ReadSchema: struct -(59) Filter +(60) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : isnotnull(n_nationkey#X) -(60) BroadcastExchange +(61) BroadcastExchange Input [2]: [n_nationkey#X, n_name#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(61) BroadcastHashJoin +(62) BroadcastHashJoin Left keys [1]: [c_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(62) Project +(63) Project Output [9]: [c_custkey#X, c_name#X, c_address#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X, n_name#X] Input [11]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X, n_nationkey#X, n_name#X] -(63) HashAggregate +(64) HashAggregate Input [9]: [c_custkey#X, c_name#X, c_address#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X, n_name#X] Keys [7]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X] Functions [1]: [partial_sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] -(64) Exchange +(65) Exchange Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Arguments: hashpartitioning(c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(65) HashAggregate +(66) HashAggregate Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Keys [7]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X] Functions [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X] Results [8]: [c_custkey#X, c_name#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X AS revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] -(66) TakeOrderedAndProject +(67) TakeOrderedAndProject Input [8]: [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] Arguments: X, [revenue#X DESC NULLS LAST], [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] -(67) AdaptiveSparkPlan +(68) AdaptiveSparkPlan Output [8]: [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/11.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/11.txt index f8d59a67a22b0..c593374b9a001 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/11.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/11.txt @@ -1,59 +1,57 @@ == Physical Plan == -AdaptiveSparkPlan (58) +AdaptiveSparkPlan (60) +- == Final Plan == - VeloxColumnarToRowExec (38) - +- ^ SortExecTransformer (36) - +- ^ InputIteratorTransformer (35) - +- ^ InputAdapter (34) - +- ^ ShuffleQueryStage (33), Statistics(X) - +- ColumnarExchange (32) - +- ^ FilterExecTransformer (30) - +- ^ RegularHashAggregateExecTransformer (29) - +- ^ InputIteratorTransformer (28) - +- ^ InputAdapter (27) - +- ^ ShuffleQueryStage (26), Statistics(X) - +- ColumnarExchange (25) + VeloxColumnarToRowExec (40) + +- ^ SortExecTransformer (38) + +- ^ InputIteratorTransformer (37) + +- ShuffleQueryStage (35), Statistics(X) + +- ColumnarExchange (34) + +- VeloxAppendBatches (33) + +- ^ FilterExecTransformer (31) + +- ^ RegularHashAggregateExecTransformer (30) + +- ^ InputIteratorTransformer (29) + +- ShuffleQueryStage (27), Statistics(X) + +- ColumnarExchange (26) + +- VeloxAppendBatches (25) +- ^ ProjectExecTransformer (23) +- ^ FlushableHashAggregateExecTransformer (22) +- ^ ProjectExecTransformer (21) - +- ^ BroadcastHashJoinExecTransformer Inner (20) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (20) :- ^ ProjectExecTransformer (11) - : +- ^ BroadcastHashJoinExecTransformer Inner (10) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (10) : :- ^ NoopFilter (2) : : +- ^ Scan parquet (1) : +- ^ InputIteratorTransformer (9) - : +- ^ InputAdapter (8) - : +- ^ BroadcastQueryStage (7), Statistics(X) - : +- ColumnarBroadcastExchange (6) - : +- ^ NoopFilter (4) - : +- ^ Scan parquet (3) + : +- BroadcastQueryStage (7), Statistics(X) + : +- ColumnarBroadcastExchange (6) + : +- ^ NoopFilter (4) + : +- ^ Scan parquet (3) +- ^ InputIteratorTransformer (19) - +- ^ InputAdapter (18) - +- ^ BroadcastQueryStage (17), Statistics(X) - +- ColumnarBroadcastExchange (16) - +- ^ ProjectExecTransformer (14) - +- ^ NoopFilter (13) - +- ^ Scan parquet (12) + +- BroadcastQueryStage (17), Statistics(X) + +- ColumnarBroadcastExchange (16) + +- ^ ProjectExecTransformer (14) + +- ^ NoopFilter (13) + +- ^ Scan parquet (12) +- == Initial Plan == - Sort (57) - +- Exchange (56) - +- Filter (55) - +- HashAggregate (54) - +- Exchange (53) - +- HashAggregate (52) - +- Project (51) - +- BroadcastHashJoin Inner BuildRight (50) - :- Project (45) - : +- BroadcastHashJoin Inner BuildRight (44) - : :- Filter (40) - : : +- Scan parquet (39) - : +- BroadcastExchange (43) - : +- Filter (42) - : +- Scan parquet (41) - +- BroadcastExchange (49) - +- Project (48) - +- Filter (47) - +- Scan parquet (46) + Sort (59) + +- Exchange (58) + +- Filter (57) + +- HashAggregate (56) + +- Exchange (55) + +- HashAggregate (54) + +- Project (53) + +- BroadcastHashJoin Inner BuildRight (52) + :- Project (47) + : +- BroadcastHashJoin Inner BuildRight (46) + : :- Filter (42) + : : +- Scan parquet (41) + : +- BroadcastExchange (45) + : +- Filter (44) + : +- Scan parquet (43) + +- BroadcastExchange (51) + +- Project (50) + +- Filter (49) + +- Scan parquet (48) (1) Scan parquet @@ -164,388 +162,398 @@ Input [3]: [ps_partkey#X, sum#X, isEmpty#X] Input [4]: [hash_partition_key#X, ps_partkey#X, sum#X, isEmpty#X] Arguments: false -(25) ColumnarExchange +(25) VeloxAppendBatches +Input [4]: [hash_partition_key#X, ps_partkey#X, sum#X, isEmpty#X] +Arguments: X + +(26) ColumnarExchange Input [4]: [hash_partition_key#X, ps_partkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(26) ShuffleQueryStage +(27) ShuffleQueryStage Output [3]: [ps_partkey#X, sum#X, isEmpty#X] Arguments: X -(27) InputAdapter +(28) InputAdapter Input [3]: [ps_partkey#X, sum#X, isEmpty#X] -(28) InputIteratorTransformer +(29) InputIteratorTransformer Input [3]: [ps_partkey#X, sum#X, isEmpty#X] -(29) RegularHashAggregateExecTransformer +(30) RegularHashAggregateExecTransformer Input [3]: [ps_partkey#X, sum#X, isEmpty#X] Keys [1]: [ps_partkey#X] Functions [1]: [sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))] Aggregate Attributes [1]: [sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))#X] Results [2]: [ps_partkey#X, sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))#X AS value#X] -(30) FilterExecTransformer +(31) FilterExecTransformer Input [2]: [ps_partkey#X, value#X] Arguments: (isnotnull(value#X) AND (cast(value#X as decimal(38,6)) > Subquery subquery#X, [id=#X])) -(31) WholeStageCodegenTransformer (X) +(32) WholeStageCodegenTransformer (X) Input [2]: [ps_partkey#X, value#X] Arguments: false -(32) ColumnarExchange +(33) VeloxAppendBatches +Input [2]: [ps_partkey#X, value#X] +Arguments: X + +(34) ColumnarExchange Input [2]: [ps_partkey#X, value#X] Arguments: rangepartitioning(value#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(33) ShuffleQueryStage +(35) ShuffleQueryStage Output [2]: [ps_partkey#X, value#X] Arguments: X -(34) InputAdapter +(36) InputAdapter Input [2]: [ps_partkey#X, value#X] -(35) InputIteratorTransformer +(37) InputIteratorTransformer Input [2]: [ps_partkey#X, value#X] -(36) SortExecTransformer +(38) SortExecTransformer Input [2]: [ps_partkey#X, value#X] Arguments: [value#X DESC NULLS LAST], true, 0 -(37) WholeStageCodegenTransformer (X) +(39) WholeStageCodegenTransformer (X) Input [2]: [ps_partkey#X, value#X] Arguments: false -(38) VeloxColumnarToRowExec +(40) VeloxColumnarToRowExec Input [2]: [ps_partkey#X, value#X] -(39) Scan parquet +(41) Scan parquet Output [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_suppkey)] ReadSchema: struct -(40) Filter +(42) Filter Input [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Condition : isnotnull(ps_suppkey#X) -(41) Scan parquet +(43) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(42) Filter +(44) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(43) BroadcastExchange +(45) BroadcastExchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(44) BroadcastHashJoin +(46) BroadcastHashJoin Left keys [1]: [ps_suppkey#X] Right keys [1]: [s_suppkey#X] Join type: Inner Join condition: None -(45) Project +(47) Project Output [4]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] Input [6]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X, s_suppkey#X, s_nationkey#X] -(46) Scan parquet +(48) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,GERMANY), IsNotNull(n_nationkey)] ReadSchema: struct -(47) Filter +(49) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : ((isnotnull(n_name#X) AND (n_name#X = GERMANY)) AND isnotnull(n_nationkey#X)) -(48) Project +(50) Project Output [1]: [n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(49) BroadcastExchange +(51) BroadcastExchange Input [1]: [n_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(50) BroadcastHashJoin +(52) BroadcastHashJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(51) Project +(53) Project Output [3]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X] Input [5]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X, n_nationkey#X] -(52) HashAggregate +(54) HashAggregate Input [3]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X] Keys [1]: [ps_partkey#X] Functions [1]: [partial_sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [ps_partkey#X, sum#X, isEmpty#X] -(53) Exchange +(55) Exchange Input [3]: [ps_partkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(54) HashAggregate +(56) HashAggregate Input [3]: [ps_partkey#X, sum#X, isEmpty#X] Keys [1]: [ps_partkey#X] Functions [1]: [sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))] Aggregate Attributes [1]: [sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))#X] Results [2]: [ps_partkey#X, sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))#X AS value#X] -(55) Filter +(57) Filter Input [2]: [ps_partkey#X, value#X] Condition : (isnotnull(value#X) AND (cast(value#X as decimal(38,6)) > Subquery subquery#X, [id=#X])) -(56) Exchange +(58) Exchange Input [2]: [ps_partkey#X, value#X] Arguments: rangepartitioning(value#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(57) Sort +(59) Sort Input [2]: [ps_partkey#X, value#X] Arguments: [value#X DESC NULLS LAST], true, 0 -(58) AdaptiveSparkPlan +(60) AdaptiveSparkPlan Output [2]: [ps_partkey#X, value#X] Arguments: isFinalPlan=true ===== Subqueries ===== -Subquery:1 Hosting operator id = 30 Hosting Expression = Subquery subquery#X, [id=#X] -AdaptiveSparkPlan (99) +Subquery:1 Hosting operator id = 31 Hosting Expression = Subquery subquery#X, [id=#X] +AdaptiveSparkPlan (102) +- == Final Plan == - VeloxColumnarToRowExec (82) - +- ^ ProjectExecTransformer (80) - +- ^ RegularHashAggregateExecTransformer (79) - +- ^ InputIteratorTransformer (78) - +- ^ InputAdapter (77) - +- ^ ShuffleQueryStage (76), Statistics(X) - +- ColumnarExchange (75) - +- ^ FlushableHashAggregateExecTransformer (73) - +- ^ ProjectExecTransformer (72) - +- ^ BroadcastHashJoinExecTransformer Inner (71) - :- ^ ProjectExecTransformer (66) - : +- ^ BroadcastHashJoinExecTransformer Inner (65) - : :- ^ NoopFilter (60) - : : +- ^ Scan parquet (59) - : +- ^ InputIteratorTransformer (64) - : +- ^ InputAdapter (63) - : +- ^ BroadcastQueryStage (62), Statistics(X) - : +- ReusedExchange (61) - +- ^ InputIteratorTransformer (70) - +- ^ InputAdapter (69) - +- ^ BroadcastQueryStage (68), Statistics(X) - +- ReusedExchange (67) + VeloxColumnarToRowExec (85) + +- ^ ProjectExecTransformer (83) + +- ^ RegularHashAggregateExecTransformer (82) + +- ^ InputIteratorTransformer (81) + +- ShuffleQueryStage (79), Statistics(X) + +- ColumnarExchange (78) + +- VeloxAppendBatches (77) + +- ^ FlushableHashAggregateExecTransformer (75) + +- ^ ProjectExecTransformer (74) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (73) + :- ^ ProjectExecTransformer (68) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (67) + : :- ^ NoopFilter (62) + : : +- ^ Scan parquet (61) + : +- ^ InputIteratorTransformer (66) + : +- BroadcastQueryStage (64), Statistics(X) + : +- ReusedExchange (63) + +- ^ InputIteratorTransformer (72) + +- BroadcastQueryStage (70), Statistics(X) + +- ReusedExchange (69) +- == Initial Plan == - HashAggregate (98) - +- Exchange (97) - +- HashAggregate (96) - +- Project (95) - +- BroadcastHashJoin Inner BuildRight (94) - :- Project (89) - : +- BroadcastHashJoin Inner BuildRight (88) - : :- Filter (84) - : : +- Scan parquet (83) - : +- BroadcastExchange (87) - : +- Filter (86) - : +- Scan parquet (85) - +- BroadcastExchange (93) - +- Project (92) - +- Filter (91) - +- Scan parquet (90) - - -(59) Scan parquet + HashAggregate (101) + +- Exchange (100) + +- HashAggregate (99) + +- Project (98) + +- BroadcastHashJoin Inner BuildRight (97) + :- Project (92) + : +- BroadcastHashJoin Inner BuildRight (91) + : :- Filter (87) + : : +- Scan parquet (86) + : +- BroadcastExchange (90) + : +- Filter (89) + : +- Scan parquet (88) + +- BroadcastExchange (96) + +- Project (95) + +- Filter (94) + +- Scan parquet (93) + + +(61) Scan parquet Output [3]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_suppkey)] ReadSchema: struct -(60) NoopFilter +(62) NoopFilter Input [3]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Arguments: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] -(61) ReusedExchange [Reuses operator id: 6] +(63) ReusedExchange [Reuses operator id: 6] Output [2]: [s_suppkey#X, s_nationkey#X] -(62) BroadcastQueryStage +(64) BroadcastQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] Arguments: X -(63) InputAdapter +(65) InputAdapter Input [2]: [s_suppkey#X, s_nationkey#X] -(64) InputIteratorTransformer +(66) InputIteratorTransformer Input [2]: [s_suppkey#X, s_nationkey#X] -(65) BroadcastHashJoinExecTransformer +(67) BroadcastHashJoinExecTransformer Left keys [1]: [ps_suppkey#X] Right keys [1]: [s_suppkey#X] Join type: Inner Join condition: None -(66) ProjectExecTransformer +(68) ProjectExecTransformer Output [3]: [ps_availqty#X, ps_supplycost#X, s_nationkey#X] Input [5]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X, s_suppkey#X, s_nationkey#X] -(67) ReusedExchange [Reuses operator id: 16] +(69) ReusedExchange [Reuses operator id: 16] Output [1]: [n_nationkey#X] -(68) BroadcastQueryStage +(70) BroadcastQueryStage Output [1]: [n_nationkey#X] Arguments: X -(69) InputAdapter +(71) InputAdapter Input [1]: [n_nationkey#X] -(70) InputIteratorTransformer +(72) InputIteratorTransformer Input [1]: [n_nationkey#X] -(71) BroadcastHashJoinExecTransformer +(73) BroadcastHashJoinExecTransformer Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(72) ProjectExecTransformer +(74) ProjectExecTransformer Output [3]: [ps_availqty#X, ps_supplycost#X, (ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))) AS _pre_X#X] Input [4]: [ps_availqty#X, ps_supplycost#X, s_nationkey#X, n_nationkey#X] -(73) FlushableHashAggregateExecTransformer +(75) FlushableHashAggregateExecTransformer Input [3]: [ps_availqty#X, ps_supplycost#X, _pre_X#X] Keys: [] Functions [1]: [partial_sum(_pre_X#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(74) WholeStageCodegenTransformer (X) +(76) WholeStageCodegenTransformer (X) Input [2]: [sum#X, isEmpty#X] Arguments: false -(75) ColumnarExchange +(77) VeloxAppendBatches +Input [2]: [sum#X, isEmpty#X] +Arguments: X + +(78) ColumnarExchange Input [2]: [sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(76) ShuffleQueryStage +(79) ShuffleQueryStage Output [2]: [sum#X, isEmpty#X] Arguments: X -(77) InputAdapter +(80) InputAdapter Input [2]: [sum#X, isEmpty#X] -(78) InputIteratorTransformer +(81) InputIteratorTransformer Input [2]: [sum#X, isEmpty#X] -(79) RegularHashAggregateExecTransformer +(82) RegularHashAggregateExecTransformer Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))] Aggregate Attributes [1]: [sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))#X] Results [1]: [sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))#X] -(80) ProjectExecTransformer +(83) ProjectExecTransformer Output [1]: [(sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))#X * 0.0001000000) AS (sum((ps_supplycost * ps_availqty)) * 0.0001000000)#X] Input [1]: [sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))#X] -(81) WholeStageCodegenTransformer (X) +(84) WholeStageCodegenTransformer (X) Input [1]: [(sum((ps_supplycost * ps_availqty)) * 0.0001000000)#X] Arguments: false -(82) VeloxColumnarToRowExec +(85) VeloxColumnarToRowExec Input [1]: [(sum((ps_supplycost * ps_availqty)) * 0.0001000000)#X] -(83) Scan parquet +(86) Scan parquet Output [3]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_suppkey)] ReadSchema: struct -(84) Filter +(87) Filter Input [3]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Condition : isnotnull(ps_suppkey#X) -(85) Scan parquet +(88) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(86) Filter +(89) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(87) BroadcastExchange +(90) BroadcastExchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(88) BroadcastHashJoin +(91) BroadcastHashJoin Left keys [1]: [ps_suppkey#X] Right keys [1]: [s_suppkey#X] Join type: Inner Join condition: None -(89) Project +(92) Project Output [3]: [ps_availqty#X, ps_supplycost#X, s_nationkey#X] Input [5]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X, s_suppkey#X, s_nationkey#X] -(90) Scan parquet +(93) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,GERMANY), IsNotNull(n_nationkey)] ReadSchema: struct -(91) Filter +(94) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : ((isnotnull(n_name#X) AND (n_name#X = GERMANY)) AND isnotnull(n_nationkey#X)) -(92) Project +(95) Project Output [1]: [n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(93) BroadcastExchange +(96) BroadcastExchange Input [1]: [n_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(94) BroadcastHashJoin +(97) BroadcastHashJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(95) Project +(98) Project Output [2]: [ps_availqty#X, ps_supplycost#X] Input [4]: [ps_availqty#X, ps_supplycost#X, s_nationkey#X, n_nationkey#X] -(96) HashAggregate +(99) HashAggregate Input [2]: [ps_availqty#X, ps_supplycost#X] Keys: [] Functions [1]: [partial_sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(97) Exchange +(100) Exchange Input [2]: [sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X] -(98) HashAggregate +(101) HashAggregate Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))] Aggregate Attributes [1]: [sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))#X] Results [1]: [(sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))#X * 0.0001000000) AS (sum((ps_supplycost * ps_availqty)) * 0.0001000000)#X] -(99) AdaptiveSparkPlan +(102) AdaptiveSparkPlan Output [1]: [(sum((ps_supplycost * ps_availqty)) * 0.0001000000)#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/12.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/12.txt index 9e142ea961f5e..93f3a4cf9ab77 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/12.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/12.txt @@ -1,44 +1,43 @@ == Physical Plan == -AdaptiveSparkPlan (42) +AdaptiveSparkPlan (44) +- == Final Plan == - VeloxColumnarToRowExec (28) - +- ^ SortExecTransformer (26) - +- ^ InputIteratorTransformer (25) - +- ^ InputAdapter (24) - +- ^ ShuffleQueryStage (23), Statistics(X) - +- ColumnarExchange (22) - +- ^ RegularHashAggregateExecTransformer (20) - +- ^ InputIteratorTransformer (19) - +- ^ InputAdapter (18) - +- ^ ShuffleQueryStage (17), Statistics(X) - +- ColumnarExchange (16) + VeloxColumnarToRowExec (30) + +- ^ SortExecTransformer (28) + +- ^ InputIteratorTransformer (27) + +- ShuffleQueryStage (25), Statistics(X) + +- ColumnarExchange (24) + +- VeloxAppendBatches (23) + +- ^ RegularHashAggregateExecTransformer (21) + +- ^ InputIteratorTransformer (20) + +- ShuffleQueryStage (18), Statistics(X) + +- ColumnarExchange (17) + +- VeloxAppendBatches (16) +- ^ ProjectExecTransformer (14) +- ^ FlushableHashAggregateExecTransformer (13) +- ^ ProjectExecTransformer (12) - +- ^ BroadcastHashJoinExecTransformer Inner (11) + +- ^ BroadcastHashJoinExecTransformer Inner BuildLeft (11) :- ^ InputIteratorTransformer (7) - : +- ^ InputAdapter (6) - : +- ^ BroadcastQueryStage (5), Statistics(X) - : +- ColumnarBroadcastExchange (4) - : +- ^ NoopFilter (2) - : +- ^ Scan parquet (1) + : +- BroadcastQueryStage (5), Statistics(X) + : +- ColumnarBroadcastExchange (4) + : +- ^ NoopFilter (2) + : +- ^ Scan parquet (1) +- ^ ProjectExecTransformer (10) +- ^ NoopFilter (9) +- ^ Scan parquet (8) +- == Initial Plan == - Sort (41) - +- Exchange (40) - +- HashAggregate (39) - +- Exchange (38) - +- HashAggregate (37) - +- Project (36) - +- BroadcastHashJoin Inner BuildLeft (35) - :- BroadcastExchange (31) - : +- Filter (30) - : +- Scan parquet (29) - +- Project (34) - +- Filter (33) - +- Scan parquet (32) + Sort (43) + +- Exchange (42) + +- HashAggregate (41) + +- Exchange (40) + +- HashAggregate (39) + +- Project (38) + +- BroadcastHashJoin Inner BuildLeft (37) + :- BroadcastExchange (33) + : +- Filter (32) + : +- Scan parquet (31) + +- Project (36) + +- Filter (35) + +- Scan parquet (34) (1) Scan parquet @@ -110,122 +109,130 @@ Input [3]: [l_shipmode#X, sum#X, sum#X] Input [4]: [hash_partition_key#X, l_shipmode#X, sum#X, sum#X] Arguments: false -(16) ColumnarExchange +(16) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_shipmode#X, sum#X, sum#X] +Arguments: X + +(17) ColumnarExchange Input [4]: [hash_partition_key#X, l_shipmode#X, sum#X, sum#X] Arguments: hashpartitioning(l_shipmode#X, 1), ENSURE_REQUIREMENTS, [l_shipmode#X, sum#X, sum#X], [plan_id=X], [id=#X] -(17) ShuffleQueryStage +(18) ShuffleQueryStage Output [3]: [l_shipmode#X, sum#X, sum#X] Arguments: X -(18) InputAdapter +(19) InputAdapter Input [3]: [l_shipmode#X, sum#X, sum#X] -(19) InputIteratorTransformer +(20) InputIteratorTransformer Input [3]: [l_shipmode#X, sum#X, sum#X] -(20) RegularHashAggregateExecTransformer +(21) RegularHashAggregateExecTransformer Input [3]: [l_shipmode#X, sum#X, sum#X] Keys [1]: [l_shipmode#X] Functions [2]: [sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END), sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)] Aggregate Attributes [2]: [sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X, sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X] Results [3]: [l_shipmode#X, sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X AS high_line_count#X, sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X AS low_line_count#X] -(21) WholeStageCodegenTransformer (X) +(22) WholeStageCodegenTransformer (X) Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: false -(22) ColumnarExchange +(23) VeloxAppendBatches +Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] +Arguments: X + +(24) ColumnarExchange Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: rangepartitioning(l_shipmode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(23) ShuffleQueryStage +(25) ShuffleQueryStage Output [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: X -(24) InputAdapter +(26) InputAdapter Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] -(25) InputIteratorTransformer +(27) InputIteratorTransformer Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] -(26) SortExecTransformer +(28) SortExecTransformer Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: [l_shipmode#X ASC NULLS FIRST], true, 0 -(27) WholeStageCodegenTransformer (X) +(29) WholeStageCodegenTransformer (X) Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: false -(28) VeloxColumnarToRowExec +(30) VeloxColumnarToRowExec Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] -(29) Scan parquet +(31) Scan parquet Output [2]: [o_orderkey#X, o_orderpriority#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderkey)] ReadSchema: struct -(30) Filter +(32) Filter Input [2]: [o_orderkey#X, o_orderpriority#X] Condition : isnotnull(o_orderkey#X) -(31) BroadcastExchange +(33) BroadcastExchange Input [2]: [o_orderkey#X, o_orderpriority#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(32) Scan parquet +(34) Scan parquet Output [5]: [l_orderkey#X, l_shipdate#X, l_commitdate#X, l_receiptdate#X, l_shipmode#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_commitdate), IsNotNull(l_receiptdate), IsNotNull(l_shipdate), In(l_shipmode, [MAIL,SHIP]), GreaterThanOrEqual(l_receiptdate,1994-01-01), LessThan(l_receiptdate,1995-01-01), IsNotNull(l_orderkey)] ReadSchema: struct -(33) Filter +(35) Filter Input [5]: [l_orderkey#X, l_shipdate#X, l_commitdate#X, l_receiptdate#X, l_shipmode#X] Condition : ((((((((isnotnull(l_commitdate#X) AND isnotnull(l_receiptdate#X)) AND isnotnull(l_shipdate#X)) AND l_shipmode#X IN (MAIL,SHIP)) AND (l_commitdate#X < l_receiptdate#X)) AND (l_shipdate#X < l_commitdate#X)) AND (l_receiptdate#X >= 1994-01-01)) AND (l_receiptdate#X < 1995-01-01)) AND isnotnull(l_orderkey#X)) -(34) Project +(36) Project Output [2]: [l_orderkey#X, l_shipmode#X] Input [5]: [l_orderkey#X, l_shipdate#X, l_commitdate#X, l_receiptdate#X, l_shipmode#X] -(35) BroadcastHashJoin +(37) BroadcastHashJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: Inner Join condition: None -(36) Project +(38) Project Output [2]: [o_orderpriority#X, l_shipmode#X] Input [4]: [o_orderkey#X, o_orderpriority#X, l_orderkey#X, l_shipmode#X] -(37) HashAggregate +(39) HashAggregate Input [2]: [o_orderpriority#X, l_shipmode#X] Keys [1]: [l_shipmode#X] Functions [2]: [partial_sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END), partial_sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)] Aggregate Attributes [2]: [sum#X, sum#X] Results [3]: [l_shipmode#X, sum#X, sum#X] -(38) Exchange +(40) Exchange Input [3]: [l_shipmode#X, sum#X, sum#X] Arguments: hashpartitioning(l_shipmode#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(39) HashAggregate +(41) HashAggregate Input [3]: [l_shipmode#X, sum#X, sum#X] Keys [1]: [l_shipmode#X] Functions [2]: [sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END), sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)] Aggregate Attributes [2]: [sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X, sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X] Results [3]: [l_shipmode#X, sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X AS high_line_count#X, sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X AS low_line_count#X] -(40) Exchange +(42) Exchange Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: rangepartitioning(l_shipmode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(41) Sort +(43) Sort Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: [l_shipmode#X ASC NULLS FIRST], true, 0 -(42) AdaptiveSparkPlan +(44) AdaptiveSparkPlan Output [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/13.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/13.txt index 9841ce1395e15..e44b10042401e 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/13.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/13.txt @@ -1,53 +1,52 @@ == Physical Plan == -AdaptiveSparkPlan (52) +AdaptiveSparkPlan (55) +- == Final Plan == - VeloxColumnarToRowExec (36) - +- ^ SortExecTransformer (34) - +- ^ InputIteratorTransformer (33) - +- ^ InputAdapter (32) - +- ^ ShuffleQueryStage (31), Statistics(X) - +- ColumnarExchange (30) - +- ^ RegularHashAggregateExecTransformer (28) - +- ^ InputIteratorTransformer (27) - +- ^ InputAdapter (26) - +- ^ ShuffleQueryStage (25), Statistics(X) - +- ColumnarExchange (24) - +- ^ ProjectExecTransformer (22) - +- ^ FlushableHashAggregateExecTransformer (21) - +- ^ ProjectExecTransformer (20) - +- ^ RegularHashAggregateExecTransformer (19) - +- ^ InputIteratorTransformer (18) - +- ^ InputAdapter (17) - +- ^ ShuffleQueryStage (16), Statistics(X) - +- ColumnarExchange (15) + VeloxColumnarToRowExec (39) + +- ^ SortExecTransformer (37) + +- ^ InputIteratorTransformer (36) + +- ShuffleQueryStage (34), Statistics(X) + +- ColumnarExchange (33) + +- VeloxAppendBatches (32) + +- ^ RegularHashAggregateExecTransformer (30) + +- ^ InputIteratorTransformer (29) + +- ShuffleQueryStage (27), Statistics(X) + +- ColumnarExchange (26) + +- VeloxAppendBatches (25) + +- ^ ProjectExecTransformer (23) + +- ^ FlushableHashAggregateExecTransformer (22) + +- ^ ProjectExecTransformer (21) + +- ^ RegularHashAggregateExecTransformer (20) + +- ^ InputIteratorTransformer (19) + +- ShuffleQueryStage (17), Statistics(X) + +- ColumnarExchange (16) + +- VeloxAppendBatches (15) +- ^ ProjectExecTransformer (13) +- ^ FlushableHashAggregateExecTransformer (12) +- ^ ProjectExecTransformer (11) - +- ^ BroadcastHashJoinExecTransformer LeftOuter (10) + +- ^ BroadcastHashJoinExecTransformer LeftOuter BuildRight (10) :- ^ Scan parquet (1) +- ^ InputIteratorTransformer (9) - +- ^ InputAdapter (8) - +- ^ BroadcastQueryStage (7), Statistics(X) - +- ColumnarBroadcastExchange (6) - +- ^ ProjectExecTransformer (4) - +- ^ NoopFilter (3) - +- ^ Scan parquet (2) + +- BroadcastQueryStage (7), Statistics(X) + +- ColumnarBroadcastExchange (6) + +- ^ ProjectExecTransformer (4) + +- ^ NoopFilter (3) + +- ^ Scan parquet (2) +- == Initial Plan == - Sort (51) - +- Exchange (50) - +- HashAggregate (49) - +- Exchange (48) - +- HashAggregate (47) - +- HashAggregate (46) - +- Exchange (45) - +- HashAggregate (44) - +- Project (43) - +- BroadcastHashJoin LeftOuter BuildRight (42) - :- Scan parquet (37) - +- BroadcastExchange (41) - +- Project (40) - +- Filter (39) - +- Scan parquet (38) + Sort (54) + +- Exchange (53) + +- HashAggregate (52) + +- Exchange (51) + +- HashAggregate (50) + +- HashAggregate (49) + +- Exchange (48) + +- HashAggregate (47) + +- Project (46) + +- BroadcastHashJoin LeftOuter BuildRight (45) + :- Scan parquet (40) + +- BroadcastExchange (44) + +- Project (43) + +- Filter (42) + +- Scan parquet (41) (1) Scan parquet @@ -114,175 +113,187 @@ Input [2]: [c_custkey#X, count#X] Input [3]: [hash_partition_key#X, c_custkey#X, count#X] Arguments: false -(15) ColumnarExchange +(15) VeloxAppendBatches +Input [3]: [hash_partition_key#X, c_custkey#X, count#X] +Arguments: X + +(16) ColumnarExchange Input [3]: [hash_partition_key#X, c_custkey#X, count#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, count#X], [plan_id=X], [id=#X] -(16) ShuffleQueryStage +(17) ShuffleQueryStage Output [2]: [c_custkey#X, count#X] Arguments: X -(17) InputAdapter +(18) InputAdapter Input [2]: [c_custkey#X, count#X] -(18) InputIteratorTransformer +(19) InputIteratorTransformer Input [2]: [c_custkey#X, count#X] -(19) RegularHashAggregateExecTransformer +(20) RegularHashAggregateExecTransformer Input [2]: [c_custkey#X, count#X] Keys [1]: [c_custkey#X] Functions [1]: [count(o_orderkey#X)] Aggregate Attributes [1]: [count(o_orderkey#X)#X] Results [2]: [c_custkey#X, count(o_orderkey#X)#X] -(20) ProjectExecTransformer +(21) ProjectExecTransformer Output [1]: [count(o_orderkey#X)#X AS c_count#X] Input [2]: [c_custkey#X, count(o_orderkey#X)#X] -(21) FlushableHashAggregateExecTransformer +(22) FlushableHashAggregateExecTransformer Input [1]: [c_count#X] Keys [1]: [c_count#X] Functions [1]: [partial_count(1)] Aggregate Attributes [1]: [count#X] Results [2]: [c_count#X, count#X] -(22) ProjectExecTransformer +(23) ProjectExecTransformer Output [3]: [hash(c_count#X, 42) AS hash_partition_key#X, c_count#X, count#X] Input [2]: [c_count#X, count#X] -(23) WholeStageCodegenTransformer (X) +(24) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, c_count#X, count#X] Arguments: false -(24) ColumnarExchange +(25) VeloxAppendBatches +Input [3]: [hash_partition_key#X, c_count#X, count#X] +Arguments: X + +(26) ColumnarExchange Input [3]: [hash_partition_key#X, c_count#X, count#X] Arguments: hashpartitioning(c_count#X, 1), ENSURE_REQUIREMENTS, [c_count#X, count#X], [plan_id=X], [id=#X] -(25) ShuffleQueryStage +(27) ShuffleQueryStage Output [2]: [c_count#X, count#X] Arguments: X -(26) InputAdapter +(28) InputAdapter Input [2]: [c_count#X, count#X] -(27) InputIteratorTransformer +(29) InputIteratorTransformer Input [2]: [c_count#X, count#X] -(28) RegularHashAggregateExecTransformer +(30) RegularHashAggregateExecTransformer Input [2]: [c_count#X, count#X] Keys [1]: [c_count#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [c_count#X, count(1)#X AS custdist#X] -(29) WholeStageCodegenTransformer (X) +(31) WholeStageCodegenTransformer (X) Input [2]: [c_count#X, custdist#X] Arguments: false -(30) ColumnarExchange +(32) VeloxAppendBatches +Input [2]: [c_count#X, custdist#X] +Arguments: X + +(33) ColumnarExchange Input [2]: [c_count#X, custdist#X] Arguments: rangepartitioning(custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(31) ShuffleQueryStage +(34) ShuffleQueryStage Output [2]: [c_count#X, custdist#X] Arguments: X -(32) InputAdapter +(35) InputAdapter Input [2]: [c_count#X, custdist#X] -(33) InputIteratorTransformer +(36) InputIteratorTransformer Input [2]: [c_count#X, custdist#X] -(34) SortExecTransformer +(37) SortExecTransformer Input [2]: [c_count#X, custdist#X] Arguments: [custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST], true, 0 -(35) WholeStageCodegenTransformer (X) +(38) WholeStageCodegenTransformer (X) Input [2]: [c_count#X, custdist#X] Arguments: false -(36) VeloxColumnarToRowExec +(39) VeloxColumnarToRowExec Input [2]: [c_count#X, custdist#X] -(37) Scan parquet +(40) Scan parquet Output [1]: [c_custkey#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(38) Scan parquet +(41) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_comment#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_comment), IsNotNull(o_custkey)] ReadSchema: struct -(39) Filter +(42) Filter Input [3]: [o_orderkey#X, o_custkey#X, o_comment#X] Condition : ((isnotnull(o_comment#X) AND NOT o_comment#X LIKE %special%requests%) AND isnotnull(o_custkey#X)) -(40) Project +(43) Project Output [2]: [o_orderkey#X, o_custkey#X] Input [3]: [o_orderkey#X, o_custkey#X, o_comment#X] -(41) BroadcastExchange +(44) BroadcastExchange Input [2]: [o_orderkey#X, o_custkey#X] Arguments: HashedRelationBroadcastMode(List(input[1, bigint, true]),false), [plan_id=X] -(42) BroadcastHashJoin +(45) BroadcastHashJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join type: LeftOuter Join condition: None -(43) Project +(46) Project Output [2]: [c_custkey#X, o_orderkey#X] Input [3]: [c_custkey#X, o_orderkey#X, o_custkey#X] -(44) HashAggregate +(47) HashAggregate Input [2]: [c_custkey#X, o_orderkey#X] Keys [1]: [c_custkey#X] Functions [1]: [partial_count(o_orderkey#X)] Aggregate Attributes [1]: [count#X] Results [2]: [c_custkey#X, count#X] -(45) Exchange +(48) Exchange Input [2]: [c_custkey#X, count#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(46) HashAggregate +(49) HashAggregate Input [2]: [c_custkey#X, count#X] Keys [1]: [c_custkey#X] Functions [1]: [count(o_orderkey#X)] Aggregate Attributes [1]: [count(o_orderkey#X)#X] Results [1]: [count(o_orderkey#X)#X AS c_count#X] -(47) HashAggregate +(50) HashAggregate Input [1]: [c_count#X] Keys [1]: [c_count#X] Functions [1]: [partial_count(1)] Aggregate Attributes [1]: [count#X] Results [2]: [c_count#X, count#X] -(48) Exchange +(51) Exchange Input [2]: [c_count#X, count#X] Arguments: hashpartitioning(c_count#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(49) HashAggregate +(52) HashAggregate Input [2]: [c_count#X, count#X] Keys [1]: [c_count#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [c_count#X, count(1)#X AS custdist#X] -(50) Exchange +(53) Exchange Input [2]: [c_count#X, custdist#X] Arguments: rangepartitioning(custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(51) Sort +(54) Sort Input [2]: [c_count#X, custdist#X] Arguments: [custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST], true, 0 -(52) AdaptiveSparkPlan +(55) AdaptiveSparkPlan Output [2]: [c_count#X, custdist#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/14.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/14.txt index fe13deb405004..8f5ae0f52e634 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/14.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/14.txt @@ -1,37 +1,36 @@ == Physical Plan == -AdaptiveSparkPlan (34) +AdaptiveSparkPlan (35) +- == Final Plan == - VeloxColumnarToRowExec (22) - +- ^ ProjectExecTransformer (20) - +- ^ RegularHashAggregateExecTransformer (19) - +- ^ InputIteratorTransformer (18) - +- ^ InputAdapter (17) - +- ^ ShuffleQueryStage (16), Statistics(X) - +- ColumnarExchange (15) + VeloxColumnarToRowExec (23) + +- ^ ProjectExecTransformer (21) + +- ^ RegularHashAggregateExecTransformer (20) + +- ^ InputIteratorTransformer (19) + +- ShuffleQueryStage (17), Statistics(X) + +- ColumnarExchange (16) + +- VeloxAppendBatches (15) +- ^ FlushableHashAggregateExecTransformer (13) +- ^ ProjectExecTransformer (12) - +- ^ BroadcastHashJoinExecTransformer Inner (11) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (11) :- ^ ProjectExecTransformer (3) : +- ^ NoopFilter (2) : +- ^ Scan parquet (1) +- ^ InputIteratorTransformer (10) - +- ^ InputAdapter (9) - +- ^ BroadcastQueryStage (8), Statistics(X) - +- ColumnarBroadcastExchange (7) - +- ^ NoopFilter (5) - +- ^ Scan parquet (4) + +- BroadcastQueryStage (8), Statistics(X) + +- ColumnarBroadcastExchange (7) + +- ^ NoopFilter (5) + +- ^ Scan parquet (4) +- == Initial Plan == - HashAggregate (33) - +- Exchange (32) - +- HashAggregate (31) - +- Project (30) - +- BroadcastHashJoin Inner BuildRight (29) - :- Project (25) - : +- Filter (24) - : +- Scan parquet (23) - +- BroadcastExchange (28) - +- Filter (27) - +- Scan parquet (26) + HashAggregate (34) + +- Exchange (33) + +- HashAggregate (32) + +- Project (31) + +- BroadcastHashJoin Inner BuildRight (30) + :- Project (26) + : +- Filter (25) + : +- Scan parquet (24) + +- BroadcastExchange (29) + +- Filter (28) + +- Scan parquet (27) (1) Scan parquet @@ -99,96 +98,100 @@ Results [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: false -(15) ColumnarExchange +(15) VeloxAppendBatches +Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] +Arguments: X + +(16) ColumnarExchange Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(16) ShuffleQueryStage +(17) ShuffleQueryStage Output [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: X -(17) InputAdapter +(18) InputAdapter Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] -(18) InputIteratorTransformer +(19) InputIteratorTransformer Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] -(19) RegularHashAggregateExecTransformer +(20) RegularHashAggregateExecTransformer Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Keys: [] Functions [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN (l_extendedprice#X * (1 - l_discount#X)) ELSE 0.0000 END), sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN (l_extendedprice#X * (1 - l_discount#X)) ELSE 0.0000 END)#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X] Results [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN (l_extendedprice#X * (1 - l_discount#X)) ELSE 0.0000 END)#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X] -(20) ProjectExecTransformer +(21) ProjectExecTransformer Output [1]: [((100.00 * sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN (l_extendedprice#X * (1 - l_discount#X)) ELSE 0.0000 END)#X) / sum((l_extendedprice#X * (1 - l_discount#X)))#X) AS promo_revenue#X] Input [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN (l_extendedprice#X * (1 - l_discount#X)) ELSE 0.0000 END)#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X] -(21) WholeStageCodegenTransformer (X) +(22) WholeStageCodegenTransformer (X) Input [1]: [promo_revenue#X] Arguments: false -(22) VeloxColumnarToRowExec +(23) VeloxColumnarToRowExec Input [1]: [promo_revenue#X] -(23) Scan parquet +(24) Scan parquet Output [4]: [l_partkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1995-09-01), LessThan(l_shipdate,1995-10-01), IsNotNull(l_partkey)] ReadSchema: struct -(24) Filter +(25) Filter Input [4]: [l_partkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : (((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1995-09-01)) AND (l_shipdate#X < 1995-10-01)) AND isnotnull(l_partkey#X)) -(25) Project +(26) Project Output [3]: [l_partkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_partkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(26) Scan parquet +(27) Scan parquet Output [2]: [p_partkey#X, p_type#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_partkey)] ReadSchema: struct -(27) Filter +(28) Filter Input [2]: [p_partkey#X, p_type#X] Condition : isnotnull(p_partkey#X) -(28) BroadcastExchange +(29) BroadcastExchange Input [2]: [p_partkey#X, p_type#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(29) BroadcastHashJoin +(30) BroadcastHashJoin Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join type: Inner Join condition: None -(30) Project +(31) Project Output [3]: [l_extendedprice#X, l_discount#X, p_type#X] Input [5]: [l_partkey#X, l_extendedprice#X, l_discount#X, p_partkey#X, p_type#X] -(31) HashAggregate +(32) HashAggregate Input [3]: [l_extendedprice#X, l_discount#X, p_type#X] Keys: [] Functions [2]: [partial_sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN (l_extendedprice#X * (1 - l_discount#X)) ELSE 0.0000 END), partial_sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Results [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] -(32) Exchange +(33) Exchange Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X] -(33) HashAggregate +(34) HashAggregate Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Keys: [] Functions [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN (l_extendedprice#X * (1 - l_discount#X)) ELSE 0.0000 END), sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN (l_extendedprice#X * (1 - l_discount#X)) ELSE 0.0000 END)#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X] Results [1]: [((100.00 * sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN (l_extendedprice#X * (1 - l_discount#X)) ELSE 0.0000 END)#X) / sum((l_extendedprice#X * (1 - l_discount#X)))#X) AS promo_revenue#X] -(34) AdaptiveSparkPlan +(35) AdaptiveSparkPlan Output [1]: [promo_revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/15.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/15.txt index 4b761198f3e36..d616981e57d21 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/15.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/15.txt @@ -1,44 +1,44 @@ == Physical Plan == -AdaptiveSparkPlan (41) +AdaptiveSparkPlan (43) +- == Final Plan == - VeloxColumnarToRowExec (26) - +- AQEShuffleRead (25) - +- ShuffleQueryStage (24), Statistics(X) - +- ColumnarExchange (23) - +- ^ ProjectExecTransformer (21) - +- ^ BroadcastHashJoinExecTransformer Inner (20) - :- ^ InputIteratorTransformer (7) - : +- ^ InputAdapter (6) - : +- ^ BroadcastQueryStage (5), Statistics(X) - : +- ColumnarBroadcastExchange (4) - : +- ^ NoopFilter (2) - : +- ^ Scan parquet (1) - +- ^ FilterExecTransformer (19) - +- ^ RegularHashAggregateExecTransformer (18) - +- ^ InputIteratorTransformer (17) - +- ^ InputAdapter (16) - +- ^ ShuffleQueryStage (15), Statistics(X) - +- ColumnarExchange (14) - +- ^ ProjectExecTransformer (12) - +- ^ FlushableHashAggregateExecTransformer (11) - +- ^ ProjectExecTransformer (10) - +- ^ NoopFilter (9) - +- ^ Scan parquet (8) + VeloxColumnarToRowExec (28) + +- AQEShuffleRead (27) + +- ShuffleQueryStage (26), Statistics(X) + +- ColumnarExchange (25) + +- VeloxAppendBatches (24) + +- ^ ProjectExecTransformer (22) + +- ^ BroadcastHashJoinExecTransformer Inner BuildLeft (21) + :- ^ InputIteratorTransformer (7) + : +- BroadcastQueryStage (5), Statistics(X) + : +- ColumnarBroadcastExchange (4) + : +- ^ NoopFilter (2) + : +- ^ Scan parquet (1) + +- ^ FilterExecTransformer (20) + +- ^ RegularHashAggregateExecTransformer (19) + +- ^ InputIteratorTransformer (18) + +- ShuffleQueryStage (16), Statistics(X) + +- ColumnarExchange (15) + +- VeloxAppendBatches (14) + +- ^ ProjectExecTransformer (12) + +- ^ FlushableHashAggregateExecTransformer (11) + +- ^ ProjectExecTransformer (10) + +- ^ NoopFilter (9) + +- ^ Scan parquet (8) +- == Initial Plan == - Sort (40) - +- Exchange (39) - +- Project (38) - +- BroadcastHashJoin Inner BuildLeft (37) - :- BroadcastExchange (29) - : +- Filter (28) - : +- Scan parquet (27) - +- Filter (36) - +- HashAggregate (35) - +- Exchange (34) - +- HashAggregate (33) - +- Project (32) - +- Filter (31) - +- Scan parquet (30) + Sort (42) + +- Exchange (41) + +- Project (40) + +- BroadcastHashJoin Inner BuildLeft (39) + :- BroadcastExchange (31) + : +- Filter (30) + : +- Scan parquet (29) + +- Filter (38) + +- HashAggregate (37) + +- Exchange (36) + +- HashAggregate (35) + +- Project (34) + +- Filter (33) + +- Scan parquet (32) (1) Scan parquet @@ -100,287 +100,299 @@ Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: false -(14) ColumnarExchange +(14) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] +Arguments: X + +(15) ColumnarExchange Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(15) ShuffleQueryStage +(16) ShuffleQueryStage Output [3]: [l_suppkey#X, sum#X, isEmpty#X] Arguments: X -(16) InputAdapter +(17) InputAdapter Input [3]: [l_suppkey#X, sum#X, isEmpty#X] -(17) InputIteratorTransformer +(18) InputIteratorTransformer Input [3]: [l_suppkey#X, sum#X, isEmpty#X] -(18) RegularHashAggregateExecTransformer +(19) RegularHashAggregateExecTransformer Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Keys [1]: [l_suppkey#X] Functions [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X] Results [2]: [l_suppkey#X AS supplier_no#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X AS total_revenue#X] -(19) FilterExecTransformer +(20) FilterExecTransformer Input [2]: [supplier_no#X, total_revenue#X] Arguments: (isnotnull(total_revenue#X) AND (total_revenue#X = Subquery subquery#X, [id=#X])) -(20) BroadcastHashJoinExecTransformer +(21) BroadcastHashJoinExecTransformer Left keys [1]: [s_suppkey#X] Right keys [1]: [supplier_no#X] Join type: Inner Join condition: None -(21) ProjectExecTransformer +(22) ProjectExecTransformer Output [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Input [6]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, supplier_no#X, total_revenue#X] -(22) WholeStageCodegenTransformer (X) +(23) WholeStageCodegenTransformer (X) Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: false -(23) ColumnarExchange +(24) VeloxAppendBatches +Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] +Arguments: X + +(25) ColumnarExchange Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: rangepartitioning(s_suppkey#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(24) ShuffleQueryStage +(26) ShuffleQueryStage Output [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: X -(25) AQEShuffleRead +(27) AQEShuffleRead Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: local -(26) VeloxColumnarToRowExec +(28) VeloxColumnarToRowExec Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] -(27) Scan parquet +(29) Scan parquet Output [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey)] ReadSchema: struct -(28) Filter +(30) Filter Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] Condition : isnotnull(s_suppkey#X) -(29) BroadcastExchange +(31) BroadcastExchange Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(30) Scan parquet +(32) Scan parquet Output [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1996-01-01), LessThan(l_shipdate,1996-04-01), IsNotNull(l_suppkey)] ReadSchema: struct -(31) Filter +(33) Filter Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : (((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1996-01-01)) AND (l_shipdate#X < 1996-04-01)) AND isnotnull(l_suppkey#X)) -(32) Project +(34) Project Output [3]: [l_suppkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(33) HashAggregate +(35) HashAggregate Input [3]: [l_suppkey#X, l_extendedprice#X, l_discount#X] Keys [1]: [l_suppkey#X] Functions [1]: [partial_sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_suppkey#X, sum#X, isEmpty#X] -(34) Exchange +(36) Exchange Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(35) HashAggregate +(37) HashAggregate Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Keys [1]: [l_suppkey#X] Functions [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X] Results [2]: [l_suppkey#X AS supplier_no#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X AS total_revenue#X] -(36) Filter +(38) Filter Input [2]: [supplier_no#X, total_revenue#X] Condition : (isnotnull(total_revenue#X) AND (total_revenue#X = Subquery subquery#X, [id=#X])) -(37) BroadcastHashJoin +(39) BroadcastHashJoin Left keys [1]: [s_suppkey#X] Right keys [1]: [supplier_no#X] Join type: Inner Join condition: None -(38) Project +(40) Project Output [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Input [6]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, supplier_no#X, total_revenue#X] -(39) Exchange +(41) Exchange Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: rangepartitioning(s_suppkey#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(40) Sort +(42) Sort Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: [s_suppkey#X ASC NULLS FIRST], true, 0 -(41) AdaptiveSparkPlan +(43) AdaptiveSparkPlan Output [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: isFinalPlan=true ===== Subqueries ===== -Subquery:1 Hosting operator id = 19 Hosting Expression = Subquery subquery#X, [id=#X] -AdaptiveSparkPlan (66) +Subquery:1 Hosting operator id = 20 Hosting Expression = Subquery subquery#X, [id=#X] +AdaptiveSparkPlan (69) +- == Final Plan == - VeloxColumnarToRowExec (57) - +- ^ RegularHashAggregateExecTransformer (55) - +- ^ RegularHashAggregateExecTransformer (54) - +- ^ ProjectExecTransformer (53) - +- ^ RegularHashAggregateExecTransformer (52) - +- ^ InputIteratorTransformer (51) - +- ^ InputAdapter (50) - +- ^ ShuffleQueryStage (49), Statistics(X) - +- ColumnarExchange (48) - +- ^ ProjectExecTransformer (46) - +- ^ FlushableHashAggregateExecTransformer (45) - +- ^ ProjectExecTransformer (44) - +- ^ NoopFilter (43) - +- ^ Scan parquet (42) + VeloxColumnarToRowExec (60) + +- ^ RegularHashAggregateExecTransformer (58) + +- ^ RegularHashAggregateExecTransformer (57) + +- ^ ProjectExecTransformer (56) + +- ^ RegularHashAggregateExecTransformer (55) + +- ^ InputIteratorTransformer (54) + +- ShuffleQueryStage (52), Statistics(X) + +- ColumnarExchange (51) + +- VeloxAppendBatches (50) + +- ^ ProjectExecTransformer (48) + +- ^ FlushableHashAggregateExecTransformer (47) + +- ^ ProjectExecTransformer (46) + +- ^ NoopFilter (45) + +- ^ Scan parquet (44) +- == Initial Plan == - HashAggregate (65) - +- HashAggregate (64) - +- HashAggregate (63) - +- Exchange (62) - +- HashAggregate (61) - +- Project (60) - +- Filter (59) - +- Scan parquet (58) + HashAggregate (68) + +- HashAggregate (67) + +- HashAggregate (66) + +- Exchange (65) + +- HashAggregate (64) + +- Project (63) + +- Filter (62) + +- Scan parquet (61) -(42) Scan parquet +(44) Scan parquet Output [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1996-01-01), LessThan(l_shipdate,1996-04-01)] ReadSchema: struct -(43) NoopFilter +(45) NoopFilter Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(44) ProjectExecTransformer +(46) ProjectExecTransformer Output [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, (l_extendedprice#X * (1 - l_discount#X)) AS _pre_X#X] Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(45) FlushableHashAggregateExecTransformer +(47) FlushableHashAggregateExecTransformer Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, _pre_X#X] Keys [1]: [l_suppkey#X] Functions [1]: [partial_sum(_pre_X#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_suppkey#X, sum#X, isEmpty#X] -(46) ProjectExecTransformer +(48) ProjectExecTransformer Output [4]: [hash(l_suppkey#X, 42) AS hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] Input [3]: [l_suppkey#X, sum#X, isEmpty#X] -(47) WholeStageCodegenTransformer (X) +(49) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: false -(48) ColumnarExchange +(50) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] +Arguments: X + +(51) ColumnarExchange Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(49) ShuffleQueryStage +(52) ShuffleQueryStage Output [3]: [l_suppkey#X, sum#X, isEmpty#X] Arguments: X -(50) InputAdapter +(53) InputAdapter Input [3]: [l_suppkey#X, sum#X, isEmpty#X] -(51) InputIteratorTransformer +(54) InputIteratorTransformer Input [3]: [l_suppkey#X, sum#X, isEmpty#X] -(52) RegularHashAggregateExecTransformer +(55) RegularHashAggregateExecTransformer Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Keys [1]: [l_suppkey#X] Functions [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X] Results [2]: [l_suppkey#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X] -(53) ProjectExecTransformer +(56) ProjectExecTransformer Output [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X AS total_revenue#X] Input [2]: [l_suppkey#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X] -(54) RegularHashAggregateExecTransformer +(57) RegularHashAggregateExecTransformer Input [1]: [total_revenue#X] Keys: [] Functions [1]: [partial_max(total_revenue#X)] Aggregate Attributes [1]: [max#X] Results [1]: [max#X] -(55) RegularHashAggregateExecTransformer +(58) RegularHashAggregateExecTransformer Input [1]: [max#X] Keys: [] Functions [1]: [max(total_revenue#X)] Aggregate Attributes [1]: [max(total_revenue#X)#X] Results [1]: [max(total_revenue#X)#X AS max(total_revenue)#X] -(56) WholeStageCodegenTransformer (X) +(59) WholeStageCodegenTransformer (X) Input [1]: [max(total_revenue)#X] Arguments: false -(57) VeloxColumnarToRowExec +(60) VeloxColumnarToRowExec Input [1]: [max(total_revenue)#X] -(58) Scan parquet +(61) Scan parquet Output [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1996-01-01), LessThan(l_shipdate,1996-04-01)] ReadSchema: struct -(59) Filter +(62) Filter Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : ((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1996-01-01)) AND (l_shipdate#X < 1996-04-01)) -(60) Project +(63) Project Output [3]: [l_suppkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(61) HashAggregate +(64) HashAggregate Input [3]: [l_suppkey#X, l_extendedprice#X, l_discount#X] Keys [1]: [l_suppkey#X] Functions [1]: [partial_sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_suppkey#X, sum#X, isEmpty#X] -(62) Exchange +(65) Exchange Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(63) HashAggregate +(66) HashAggregate Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Keys [1]: [l_suppkey#X] Functions [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X] Results [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X AS total_revenue#X] -(64) HashAggregate +(67) HashAggregate Input [1]: [total_revenue#X] Keys: [] Functions [1]: [partial_max(total_revenue#X)] Aggregate Attributes [1]: [max#X] Results [1]: [max#X] -(65) HashAggregate +(68) HashAggregate Input [1]: [max#X] Keys: [] Functions [1]: [max(total_revenue#X)] Aggregate Attributes [1]: [max(total_revenue#X)#X] Results [1]: [max(total_revenue#X)#X AS max(total_revenue)#X] -(66) AdaptiveSparkPlan +(69) AdaptiveSparkPlan Output [1]: [max(total_revenue)#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/16.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/16.txt index f178803b43e9c..3b4da47fc25f4 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/16.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/16.txt @@ -1,57 +1,56 @@ == Physical Plan == -AdaptiveSparkPlan (56) +AdaptiveSparkPlan (59) +- == Final Plan == - VeloxColumnarToRowExec (35) - +- ^ SortExecTransformer (33) - +- ^ InputIteratorTransformer (32) - +- ^ InputAdapter (31) - +- ^ ShuffleQueryStage (30), Statistics(X) - +- ColumnarExchange (29) - +- ^ RegularHashAggregateExecTransformer (27) - +- ^ InputIteratorTransformer (26) - +- ^ InputAdapter (25) - +- ^ ShuffleQueryStage (24), Statistics(X) - +- ColumnarExchange (23) - +- ^ ProjectExecTransformer (21) - +- ^ FlushableHashAggregateExecTransformer (20) - +- ^ RegularHashAggregateExecTransformer (19) - +- ^ InputIteratorTransformer (18) - +- ^ InputAdapter (17) - +- ^ ShuffleQueryStage (16), Statistics(X) - +- ColumnarExchange (15) + VeloxColumnarToRowExec (38) + +- ^ SortExecTransformer (36) + +- ^ InputIteratorTransformer (35) + +- ShuffleQueryStage (33), Statistics(X) + +- ColumnarExchange (32) + +- VeloxAppendBatches (31) + +- ^ RegularHashAggregateExecTransformer (29) + +- ^ InputIteratorTransformer (28) + +- ShuffleQueryStage (26), Statistics(X) + +- ColumnarExchange (25) + +- VeloxAppendBatches (24) + +- ^ ProjectExecTransformer (22) + +- ^ FlushableHashAggregateExecTransformer (21) + +- ^ RegularHashAggregateExecTransformer (20) + +- ^ InputIteratorTransformer (19) + +- ShuffleQueryStage (17), Statistics(X) + +- ColumnarExchange (16) + +- VeloxAppendBatches (15) +- ^ ProjectExecTransformer (13) +- ^ FlushableHashAggregateExecTransformer (12) +- ^ ProjectExecTransformer (11) - +- ^ BroadcastHashJoinExecTransformer Inner (10) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (10) :- ^ NoopFilter (2) : +- ^ Scan parquet (1) +- ^ InputIteratorTransformer (9) - +- ^ InputAdapter (8) - +- ^ BroadcastQueryStage (7), Statistics(X) - +- ColumnarBroadcastExchange (6) - +- ^ NoopFilter (4) - +- ^ Scan parquet (3) + +- BroadcastQueryStage (7), Statistics(X) + +- ColumnarBroadcastExchange (6) + +- ^ NoopFilter (4) + +- ^ Scan parquet (3) +- == Initial Plan == - Sort (55) - +- Exchange (54) - +- HashAggregate (53) - +- Exchange (52) - +- HashAggregate (51) - +- HashAggregate (50) - +- Exchange (49) - +- HashAggregate (48) - +- Project (47) - +- BroadcastHashJoin Inner BuildRight (46) - :- BroadcastHashJoin LeftAnti BuildRight (42) - : :- Filter (37) - : : +- Scan parquet (36) - : +- BroadcastExchange (41) - : +- Project (40) - : +- Filter (39) - : +- Scan parquet (38) - +- BroadcastExchange (45) - +- Filter (44) - +- Scan parquet (43) + Sort (58) + +- Exchange (57) + +- HashAggregate (56) + +- Exchange (55) + +- HashAggregate (54) + +- HashAggregate (53) + +- Exchange (52) + +- HashAggregate (51) + +- Project (50) + +- BroadcastHashJoin Inner BuildRight (49) + :- BroadcastHashJoin LeftAnti BuildRight (45) + : :- Filter (40) + : : +- Scan parquet (39) + : +- BroadcastExchange (44) + : +- Project (43) + : +- Filter (42) + : +- Scan parquet (41) + +- BroadcastExchange (48) + +- Filter (47) + +- Scan parquet (46) (1) Scan parquet @@ -119,197 +118,209 @@ Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Arguments: false -(15) ColumnarExchange +(15) VeloxAppendBatches +Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, ps_suppkey#X] +Arguments: X + +(16) ColumnarExchange Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, ps_suppkey#X], [plan_id=X], [id=#X] -(16) ShuffleQueryStage +(17) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Arguments: X -(17) InputAdapter +(18) InputAdapter Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(18) InputIteratorTransformer +(19) InputIteratorTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(19) RegularHashAggregateExecTransformer +(20) RegularHashAggregateExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Keys [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Functions: [] Aggregate Attributes: [] Results [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(20) FlushableHashAggregateExecTransformer +(21) FlushableHashAggregateExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] Functions [1]: [partial_count(distinct ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count#X] -(21) ProjectExecTransformer +(22) ProjectExecTransformer Output [5]: [hash(p_brand#X, p_type#X, p_size#X, 42) AS hash_partition_key#X, p_brand#X, p_type#X, p_size#X, count#X] Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] -(22) WholeStageCodegenTransformer (X) +(23) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, count#X] Arguments: false -(23) ColumnarExchange +(24) VeloxAppendBatches +Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, count#X] +Arguments: X + +(25) ColumnarExchange Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, count#X] Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, count#X], [plan_id=X], [id=#X] -(24) ShuffleQueryStage +(26) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, count#X] Arguments: X -(25) InputAdapter +(27) InputAdapter Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] -(26) InputIteratorTransformer +(28) InputIteratorTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] -(27) RegularHashAggregateExecTransformer +(29) RegularHashAggregateExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] Functions [1]: [count(distinct ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count(ps_suppkey#X)#X AS supplier_cnt#X] -(28) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: false -(29) ColumnarExchange +(31) VeloxAppendBatches +Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] +Arguments: X + +(32) ColumnarExchange Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: rangepartitioning(supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(30) ShuffleQueryStage +(33) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: X -(31) InputAdapter +(34) InputAdapter Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] -(32) InputIteratorTransformer +(35) InputIteratorTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] -(33) SortExecTransformer +(36) SortExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: [supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST], true, 0 -(34) WholeStageCodegenTransformer (X) +(37) WholeStageCodegenTransformer (X) Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: false -(35) VeloxColumnarToRowExec +(38) VeloxColumnarToRowExec Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] -(36) Scan parquet +(39) Scan parquet Output [2]: [ps_partkey#X, ps_suppkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_partkey)] ReadSchema: struct -(37) Filter +(40) Filter Input [2]: [ps_partkey#X, ps_suppkey#X] Condition : isnotnull(ps_partkey#X) -(38) Scan parquet +(41) Scan parquet Output [2]: [s_suppkey#X, s_comment#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_comment)] ReadSchema: struct -(39) Filter +(42) Filter Input [2]: [s_suppkey#X, s_comment#X] Condition : (isnotnull(s_comment#X) AND s_comment#X LIKE %Customer%Complaints%) -(40) Project +(43) Project Output [1]: [s_suppkey#X] Input [2]: [s_suppkey#X, s_comment#X] -(41) BroadcastExchange +(44) BroadcastExchange Input [1]: [s_suppkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),true), [plan_id=X] -(42) BroadcastHashJoin +(45) BroadcastHashJoin Left keys [1]: [ps_suppkey#X] Right keys [1]: [s_suppkey#X] Join type: LeftAnti Join condition: None -(43) Scan parquet +(46) Scan parquet Output [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_brand), IsNotNull(p_type), Not(EqualTo(p_brand,Brand#X)), Not(StringStartsWith(p_type,MEDIUM POLISHED)), In(p_size, [14,19,23,3,36,45,49,9]), IsNotNull(p_partkey)] ReadSchema: struct -(44) Filter +(47) Filter Input [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] Condition : (((((isnotnull(p_brand#X) AND isnotnull(p_type#X)) AND NOT (p_brand#X = Brand#X)) AND NOT StartsWith(p_type#X, MEDIUM POLISHED)) AND p_size#X IN (49,14,23,45,19,3,36,9)) AND isnotnull(p_partkey#X)) -(45) BroadcastExchange +(48) BroadcastExchange Input [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(46) BroadcastHashJoin +(49) BroadcastHashJoin Left keys [1]: [ps_partkey#X] Right keys [1]: [p_partkey#X] Join type: Inner Join condition: None -(47) Project +(50) Project Output [4]: [ps_suppkey#X, p_brand#X, p_type#X, p_size#X] Input [6]: [ps_partkey#X, ps_suppkey#X, p_partkey#X, p_brand#X, p_type#X, p_size#X] -(48) HashAggregate +(51) HashAggregate Input [4]: [ps_suppkey#X, p_brand#X, p_type#X, p_size#X] Keys [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Functions: [] Aggregate Attributes: [] Results [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(49) Exchange +(52) Exchange Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(50) HashAggregate +(53) HashAggregate Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Keys [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Functions: [] Aggregate Attributes: [] Results [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(51) HashAggregate +(54) HashAggregate Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] Functions [1]: [partial_count(distinct ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count#X] -(52) Exchange +(55) Exchange Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(53) HashAggregate +(56) HashAggregate Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] Functions [1]: [count(distinct ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count(ps_suppkey#X)#X AS supplier_cnt#X] -(54) Exchange +(57) Exchange Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: rangepartitioning(supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(55) Sort +(58) Sort Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: [supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST], true, 0 -(56) AdaptiveSparkPlan +(59) AdaptiveSparkPlan Output [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/17.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/17.txt index 4a2b5a1744e07..77733ad6f8a5b 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/17.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/17.txt @@ -1,39 +1,38 @@ == Physical Plan == -AdaptiveSparkPlan (35) +AdaptiveSparkPlan (36) +- == Final Plan == - VeloxColumnarToRowExec (14) - +- ^ ProjectExecTransformer (12) - +- ^ RegularHashAggregateExecTransformer (11) - +- ^ InputIteratorTransformer (10) - +- ^ InputAdapter (9) - +- ^ ShuffleQueryStage (8), Statistics(X) - +- ColumnarExchange (7) + VeloxColumnarToRowExec (15) + +- ^ ProjectExecTransformer (13) + +- ^ RegularHashAggregateExecTransformer (12) + +- ^ InputIteratorTransformer (11) + +- ShuffleQueryStage (9), Statistics(X) + +- ColumnarExchange (8) + +- VeloxAppendBatches (7) +- ^ FlushableHashAggregateExecTransformer (5) +- ^ InputIteratorTransformer (4) - +- ^ InputAdapter (3) - +- ^ RowToVeloxColumnar (2) - +- ^ LocalTableScan (1) + +- RowToVeloxColumnar (2) + +- LocalTableScan (1) +- == Initial Plan == - HashAggregate (34) - +- Exchange (33) - +- HashAggregate (32) - +- Project (31) - +- BroadcastHashJoin Inner BuildRight (30) - :- Project (22) - : +- BroadcastHashJoin Inner BuildRight (21) - : :- Filter (16) - : : +- Scan parquet (15) - : +- BroadcastExchange (20) - : +- Project (19) - : +- Filter (18) - : +- Scan parquet (17) - +- BroadcastExchange (29) - +- Filter (28) - +- HashAggregate (27) - +- Exchange (26) - +- HashAggregate (25) - +- Filter (24) - +- Scan parquet (23) + HashAggregate (35) + +- Exchange (34) + +- HashAggregate (33) + +- Project (32) + +- BroadcastHashJoin Inner BuildRight (31) + :- Project (23) + : +- BroadcastHashJoin Inner BuildRight (22) + : :- Filter (17) + : : +- Scan parquet (16) + : +- BroadcastExchange (21) + : +- Project (20) + : +- Filter (19) + : +- Scan parquet (18) + +- BroadcastExchange (30) + +- Filter (29) + +- HashAggregate (28) + +- Exchange (27) + +- HashAggregate (26) + +- Filter (25) + +- Scan parquet (24) (1) LocalTableScan @@ -60,143 +59,147 @@ Results [2]: [sum#X, isEmpty#X] Input [2]: [sum#X, isEmpty#X] Arguments: false -(7) ColumnarExchange +(7) VeloxAppendBatches +Input [2]: [sum#X, isEmpty#X] +Arguments: X + +(8) ColumnarExchange Input [2]: [sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(8) ShuffleQueryStage +(9) ShuffleQueryStage Output [2]: [sum#X, isEmpty#X] Arguments: X -(9) InputAdapter +(10) InputAdapter Input [2]: [sum#X, isEmpty#X] -(10) InputIteratorTransformer +(11) InputIteratorTransformer Input [2]: [sum#X, isEmpty#X] -(11) RegularHashAggregateExecTransformer +(12) RegularHashAggregateExecTransformer Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(l_extendedprice#X)] Aggregate Attributes [1]: [sum(l_extendedprice#X)#X] Results [1]: [sum(l_extendedprice#X)#X] -(12) ProjectExecTransformer +(13) ProjectExecTransformer Output [1]: [(sum(l_extendedprice#X)#X / 7.0) AS avg_yearly#X] Input [1]: [sum(l_extendedprice#X)#X] -(13) WholeStageCodegenTransformer (X) +(14) WholeStageCodegenTransformer (X) Input [1]: [avg_yearly#X] Arguments: false -(14) VeloxColumnarToRowExec +(15) VeloxColumnarToRowExec Input [1]: [avg_yearly#X] -(15) Scan parquet +(16) Scan parquet Output [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey), IsNotNull(l_quantity)] ReadSchema: struct -(16) Filter +(17) Filter Input [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] Condition : (isnotnull(l_partkey#X) AND isnotnull(l_quantity#X)) -(17) Scan parquet +(18) Scan parquet Output [3]: [p_partkey#X, p_brand#X, p_container#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_brand), IsNotNull(p_container), EqualTo(p_brand,Brand#X), EqualTo(p_container,MED BOX), IsNotNull(p_partkey)] ReadSchema: struct -(18) Filter +(19) Filter Input [3]: [p_partkey#X, p_brand#X, p_container#X] Condition : ((((isnotnull(p_brand#X) AND isnotnull(p_container#X)) AND (p_brand#X = Brand#X)) AND (p_container#X = MED BOX)) AND isnotnull(p_partkey#X)) -(19) Project +(20) Project Output [1]: [p_partkey#X] Input [3]: [p_partkey#X, p_brand#X, p_container#X] -(20) BroadcastExchange +(21) BroadcastExchange Input [1]: [p_partkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(21) BroadcastHashJoin +(22) BroadcastHashJoin Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join type: Inner Join condition: None -(22) Project +(23) Project Output [3]: [l_quantity#X, l_extendedprice#X, p_partkey#X] Input [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, p_partkey#X] -(23) Scan parquet +(24) Scan parquet Output [2]: [l_partkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey)] ReadSchema: struct -(24) Filter +(25) Filter Input [2]: [l_partkey#X, l_quantity#X] Condition : isnotnull(l_partkey#X) -(25) HashAggregate +(26) HashAggregate Input [2]: [l_partkey#X, l_quantity#X] Keys [1]: [l_partkey#X] Functions [1]: [partial_avg(l_quantity#X)] Aggregate Attributes [2]: [sum#X, count#X] Results [3]: [l_partkey#X, sum#X, count#X] -(26) Exchange +(27) Exchange Input [3]: [l_partkey#X, sum#X, count#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(27) HashAggregate +(28) HashAggregate Input [3]: [l_partkey#X, sum#X, count#X] Keys [1]: [l_partkey#X] Functions [1]: [avg(l_quantity#X)] Aggregate Attributes [1]: [avg(l_quantity#X)#X] Results [2]: [(0.2 * avg(l_quantity#X)#X) AS (0.2 * avg(l_quantity))#X, l_partkey#X] -(28) Filter +(29) Filter Input [2]: [(0.2 * avg(l_quantity))#X, l_partkey#X] Condition : isnotnull((0.2 * avg(l_quantity))#X) -(29) BroadcastExchange +(30) BroadcastExchange Input [2]: [(0.2 * avg(l_quantity))#X, l_partkey#X] Arguments: HashedRelationBroadcastMode(List(input[1, bigint, true]),false), [plan_id=X] -(30) BroadcastHashJoin +(31) BroadcastHashJoin Left keys [1]: [p_partkey#X] Right keys [1]: [l_partkey#X] Join type: Inner Join condition: (cast(l_quantity#X as decimal(18,7)) < (0.2 * avg(l_quantity))#X) -(31) Project +(32) Project Output [1]: [l_extendedprice#X] Input [5]: [l_quantity#X, l_extendedprice#X, p_partkey#X, (0.2 * avg(l_quantity))#X, l_partkey#X] -(32) HashAggregate +(33) HashAggregate Input [1]: [l_extendedprice#X] Keys: [] Functions [1]: [partial_sum(l_extendedprice#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(33) Exchange +(34) Exchange Input [2]: [sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X] -(34) HashAggregate +(35) HashAggregate Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(l_extendedprice#X)] Aggregate Attributes [1]: [sum(l_extendedprice#X)#X] Results [1]: [(sum(l_extendedprice#X)#X / 7.0) AS avg_yearly#X] -(35) AdaptiveSparkPlan +(36) AdaptiveSparkPlan Output [1]: [avg_yearly#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/18.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/18.txt index 2484955032d0d..0f5e82ec54c16 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/18.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/18.txt @@ -1,86 +1,82 @@ == Physical Plan == -AdaptiveSparkPlan (86) +AdaptiveSparkPlan (88) +- == Final Plan == - VeloxColumnarToRowExec (53) - +- TakeOrderedAndProjectExecTransformer (52) - +- ^ RegularHashAggregateExecTransformer (50) - +- ^ InputIteratorTransformer (49) - +- ^ InputAdapter (48) - +- ^ ShuffleQueryStage (47), Statistics(X) - +- ColumnarExchange (46) - +- ^ ProjectExecTransformer (44) - +- ^ FlushableHashAggregateExecTransformer (43) - +- ^ ProjectExecTransformer (42) - +- ^ BroadcastHashJoinExecTransformer Inner (41) - :- ^ ProjectExecTransformer (28) - : +- ^ BroadcastHashJoinExecTransformer Inner (27) + VeloxColumnarToRowExec (55) + +- TakeOrderedAndProjectExecTransformer (54) + +- ^ RegularHashAggregateExecTransformer (52) + +- ^ InputIteratorTransformer (51) + +- ShuffleQueryStage (49), Statistics(X) + +- ColumnarExchange (48) + +- VeloxAppendBatches (47) + +- ^ ProjectExecTransformer (45) + +- ^ FlushableHashAggregateExecTransformer (44) + +- ^ ProjectExecTransformer (43) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (42) + :- ^ ProjectExecTransformer (29) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildLeft (28) : :- ^ InputIteratorTransformer (7) - : : +- ^ InputAdapter (6) - : : +- ^ BroadcastQueryStage (5), Statistics(X) - : : +- ColumnarBroadcastExchange (4) - : : +- ^ NoopFilter (2) - : : +- ^ Scan parquet (1) - : +- ^ BroadcastHashJoinExecTransformer LeftSemi (26) + : : +- BroadcastQueryStage (5), Statistics(X) + : : +- ColumnarBroadcastExchange (4) + : : +- ^ NoopFilter (2) + : : +- ^ Scan parquet (1) + : +- ^ BroadcastHashJoinExecTransformer LeftSemi BuildRight (27) : :- ^ NoopFilter (9) : : +- ^ Scan parquet (8) - : +- ^ InputIteratorTransformer (25) - : +- ^ InputAdapter (24) - : +- ^ BroadcastQueryStage (23), Statistics(X) - : +- ColumnarBroadcastExchange (22) - : +- ^ ProjectExecTransformer (20) - : +- ^ FilterExecTransformer (19) - : +- ^ RegularHashAggregateExecTransformer (18) - : +- ^ InputIteratorTransformer (17) - : +- ^ InputAdapter (16) - : +- ^ ShuffleQueryStage (15), Statistics(X) - : +- ColumnarExchange (14) - : +- ^ ProjectExecTransformer (12) - : +- ^ FlushableHashAggregateExecTransformer (11) - : +- ^ Scan parquet (10) - +- ^ InputIteratorTransformer (40) - +- ^ InputAdapter (39) - +- ^ BroadcastQueryStage (38), Statistics(X) - +- ColumnarBroadcastExchange (37) - +- ^ BroadcastHashJoinExecTransformer LeftSemi (35) - :- ^ NoopFilter (30) - : +- ^ Scan parquet (29) - +- ^ InputIteratorTransformer (34) - +- ^ InputAdapter (33) - +- ^ BroadcastQueryStage (32), Statistics(X) - +- ReusedExchange (31) + : +- ^ InputIteratorTransformer (26) + : +- BroadcastQueryStage (24), Statistics(X) + : +- ColumnarBroadcastExchange (23) + : +- ^ ProjectExecTransformer (21) + : +- ^ FilterExecTransformer (20) + : +- ^ RegularHashAggregateExecTransformer (19) + : +- ^ InputIteratorTransformer (18) + : +- ShuffleQueryStage (16), Statistics(X) + : +- ColumnarExchange (15) + : +- VeloxAppendBatches (14) + : +- ^ ProjectExecTransformer (12) + : +- ^ FlushableHashAggregateExecTransformer (11) + : +- ^ Scan parquet (10) + +- ^ InputIteratorTransformer (41) + +- BroadcastQueryStage (39), Statistics(X) + +- ColumnarBroadcastExchange (38) + +- ^ BroadcastHashJoinExecTransformer LeftSemi BuildRight (36) + :- ^ NoopFilter (31) + : +- ^ Scan parquet (30) + +- ^ InputIteratorTransformer (35) + +- BroadcastQueryStage (33), Statistics(X) + +- ReusedExchange (32) +- == Initial Plan == - TakeOrderedAndProject (85) - +- HashAggregate (84) - +- Exchange (83) - +- HashAggregate (82) - +- Project (81) - +- BroadcastHashJoin Inner BuildRight (80) - :- Project (68) - : +- BroadcastHashJoin Inner BuildLeft (67) - : :- BroadcastExchange (56) - : : +- Filter (55) - : : +- Scan parquet (54) - : +- BroadcastHashJoin LeftSemi BuildRight (66) - : :- Filter (58) - : : +- Scan parquet (57) - : +- BroadcastExchange (65) - : +- Project (64) - : +- Filter (63) - : +- HashAggregate (62) - : +- Exchange (61) - : +- HashAggregate (60) - : +- Scan parquet (59) - +- BroadcastExchange (79) - +- BroadcastHashJoin LeftSemi BuildRight (78) - :- Filter (70) - : +- Scan parquet (69) - +- BroadcastExchange (77) - +- Project (76) - +- Filter (75) - +- HashAggregate (74) - +- Exchange (73) - +- HashAggregate (72) - +- Scan parquet (71) + TakeOrderedAndProject (87) + +- HashAggregate (86) + +- Exchange (85) + +- HashAggregate (84) + +- Project (83) + +- BroadcastHashJoin Inner BuildRight (82) + :- Project (70) + : +- BroadcastHashJoin Inner BuildLeft (69) + : :- BroadcastExchange (58) + : : +- Filter (57) + : : +- Scan parquet (56) + : +- BroadcastHashJoin LeftSemi BuildRight (68) + : :- Filter (60) + : : +- Scan parquet (59) + : +- BroadcastExchange (67) + : +- Project (66) + : +- Filter (65) + : +- HashAggregate (64) + : +- Exchange (63) + : +- HashAggregate (62) + : +- Scan parquet (61) + +- BroadcastExchange (81) + +- BroadcastHashJoin LeftSemi BuildRight (80) + :- Filter (72) + : +- Scan parquet (71) + +- BroadcastExchange (79) + +- Project (78) + +- Filter (77) + +- HashAggregate (76) + +- Exchange (75) + +- HashAggregate (74) + +- Scan parquet (73) (1) Scan parquet @@ -144,341 +140,349 @@ Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Input [4]: [hash_partition_key#X, l_orderkey#X, sum#X, isEmpty#X] Arguments: false -(14) ColumnarExchange +(14) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_orderkey#X, sum#X, isEmpty#X] +Arguments: X + +(15) ColumnarExchange Input [4]: [hash_partition_key#X, l_orderkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(15) ShuffleQueryStage +(16) ShuffleQueryStage Output [3]: [l_orderkey#X, sum#X, isEmpty#X] Arguments: X -(16) InputAdapter +(17) InputAdapter Input [3]: [l_orderkey#X, sum#X, isEmpty#X] -(17) InputIteratorTransformer +(18) InputIteratorTransformer Input [3]: [l_orderkey#X, sum#X, isEmpty#X] -(18) RegularHashAggregateExecTransformer +(19) RegularHashAggregateExecTransformer Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Keys [1]: [l_orderkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [2]: [l_orderkey#X, sum(l_quantity#X)#X AS sum(l_quantity#X)#X] -(19) FilterExecTransformer +(20) FilterExecTransformer Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] Arguments: (isnotnull(sum(l_quantity#X)#X) AND (sum(l_quantity#X)#X > 300.00)) -(20) ProjectExecTransformer +(21) ProjectExecTransformer Output [1]: [l_orderkey#X] Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] -(21) WholeStageCodegenTransformer (X) +(22) WholeStageCodegenTransformer (X) Input [1]: [l_orderkey#X] Arguments: false -(22) ColumnarBroadcastExchange +(23) ColumnarBroadcastExchange Input [1]: [l_orderkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(23) BroadcastQueryStage +(24) BroadcastQueryStage Output [1]: [l_orderkey#X] Arguments: X -(24) InputAdapter +(25) InputAdapter Input [1]: [l_orderkey#X] -(25) InputIteratorTransformer +(26) InputIteratorTransformer Input [1]: [l_orderkey#X] -(26) BroadcastHashJoinExecTransformer +(27) BroadcastHashJoinExecTransformer Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: LeftSemi Join condition: None -(27) BroadcastHashJoinExecTransformer +(28) BroadcastHashJoinExecTransformer Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join type: Inner Join condition: None -(28) ProjectExecTransformer +(29) ProjectExecTransformer Output [5]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] Input [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -(29) Scan parquet +(30) Scan parquet Output [2]: [l_orderkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_orderkey)] ReadSchema: struct -(30) NoopFilter +(31) NoopFilter Input [2]: [l_orderkey#X, l_quantity#X] Arguments: [l_orderkey#X, l_quantity#X] -(31) ReusedExchange [Reuses operator id: 22] +(32) ReusedExchange [Reuses operator id: 23] Output [1]: [l_orderkey#X] -(32) BroadcastQueryStage +(33) BroadcastQueryStage Output [1]: [l_orderkey#X] Arguments: X -(33) InputAdapter +(34) InputAdapter Input [1]: [l_orderkey#X] -(34) InputIteratorTransformer +(35) InputIteratorTransformer Input [1]: [l_orderkey#X] -(35) BroadcastHashJoinExecTransformer +(36) BroadcastHashJoinExecTransformer Left keys [1]: [l_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: LeftSemi Join condition: None -(36) WholeStageCodegenTransformer (X) +(37) WholeStageCodegenTransformer (X) Input [2]: [l_orderkey#X, l_quantity#X] Arguments: false -(37) ColumnarBroadcastExchange +(38) ColumnarBroadcastExchange Input [2]: [l_orderkey#X, l_quantity#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(38) BroadcastQueryStage +(39) BroadcastQueryStage Output [2]: [l_orderkey#X, l_quantity#X] Arguments: X -(39) InputAdapter +(40) InputAdapter Input [2]: [l_orderkey#X, l_quantity#X] -(40) InputIteratorTransformer +(41) InputIteratorTransformer Input [2]: [l_orderkey#X, l_quantity#X] -(41) BroadcastHashJoinExecTransformer +(42) BroadcastHashJoinExecTransformer Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: Inner Join condition: None -(42) ProjectExecTransformer +(43) ProjectExecTransformer Output [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_quantity#X] Input [7]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_orderkey#X, l_quantity#X] -(43) FlushableHashAggregateExecTransformer +(44) FlushableHashAggregateExecTransformer Input [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_quantity#X] Keys [5]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] -(44) ProjectExecTransformer +(45) ProjectExecTransformer Output [8]: [hash(c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, 42) AS hash_partition_key#X, c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] Input [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] -(45) WholeStageCodegenTransformer (X) +(46) WholeStageCodegenTransformer (X) Input [8]: [hash_partition_key#X, c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] Arguments: false -(46) ColumnarExchange +(47) VeloxAppendBatches +Input [8]: [hash_partition_key#X, c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] +Arguments: X + +(48) ColumnarExchange Input [8]: [hash_partition_key#X, c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] Arguments: hashpartitioning(c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, 1), ENSURE_REQUIREMENTS, [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(47) ShuffleQueryStage +(49) ShuffleQueryStage Output [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] Arguments: X -(48) InputAdapter +(50) InputAdapter Input [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] -(49) InputIteratorTransformer +(51) InputIteratorTransformer Input [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] -(50) RegularHashAggregateExecTransformer +(52) RegularHashAggregateExecTransformer Input [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] Keys [5]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity#X)#X AS sum(l_quantity)#X] -(51) WholeStageCodegenTransformer (X) +(53) WholeStageCodegenTransformer (X) Input [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] Arguments: false -(52) TakeOrderedAndProjectExecTransformer +(54) TakeOrderedAndProjectExecTransformer Input [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] Arguments: X, [o_totalprice#X DESC NULLS LAST, o_orderdate#X ASC NULLS FIRST], [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X], 0 -(53) VeloxColumnarToRowExec +(55) VeloxColumnarToRowExec Input [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] -(54) Scan parquet +(56) Scan parquet Output [2]: [c_custkey#X, c_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey)] ReadSchema: struct -(55) Filter +(57) Filter Input [2]: [c_custkey#X, c_name#X] Condition : isnotnull(c_custkey#X) -(56) BroadcastExchange +(58) BroadcastExchange Input [2]: [c_custkey#X, c_name#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(57) Scan parquet +(59) Scan parquet Output [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(58) Filter +(60) Filter Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Condition : (isnotnull(o_custkey#X) AND isnotnull(o_orderkey#X)) -(59) Scan parquet +(61) Scan parquet Output [2]: [l_orderkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(60) HashAggregate +(62) HashAggregate Input [2]: [l_orderkey#X, l_quantity#X] Keys [1]: [l_orderkey#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_orderkey#X, sum#X, isEmpty#X] -(61) Exchange +(63) Exchange Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(62) HashAggregate +(64) HashAggregate Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Keys [1]: [l_orderkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [2]: [l_orderkey#X, sum(l_quantity#X)#X AS sum(l_quantity#X)#X] -(63) Filter +(65) Filter Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] Condition : (isnotnull(sum(l_quantity#X)#X) AND (sum(l_quantity#X)#X > 300.00)) -(64) Project +(66) Project Output [1]: [l_orderkey#X] Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] -(65) BroadcastExchange +(67) BroadcastExchange Input [1]: [l_orderkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(66) BroadcastHashJoin +(68) BroadcastHashJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: LeftSemi Join condition: None -(67) BroadcastHashJoin +(69) BroadcastHashJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join type: Inner Join condition: None -(68) Project +(70) Project Output [5]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] Input [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -(69) Scan parquet +(71) Scan parquet Output [2]: [l_orderkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_orderkey)] ReadSchema: struct -(70) Filter +(72) Filter Input [2]: [l_orderkey#X, l_quantity#X] Condition : isnotnull(l_orderkey#X) -(71) Scan parquet +(73) Scan parquet Output [2]: [l_orderkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(72) HashAggregate +(74) HashAggregate Input [2]: [l_orderkey#X, l_quantity#X] Keys [1]: [l_orderkey#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_orderkey#X, sum#X, isEmpty#X] -(73) Exchange +(75) Exchange Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(74) HashAggregate +(76) HashAggregate Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Keys [1]: [l_orderkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [2]: [l_orderkey#X, sum(l_quantity#X)#X AS sum(l_quantity#X)#X] -(75) Filter +(77) Filter Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] Condition : (isnotnull(sum(l_quantity#X)#X) AND (sum(l_quantity#X)#X > 300.00)) -(76) Project +(78) Project Output [1]: [l_orderkey#X] Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] -(77) BroadcastExchange +(79) BroadcastExchange Input [1]: [l_orderkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(78) BroadcastHashJoin +(80) BroadcastHashJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: LeftSemi Join condition: None -(79) BroadcastExchange +(81) BroadcastExchange Input [2]: [l_orderkey#X, l_quantity#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(80) BroadcastHashJoin +(82) BroadcastHashJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: Inner Join condition: None -(81) Project +(83) Project Output [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_quantity#X] Input [7]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_orderkey#X, l_quantity#X] -(82) HashAggregate +(84) HashAggregate Input [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_quantity#X] Keys [5]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] -(83) Exchange +(85) Exchange Input [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] Arguments: hashpartitioning(c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(84) HashAggregate +(86) HashAggregate Input [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] Keys [5]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity#X)#X AS sum(l_quantity)#X] -(85) TakeOrderedAndProject +(87) TakeOrderedAndProject Input [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] Arguments: X, [o_totalprice#X DESC NULLS LAST, o_orderdate#X ASC NULLS FIRST], [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] -(86) AdaptiveSparkPlan +(88) AdaptiveSparkPlan Output [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/19.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/19.txt index 1a2a13c21db84..3bafdb994153b 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/19.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/19.txt @@ -1,36 +1,35 @@ == Physical Plan == -AdaptiveSparkPlan (33) +AdaptiveSparkPlan (34) +- == Final Plan == - VeloxColumnarToRowExec (21) - +- ^ RegularHashAggregateExecTransformer (19) - +- ^ InputIteratorTransformer (18) - +- ^ InputAdapter (17) - +- ^ ShuffleQueryStage (16), Statistics(X) - +- ColumnarExchange (15) + VeloxColumnarToRowExec (22) + +- ^ RegularHashAggregateExecTransformer (20) + +- ^ InputIteratorTransformer (19) + +- ShuffleQueryStage (17), Statistics(X) + +- ColumnarExchange (16) + +- VeloxAppendBatches (15) +- ^ FlushableHashAggregateExecTransformer (13) +- ^ ProjectExecTransformer (12) - +- ^ BroadcastHashJoinExecTransformer Inner (11) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (11) :- ^ ProjectExecTransformer (3) : +- ^ NoopFilter (2) : +- ^ Scan parquet (1) +- ^ InputIteratorTransformer (10) - +- ^ InputAdapter (9) - +- ^ BroadcastQueryStage (8), Statistics(X) - +- ColumnarBroadcastExchange (7) - +- ^ NoopFilter (5) - +- ^ Scan parquet (4) + +- BroadcastQueryStage (8), Statistics(X) + +- ColumnarBroadcastExchange (7) + +- ^ NoopFilter (5) + +- ^ Scan parquet (4) +- == Initial Plan == - HashAggregate (32) - +- Exchange (31) - +- HashAggregate (30) - +- Project (29) - +- BroadcastHashJoin Inner BuildRight (28) - :- Project (24) - : +- Filter (23) - : +- Scan parquet (22) - +- BroadcastExchange (27) - +- Filter (26) - +- Scan parquet (25) + HashAggregate (33) + +- Exchange (32) + +- HashAggregate (31) + +- Project (30) + +- BroadcastHashJoin Inner BuildRight (29) + :- Project (25) + : +- Filter (24) + : +- Scan parquet (23) + +- BroadcastExchange (28) + +- Filter (27) + +- Scan parquet (26) (1) Scan parquet @@ -98,92 +97,96 @@ Results [2]: [sum#X, isEmpty#X] Input [2]: [sum#X, isEmpty#X] Arguments: false -(15) ColumnarExchange +(15) VeloxAppendBatches +Input [2]: [sum#X, isEmpty#X] +Arguments: X + +(16) ColumnarExchange Input [2]: [sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(16) ShuffleQueryStage +(17) ShuffleQueryStage Output [2]: [sum#X, isEmpty#X] Arguments: X -(17) InputAdapter +(18) InputAdapter Input [2]: [sum#X, isEmpty#X] -(18) InputIteratorTransformer +(19) InputIteratorTransformer Input [2]: [sum#X, isEmpty#X] -(19) RegularHashAggregateExecTransformer +(20) RegularHashAggregateExecTransformer Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X] Results [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X AS revenue#X] -(20) WholeStageCodegenTransformer (X) +(21) WholeStageCodegenTransformer (X) Input [1]: [revenue#X] Arguments: false -(21) VeloxColumnarToRowExec +(22) VeloxColumnarToRowExec Input [1]: [revenue#X] -(22) Scan parquet +(23) Scan parquet Output [6]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, l_shipinstruct#X, l_shipmode#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipinstruct), In(l_shipmode, [AIR,AIR REG]), EqualTo(l_shipinstruct,DELIVER IN PERSON), IsNotNull(l_partkey), Or(Or(And(GreaterThanOrEqual(l_quantity,1.00),LessThanOrEqual(l_quantity,11.00)),And(GreaterThanOrEqual(l_quantity,10.00),LessThanOrEqual(l_quantity,20.00))),And(GreaterThanOrEqual(l_quantity,20.00),LessThanOrEqual(l_quantity,30.00)))] ReadSchema: struct -(23) Filter +(24) Filter Input [6]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, l_shipinstruct#X, l_shipmode#X] Condition : ((((isnotnull(l_shipinstruct#X) AND l_shipmode#X IN (AIR,AIR REG)) AND (l_shipinstruct#X = DELIVER IN PERSON)) AND isnotnull(l_partkey#X)) AND ((((l_quantity#X >= 1.00) AND (l_quantity#X <= 11.00)) OR ((l_quantity#X >= 10.00) AND (l_quantity#X <= 20.00))) OR ((l_quantity#X >= 20.00) AND (l_quantity#X <= 30.00)))) -(24) Project +(25) Project Output [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Input [6]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, l_shipinstruct#X, l_shipmode#X] -(25) Scan parquet +(26) Scan parquet Output [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_size), GreaterThanOrEqual(p_size,1), IsNotNull(p_partkey), Or(Or(And(And(EqualTo(p_brand,Brand#X),In(p_container, [SM BOX,SM CASE,SM PACK,SM PKG])),LessThanOrEqual(p_size,5)),And(And(EqualTo(p_brand,Brand#X),In(p_container, [MED BAG,MED BOX,MED PACK,MED PKG])),LessThanOrEqual(p_size,10))),And(And(EqualTo(p_brand,Brand#X),In(p_container, [LG BOX,LG CASE,LG PACK,LG PKG])),LessThanOrEqual(p_size,15)))] ReadSchema: struct -(26) Filter +(27) Filter Input [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] Condition : (((isnotnull(p_size#X) AND (p_size#X >= 1)) AND isnotnull(p_partkey#X)) AND (((((p_brand#X = Brand#X) AND p_container#X IN (SM CASE,SM BOX,SM PACK,SM PKG)) AND (p_size#X <= 5)) OR (((p_brand#X = Brand#X) AND p_container#X IN (MED BAG,MED BOX,MED PKG,MED PACK)) AND (p_size#X <= 10))) OR (((p_brand#X = Brand#X) AND p_container#X IN (LG CASE,LG BOX,LG PACK,LG PKG)) AND (p_size#X <= 15)))) -(27) BroadcastExchange +(28) BroadcastExchange Input [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(28) BroadcastHashJoin +(29) BroadcastHashJoin Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join type: Inner Join condition: (((((((p_brand#X = Brand#X) AND p_container#X IN (SM CASE,SM BOX,SM PACK,SM PKG)) AND (l_quantity#X >= 1.00)) AND (l_quantity#X <= 11.00)) AND (p_size#X <= 5)) OR (((((p_brand#X = Brand#X) AND p_container#X IN (MED BAG,MED BOX,MED PKG,MED PACK)) AND (l_quantity#X >= 10.00)) AND (l_quantity#X <= 20.00)) AND (p_size#X <= 10))) OR (((((p_brand#X = Brand#X) AND p_container#X IN (LG CASE,LG BOX,LG PACK,LG PKG)) AND (l_quantity#X >= 20.00)) AND (l_quantity#X <= 30.00)) AND (p_size#X <= 15))) -(29) Project +(30) Project Output [2]: [l_extendedprice#X, l_discount#X] Input [8]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, p_partkey#X, p_brand#X, p_size#X, p_container#X] -(30) HashAggregate +(31) HashAggregate Input [2]: [l_extendedprice#X, l_discount#X] Keys: [] Functions [1]: [partial_sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(31) Exchange +(32) Exchange Input [2]: [sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X] -(32) HashAggregate +(33) HashAggregate Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X] Results [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X AS revenue#X] -(33) AdaptiveSparkPlan +(34) AdaptiveSparkPlan Output [1]: [revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/20.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/20.txt index ed40d3bbe23dd..945cc79de1ac2 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/20.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/20.txt @@ -1,96 +1,92 @@ == Physical Plan == -AdaptiveSparkPlan (96) +AdaptiveSparkPlan (98) +- == Final Plan == - VeloxColumnarToRowExec (60) - +- AQEShuffleRead (59) - +- ShuffleQueryStage (58), Statistics(X) - +- ColumnarExchange (57) - +- ^ ProjectExecTransformer (55) - +- ^ BroadcastHashJoinExecTransformer Inner (54) - :- ^ ProjectExecTransformer (45) - : +- ^ BroadcastHashJoinExecTransformer LeftSemi (44) - : :- ^ NoopFilter (2) - : : +- ^ Scan parquet (1) - : +- ^ InputIteratorTransformer (43) - : +- ^ InputAdapter (42) - : +- ^ BroadcastQueryStage (41), Statistics(X) - : +- ColumnarBroadcastExchange (40) - : +- ^ ProjectExecTransformer (38) - : +- ^ BroadcastHashJoinExecTransformer Inner (37) - : :- ^ InputIteratorTransformer (18) - : : +- ^ InputAdapter (17) - : : +- ^ BroadcastQueryStage (16), Statistics(X) - : : +- ColumnarBroadcastExchange (15) - : : +- ^ BroadcastHashJoinExecTransformer LeftSemi (13) - : : :- ^ NoopFilter (4) - : : : +- ^ Scan parquet (3) - : : +- ^ InputIteratorTransformer (12) - : : +- ^ InputAdapter (11) - : : +- ^ BroadcastQueryStage (10), Statistics(X) - : : +- ColumnarBroadcastExchange (9) - : : +- ^ ProjectExecTransformer (7) - : : +- ^ NoopFilter (6) - : : +- ^ Scan parquet (5) - : +- ^ FilterExecTransformer (36) - : +- ^ ProjectExecTransformer (35) - : +- ^ RegularHashAggregateExecTransformer (34) - : +- ^ InputIteratorTransformer (33) - : +- ^ InputAdapter (32) - : +- ^ ShuffleQueryStage (31), Statistics(X) - : +- ColumnarExchange (30) - : +- ^ ProjectExecTransformer (28) - : +- ^ FlushableHashAggregateExecTransformer (27) - : +- ^ BroadcastHashJoinExecTransformer LeftSemi (26) - : :- ^ ProjectExecTransformer (21) - : : +- ^ NoopFilter (20) - : : +- ^ Scan parquet (19) - : +- ^ InputIteratorTransformer (25) - : +- ^ InputAdapter (24) - : +- ^ BroadcastQueryStage (23), Statistics(X) - : +- ReusedExchange (22) - +- ^ InputIteratorTransformer (53) - +- ^ InputAdapter (52) - +- ^ BroadcastQueryStage (51), Statistics(X) - +- ColumnarBroadcastExchange (50) - +- ^ ProjectExecTransformer (48) - +- ^ NoopFilter (47) - +- ^ Scan parquet (46) + VeloxColumnarToRowExec (62) + +- AQEShuffleRead (61) + +- ShuffleQueryStage (60), Statistics(X) + +- ColumnarExchange (59) + +- VeloxAppendBatches (58) + +- ^ ProjectExecTransformer (56) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (55) + :- ^ ProjectExecTransformer (46) + : +- ^ BroadcastHashJoinExecTransformer LeftSemi BuildRight (45) + : :- ^ NoopFilter (2) + : : +- ^ Scan parquet (1) + : +- ^ InputIteratorTransformer (44) + : +- BroadcastQueryStage (42), Statistics(X) + : +- ColumnarBroadcastExchange (41) + : +- ^ ProjectExecTransformer (39) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildLeft (38) + : :- ^ InputIteratorTransformer (18) + : : +- BroadcastQueryStage (16), Statistics(X) + : : +- ColumnarBroadcastExchange (15) + : : +- ^ BroadcastHashJoinExecTransformer LeftSemi BuildRight (13) + : : :- ^ NoopFilter (4) + : : : +- ^ Scan parquet (3) + : : +- ^ InputIteratorTransformer (12) + : : +- BroadcastQueryStage (10), Statistics(X) + : : +- ColumnarBroadcastExchange (9) + : : +- ^ ProjectExecTransformer (7) + : : +- ^ NoopFilter (6) + : : +- ^ Scan parquet (5) + : +- ^ FilterExecTransformer (37) + : +- ^ ProjectExecTransformer (36) + : +- ^ RegularHashAggregateExecTransformer (35) + : +- ^ InputIteratorTransformer (34) + : +- ShuffleQueryStage (32), Statistics(X) + : +- ColumnarExchange (31) + : +- VeloxAppendBatches (30) + : +- ^ ProjectExecTransformer (28) + : +- ^ FlushableHashAggregateExecTransformer (27) + : +- ^ BroadcastHashJoinExecTransformer LeftSemi BuildRight (26) + : :- ^ ProjectExecTransformer (21) + : : +- ^ NoopFilter (20) + : : +- ^ Scan parquet (19) + : +- ^ InputIteratorTransformer (25) + : +- BroadcastQueryStage (23), Statistics(X) + : +- ReusedExchange (22) + +- ^ InputIteratorTransformer (54) + +- BroadcastQueryStage (52), Statistics(X) + +- ColumnarBroadcastExchange (51) + +- ^ ProjectExecTransformer (49) + +- ^ NoopFilter (48) + +- ^ Scan parquet (47) +- == Initial Plan == - Sort (95) - +- Exchange (94) - +- Project (93) - +- BroadcastHashJoin Inner BuildRight (92) - :- Project (87) - : +- BroadcastHashJoin LeftSemi BuildRight (86) - : :- Filter (62) - : : +- Scan parquet (61) - : +- BroadcastExchange (85) - : +- Project (84) - : +- BroadcastHashJoin Inner BuildLeft (83) - : :- BroadcastExchange (70) - : : +- BroadcastHashJoin LeftSemi BuildRight (69) - : : :- Filter (64) - : : : +- Scan parquet (63) - : : +- BroadcastExchange (68) - : : +- Project (67) - : : +- Filter (66) - : : +- Scan parquet (65) - : +- Filter (82) - : +- HashAggregate (81) - : +- Exchange (80) - : +- HashAggregate (79) - : +- BroadcastHashJoin LeftSemi BuildRight (78) - : :- Project (73) - : : +- Filter (72) - : : +- Scan parquet (71) - : +- BroadcastExchange (77) - : +- Project (76) - : +- Filter (75) - : +- Scan parquet (74) - +- BroadcastExchange (91) - +- Project (90) - +- Filter (89) - +- Scan parquet (88) + Sort (97) + +- Exchange (96) + +- Project (95) + +- BroadcastHashJoin Inner BuildRight (94) + :- Project (89) + : +- BroadcastHashJoin LeftSemi BuildRight (88) + : :- Filter (64) + : : +- Scan parquet (63) + : +- BroadcastExchange (87) + : +- Project (86) + : +- BroadcastHashJoin Inner BuildLeft (85) + : :- BroadcastExchange (72) + : : +- BroadcastHashJoin LeftSemi BuildRight (71) + : : :- Filter (66) + : : : +- Scan parquet (65) + : : +- BroadcastExchange (70) + : : +- Project (69) + : : +- Filter (68) + : : +- Scan parquet (67) + : +- Filter (84) + : +- HashAggregate (83) + : +- Exchange (82) + : +- HashAggregate (81) + : +- BroadcastHashJoin LeftSemi BuildRight (80) + : :- Project (75) + : : +- Filter (74) + : : +- Scan parquet (73) + : +- BroadcastExchange (79) + : +- Project (78) + : +- Filter (77) + : +- Scan parquet (76) + +- BroadcastExchange (93) + +- Project (92) + +- Filter (91) + +- Scan parquet (90) (1) Scan parquet @@ -221,309 +217,317 @@ Input [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] Input [5]: [hash_partition_key#X, l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: false -(30) ColumnarExchange +(30) VeloxAppendBatches +Input [5]: [hash_partition_key#X, l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] +Arguments: X + +(31) ColumnarExchange Input [5]: [hash_partition_key#X, l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_partkey#X, l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(31) ShuffleQueryStage +(32) ShuffleQueryStage Output [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: X -(32) InputAdapter +(33) InputAdapter Input [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] -(33) InputIteratorTransformer +(34) InputIteratorTransformer Input [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] -(34) RegularHashAggregateExecTransformer +(35) RegularHashAggregateExecTransformer Input [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] Keys [2]: [l_partkey#X, l_suppkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [3]: [l_partkey#X, l_suppkey#X, sum(l_quantity#X)#X] -(35) ProjectExecTransformer +(36) ProjectExecTransformer Output [3]: [(0.5 * sum(l_quantity#X)#X) AS (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Input [3]: [l_partkey#X, l_suppkey#X, sum(l_quantity#X)#X] -(36) FilterExecTransformer +(37) FilterExecTransformer Input [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Arguments: isnotnull((0.5 * sum(l_quantity))#X) -(37) BroadcastHashJoinExecTransformer +(38) BroadcastHashJoinExecTransformer Left keys [2]: [ps_partkey#X, ps_suppkey#X] Right keys [2]: [l_partkey#X, l_suppkey#X] Join type: Inner Join condition: (cast(ps_availqty#X as decimal(24,3)) > (0.5 * sum(l_quantity))#X) -(38) ProjectExecTransformer +(39) ProjectExecTransformer Output [1]: [ps_suppkey#X] Input [6]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] -(39) WholeStageCodegenTransformer (X) +(40) WholeStageCodegenTransformer (X) Input [1]: [ps_suppkey#X] Arguments: false -(40) ColumnarBroadcastExchange +(41) ColumnarBroadcastExchange Input [1]: [ps_suppkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(41) BroadcastQueryStage +(42) BroadcastQueryStage Output [1]: [ps_suppkey#X] Arguments: X -(42) InputAdapter +(43) InputAdapter Input [1]: [ps_suppkey#X] -(43) InputIteratorTransformer +(44) InputIteratorTransformer Input [1]: [ps_suppkey#X] -(44) BroadcastHashJoinExecTransformer +(45) BroadcastHashJoinExecTransformer Left keys [1]: [s_suppkey#X] Right keys [1]: [ps_suppkey#X] Join type: LeftSemi Join condition: None -(45) ProjectExecTransformer +(46) ProjectExecTransformer Output [3]: [s_name#X, s_address#X, s_nationkey#X] Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] -(46) Scan parquet +(47) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,CANADA), IsNotNull(n_nationkey)] ReadSchema: struct -(47) NoopFilter +(48) NoopFilter Input [2]: [n_nationkey#X, n_name#X] Arguments: [n_nationkey#X, n_name#X] -(48) ProjectExecTransformer +(49) ProjectExecTransformer Output [1]: [n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(49) WholeStageCodegenTransformer (X) +(50) WholeStageCodegenTransformer (X) Input [1]: [n_nationkey#X] Arguments: false -(50) ColumnarBroadcastExchange +(51) ColumnarBroadcastExchange Input [1]: [n_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(51) BroadcastQueryStage +(52) BroadcastQueryStage Output [1]: [n_nationkey#X] Arguments: X -(52) InputAdapter +(53) InputAdapter Input [1]: [n_nationkey#X] -(53) InputIteratorTransformer +(54) InputIteratorTransformer Input [1]: [n_nationkey#X] -(54) BroadcastHashJoinExecTransformer +(55) BroadcastHashJoinExecTransformer Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(55) ProjectExecTransformer +(56) ProjectExecTransformer Output [2]: [s_name#X, s_address#X] Input [4]: [s_name#X, s_address#X, s_nationkey#X, n_nationkey#X] -(56) WholeStageCodegenTransformer (X) +(57) WholeStageCodegenTransformer (X) Input [2]: [s_name#X, s_address#X] Arguments: false -(57) ColumnarExchange +(58) VeloxAppendBatches +Input [2]: [s_name#X, s_address#X] +Arguments: X + +(59) ColumnarExchange Input [2]: [s_name#X, s_address#X] Arguments: rangepartitioning(s_name#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(58) ShuffleQueryStage +(60) ShuffleQueryStage Output [2]: [s_name#X, s_address#X] Arguments: X -(59) AQEShuffleRead +(61) AQEShuffleRead Input [2]: [s_name#X, s_address#X] Arguments: local -(60) VeloxColumnarToRowExec +(62) VeloxColumnarToRowExec Input [2]: [s_name#X, s_address#X] -(61) Scan parquet +(63) Scan parquet Output [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_nationkey)] ReadSchema: struct -(62) Filter +(64) Filter Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Condition : isnotnull(s_nationkey#X) -(63) Scan parquet +(65) Scan parquet Output [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_availqty), IsNotNull(ps_partkey), IsNotNull(ps_suppkey)] ReadSchema: struct -(64) Filter +(66) Filter Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Condition : ((isnotnull(ps_availqty#X) AND isnotnull(ps_partkey#X)) AND isnotnull(ps_suppkey#X)) -(65) Scan parquet +(67) Scan parquet Output [2]: [p_partkey#X, p_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_name), StringStartsWith(p_name,forest)] ReadSchema: struct -(66) Filter +(68) Filter Input [2]: [p_partkey#X, p_name#X] Condition : (isnotnull(p_name#X) AND StartsWith(p_name#X, forest)) -(67) Project +(69) Project Output [1]: [p_partkey#X] Input [2]: [p_partkey#X, p_name#X] -(68) BroadcastExchange +(70) BroadcastExchange Input [1]: [p_partkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(69) BroadcastHashJoin +(71) BroadcastHashJoin Left keys [1]: [ps_partkey#X] Right keys [1]: [p_partkey#X] Join type: LeftSemi Join condition: None -(70) BroadcastExchange +(72) BroadcastExchange Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false], input[1, bigint, false]),false), [plan_id=X] -(71) Scan parquet +(73) Scan parquet Output [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1994-01-01), LessThan(l_shipdate,1995-01-01), IsNotNull(l_partkey), IsNotNull(l_suppkey)] ReadSchema: struct -(72) Filter +(74) Filter Input [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] Condition : ((((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1994-01-01)) AND (l_shipdate#X < 1995-01-01)) AND isnotnull(l_partkey#X)) AND isnotnull(l_suppkey#X)) -(73) Project +(75) Project Output [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] Input [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] -(74) Scan parquet +(76) Scan parquet Output [2]: [p_partkey#X, p_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_name), StringStartsWith(p_name,forest)] ReadSchema: struct -(75) Filter +(77) Filter Input [2]: [p_partkey#X, p_name#X] Condition : (isnotnull(p_name#X) AND StartsWith(p_name#X, forest)) -(76) Project +(78) Project Output [1]: [p_partkey#X] Input [2]: [p_partkey#X, p_name#X] -(77) BroadcastExchange +(79) BroadcastExchange Input [1]: [p_partkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(78) BroadcastHashJoin +(80) BroadcastHashJoin Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join type: LeftSemi Join condition: None -(79) HashAggregate +(81) HashAggregate Input [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] Keys [2]: [l_partkey#X, l_suppkey#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] -(80) Exchange +(82) Exchange Input [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_partkey#X, l_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(81) HashAggregate +(83) HashAggregate Input [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] Keys [2]: [l_partkey#X, l_suppkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [3]: [(0.5 * sum(l_quantity#X)#X) AS (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] -(82) Filter +(84) Filter Input [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Condition : isnotnull((0.5 * sum(l_quantity))#X) -(83) BroadcastHashJoin +(85) BroadcastHashJoin Left keys [2]: [ps_partkey#X, ps_suppkey#X] Right keys [2]: [l_partkey#X, l_suppkey#X] Join type: Inner Join condition: (cast(ps_availqty#X as decimal(24,3)) > (0.5 * sum(l_quantity))#X) -(84) Project +(86) Project Output [1]: [ps_suppkey#X] Input [6]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] -(85) BroadcastExchange +(87) BroadcastExchange Input [1]: [ps_suppkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(86) BroadcastHashJoin +(88) BroadcastHashJoin Left keys [1]: [s_suppkey#X] Right keys [1]: [ps_suppkey#X] Join type: LeftSemi Join condition: None -(87) Project +(89) Project Output [3]: [s_name#X, s_address#X, s_nationkey#X] Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] -(88) Scan parquet +(90) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,CANADA), IsNotNull(n_nationkey)] ReadSchema: struct -(89) Filter +(91) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : ((isnotnull(n_name#X) AND (n_name#X = CANADA)) AND isnotnull(n_nationkey#X)) -(90) Project +(92) Project Output [1]: [n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(91) BroadcastExchange +(93) BroadcastExchange Input [1]: [n_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(92) BroadcastHashJoin +(94) BroadcastHashJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(93) Project +(95) Project Output [2]: [s_name#X, s_address#X] Input [4]: [s_name#X, s_address#X, s_nationkey#X, n_nationkey#X] -(94) Exchange +(96) Exchange Input [2]: [s_name#X, s_address#X] Arguments: rangepartitioning(s_name#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(95) Sort +(97) Sort Input [2]: [s_name#X, s_address#X] Arguments: [s_name#X ASC NULLS FIRST], true, 0 -(96) AdaptiveSparkPlan +(98) AdaptiveSparkPlan Output [2]: [s_name#X, s_address#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/21.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/21.txt index d0b14137ca5a2..a7d9f42063a09 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/21.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/21.txt @@ -1,90 +1,85 @@ == Physical Plan == -AdaptiveSparkPlan (91) +AdaptiveSparkPlan (92) +- == Final Plan == - VeloxColumnarToRowExec (58) - +- ^ RegularHashAggregateExecTransformer (56) - +- ^ InputIteratorTransformer (55) - +- ^ InputAdapter (54) - +- ^ ShuffleQueryStage (53), Statistics(X) - +- ColumnarExchange (52) + VeloxColumnarToRowExec (59) + +- ^ RegularHashAggregateExecTransformer (57) + +- ^ InputIteratorTransformer (56) + +- ShuffleQueryStage (54), Statistics(X) + +- ColumnarExchange (53) + +- VeloxAppendBatches (52) +- ^ ProjectExecTransformer (50) +- ^ FlushableHashAggregateExecTransformer (49) +- ^ ProjectExecTransformer (48) - +- ^ BroadcastHashJoinExecTransformer Inner (47) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (47) :- ^ ProjectExecTransformer (38) - : +- ^ BroadcastHashJoinExecTransformer Inner (37) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (37) : :- ^ ProjectExecTransformer (28) - : : +- ^ BroadcastHashJoinExecTransformer Inner (27) + : : +- ^ BroadcastHashJoinExecTransformer Inner BuildLeft (27) : : :- ^ InputIteratorTransformer (7) - : : : +- ^ InputAdapter (6) - : : : +- ^ BroadcastQueryStage (5), Statistics(X) - : : : +- ColumnarBroadcastExchange (4) - : : : +- ^ NoopFilter (2) - : : : +- ^ Scan parquet (1) - : : +- ^ BroadcastHashJoinExecTransformer LeftAnti (26) - : : :- ^ BroadcastHashJoinExecTransformer LeftSemi (17) + : : : +- BroadcastQueryStage (5), Statistics(X) + : : : +- ColumnarBroadcastExchange (4) + : : : +- ^ NoopFilter (2) + : : : +- ^ Scan parquet (1) + : : +- ^ BroadcastHashJoinExecTransformer LeftAnti BuildRight (26) + : : :- ^ BroadcastHashJoinExecTransformer LeftSemi BuildRight (17) : : : :- ^ ProjectExecTransformer (10) : : : : +- ^ NoopFilter (9) : : : : +- ^ Scan parquet (8) : : : +- ^ InputIteratorTransformer (16) - : : : +- ^ InputAdapter (15) - : : : +- ^ BroadcastQueryStage (14), Statistics(X) - : : : +- ColumnarBroadcastExchange (13) - : : : +- ^ Scan parquet (11) + : : : +- BroadcastQueryStage (14), Statistics(X) + : : : +- ColumnarBroadcastExchange (13) + : : : +- ^ Scan parquet (11) : : +- ^ InputIteratorTransformer (25) - : : +- ^ InputAdapter (24) - : : +- ^ BroadcastQueryStage (23), Statistics(X) - : : +- ColumnarBroadcastExchange (22) - : : +- ^ ProjectExecTransformer (20) - : : +- ^ NoopFilter (19) - : : +- ^ Scan parquet (18) + : : +- BroadcastQueryStage (23), Statistics(X) + : : +- ColumnarBroadcastExchange (22) + : : +- ^ ProjectExecTransformer (20) + : : +- ^ NoopFilter (19) + : : +- ^ Scan parquet (18) : +- ^ InputIteratorTransformer (36) - : +- ^ InputAdapter (35) - : +- ^ BroadcastQueryStage (34), Statistics(X) - : +- ColumnarBroadcastExchange (33) - : +- ^ ProjectExecTransformer (31) - : +- ^ NoopFilter (30) - : +- ^ Scan parquet (29) + : +- BroadcastQueryStage (34), Statistics(X) + : +- ColumnarBroadcastExchange (33) + : +- ^ ProjectExecTransformer (31) + : +- ^ NoopFilter (30) + : +- ^ Scan parquet (29) +- ^ InputIteratorTransformer (46) - +- ^ InputAdapter (45) - +- ^ BroadcastQueryStage (44), Statistics(X) - +- ColumnarBroadcastExchange (43) - +- ^ ProjectExecTransformer (41) - +- ^ NoopFilter (40) - +- ^ Scan parquet (39) + +- BroadcastQueryStage (44), Statistics(X) + +- ColumnarBroadcastExchange (43) + +- ^ ProjectExecTransformer (41) + +- ^ NoopFilter (40) + +- ^ Scan parquet (39) +- == Initial Plan == - TakeOrderedAndProject (90) - +- HashAggregate (89) - +- Exchange (88) - +- HashAggregate (87) - +- Project (86) - +- BroadcastHashJoin Inner BuildRight (85) - :- Project (80) - : +- BroadcastHashJoin Inner BuildRight (79) - : :- Project (74) - : : +- BroadcastHashJoin Inner BuildLeft (73) - : : :- BroadcastExchange (61) - : : : +- Filter (60) - : : : +- Scan parquet (59) - : : +- BroadcastHashJoin LeftAnti BuildRight (72) - : : :- BroadcastHashJoin LeftSemi BuildRight (67) - : : : :- Project (64) - : : : : +- Filter (63) - : : : : +- Scan parquet (62) - : : : +- BroadcastExchange (66) - : : : +- Scan parquet (65) - : : +- BroadcastExchange (71) - : : +- Project (70) - : : +- Filter (69) - : : +- Scan parquet (68) - : +- BroadcastExchange (78) - : +- Project (77) - : +- Filter (76) - : +- Scan parquet (75) - +- BroadcastExchange (84) - +- Project (83) - +- Filter (82) - +- Scan parquet (81) + TakeOrderedAndProject (91) + +- HashAggregate (90) + +- Exchange (89) + +- HashAggregate (88) + +- Project (87) + +- BroadcastHashJoin Inner BuildRight (86) + :- Project (81) + : +- BroadcastHashJoin Inner BuildRight (80) + : :- Project (75) + : : +- BroadcastHashJoin Inner BuildLeft (74) + : : :- BroadcastExchange (62) + : : : +- Filter (61) + : : : +- Scan parquet (60) + : : +- BroadcastHashJoin LeftAnti BuildRight (73) + : : :- BroadcastHashJoin LeftSemi BuildRight (68) + : : : :- Project (65) + : : : : +- Filter (64) + : : : : +- Scan parquet (63) + : : : +- BroadcastExchange (67) + : : : +- Scan parquet (66) + : : +- BroadcastExchange (72) + : : +- Project (71) + : : +- Filter (70) + : : +- Scan parquet (69) + : +- BroadcastExchange (79) + : +- Project (78) + : +- Filter (77) + : +- Scan parquet (76) + +- BroadcastExchange (85) + +- Project (84) + +- Filter (83) + +- Scan parquet (82) (1) Scan parquet @@ -311,195 +306,199 @@ Input [2]: [s_name#X, count#X] Input [3]: [hash_partition_key#X, s_name#X, count#X] Arguments: false -(52) ColumnarExchange +(52) VeloxAppendBatches +Input [3]: [hash_partition_key#X, s_name#X, count#X] +Arguments: X + +(53) ColumnarExchange Input [3]: [hash_partition_key#X, s_name#X, count#X] Arguments: hashpartitioning(s_name#X, 1), ENSURE_REQUIREMENTS, [s_name#X, count#X], [plan_id=X], [id=#X] -(53) ShuffleQueryStage +(54) ShuffleQueryStage Output [2]: [s_name#X, count#X] Arguments: X -(54) InputAdapter +(55) InputAdapter Input [2]: [s_name#X, count#X] -(55) InputIteratorTransformer +(56) InputIteratorTransformer Input [2]: [s_name#X, count#X] -(56) RegularHashAggregateExecTransformer +(57) RegularHashAggregateExecTransformer Input [2]: [s_name#X, count#X] Keys [1]: [s_name#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [s_name#X, count(1)#X AS numwait#X] -(57) WholeStageCodegenTransformer (X) +(58) WholeStageCodegenTransformer (X) Input [2]: [s_name#X, numwait#X] Arguments: false -(58) VeloxColumnarToRowExec +(59) VeloxColumnarToRowExec Input [2]: [s_name#X, numwait#X] -(59) Scan parquet +(60) Scan parquet Output [3]: [s_suppkey#X, s_name#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(60) Filter +(61) Filter Input [3]: [s_suppkey#X, s_name#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(61) BroadcastExchange +(62) BroadcastExchange Input [3]: [s_suppkey#X, s_name#X, s_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(62) Scan parquet +(63) Scan parquet Output [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_receiptdate), IsNotNull(l_commitdate), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(63) Filter +(64) Filter Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Condition : ((((isnotnull(l_receiptdate#X) AND isnotnull(l_commitdate#X)) AND (l_receiptdate#X > l_commitdate#X)) AND isnotnull(l_suppkey#X)) AND isnotnull(l_orderkey#X)) -(64) Project +(65) Project Output [2]: [l_orderkey#X, l_suppkey#X] Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] -(65) Scan parquet +(66) Scan parquet Output [2]: [l_orderkey#X, l_suppkey#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(66) BroadcastExchange +(67) BroadcastExchange Input [2]: [l_orderkey#X, l_suppkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(67) BroadcastHashJoin +(68) BroadcastHashJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: LeftSemi Join condition: NOT (l_suppkey#X = l_suppkey#X) -(68) Scan parquet +(69) Scan parquet Output [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_receiptdate), IsNotNull(l_commitdate)] ReadSchema: struct -(69) Filter +(70) Filter Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Condition : ((isnotnull(l_receiptdate#X) AND isnotnull(l_commitdate#X)) AND (l_receiptdate#X > l_commitdate#X)) -(70) Project +(71) Project Output [2]: [l_orderkey#X, l_suppkey#X] Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] -(71) BroadcastExchange +(72) BroadcastExchange Input [2]: [l_orderkey#X, l_suppkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(72) BroadcastHashJoin +(73) BroadcastHashJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: LeftAnti Join condition: NOT (l_suppkey#X = l_suppkey#X) -(73) BroadcastHashJoin +(74) BroadcastHashJoin Left keys [1]: [s_suppkey#X] Right keys [1]: [l_suppkey#X] Join type: Inner Join condition: None -(74) Project +(75) Project Output [3]: [s_name#X, s_nationkey#X, l_orderkey#X] Input [5]: [s_suppkey#X, s_name#X, s_nationkey#X, l_orderkey#X, l_suppkey#X] -(75) Scan parquet +(76) Scan parquet Output [2]: [o_orderkey#X, o_orderstatus#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderstatus), EqualTo(o_orderstatus,F), IsNotNull(o_orderkey)] ReadSchema: struct -(76) Filter +(77) Filter Input [2]: [o_orderkey#X, o_orderstatus#X] Condition : ((isnotnull(o_orderstatus#X) AND (o_orderstatus#X = F)) AND isnotnull(o_orderkey#X)) -(77) Project +(78) Project Output [1]: [o_orderkey#X] Input [2]: [o_orderkey#X, o_orderstatus#X] -(78) BroadcastExchange +(79) BroadcastExchange Input [1]: [o_orderkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(79) BroadcastHashJoin +(80) BroadcastHashJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join type: Inner Join condition: None -(80) Project +(81) Project Output [2]: [s_name#X, s_nationkey#X] Input [4]: [s_name#X, s_nationkey#X, l_orderkey#X, o_orderkey#X] -(81) Scan parquet +(82) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,SAUDI ARABIA), IsNotNull(n_nationkey)] ReadSchema: struct -(82) Filter +(83) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : ((isnotnull(n_name#X) AND (n_name#X = SAUDI ARABIA)) AND isnotnull(n_nationkey#X)) -(83) Project +(84) Project Output [1]: [n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(84) BroadcastExchange +(85) BroadcastExchange Input [1]: [n_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(85) BroadcastHashJoin +(86) BroadcastHashJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(86) Project +(87) Project Output [1]: [s_name#X] Input [3]: [s_name#X, s_nationkey#X, n_nationkey#X] -(87) HashAggregate +(88) HashAggregate Input [1]: [s_name#X] Keys [1]: [s_name#X] Functions [1]: [partial_count(1)] Aggregate Attributes [1]: [count#X] Results [2]: [s_name#X, count#X] -(88) Exchange +(89) Exchange Input [2]: [s_name#X, count#X] Arguments: hashpartitioning(s_name#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(89) HashAggregate +(90) HashAggregate Input [2]: [s_name#X, count#X] Keys [1]: [s_name#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [s_name#X, count(1)#X AS numwait#X] -(90) TakeOrderedAndProject +(91) TakeOrderedAndProject Input [2]: [s_name#X, numwait#X] Arguments: X, [numwait#X DESC NULLS LAST, s_name#X ASC NULLS FIRST], [s_name#X, numwait#X] -(91) AdaptiveSparkPlan +(92) AdaptiveSparkPlan Output [2]: [s_name#X, numwait#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/22.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/22.txt index 698879c473b6b..0d9139bcd990e 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/22.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/22.txt @@ -1,40 +1,39 @@ == Physical Plan == -AdaptiveSparkPlan (38) +AdaptiveSparkPlan (40) +- == Final Plan == - VeloxColumnarToRowExec (26) - +- ^ SortExecTransformer (24) - +- ^ InputIteratorTransformer (23) - +- ^ InputAdapter (22) - +- ^ ShuffleQueryStage (21), Statistics(X) - +- ColumnarExchange (20) - +- ^ RegularHashAggregateExecTransformer (18) - +- ^ InputIteratorTransformer (17) - +- ^ InputAdapter (16) - +- ^ ShuffleQueryStage (15), Statistics(X) - +- ColumnarExchange (14) + VeloxColumnarToRowExec (28) + +- ^ SortExecTransformer (26) + +- ^ InputIteratorTransformer (25) + +- ShuffleQueryStage (23), Statistics(X) + +- ColumnarExchange (22) + +- VeloxAppendBatches (21) + +- ^ RegularHashAggregateExecTransformer (19) + +- ^ InputIteratorTransformer (18) + +- ShuffleQueryStage (16), Statistics(X) + +- ColumnarExchange (15) + +- VeloxAppendBatches (14) +- ^ ProjectExecTransformer (12) +- ^ FlushableHashAggregateExecTransformer (11) +- ^ ProjectExecTransformer (10) - +- ^ BroadcastHashJoinExecTransformer LeftAnti (9) + +- ^ BroadcastHashJoinExecTransformer LeftAnti BuildRight (9) :- ^ NoopFilter (2) : +- ^ Scan parquet (1) +- ^ InputIteratorTransformer (8) - +- ^ InputAdapter (7) - +- ^ BroadcastQueryStage (6), Statistics(X) - +- ColumnarBroadcastExchange (5) - +- ^ Scan parquet (3) + +- BroadcastQueryStage (6), Statistics(X) + +- ColumnarBroadcastExchange (5) + +- ^ Scan parquet (3) +- == Initial Plan == - Sort (37) - +- Exchange (36) - +- HashAggregate (35) - +- Exchange (34) - +- HashAggregate (33) - +- Project (32) - +- BroadcastHashJoin LeftAnti BuildRight (31) - :- Filter (28) - : +- Scan parquet (27) - +- BroadcastExchange (30) - +- Scan parquet (29) + Sort (39) + +- Exchange (38) + +- HashAggregate (37) + +- Exchange (36) + +- HashAggregate (35) + +- Project (34) + +- BroadcastHashJoin LeftAnti BuildRight (33) + :- Filter (30) + : +- Scan parquet (29) + +- BroadcastExchange (32) + +- Scan parquet (31) (1) Scan parquet @@ -97,228 +96,240 @@ Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] Input [5]: [hash_partition_key#X, cntrycode#X, count#X, sum#X, isEmpty#X] Arguments: false -(14) ColumnarExchange +(14) VeloxAppendBatches +Input [5]: [hash_partition_key#X, cntrycode#X, count#X, sum#X, isEmpty#X] +Arguments: X + +(15) ColumnarExchange Input [5]: [hash_partition_key#X, cntrycode#X, count#X, sum#X, isEmpty#X] Arguments: hashpartitioning(cntrycode#X, 1), ENSURE_REQUIREMENTS, [cntrycode#X, count#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(15) ShuffleQueryStage +(16) ShuffleQueryStage Output [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] Arguments: X -(16) InputAdapter +(17) InputAdapter Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] -(17) InputIteratorTransformer +(18) InputIteratorTransformer Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] -(18) RegularHashAggregateExecTransformer +(19) RegularHashAggregateExecTransformer Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] Keys [1]: [cntrycode#X] Functions [2]: [count(1), sum(c_acctbal#X)] Aggregate Attributes [2]: [count(1)#X, sum(c_acctbal#X)#X] Results [3]: [cntrycode#X, count(1)#X AS numcust#X, sum(c_acctbal#X)#X AS totacctbal#X] -(19) WholeStageCodegenTransformer (X) +(20) WholeStageCodegenTransformer (X) Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: false -(20) ColumnarExchange +(21) VeloxAppendBatches +Input [3]: [cntrycode#X, numcust#X, totacctbal#X] +Arguments: X + +(22) ColumnarExchange Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: rangepartitioning(cntrycode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(21) ShuffleQueryStage +(23) ShuffleQueryStage Output [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: X -(22) InputAdapter +(24) InputAdapter Input [3]: [cntrycode#X, numcust#X, totacctbal#X] -(23) InputIteratorTransformer +(25) InputIteratorTransformer Input [3]: [cntrycode#X, numcust#X, totacctbal#X] -(24) SortExecTransformer +(26) SortExecTransformer Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: [cntrycode#X ASC NULLS FIRST], true, 0 -(25) WholeStageCodegenTransformer (X) +(27) WholeStageCodegenTransformer (X) Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: false -(26) VeloxColumnarToRowExec +(28) VeloxColumnarToRowExec Input [3]: [cntrycode#X, numcust#X, totacctbal#X] -(27) Scan parquet +(29) Scan parquet Output [3]: [c_custkey#X, c_phone#X, c_acctbal#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_acctbal)] ReadSchema: struct -(28) Filter +(30) Filter Input [3]: [c_custkey#X, c_phone#X, c_acctbal#X] Condition : ((isnotnull(c_acctbal#X) AND substring(c_phone#X, 1, 2) IN (13,31,23,29,30,18,17)) AND (cast(c_acctbal#X as decimal(16,6)) > Subquery subquery#X, [id=#X])) -(29) Scan parquet +(31) Scan parquet Output [1]: [o_custkey#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(30) BroadcastExchange +(32) BroadcastExchange Input [1]: [o_custkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(31) BroadcastHashJoin +(33) BroadcastHashJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join type: LeftAnti Join condition: None -(32) Project +(34) Project Output [2]: [substring(c_phone#X, 1, 2) AS cntrycode#X, c_acctbal#X] Input [3]: [c_custkey#X, c_phone#X, c_acctbal#X] -(33) HashAggregate +(35) HashAggregate Input [2]: [cntrycode#X, c_acctbal#X] Keys [1]: [cntrycode#X] Functions [2]: [partial_count(1), partial_sum(c_acctbal#X)] Aggregate Attributes [3]: [count#X, sum#X, isEmpty#X] Results [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] -(34) Exchange +(36) Exchange Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] Arguments: hashpartitioning(cntrycode#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(35) HashAggregate +(37) HashAggregate Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] Keys [1]: [cntrycode#X] Functions [2]: [count(1), sum(c_acctbal#X)] Aggregate Attributes [2]: [count(1)#X, sum(c_acctbal#X)#X] Results [3]: [cntrycode#X, count(1)#X AS numcust#X, sum(c_acctbal#X)#X AS totacctbal#X] -(36) Exchange +(38) Exchange Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: rangepartitioning(cntrycode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(37) Sort +(39) Sort Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: [cntrycode#X ASC NULLS FIRST], true, 0 -(38) AdaptiveSparkPlan +(40) AdaptiveSparkPlan Output [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: isFinalPlan=true ===== Subqueries ===== Subquery:1 Hosting operator id = 1 Hosting Expression = Subquery subquery#X, [id=#X] -AdaptiveSparkPlan (57) +AdaptiveSparkPlan (60) +- == Final Plan == - VeloxColumnarToRowExec (50) - +- ^ RegularHashAggregateExecTransformer (48) - +- ^ InputIteratorTransformer (47) - +- ^ InputAdapter (46) - +- ^ ShuffleQueryStage (45), Statistics(X) - +- ColumnarExchange (44) - +- ^ FlushableHashAggregateExecTransformer (42) - +- ^ ProjectExecTransformer (41) - +- ^ NoopFilter (40) - +- ^ Scan parquet (39) + VeloxColumnarToRowExec (53) + +- ^ RegularHashAggregateExecTransformer (51) + +- ^ InputIteratorTransformer (50) + +- ShuffleQueryStage (48), Statistics(X) + +- ColumnarExchange (47) + +- VeloxAppendBatches (46) + +- ^ FlushableHashAggregateExecTransformer (44) + +- ^ ProjectExecTransformer (43) + +- ^ NoopFilter (42) + +- ^ Scan parquet (41) +- == Initial Plan == - HashAggregate (56) - +- Exchange (55) - +- HashAggregate (54) - +- Project (53) - +- Filter (52) - +- Scan parquet (51) + HashAggregate (59) + +- Exchange (58) + +- HashAggregate (57) + +- Project (56) + +- Filter (55) + +- Scan parquet (54) -(39) Scan parquet +(41) Scan parquet Output [2]: [c_phone#X, c_acctbal#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_acctbal), GreaterThan(c_acctbal,0.00)] ReadSchema: struct -(40) NoopFilter +(42) NoopFilter Input [2]: [c_phone#X, c_acctbal#X] Arguments: [c_phone#X, c_acctbal#X] -(41) ProjectExecTransformer +(43) ProjectExecTransformer Output [1]: [c_acctbal#X] Input [2]: [c_phone#X, c_acctbal#X] -(42) FlushableHashAggregateExecTransformer +(44) FlushableHashAggregateExecTransformer Input [1]: [c_acctbal#X] Keys: [] Functions [1]: [partial_avg(c_acctbal#X)] Aggregate Attributes [2]: [sum#X, count#X] Results [2]: [sum#X, count#X] -(43) WholeStageCodegenTransformer (X) +(45) WholeStageCodegenTransformer (X) Input [2]: [sum#X, count#X] Arguments: false -(44) ColumnarExchange +(46) VeloxAppendBatches +Input [2]: [sum#X, count#X] +Arguments: X + +(47) ColumnarExchange Input [2]: [sum#X, count#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(45) ShuffleQueryStage +(48) ShuffleQueryStage Output [2]: [sum#X, count#X] Arguments: X -(46) InputAdapter +(49) InputAdapter Input [2]: [sum#X, count#X] -(47) InputIteratorTransformer +(50) InputIteratorTransformer Input [2]: [sum#X, count#X] -(48) RegularHashAggregateExecTransformer +(51) RegularHashAggregateExecTransformer Input [2]: [sum#X, count#X] Keys: [] Functions [1]: [avg(c_acctbal#X)] Aggregate Attributes [1]: [avg(c_acctbal#X)#X] Results [1]: [avg(c_acctbal#X)#X AS avg(c_acctbal)#X] -(49) WholeStageCodegenTransformer (X) +(52) WholeStageCodegenTransformer (X) Input [1]: [avg(c_acctbal)#X] Arguments: false -(50) VeloxColumnarToRowExec +(53) VeloxColumnarToRowExec Input [1]: [avg(c_acctbal)#X] -(51) Scan parquet +(54) Scan parquet Output [2]: [c_phone#X, c_acctbal#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_acctbal), GreaterThan(c_acctbal,0.00)] ReadSchema: struct -(52) Filter +(55) Filter Input [2]: [c_phone#X, c_acctbal#X] Condition : ((isnotnull(c_acctbal#X) AND (c_acctbal#X > 0.00)) AND substring(c_phone#X, 1, 2) IN (13,31,23,29,30,18,17)) -(53) Project +(56) Project Output [1]: [c_acctbal#X] Input [2]: [c_phone#X, c_acctbal#X] -(54) HashAggregate +(57) HashAggregate Input [1]: [c_acctbal#X] Keys: [] Functions [1]: [partial_avg(c_acctbal#X)] Aggregate Attributes [2]: [sum#X, count#X] Results [2]: [sum#X, count#X] -(55) Exchange +(58) Exchange Input [2]: [sum#X, count#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X] -(56) HashAggregate +(59) HashAggregate Input [2]: [sum#X, count#X] Keys: [] Functions [1]: [avg(c_acctbal#X)] Aggregate Attributes [1]: [avg(c_acctbal#X)#X] Results [1]: [avg(c_acctbal#X)#X AS avg(c_acctbal)#X] -(57) AdaptiveSparkPlan +(60) AdaptiveSparkPlan Output [1]: [avg(c_acctbal)#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/3.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/3.txt index 98a7cd0e01457..988bdc6c26ca0 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/3.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/3.txt @@ -1,55 +1,53 @@ == Physical Plan == -AdaptiveSparkPlan (53) +AdaptiveSparkPlan (54) +- == Final Plan == - VeloxColumnarToRowExec (34) - +- TakeOrderedAndProjectExecTransformer (33) - +- ^ ProjectExecTransformer (31) - +- ^ RegularHashAggregateExecTransformer (30) - +- ^ InputIteratorTransformer (29) - +- ^ InputAdapter (28) - +- ^ ShuffleQueryStage (27), Statistics(X) - +- ColumnarExchange (26) + VeloxColumnarToRowExec (35) + +- TakeOrderedAndProjectExecTransformer (34) + +- ^ ProjectExecTransformer (32) + +- ^ RegularHashAggregateExecTransformer (31) + +- ^ InputIteratorTransformer (30) + +- ShuffleQueryStage (28), Statistics(X) + +- ColumnarExchange (27) + +- VeloxAppendBatches (26) +- ^ ProjectExecTransformer (24) +- ^ FlushableHashAggregateExecTransformer (23) +- ^ ProjectExecTransformer (22) - +- ^ BroadcastHashJoinExecTransformer Inner (21) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (21) :- ^ ProjectExecTransformer (12) - : +- ^ BroadcastHashJoinExecTransformer Inner (11) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildLeft (11) : :- ^ InputIteratorTransformer (8) - : : +- ^ InputAdapter (7) - : : +- ^ BroadcastQueryStage (6), Statistics(X) - : : +- ColumnarBroadcastExchange (5) - : : +- ^ ProjectExecTransformer (3) - : : +- ^ NoopFilter (2) - : : +- ^ Scan parquet (1) + : : +- BroadcastQueryStage (6), Statistics(X) + : : +- ColumnarBroadcastExchange (5) + : : +- ^ ProjectExecTransformer (3) + : : +- ^ NoopFilter (2) + : : +- ^ Scan parquet (1) : +- ^ NoopFilter (10) : +- ^ Scan parquet (9) +- ^ InputIteratorTransformer (20) - +- ^ InputAdapter (19) - +- ^ BroadcastQueryStage (18), Statistics(X) - +- ColumnarBroadcastExchange (17) - +- ^ ProjectExecTransformer (15) - +- ^ NoopFilter (14) - +- ^ Scan parquet (13) + +- BroadcastQueryStage (18), Statistics(X) + +- ColumnarBroadcastExchange (17) + +- ^ ProjectExecTransformer (15) + +- ^ NoopFilter (14) + +- ^ Scan parquet (13) +- == Initial Plan == - TakeOrderedAndProject (52) - +- HashAggregate (51) - +- Exchange (50) - +- HashAggregate (49) - +- Project (48) - +- BroadcastHashJoin Inner BuildRight (47) - :- Project (42) - : +- BroadcastHashJoin Inner BuildLeft (41) - : :- BroadcastExchange (38) - : : +- Project (37) - : : +- Filter (36) - : : +- Scan parquet (35) - : +- Filter (40) - : +- Scan parquet (39) - +- BroadcastExchange (46) - +- Project (45) - +- Filter (44) - +- Scan parquet (43) + TakeOrderedAndProject (53) + +- HashAggregate (52) + +- Exchange (51) + +- HashAggregate (50) + +- Project (49) + +- BroadcastHashJoin Inner BuildRight (48) + :- Project (43) + : +- BroadcastHashJoin Inner BuildLeft (42) + : :- BroadcastExchange (39) + : : +- Project (38) + : : +- Filter (37) + : : +- Scan parquet (36) + : +- Filter (41) + : +- Scan parquet (40) + +- BroadcastExchange (47) + +- Project (46) + +- Filter (45) + +- Scan parquet (44) (1) Scan parquet @@ -164,133 +162,137 @@ Input [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] Input [6]: [hash_partition_key#X, l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] Arguments: false -(26) ColumnarExchange +(26) VeloxAppendBatches +Input [6]: [hash_partition_key#X, l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] +Arguments: X + +(27) ColumnarExchange Input [6]: [hash_partition_key#X, l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_orderkey#X, o_orderdate#X, o_shippriority#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(27) ShuffleQueryStage +(28) ShuffleQueryStage Output [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] Arguments: X -(28) InputAdapter +(29) InputAdapter Input [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] -(29) InputIteratorTransformer +(30) InputIteratorTransformer Input [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] -(30) RegularHashAggregateExecTransformer +(31) RegularHashAggregateExecTransformer Input [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] Keys [3]: [l_orderkey#X, o_orderdate#X, o_shippriority#X] Functions [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X] Results [4]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X] -(31) ProjectExecTransformer +(32) ProjectExecTransformer Output [4]: [l_orderkey#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X AS revenue#X, o_orderdate#X, o_shippriority#X] Input [4]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X] -(32) WholeStageCodegenTransformer (X) +(33) WholeStageCodegenTransformer (X) Input [4]: [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] Arguments: false -(33) TakeOrderedAndProjectExecTransformer +(34) TakeOrderedAndProjectExecTransformer Input [4]: [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] Arguments: X, [revenue#X DESC NULLS LAST, o_orderdate#X ASC NULLS FIRST], [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X], 0 -(34) VeloxColumnarToRowExec +(35) VeloxColumnarToRowExec Input [4]: [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] -(35) Scan parquet +(36) Scan parquet Output [2]: [c_custkey#X, c_mktsegment#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_mktsegment), EqualTo(c_mktsegment,BUILDING), IsNotNull(c_custkey)] ReadSchema: struct -(36) Filter +(37) Filter Input [2]: [c_custkey#X, c_mktsegment#X] Condition : ((isnotnull(c_mktsegment#X) AND (c_mktsegment#X = BUILDING)) AND isnotnull(c_custkey#X)) -(37) Project +(38) Project Output [1]: [c_custkey#X] Input [2]: [c_custkey#X, c_mktsegment#X] -(38) BroadcastExchange +(39) BroadcastExchange Input [1]: [c_custkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(39) Scan parquet +(40) Scan parquet Output [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), LessThan(o_orderdate,1995-03-15), IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(40) Filter +(41) Filter Input [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Condition : (((isnotnull(o_orderdate#X) AND (o_orderdate#X < 1995-03-15)) AND isnotnull(o_custkey#X)) AND isnotnull(o_orderkey#X)) -(41) BroadcastHashJoin +(42) BroadcastHashJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join type: Inner Join condition: None -(42) Project +(43) Project Output [3]: [o_orderkey#X, o_orderdate#X, o_shippriority#X] Input [5]: [c_custkey#X, o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] -(43) Scan parquet +(44) Scan parquet Output [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThan(l_shipdate,1995-03-15), IsNotNull(l_orderkey)] ReadSchema: struct -(44) Filter +(45) Filter Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : ((isnotnull(l_shipdate#X) AND (l_shipdate#X > 1995-03-15)) AND isnotnull(l_orderkey#X)) -(45) Project +(46) Project Output [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(46) BroadcastExchange +(47) BroadcastExchange Input [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(47) BroadcastHashJoin +(48) BroadcastHashJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: Inner Join condition: None -(48) Project +(49) Project Output [5]: [o_orderdate#X, o_shippriority#X, l_orderkey#X, l_extendedprice#X, l_discount#X] Input [6]: [o_orderkey#X, o_orderdate#X, o_shippriority#X, l_orderkey#X, l_extendedprice#X, l_discount#X] -(49) HashAggregate +(50) HashAggregate Input [5]: [o_orderdate#X, o_shippriority#X, l_orderkey#X, l_extendedprice#X, l_discount#X] Keys [3]: [l_orderkey#X, o_orderdate#X, o_shippriority#X] Functions [1]: [partial_sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] -(50) Exchange +(51) Exchange Input [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_orderkey#X, o_orderdate#X, o_shippriority#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(51) HashAggregate +(52) HashAggregate Input [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] Keys [3]: [l_orderkey#X, o_orderdate#X, o_shippriority#X] Functions [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X] Results [4]: [l_orderkey#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X AS revenue#X, o_orderdate#X, o_shippriority#X] -(52) TakeOrderedAndProject +(53) TakeOrderedAndProject Input [4]: [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] Arguments: X, [revenue#X DESC NULLS LAST, o_orderdate#X ASC NULLS FIRST], [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] -(53) AdaptiveSparkPlan +(54) AdaptiveSparkPlan Output [4]: [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/4.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/4.txt index b5fefc6bef3ce..0b73608fbe4b7 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/4.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/4.txt @@ -1,46 +1,45 @@ == Physical Plan == -AdaptiveSparkPlan (44) +AdaptiveSparkPlan (46) +- == Final Plan == - VeloxColumnarToRowExec (29) - +- ^ SortExecTransformer (27) - +- ^ InputIteratorTransformer (26) - +- ^ InputAdapter (25) - +- ^ ShuffleQueryStage (24), Statistics(X) - +- ColumnarExchange (23) - +- ^ RegularHashAggregateExecTransformer (21) - +- ^ InputIteratorTransformer (20) - +- ^ InputAdapter (19) - +- ^ ShuffleQueryStage (18), Statistics(X) - +- ColumnarExchange (17) + VeloxColumnarToRowExec (31) + +- ^ SortExecTransformer (29) + +- ^ InputIteratorTransformer (28) + +- ShuffleQueryStage (26), Statistics(X) + +- ColumnarExchange (25) + +- VeloxAppendBatches (24) + +- ^ RegularHashAggregateExecTransformer (22) + +- ^ InputIteratorTransformer (21) + +- ShuffleQueryStage (19), Statistics(X) + +- ColumnarExchange (18) + +- VeloxAppendBatches (17) +- ^ ProjectExecTransformer (15) +- ^ FlushableHashAggregateExecTransformer (14) +- ^ ProjectExecTransformer (13) - +- ^ BroadcastHashJoinExecTransformer LeftSemi (12) + +- ^ BroadcastHashJoinExecTransformer LeftSemi BuildRight (12) :- ^ ProjectExecTransformer (3) : +- ^ NoopFilter (2) : +- ^ Scan parquet (1) +- ^ InputIteratorTransformer (11) - +- ^ InputAdapter (10) - +- ^ BroadcastQueryStage (9), Statistics(X) - +- ColumnarBroadcastExchange (8) - +- ^ ProjectExecTransformer (6) - +- ^ NoopFilter (5) - +- ^ Scan parquet (4) + +- BroadcastQueryStage (9), Statistics(X) + +- ColumnarBroadcastExchange (8) + +- ^ ProjectExecTransformer (6) + +- ^ NoopFilter (5) + +- ^ Scan parquet (4) +- == Initial Plan == - Sort (43) - +- Exchange (42) - +- HashAggregate (41) - +- Exchange (40) - +- HashAggregate (39) - +- Project (38) - +- BroadcastHashJoin LeftSemi BuildRight (37) - :- Project (32) - : +- Filter (31) - : +- Scan parquet (30) - +- BroadcastExchange (36) - +- Project (35) - +- Filter (34) - +- Scan parquet (33) + Sort (45) + +- Exchange (44) + +- HashAggregate (43) + +- Exchange (42) + +- HashAggregate (41) + +- Project (40) + +- BroadcastHashJoin LeftSemi BuildRight (39) + :- Project (34) + : +- Filter (33) + : +- Scan parquet (32) + +- BroadcastExchange (38) + +- Project (37) + +- Filter (36) + +- Scan parquet (35) (1) Scan parquet @@ -116,126 +115,134 @@ Input [2]: [o_orderpriority#X, count#X] Input [3]: [hash_partition_key#X, o_orderpriority#X, count#X] Arguments: false -(17) ColumnarExchange +(17) VeloxAppendBatches +Input [3]: [hash_partition_key#X, o_orderpriority#X, count#X] +Arguments: X + +(18) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderpriority#X, count#X] Arguments: hashpartitioning(o_orderpriority#X, 1), ENSURE_REQUIREMENTS, [o_orderpriority#X, count#X], [plan_id=X], [id=#X] -(18) ShuffleQueryStage +(19) ShuffleQueryStage Output [2]: [o_orderpriority#X, count#X] Arguments: X -(19) InputAdapter +(20) InputAdapter Input [2]: [o_orderpriority#X, count#X] -(20) InputIteratorTransformer +(21) InputIteratorTransformer Input [2]: [o_orderpriority#X, count#X] -(21) RegularHashAggregateExecTransformer +(22) RegularHashAggregateExecTransformer Input [2]: [o_orderpriority#X, count#X] Keys [1]: [o_orderpriority#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [o_orderpriority#X, count(1)#X AS order_count#X] -(22) WholeStageCodegenTransformer (X) +(23) WholeStageCodegenTransformer (X) Input [2]: [o_orderpriority#X, order_count#X] Arguments: false -(23) ColumnarExchange +(24) VeloxAppendBatches +Input [2]: [o_orderpriority#X, order_count#X] +Arguments: X + +(25) ColumnarExchange Input [2]: [o_orderpriority#X, order_count#X] Arguments: rangepartitioning(o_orderpriority#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(24) ShuffleQueryStage +(26) ShuffleQueryStage Output [2]: [o_orderpriority#X, order_count#X] Arguments: X -(25) InputAdapter +(27) InputAdapter Input [2]: [o_orderpriority#X, order_count#X] -(26) InputIteratorTransformer +(28) InputIteratorTransformer Input [2]: [o_orderpriority#X, order_count#X] -(27) SortExecTransformer +(29) SortExecTransformer Input [2]: [o_orderpriority#X, order_count#X] Arguments: [o_orderpriority#X ASC NULLS FIRST], true, 0 -(28) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [2]: [o_orderpriority#X, order_count#X] Arguments: false -(29) VeloxColumnarToRowExec +(31) VeloxColumnarToRowExec Input [2]: [o_orderpriority#X, order_count#X] -(30) Scan parquet +(32) Scan parquet Output [3]: [o_orderkey#X, o_orderdate#X, o_orderpriority#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1993-07-01), LessThan(o_orderdate,1993-10-01)] ReadSchema: struct -(31) Filter +(33) Filter Input [3]: [o_orderkey#X, o_orderdate#X, o_orderpriority#X] Condition : ((isnotnull(o_orderdate#X) AND (o_orderdate#X >= 1993-07-01)) AND (o_orderdate#X < 1993-10-01)) -(32) Project +(34) Project Output [2]: [o_orderkey#X, o_orderpriority#X] Input [3]: [o_orderkey#X, o_orderdate#X, o_orderpriority#X] -(33) Scan parquet +(35) Scan parquet Output [3]: [l_orderkey#X, l_commitdate#X, l_receiptdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_commitdate), IsNotNull(l_receiptdate)] ReadSchema: struct -(34) Filter +(36) Filter Input [3]: [l_orderkey#X, l_commitdate#X, l_receiptdate#X] Condition : ((isnotnull(l_commitdate#X) AND isnotnull(l_receiptdate#X)) AND (l_commitdate#X < l_receiptdate#X)) -(35) Project +(37) Project Output [1]: [l_orderkey#X] Input [3]: [l_orderkey#X, l_commitdate#X, l_receiptdate#X] -(36) BroadcastExchange +(38) BroadcastExchange Input [1]: [l_orderkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(37) BroadcastHashJoin +(39) BroadcastHashJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: LeftSemi Join condition: None -(38) Project +(40) Project Output [1]: [o_orderpriority#X] Input [2]: [o_orderkey#X, o_orderpriority#X] -(39) HashAggregate +(41) HashAggregate Input [1]: [o_orderpriority#X] Keys [1]: [o_orderpriority#X] Functions [1]: [partial_count(1)] Aggregate Attributes [1]: [count#X] Results [2]: [o_orderpriority#X, count#X] -(40) Exchange +(42) Exchange Input [2]: [o_orderpriority#X, count#X] Arguments: hashpartitioning(o_orderpriority#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(41) HashAggregate +(43) HashAggregate Input [2]: [o_orderpriority#X, count#X] Keys [1]: [o_orderpriority#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [o_orderpriority#X, count(1)#X AS order_count#X] -(42) Exchange +(44) Exchange Input [2]: [o_orderpriority#X, order_count#X] Arguments: rangepartitioning(o_orderpriority#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(43) Sort +(45) Sort Input [2]: [o_orderpriority#X, order_count#X] Arguments: [o_orderpriority#X ASC NULLS FIRST], true, 0 -(44) AdaptiveSparkPlan +(46) AdaptiveSparkPlan Output [2]: [o_orderpriority#X, order_count#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/5.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/5.txt index be0403f587f51..8891188e8a08b 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/5.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/5.txt @@ -1,98 +1,93 @@ == Physical Plan == -AdaptiveSparkPlan (100) +AdaptiveSparkPlan (102) +- == Final Plan == - VeloxColumnarToRowExec (65) - +- ^ SortExecTransformer (63) - +- ^ InputIteratorTransformer (62) - +- ^ InputAdapter (61) - +- ^ ShuffleQueryStage (60), Statistics(X) - +- ColumnarExchange (59) - +- ^ RegularHashAggregateExecTransformer (57) - +- ^ InputIteratorTransformer (56) - +- ^ InputAdapter (55) - +- ^ ShuffleQueryStage (54), Statistics(X) - +- ColumnarExchange (53) + VeloxColumnarToRowExec (67) + +- ^ SortExecTransformer (65) + +- ^ InputIteratorTransformer (64) + +- ShuffleQueryStage (62), Statistics(X) + +- ColumnarExchange (61) + +- VeloxAppendBatches (60) + +- ^ RegularHashAggregateExecTransformer (58) + +- ^ InputIteratorTransformer (57) + +- ShuffleQueryStage (55), Statistics(X) + +- ColumnarExchange (54) + +- VeloxAppendBatches (53) +- ^ ProjectExecTransformer (51) +- ^ FlushableHashAggregateExecTransformer (50) +- ^ ProjectExecTransformer (49) - +- ^ BroadcastHashJoinExecTransformer Inner (48) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (48) :- ^ ProjectExecTransformer (39) - : +- ^ BroadcastHashJoinExecTransformer Inner (38) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (38) : :- ^ ProjectExecTransformer (30) - : : +- ^ BroadcastHashJoinExecTransformer Inner (29) + : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (29) : : :- ^ ProjectExecTransformer (21) - : : : +- ^ BroadcastHashJoinExecTransformer Inner (20) + : : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (20) : : : :- ^ ProjectExecTransformer (12) - : : : : +- ^ BroadcastHashJoinExecTransformer Inner (11) + : : : : +- ^ BroadcastHashJoinExecTransformer Inner BuildLeft (11) : : : : :- ^ InputIteratorTransformer (7) - : : : : : +- ^ InputAdapter (6) - : : : : : +- ^ BroadcastQueryStage (5), Statistics(X) - : : : : : +- ColumnarBroadcastExchange (4) - : : : : : +- ^ NoopFilter (2) - : : : : : +- ^ Scan parquet (1) + : : : : : +- BroadcastQueryStage (5), Statistics(X) + : : : : : +- ColumnarBroadcastExchange (4) + : : : : : +- ^ NoopFilter (2) + : : : : : +- ^ Scan parquet (1) : : : : +- ^ ProjectExecTransformer (10) : : : : +- ^ NoopFilter (9) : : : : +- ^ Scan parquet (8) : : : +- ^ InputIteratorTransformer (19) - : : : +- ^ InputAdapter (18) - : : : +- ^ BroadcastQueryStage (17), Statistics(X) - : : : +- ColumnarBroadcastExchange (16) - : : : +- ^ NoopFilter (14) - : : : +- ^ Scan parquet (13) + : : : +- BroadcastQueryStage (17), Statistics(X) + : : : +- ColumnarBroadcastExchange (16) + : : : +- ^ NoopFilter (14) + : : : +- ^ Scan parquet (13) : : +- ^ InputIteratorTransformer (28) - : : +- ^ InputAdapter (27) - : : +- ^ BroadcastQueryStage (26), Statistics(X) - : : +- ColumnarBroadcastExchange (25) - : : +- ^ NoopFilter (23) - : : +- ^ Scan parquet (22) + : : +- BroadcastQueryStage (26), Statistics(X) + : : +- ColumnarBroadcastExchange (25) + : : +- ^ NoopFilter (23) + : : +- ^ Scan parquet (22) : +- ^ InputIteratorTransformer (37) - : +- ^ InputAdapter (36) - : +- ^ BroadcastQueryStage (35), Statistics(X) - : +- ColumnarBroadcastExchange (34) - : +- ^ NoopFilter (32) - : +- ^ Scan parquet (31) + : +- BroadcastQueryStage (35), Statistics(X) + : +- ColumnarBroadcastExchange (34) + : +- ^ NoopFilter (32) + : +- ^ Scan parquet (31) +- ^ InputIteratorTransformer (47) - +- ^ InputAdapter (46) - +- ^ BroadcastQueryStage (45), Statistics(X) - +- ColumnarBroadcastExchange (44) - +- ^ ProjectExecTransformer (42) - +- ^ NoopFilter (41) - +- ^ Scan parquet (40) + +- BroadcastQueryStage (45), Statistics(X) + +- ColumnarBroadcastExchange (44) + +- ^ ProjectExecTransformer (42) + +- ^ NoopFilter (41) + +- ^ Scan parquet (40) +- == Initial Plan == - Sort (99) - +- Exchange (98) - +- HashAggregate (97) - +- Exchange (96) - +- HashAggregate (95) - +- Project (94) - +- BroadcastHashJoin Inner BuildRight (93) - :- Project (88) - : +- BroadcastHashJoin Inner BuildRight (87) - : :- Project (83) - : : +- BroadcastHashJoin Inner BuildRight (82) - : : :- Project (78) - : : : +- BroadcastHashJoin Inner BuildRight (77) - : : : :- Project (73) - : : : : +- BroadcastHashJoin Inner BuildLeft (72) - : : : : :- BroadcastExchange (68) - : : : : : +- Filter (67) - : : : : : +- Scan parquet (66) - : : : : +- Project (71) - : : : : +- Filter (70) - : : : : +- Scan parquet (69) - : : : +- BroadcastExchange (76) - : : : +- Filter (75) - : : : +- Scan parquet (74) - : : +- BroadcastExchange (81) - : : +- Filter (80) - : : +- Scan parquet (79) - : +- BroadcastExchange (86) - : +- Filter (85) - : +- Scan parquet (84) - +- BroadcastExchange (92) - +- Project (91) - +- Filter (90) - +- Scan parquet (89) + Sort (101) + +- Exchange (100) + +- HashAggregate (99) + +- Exchange (98) + +- HashAggregate (97) + +- Project (96) + +- BroadcastHashJoin Inner BuildRight (95) + :- Project (90) + : +- BroadcastHashJoin Inner BuildRight (89) + : :- Project (85) + : : +- BroadcastHashJoin Inner BuildRight (84) + : : :- Project (80) + : : : +- BroadcastHashJoin Inner BuildRight (79) + : : : :- Project (75) + : : : : +- BroadcastHashJoin Inner BuildLeft (74) + : : : : :- BroadcastExchange (70) + : : : : : +- Filter (69) + : : : : : +- Scan parquet (68) + : : : : +- Project (73) + : : : : +- Filter (72) + : : : : +- Scan parquet (71) + : : : +- BroadcastExchange (78) + : : : +- Filter (77) + : : : +- Scan parquet (76) + : : +- BroadcastExchange (83) + : : +- Filter (82) + : : +- Scan parquet (81) + : +- BroadcastExchange (88) + : +- Filter (87) + : +- Scan parquet (86) + +- BroadcastExchange (94) + +- Project (93) + +- Filter (92) + +- Scan parquet (91) (1) Scan parquet @@ -324,226 +319,234 @@ Input [3]: [n_name#X, sum#X, isEmpty#X] Input [4]: [hash_partition_key#X, n_name#X, sum#X, isEmpty#X] Arguments: false -(53) ColumnarExchange +(53) VeloxAppendBatches +Input [4]: [hash_partition_key#X, n_name#X, sum#X, isEmpty#X] +Arguments: X + +(54) ColumnarExchange Input [4]: [hash_partition_key#X, n_name#X, sum#X, isEmpty#X] Arguments: hashpartitioning(n_name#X, 1), ENSURE_REQUIREMENTS, [n_name#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(54) ShuffleQueryStage +(55) ShuffleQueryStage Output [3]: [n_name#X, sum#X, isEmpty#X] Arguments: X -(55) InputAdapter +(56) InputAdapter Input [3]: [n_name#X, sum#X, isEmpty#X] -(56) InputIteratorTransformer +(57) InputIteratorTransformer Input [3]: [n_name#X, sum#X, isEmpty#X] -(57) RegularHashAggregateExecTransformer +(58) RegularHashAggregateExecTransformer Input [3]: [n_name#X, sum#X, isEmpty#X] Keys [1]: [n_name#X] Functions [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X] Results [2]: [n_name#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X AS revenue#X] -(58) WholeStageCodegenTransformer (X) +(59) WholeStageCodegenTransformer (X) Input [2]: [n_name#X, revenue#X] Arguments: false -(59) ColumnarExchange +(60) VeloxAppendBatches +Input [2]: [n_name#X, revenue#X] +Arguments: X + +(61) ColumnarExchange Input [2]: [n_name#X, revenue#X] Arguments: rangepartitioning(revenue#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(60) ShuffleQueryStage +(62) ShuffleQueryStage Output [2]: [n_name#X, revenue#X] Arguments: X -(61) InputAdapter +(63) InputAdapter Input [2]: [n_name#X, revenue#X] -(62) InputIteratorTransformer +(64) InputIteratorTransformer Input [2]: [n_name#X, revenue#X] -(63) SortExecTransformer +(65) SortExecTransformer Input [2]: [n_name#X, revenue#X] Arguments: [revenue#X DESC NULLS LAST], true, 0 -(64) WholeStageCodegenTransformer (X) +(66) WholeStageCodegenTransformer (X) Input [2]: [n_name#X, revenue#X] Arguments: false -(65) VeloxColumnarToRowExec +(67) VeloxColumnarToRowExec Input [2]: [n_name#X, revenue#X] -(66) Scan parquet +(68) Scan parquet Output [2]: [c_custkey#X, c_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey), IsNotNull(c_nationkey)] ReadSchema: struct -(67) Filter +(69) Filter Input [2]: [c_custkey#X, c_nationkey#X] Condition : (isnotnull(c_custkey#X) AND isnotnull(c_nationkey#X)) -(68) BroadcastExchange +(70) BroadcastExchange Input [2]: [c_custkey#X, c_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(69) Scan parquet +(71) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1994-01-01), LessThan(o_orderdate,1995-01-01), IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(70) Filter +(72) Filter Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Condition : ((((isnotnull(o_orderdate#X) AND (o_orderdate#X >= 1994-01-01)) AND (o_orderdate#X < 1995-01-01)) AND isnotnull(o_custkey#X)) AND isnotnull(o_orderkey#X)) -(71) Project +(73) Project Output [2]: [o_orderkey#X, o_custkey#X] Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] -(72) BroadcastHashJoin +(74) BroadcastHashJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join type: Inner Join condition: None -(73) Project +(75) Project Output [2]: [c_nationkey#X, o_orderkey#X] Input [4]: [c_custkey#X, c_nationkey#X, o_orderkey#X, o_custkey#X] -(74) Scan parquet +(76) Scan parquet Output [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_orderkey), IsNotNull(l_suppkey)] ReadSchema: struct -(75) Filter +(77) Filter Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Condition : (isnotnull(l_orderkey#X) AND isnotnull(l_suppkey#X)) -(76) BroadcastExchange +(78) BroadcastExchange Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(77) BroadcastHashJoin +(79) BroadcastHashJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: Inner Join condition: None -(78) Project +(80) Project Output [4]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Input [6]: [c_nationkey#X, o_orderkey#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(79) Scan parquet +(81) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(80) Filter +(82) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(81) BroadcastExchange +(83) BroadcastExchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false], input[1, bigint, false]),false), [plan_id=X] -(82) BroadcastHashJoin +(84) BroadcastHashJoin Left keys [2]: [l_suppkey#X, c_nationkey#X] Right keys [2]: [s_suppkey#X, s_nationkey#X] Join type: Inner Join condition: None -(83) Project +(85) Project Output [3]: [l_extendedprice#X, l_discount#X, s_nationkey#X] Input [6]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, s_suppkey#X, s_nationkey#X] -(84) Scan parquet +(86) Scan parquet Output [3]: [n_nationkey#X, n_name#X, n_regionkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), IsNotNull(n_regionkey)] ReadSchema: struct -(85) Filter +(87) Filter Input [3]: [n_nationkey#X, n_name#X, n_regionkey#X] Condition : (isnotnull(n_nationkey#X) AND isnotnull(n_regionkey#X)) -(86) BroadcastExchange +(88) BroadcastExchange Input [3]: [n_nationkey#X, n_name#X, n_regionkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(87) BroadcastHashJoin +(89) BroadcastHashJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(88) Project +(90) Project Output [4]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] Input [6]: [l_extendedprice#X, l_discount#X, s_nationkey#X, n_nationkey#X, n_name#X, n_regionkey#X] -(89) Scan parquet +(91) Scan parquet Output [2]: [r_regionkey#X, r_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(r_name), EqualTo(r_name,ASIA), IsNotNull(r_regionkey)] ReadSchema: struct -(90) Filter +(92) Filter Input [2]: [r_regionkey#X, r_name#X] Condition : ((isnotnull(r_name#X) AND (r_name#X = ASIA)) AND isnotnull(r_regionkey#X)) -(91) Project +(93) Project Output [1]: [r_regionkey#X] Input [2]: [r_regionkey#X, r_name#X] -(92) BroadcastExchange +(94) BroadcastExchange Input [1]: [r_regionkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(93) BroadcastHashJoin +(95) BroadcastHashJoin Left keys [1]: [n_regionkey#X] Right keys [1]: [r_regionkey#X] Join type: Inner Join condition: None -(94) Project +(96) Project Output [3]: [l_extendedprice#X, l_discount#X, n_name#X] Input [5]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X, r_regionkey#X] -(95) HashAggregate +(97) HashAggregate Input [3]: [l_extendedprice#X, l_discount#X, n_name#X] Keys [1]: [n_name#X] Functions [1]: [partial_sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [n_name#X, sum#X, isEmpty#X] -(96) Exchange +(98) Exchange Input [3]: [n_name#X, sum#X, isEmpty#X] Arguments: hashpartitioning(n_name#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(97) HashAggregate +(99) HashAggregate Input [3]: [n_name#X, sum#X, isEmpty#X] Keys [1]: [n_name#X] Functions [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X] Results [2]: [n_name#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X AS revenue#X] -(98) Exchange +(100) Exchange Input [2]: [n_name#X, revenue#X] Arguments: rangepartitioning(revenue#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(99) Sort +(101) Sort Input [2]: [n_name#X, revenue#X] Arguments: [revenue#X DESC NULLS LAST], true, 0 -(100) AdaptiveSparkPlan +(102) AdaptiveSparkPlan Output [2]: [n_name#X, revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/6.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/6.txt index 9333e2a8ad599..64624c791f726 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/6.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/6.txt @@ -1,23 +1,23 @@ == Physical Plan == -AdaptiveSparkPlan (19) +AdaptiveSparkPlan (20) +- == Final Plan == - VeloxColumnarToRowExec (12) - +- ^ RegularHashAggregateExecTransformer (10) - +- ^ InputIteratorTransformer (9) - +- ^ InputAdapter (8) - +- ^ ShuffleQueryStage (7), Statistics(X) - +- ColumnarExchange (6) + VeloxColumnarToRowExec (13) + +- ^ RegularHashAggregateExecTransformer (11) + +- ^ InputIteratorTransformer (10) + +- ShuffleQueryStage (8), Statistics(X) + +- ColumnarExchange (7) + +- VeloxAppendBatches (6) +- ^ FlushableHashAggregateExecTransformer (4) +- ^ ProjectExecTransformer (3) +- ^ NoopFilter (2) +- ^ Scan parquet (1) +- == Initial Plan == - HashAggregate (18) - +- Exchange (17) - +- HashAggregate (16) - +- Project (15) - +- Filter (14) - +- Scan parquet (13) + HashAggregate (19) + +- Exchange (18) + +- HashAggregate (17) + +- Project (16) + +- Filter (15) + +- Scan parquet (14) (1) Scan parquet @@ -46,67 +46,71 @@ Results [2]: [sum#X, isEmpty#X] Input [2]: [sum#X, isEmpty#X] Arguments: false -(6) ColumnarExchange +(6) VeloxAppendBatches +Input [2]: [sum#X, isEmpty#X] +Arguments: X + +(7) ColumnarExchange Input [2]: [sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(7) ShuffleQueryStage +(8) ShuffleQueryStage Output [2]: [sum#X, isEmpty#X] Arguments: X -(8) InputAdapter +(9) InputAdapter Input [2]: [sum#X, isEmpty#X] -(9) InputIteratorTransformer +(10) InputIteratorTransformer Input [2]: [sum#X, isEmpty#X] -(10) RegularHashAggregateExecTransformer +(11) RegularHashAggregateExecTransformer Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum((l_extendedprice#X * l_discount#X))] Aggregate Attributes [1]: [sum((l_extendedprice#X * l_discount#X))#X] Results [1]: [sum((l_extendedprice#X * l_discount#X))#X AS revenue#X] -(11) WholeStageCodegenTransformer (X) +(12) WholeStageCodegenTransformer (X) Input [1]: [revenue#X] Arguments: false -(12) VeloxColumnarToRowExec +(13) VeloxColumnarToRowExec Input [1]: [revenue#X] -(13) Scan parquet +(14) Scan parquet Output [4]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), IsNotNull(l_discount), IsNotNull(l_quantity), GreaterThanOrEqual(l_shipdate,1994-01-01), LessThan(l_shipdate,1995-01-01), GreaterThanOrEqual(l_discount,0.05), LessThanOrEqual(l_discount,0.07), LessThan(l_quantity,24.00)] ReadSchema: struct -(14) Filter +(15) Filter Input [4]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : (((((((isnotnull(l_shipdate#X) AND isnotnull(l_discount#X)) AND isnotnull(l_quantity#X)) AND (l_shipdate#X >= 1994-01-01)) AND (l_shipdate#X < 1995-01-01)) AND (l_discount#X >= 0.05)) AND (l_discount#X <= 0.07)) AND (l_quantity#X < 24.00)) -(15) Project +(16) Project Output [2]: [l_extendedprice#X, l_discount#X] Input [4]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(16) HashAggregate +(17) HashAggregate Input [2]: [l_extendedprice#X, l_discount#X] Keys: [] Functions [1]: [partial_sum((l_extendedprice#X * l_discount#X))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(17) Exchange +(18) Exchange Input [2]: [sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X] -(18) HashAggregate +(19) HashAggregate Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum((l_extendedprice#X * l_discount#X))] Aggregate Attributes [1]: [sum((l_extendedprice#X * l_discount#X))#X] Results [1]: [sum((l_extendedprice#X * l_discount#X))#X AS revenue#X] -(19) AdaptiveSparkPlan +(20) AdaptiveSparkPlan Output [1]: [revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/7.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/7.txt index 2ac2968387fdf..ae1ab637f8050 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/7.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/7.txt @@ -1,92 +1,87 @@ == Physical Plan == -AdaptiveSparkPlan (93) +AdaptiveSparkPlan (95) +- == Final Plan == - VeloxColumnarToRowExec (60) - +- ^ SortExecTransformer (58) - +- ^ InputIteratorTransformer (57) - +- ^ InputAdapter (56) - +- ^ ShuffleQueryStage (55), Statistics(X) - +- ColumnarExchange (54) - +- ^ RegularHashAggregateExecTransformer (52) - +- ^ InputIteratorTransformer (51) - +- ^ InputAdapter (50) - +- ^ ShuffleQueryStage (49), Statistics(X) - +- ColumnarExchange (48) + VeloxColumnarToRowExec (62) + +- ^ SortExecTransformer (60) + +- ^ InputIteratorTransformer (59) + +- ShuffleQueryStage (57), Statistics(X) + +- ColumnarExchange (56) + +- VeloxAppendBatches (55) + +- ^ RegularHashAggregateExecTransformer (53) + +- ^ InputIteratorTransformer (52) + +- ShuffleQueryStage (50), Statistics(X) + +- ColumnarExchange (49) + +- VeloxAppendBatches (48) +- ^ ProjectExecTransformer (46) +- ^ FlushableHashAggregateExecTransformer (45) +- ^ ProjectExecTransformer (44) - +- ^ BroadcastHashJoinExecTransformer Inner (43) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (43) :- ^ ProjectExecTransformer (38) - : +- ^ BroadcastHashJoinExecTransformer Inner (37) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (37) : :- ^ ProjectExecTransformer (29) - : : +- ^ BroadcastHashJoinExecTransformer Inner (28) + : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (28) : : :- ^ ProjectExecTransformer (20) - : : : +- ^ BroadcastHashJoinExecTransformer Inner (19) + : : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (19) : : : :- ^ ProjectExecTransformer (11) - : : : : +- ^ BroadcastHashJoinExecTransformer Inner (10) + : : : : +- ^ BroadcastHashJoinExecTransformer Inner BuildLeft (10) : : : : :- ^ InputIteratorTransformer (7) - : : : : : +- ^ InputAdapter (6) - : : : : : +- ^ BroadcastQueryStage (5), Statistics(X) - : : : : : +- ColumnarBroadcastExchange (4) - : : : : : +- ^ NoopFilter (2) - : : : : : +- ^ Scan parquet (1) + : : : : : +- BroadcastQueryStage (5), Statistics(X) + : : : : : +- ColumnarBroadcastExchange (4) + : : : : : +- ^ NoopFilter (2) + : : : : : +- ^ Scan parquet (1) : : : : +- ^ NoopFilter (9) : : : : +- ^ Scan parquet (8) : : : +- ^ InputIteratorTransformer (18) - : : : +- ^ InputAdapter (17) - : : : +- ^ BroadcastQueryStage (16), Statistics(X) - : : : +- ColumnarBroadcastExchange (15) - : : : +- ^ NoopFilter (13) - : : : +- ^ Scan parquet (12) + : : : +- BroadcastQueryStage (16), Statistics(X) + : : : +- ColumnarBroadcastExchange (15) + : : : +- ^ NoopFilter (13) + : : : +- ^ Scan parquet (12) : : +- ^ InputIteratorTransformer (27) - : : +- ^ InputAdapter (26) - : : +- ^ BroadcastQueryStage (25), Statistics(X) - : : +- ColumnarBroadcastExchange (24) - : : +- ^ NoopFilter (22) - : : +- ^ Scan parquet (21) + : : +- BroadcastQueryStage (25), Statistics(X) + : : +- ColumnarBroadcastExchange (24) + : : +- ^ NoopFilter (22) + : : +- ^ Scan parquet (21) : +- ^ InputIteratorTransformer (36) - : +- ^ InputAdapter (35) - : +- ^ BroadcastQueryStage (34), Statistics(X) - : +- ColumnarBroadcastExchange (33) - : +- ^ NoopFilter (31) - : +- ^ Scan parquet (30) + : +- BroadcastQueryStage (34), Statistics(X) + : +- ColumnarBroadcastExchange (33) + : +- ^ NoopFilter (31) + : +- ^ Scan parquet (30) +- ^ InputIteratorTransformer (42) - +- ^ InputAdapter (41) - +- ^ BroadcastQueryStage (40), Statistics(X) - +- ReusedExchange (39) + +- BroadcastQueryStage (40), Statistics(X) + +- ReusedExchange (39) +- == Initial Plan == - Sort (92) - +- Exchange (91) - +- HashAggregate (90) - +- Exchange (89) - +- HashAggregate (88) - +- Project (87) - +- BroadcastHashJoin Inner BuildRight (86) - :- Project (82) - : +- BroadcastHashJoin Inner BuildRight (81) - : :- Project (77) - : : +- BroadcastHashJoin Inner BuildRight (76) - : : :- Project (72) - : : : +- BroadcastHashJoin Inner BuildRight (71) - : : : :- Project (67) - : : : : +- BroadcastHashJoin Inner BuildLeft (66) - : : : : :- BroadcastExchange (63) - : : : : : +- Filter (62) - : : : : : +- Scan parquet (61) - : : : : +- Filter (65) - : : : : +- Scan parquet (64) - : : : +- BroadcastExchange (70) - : : : +- Filter (69) - : : : +- Scan parquet (68) - : : +- BroadcastExchange (75) - : : +- Filter (74) - : : +- Scan parquet (73) - : +- BroadcastExchange (80) - : +- Filter (79) - : +- Scan parquet (78) - +- BroadcastExchange (85) - +- Filter (84) - +- Scan parquet (83) + Sort (94) + +- Exchange (93) + +- HashAggregate (92) + +- Exchange (91) + +- HashAggregate (90) + +- Project (89) + +- BroadcastHashJoin Inner BuildRight (88) + :- Project (84) + : +- BroadcastHashJoin Inner BuildRight (83) + : :- Project (79) + : : +- BroadcastHashJoin Inner BuildRight (78) + : : :- Project (74) + : : : +- BroadcastHashJoin Inner BuildRight (73) + : : : :- Project (69) + : : : : +- BroadcastHashJoin Inner BuildLeft (68) + : : : : :- BroadcastExchange (65) + : : : : : +- Filter (64) + : : : : : +- Scan parquet (63) + : : : : +- Filter (67) + : : : : +- Scan parquet (66) + : : : +- BroadcastExchange (72) + : : : +- Filter (71) + : : : +- Scan parquet (70) + : : +- BroadcastExchange (77) + : : +- Filter (76) + : : +- Scan parquet (75) + : +- BroadcastExchange (82) + : +- Filter (81) + : +- Scan parquet (80) + +- BroadcastExchange (87) + +- Filter (86) + +- Scan parquet (85) (1) Scan parquet @@ -294,218 +289,226 @@ Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Input [6]: [hash_partition_key#X, supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Arguments: false -(48) ColumnarExchange +(48) VeloxAppendBatches +Input [6]: [hash_partition_key#X, supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] +Arguments: X + +(49) ColumnarExchange Input [6]: [hash_partition_key#X, supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Arguments: hashpartitioning(supp_nation#X, cust_nation#X, l_year#X, 1), ENSURE_REQUIREMENTS, [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(49) ShuffleQueryStage +(50) ShuffleQueryStage Output [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Arguments: X -(50) InputAdapter +(51) InputAdapter Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] -(51) InputIteratorTransformer +(52) InputIteratorTransformer Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] -(52) RegularHashAggregateExecTransformer +(53) RegularHashAggregateExecTransformer Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Keys [3]: [supp_nation#X, cust_nation#X, l_year#X] Functions [1]: [sum(volume#X)] Aggregate Attributes [1]: [sum(volume#X)#X] Results [4]: [supp_nation#X, cust_nation#X, l_year#X, sum(volume#X)#X AS revenue#X] -(53) WholeStageCodegenTransformer (X) +(54) WholeStageCodegenTransformer (X) Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: false -(54) ColumnarExchange +(55) VeloxAppendBatches +Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] +Arguments: X + +(56) ColumnarExchange Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: rangepartitioning(supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(55) ShuffleQueryStage +(57) ShuffleQueryStage Output [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: X -(56) InputAdapter +(58) InputAdapter Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] -(57) InputIteratorTransformer +(59) InputIteratorTransformer Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] -(58) SortExecTransformer +(60) SortExecTransformer Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: [supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST], true, 0 -(59) WholeStageCodegenTransformer (X) +(61) WholeStageCodegenTransformer (X) Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: false -(60) VeloxColumnarToRowExec +(62) VeloxColumnarToRowExec Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] -(61) Scan parquet +(63) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(62) Filter +(64) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(63) BroadcastExchange +(65) BroadcastExchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(64) Scan parquet +(66) Scan parquet Output [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1995-01-01), LessThanOrEqual(l_shipdate,1996-12-31), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(65) Filter +(67) Filter Input [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : ((((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1995-01-01)) AND (l_shipdate#X <= 1996-12-31)) AND isnotnull(l_suppkey#X)) AND isnotnull(l_orderkey#X)) -(66) BroadcastHashJoin +(68) BroadcastHashJoin Left keys [1]: [s_suppkey#X] Right keys [1]: [l_suppkey#X] Join type: Inner Join condition: None -(67) Project +(69) Project Output [5]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Input [7]: [s_suppkey#X, s_nationkey#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(68) Scan parquet +(70) Scan parquet Output [2]: [o_orderkey#X, o_custkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderkey), IsNotNull(o_custkey)] ReadSchema: struct -(69) Filter +(71) Filter Input [2]: [o_orderkey#X, o_custkey#X] Condition : (isnotnull(o_orderkey#X) AND isnotnull(o_custkey#X)) -(70) BroadcastExchange +(72) BroadcastExchange Input [2]: [o_orderkey#X, o_custkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(71) BroadcastHashJoin +(73) BroadcastHashJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join type: Inner Join condition: None -(72) Project +(74) Project Output [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] Input [7]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_orderkey#X, o_custkey#X] -(73) Scan parquet +(75) Scan parquet Output [2]: [c_custkey#X, c_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey), IsNotNull(c_nationkey)] ReadSchema: struct -(74) Filter +(76) Filter Input [2]: [c_custkey#X, c_nationkey#X] Condition : (isnotnull(c_custkey#X) AND isnotnull(c_nationkey#X)) -(75) BroadcastExchange +(77) BroadcastExchange Input [2]: [c_custkey#X, c_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(76) BroadcastHashJoin +(78) BroadcastHashJoin Left keys [1]: [o_custkey#X] Right keys [1]: [c_custkey#X] Join type: Inner Join condition: None -(77) Project +(79) Project Output [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] Input [7]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X, c_custkey#X, c_nationkey#X] -(78) Scan parquet +(80) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), Or(EqualTo(n_name,FRANCE),EqualTo(n_name,GERMANY))] ReadSchema: struct -(79) Filter +(81) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : (isnotnull(n_nationkey#X) AND ((n_name#X = FRANCE) OR (n_name#X = GERMANY))) -(80) BroadcastExchange +(82) BroadcastExchange Input [2]: [n_nationkey#X, n_name#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(81) BroadcastHashJoin +(83) BroadcastHashJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(82) Project +(84) Project Output [5]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] Input [7]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_nationkey#X, n_name#X] -(83) Scan parquet +(85) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), Or(EqualTo(n_name,GERMANY),EqualTo(n_name,FRANCE))] ReadSchema: struct -(84) Filter +(86) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : (isnotnull(n_nationkey#X) AND ((n_name#X = GERMANY) OR (n_name#X = FRANCE))) -(85) BroadcastExchange +(87) BroadcastExchange Input [2]: [n_nationkey#X, n_name#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(86) BroadcastHashJoin +(88) BroadcastHashJoin Left keys [1]: [c_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: (((n_name#X = FRANCE) AND (n_name#X = GERMANY)) OR ((n_name#X = GERMANY) AND (n_name#X = FRANCE))) -(87) Project +(89) Project Output [4]: [n_name#X AS supp_nation#X, n_name#X AS cust_nation#X, year(l_shipdate#X) AS l_year#X, (l_extendedprice#X * (1 - l_discount#X)) AS volume#X] Input [7]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X, n_nationkey#X, n_name#X] -(88) HashAggregate +(90) HashAggregate Input [4]: [supp_nation#X, cust_nation#X, l_year#X, volume#X] Keys [3]: [supp_nation#X, cust_nation#X, l_year#X] Functions [1]: [partial_sum(volume#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] -(89) Exchange +(91) Exchange Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Arguments: hashpartitioning(supp_nation#X, cust_nation#X, l_year#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(90) HashAggregate +(92) HashAggregate Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Keys [3]: [supp_nation#X, cust_nation#X, l_year#X] Functions [1]: [sum(volume#X)] Aggregate Attributes [1]: [sum(volume#X)#X] Results [4]: [supp_nation#X, cust_nation#X, l_year#X, sum(volume#X)#X AS revenue#X] -(91) Exchange +(93) Exchange Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: rangepartitioning(supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(92) Sort +(94) Sort Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: [supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST], true, 0 -(93) AdaptiveSparkPlan +(95) AdaptiveSparkPlan Output [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/8.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/8.txt index 65906bb966915..4bbf7967744ca 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/8.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/8.txt @@ -1,125 +1,118 @@ == Physical Plan == -AdaptiveSparkPlan (129) +AdaptiveSparkPlan (131) +- == Final Plan == - VeloxColumnarToRowExec (84) - +- ^ SortExecTransformer (82) - +- ^ InputIteratorTransformer (81) - +- ^ InputAdapter (80) - +- ^ ShuffleQueryStage (79), Statistics(X) - +- ColumnarExchange (78) - +- ^ ProjectExecTransformer (76) - +- ^ RegularHashAggregateExecTransformer (75) - +- ^ InputIteratorTransformer (74) - +- ^ InputAdapter (73) - +- ^ ShuffleQueryStage (72), Statistics(X) - +- ColumnarExchange (71) + VeloxColumnarToRowExec (86) + +- ^ SortExecTransformer (84) + +- ^ InputIteratorTransformer (83) + +- ShuffleQueryStage (81), Statistics(X) + +- ColumnarExchange (80) + +- VeloxAppendBatches (79) + +- ^ ProjectExecTransformer (77) + +- ^ RegularHashAggregateExecTransformer (76) + +- ^ InputIteratorTransformer (75) + +- ShuffleQueryStage (73), Statistics(X) + +- ColumnarExchange (72) + +- VeloxAppendBatches (71) +- ^ ProjectExecTransformer (69) +- ^ FlushableHashAggregateExecTransformer (68) +- ^ ProjectExecTransformer (67) - +- ^ BroadcastHashJoinExecTransformer Inner (66) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (66) :- ^ ProjectExecTransformer (57) - : +- ^ BroadcastHashJoinExecTransformer Inner (56) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (56) : :- ^ ProjectExecTransformer (48) - : : +- ^ BroadcastHashJoinExecTransformer Inner (47) + : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (47) : : :- ^ ProjectExecTransformer (39) - : : : +- ^ BroadcastHashJoinExecTransformer Inner (38) + : : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (38) : : : :- ^ ProjectExecTransformer (30) - : : : : +- ^ BroadcastHashJoinExecTransformer Inner (29) + : : : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (29) : : : : :- ^ ProjectExecTransformer (21) - : : : : : +- ^ BroadcastHashJoinExecTransformer Inner (20) + : : : : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (20) : : : : : :- ^ ProjectExecTransformer (12) - : : : : : : +- ^ BroadcastHashJoinExecTransformer Inner (11) + : : : : : : +- ^ BroadcastHashJoinExecTransformer Inner BuildLeft (11) : : : : : : :- ^ InputIteratorTransformer (8) - : : : : : : : +- ^ InputAdapter (7) - : : : : : : : +- ^ BroadcastQueryStage (6), Statistics(X) - : : : : : : : +- ColumnarBroadcastExchange (5) - : : : : : : : +- ^ ProjectExecTransformer (3) - : : : : : : : +- ^ NoopFilter (2) - : : : : : : : +- ^ Scan parquet (1) + : : : : : : : +- BroadcastQueryStage (6), Statistics(X) + : : : : : : : +- ColumnarBroadcastExchange (5) + : : : : : : : +- ^ ProjectExecTransformer (3) + : : : : : : : +- ^ NoopFilter (2) + : : : : : : : +- ^ Scan parquet (1) : : : : : : +- ^ NoopFilter (10) : : : : : : +- ^ Scan parquet (9) : : : : : +- ^ InputIteratorTransformer (19) - : : : : : +- ^ InputAdapter (18) - : : : : : +- ^ BroadcastQueryStage (17), Statistics(X) - : : : : : +- ColumnarBroadcastExchange (16) - : : : : : +- ^ NoopFilter (14) - : : : : : +- ^ Scan parquet (13) + : : : : : +- BroadcastQueryStage (17), Statistics(X) + : : : : : +- ColumnarBroadcastExchange (16) + : : : : : +- ^ NoopFilter (14) + : : : : : +- ^ Scan parquet (13) : : : : +- ^ InputIteratorTransformer (28) - : : : : +- ^ InputAdapter (27) - : : : : +- ^ BroadcastQueryStage (26), Statistics(X) - : : : : +- ColumnarBroadcastExchange (25) - : : : : +- ^ NoopFilter (23) - : : : : +- ^ Scan parquet (22) + : : : : +- BroadcastQueryStage (26), Statistics(X) + : : : : +- ColumnarBroadcastExchange (25) + : : : : +- ^ NoopFilter (23) + : : : : +- ^ Scan parquet (22) : : : +- ^ InputIteratorTransformer (37) - : : : +- ^ InputAdapter (36) - : : : +- ^ BroadcastQueryStage (35), Statistics(X) - : : : +- ColumnarBroadcastExchange (34) - : : : +- ^ NoopFilter (32) - : : : +- ^ Scan parquet (31) + : : : +- BroadcastQueryStage (35), Statistics(X) + : : : +- ColumnarBroadcastExchange (34) + : : : +- ^ NoopFilter (32) + : : : +- ^ Scan parquet (31) : : +- ^ InputIteratorTransformer (46) - : : +- ^ InputAdapter (45) - : : +- ^ BroadcastQueryStage (44), Statistics(X) - : : +- ColumnarBroadcastExchange (43) - : : +- ^ NoopFilter (41) - : : +- ^ Scan parquet (40) + : : +- BroadcastQueryStage (44), Statistics(X) + : : +- ColumnarBroadcastExchange (43) + : : +- ^ NoopFilter (41) + : : +- ^ Scan parquet (40) : +- ^ InputIteratorTransformer (55) - : +- ^ InputAdapter (54) - : +- ^ BroadcastQueryStage (53), Statistics(X) - : +- ColumnarBroadcastExchange (52) - : +- ^ NoopFilter (50) - : +- ^ Scan parquet (49) + : +- BroadcastQueryStage (53), Statistics(X) + : +- ColumnarBroadcastExchange (52) + : +- ^ NoopFilter (50) + : +- ^ Scan parquet (49) +- ^ InputIteratorTransformer (65) - +- ^ InputAdapter (64) - +- ^ BroadcastQueryStage (63), Statistics(X) - +- ColumnarBroadcastExchange (62) - +- ^ ProjectExecTransformer (60) - +- ^ NoopFilter (59) - +- ^ Scan parquet (58) + +- BroadcastQueryStage (63), Statistics(X) + +- ColumnarBroadcastExchange (62) + +- ^ ProjectExecTransformer (60) + +- ^ NoopFilter (59) + +- ^ Scan parquet (58) +- == Initial Plan == - Sort (128) - +- Exchange (127) - +- HashAggregate (126) - +- Exchange (125) - +- HashAggregate (124) - +- Project (123) - +- BroadcastHashJoin Inner BuildRight (122) - :- Project (117) - : +- BroadcastHashJoin Inner BuildRight (116) - : :- Project (112) - : : +- BroadcastHashJoin Inner BuildRight (111) - : : :- Project (107) - : : : +- BroadcastHashJoin Inner BuildRight (106) - : : : :- Project (102) - : : : : +- BroadcastHashJoin Inner BuildRight (101) - : : : : :- Project (97) - : : : : : +- BroadcastHashJoin Inner BuildRight (96) - : : : : : :- Project (92) - : : : : : : +- BroadcastHashJoin Inner BuildLeft (91) - : : : : : : :- BroadcastExchange (88) - : : : : : : : +- Project (87) - : : : : : : : +- Filter (86) - : : : : : : : +- Scan parquet (85) - : : : : : : +- Filter (90) - : : : : : : +- Scan parquet (89) - : : : : : +- BroadcastExchange (95) - : : : : : +- Filter (94) - : : : : : +- Scan parquet (93) - : : : : +- BroadcastExchange (100) - : : : : +- Filter (99) - : : : : +- Scan parquet (98) - : : : +- BroadcastExchange (105) - : : : +- Filter (104) - : : : +- Scan parquet (103) - : : +- BroadcastExchange (110) - : : +- Filter (109) - : : +- Scan parquet (108) - : +- BroadcastExchange (115) - : +- Filter (114) - : +- Scan parquet (113) - +- BroadcastExchange (121) - +- Project (120) - +- Filter (119) - +- Scan parquet (118) + Sort (130) + +- Exchange (129) + +- HashAggregate (128) + +- Exchange (127) + +- HashAggregate (126) + +- Project (125) + +- BroadcastHashJoin Inner BuildRight (124) + :- Project (119) + : +- BroadcastHashJoin Inner BuildRight (118) + : :- Project (114) + : : +- BroadcastHashJoin Inner BuildRight (113) + : : :- Project (109) + : : : +- BroadcastHashJoin Inner BuildRight (108) + : : : :- Project (104) + : : : : +- BroadcastHashJoin Inner BuildRight (103) + : : : : :- Project (99) + : : : : : +- BroadcastHashJoin Inner BuildRight (98) + : : : : : :- Project (94) + : : : : : : +- BroadcastHashJoin Inner BuildLeft (93) + : : : : : : :- BroadcastExchange (90) + : : : : : : : +- Project (89) + : : : : : : : +- Filter (88) + : : : : : : : +- Scan parquet (87) + : : : : : : +- Filter (92) + : : : : : : +- Scan parquet (91) + : : : : : +- BroadcastExchange (97) + : : : : : +- Filter (96) + : : : : : +- Scan parquet (95) + : : : : +- BroadcastExchange (102) + : : : : +- Filter (101) + : : : : +- Scan parquet (100) + : : : +- BroadcastExchange (107) + : : : +- Filter (106) + : : : +- Scan parquet (105) + : : +- BroadcastExchange (112) + : : +- Filter (111) + : : +- Scan parquet (110) + : +- BroadcastExchange (117) + : +- Filter (116) + : +- Scan parquet (115) + +- BroadcastExchange (123) + +- Project (122) + +- Filter (121) + +- Scan parquet (120) (1) Scan parquet @@ -429,280 +422,288 @@ Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Input [6]: [hash_partition_key#X, o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: false -(71) ColumnarExchange +(71) VeloxAppendBatches +Input [6]: [hash_partition_key#X, o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] +Arguments: X + +(72) ColumnarExchange Input [6]: [hash_partition_key#X, o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: hashpartitioning(o_year#X, 1), ENSURE_REQUIREMENTS, [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(72) ShuffleQueryStage +(73) ShuffleQueryStage Output [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: X -(73) InputAdapter +(74) InputAdapter Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] -(74) InputIteratorTransformer +(75) InputIteratorTransformer Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] -(75) RegularHashAggregateExecTransformer +(76) RegularHashAggregateExecTransformer Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Keys [1]: [o_year#X] Functions [2]: [sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END), sum(volume#X)] Aggregate Attributes [2]: [sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X, sum(volume#X)#X] Results [3]: [o_year#X, sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X, sum(volume#X)#X] -(76) ProjectExecTransformer +(77) ProjectExecTransformer Output [2]: [o_year#X, (sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X / sum(volume#X)#X) AS mkt_share#X] Input [3]: [o_year#X, sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X, sum(volume#X)#X] -(77) WholeStageCodegenTransformer (X) +(78) WholeStageCodegenTransformer (X) Input [2]: [o_year#X, mkt_share#X] Arguments: false -(78) ColumnarExchange +(79) VeloxAppendBatches +Input [2]: [o_year#X, mkt_share#X] +Arguments: X + +(80) ColumnarExchange Input [2]: [o_year#X, mkt_share#X] Arguments: rangepartitioning(o_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(79) ShuffleQueryStage +(81) ShuffleQueryStage Output [2]: [o_year#X, mkt_share#X] Arguments: X -(80) InputAdapter +(82) InputAdapter Input [2]: [o_year#X, mkt_share#X] -(81) InputIteratorTransformer +(83) InputIteratorTransformer Input [2]: [o_year#X, mkt_share#X] -(82) SortExecTransformer +(84) SortExecTransformer Input [2]: [o_year#X, mkt_share#X] Arguments: [o_year#X ASC NULLS FIRST], true, 0 -(83) WholeStageCodegenTransformer (X) +(85) WholeStageCodegenTransformer (X) Input [2]: [o_year#X, mkt_share#X] Arguments: false -(84) VeloxColumnarToRowExec +(86) VeloxColumnarToRowExec Input [2]: [o_year#X, mkt_share#X] -(85) Scan parquet +(87) Scan parquet Output [2]: [p_partkey#X, p_type#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_type), EqualTo(p_type,ECONOMY ANODIZED STEEL), IsNotNull(p_partkey)] ReadSchema: struct -(86) Filter +(88) Filter Input [2]: [p_partkey#X, p_type#X] Condition : ((isnotnull(p_type#X) AND (p_type#X = ECONOMY ANODIZED STEEL)) AND isnotnull(p_partkey#X)) -(87) Project +(89) Project Output [1]: [p_partkey#X] Input [2]: [p_partkey#X, p_type#X] -(88) BroadcastExchange +(90) BroadcastExchange Input [1]: [p_partkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(89) Scan parquet +(91) Scan parquet Output [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(90) Filter +(92) Filter Input [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Condition : ((isnotnull(l_partkey#X) AND isnotnull(l_suppkey#X)) AND isnotnull(l_orderkey#X)) -(91) BroadcastHashJoin +(93) BroadcastHashJoin Left keys [1]: [p_partkey#X] Right keys [1]: [l_partkey#X] Join type: Inner Join condition: None -(92) Project +(94) Project Output [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Input [6]: [p_partkey#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(93) Scan parquet +(95) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(94) Filter +(96) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(95) BroadcastExchange +(97) BroadcastExchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(96) BroadcastHashJoin +(98) BroadcastHashJoin Left keys [1]: [l_suppkey#X] Right keys [1]: [s_suppkey#X] Join type: Inner Join condition: None -(97) Project +(99) Project Output [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Input [6]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, s_suppkey#X, s_nationkey#X] -(98) Scan parquet +(100) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1995-01-01), LessThanOrEqual(o_orderdate,1996-12-31), IsNotNull(o_orderkey), IsNotNull(o_custkey)] ReadSchema: struct -(99) Filter +(101) Filter Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Condition : ((((isnotnull(o_orderdate#X) AND (o_orderdate#X >= 1995-01-01)) AND (o_orderdate#X <= 1996-12-31)) AND isnotnull(o_orderkey#X)) AND isnotnull(o_custkey#X)) -(100) BroadcastExchange +(102) BroadcastExchange Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(101) BroadcastHashJoin +(103) BroadcastHashJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join type: Inner Join condition: None -(102) Project +(104) Project Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] Input [7]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderkey#X, o_custkey#X, o_orderdate#X] -(103) Scan parquet +(105) Scan parquet Output [2]: [c_custkey#X, c_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey), IsNotNull(c_nationkey)] ReadSchema: struct -(104) Filter +(106) Filter Input [2]: [c_custkey#X, c_nationkey#X] Condition : (isnotnull(c_custkey#X) AND isnotnull(c_nationkey#X)) -(105) BroadcastExchange +(107) BroadcastExchange Input [2]: [c_custkey#X, c_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(106) BroadcastHashJoin +(108) BroadcastHashJoin Left keys [1]: [o_custkey#X] Right keys [1]: [c_custkey#X] Join type: Inner Join condition: None -(107) Project +(109) Project Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] Input [7]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X, c_custkey#X, c_nationkey#X] -(108) Scan parquet +(110) Scan parquet Output [2]: [n_nationkey#X, n_regionkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), IsNotNull(n_regionkey)] ReadSchema: struct -(109) Filter +(111) Filter Input [2]: [n_nationkey#X, n_regionkey#X] Condition : (isnotnull(n_nationkey#X) AND isnotnull(n_regionkey#X)) -(110) BroadcastExchange +(112) BroadcastExchange Input [2]: [n_nationkey#X, n_regionkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(111) BroadcastHashJoin +(113) BroadcastHashJoin Left keys [1]: [c_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(112) Project +(114) Project Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] Input [7]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X, n_nationkey#X, n_regionkey#X] -(113) Scan parquet +(115) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey)] ReadSchema: struct -(114) Filter +(116) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : isnotnull(n_nationkey#X) -(115) BroadcastExchange +(117) BroadcastExchange Input [2]: [n_nationkey#X, n_name#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(116) BroadcastHashJoin +(118) BroadcastHashJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(117) Project +(119) Project Output [5]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] Input [7]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X, n_nationkey#X, n_name#X] -(118) Scan parquet +(120) Scan parquet Output [2]: [r_regionkey#X, r_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(r_name), EqualTo(r_name,AMERICA), IsNotNull(r_regionkey)] ReadSchema: struct -(119) Filter +(121) Filter Input [2]: [r_regionkey#X, r_name#X] Condition : ((isnotnull(r_name#X) AND (r_name#X = AMERICA)) AND isnotnull(r_regionkey#X)) -(120) Project +(122) Project Output [1]: [r_regionkey#X] Input [2]: [r_regionkey#X, r_name#X] -(121) BroadcastExchange +(123) BroadcastExchange Input [1]: [r_regionkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(122) BroadcastHashJoin +(124) BroadcastHashJoin Left keys [1]: [n_regionkey#X] Right keys [1]: [r_regionkey#X] Join type: Inner Join condition: None -(123) Project +(125) Project Output [3]: [year(o_orderdate#X) AS o_year#X, (l_extendedprice#X * (1 - l_discount#X)) AS volume#X, n_name#X AS nation#X] Input [6]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X, r_regionkey#X] -(124) HashAggregate +(126) HashAggregate Input [3]: [o_year#X, volume#X, nation#X] Keys [1]: [o_year#X] Functions [2]: [partial_sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END), partial_sum(volume#X)] Aggregate Attributes [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Results [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] -(125) Exchange +(127) Exchange Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: hashpartitioning(o_year#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(126) HashAggregate +(128) HashAggregate Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Keys [1]: [o_year#X] Functions [2]: [sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END), sum(volume#X)] Aggregate Attributes [2]: [sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X, sum(volume#X)#X] Results [2]: [o_year#X, (sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X / sum(volume#X)#X) AS mkt_share#X] -(127) Exchange +(129) Exchange Input [2]: [o_year#X, mkt_share#X] Arguments: rangepartitioning(o_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(128) Sort +(130) Sort Input [2]: [o_year#X, mkt_share#X] Arguments: [o_year#X ASC NULLS FIRST], true, 0 -(129) AdaptiveSparkPlan +(131) AdaptiveSparkPlan Output [2]: [o_year#X, mkt_share#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/9.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/9.txt index e7abd01744e53..301c001eb7938 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/9.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/9.txt @@ -1,96 +1,91 @@ == Physical Plan == -AdaptiveSparkPlan (98) +AdaptiveSparkPlan (100) +- == Final Plan == - VeloxColumnarToRowExec (64) - +- ^ SortExecTransformer (62) - +- ^ InputIteratorTransformer (61) - +- ^ InputAdapter (60) - +- ^ ShuffleQueryStage (59), Statistics(X) - +- ColumnarExchange (58) - +- ^ RegularHashAggregateExecTransformer (56) - +- ^ InputIteratorTransformer (55) - +- ^ InputAdapter (54) - +- ^ ShuffleQueryStage (53), Statistics(X) - +- ColumnarExchange (52) + VeloxColumnarToRowExec (66) + +- ^ SortExecTransformer (64) + +- ^ InputIteratorTransformer (63) + +- ShuffleQueryStage (61), Statistics(X) + +- ColumnarExchange (60) + +- VeloxAppendBatches (59) + +- ^ RegularHashAggregateExecTransformer (57) + +- ^ InputIteratorTransformer (56) + +- ShuffleQueryStage (54), Statistics(X) + +- ColumnarExchange (53) + +- VeloxAppendBatches (52) +- ^ ProjectExecTransformer (50) +- ^ FlushableHashAggregateExecTransformer (49) +- ^ ProjectExecTransformer (48) - +- ^ BroadcastHashJoinExecTransformer Inner (47) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (47) :- ^ ProjectExecTransformer (39) - : +- ^ BroadcastHashJoinExecTransformer Inner (38) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (38) : :- ^ ProjectExecTransformer (30) - : : +- ^ BroadcastHashJoinExecTransformer Inner (29) + : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (29) : : :- ^ ProjectExecTransformer (21) - : : : +- ^ BroadcastHashJoinExecTransformer Inner (20) + : : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (20) : : : :- ^ ProjectExecTransformer (12) - : : : : +- ^ BroadcastHashJoinExecTransformer Inner (11) + : : : : +- ^ BroadcastHashJoinExecTransformer Inner BuildLeft (11) : : : : :- ^ InputIteratorTransformer (8) - : : : : : +- ^ InputAdapter (7) - : : : : : +- ^ BroadcastQueryStage (6), Statistics(X) - : : : : : +- ColumnarBroadcastExchange (5) - : : : : : +- ^ ProjectExecTransformer (3) - : : : : : +- ^ NoopFilter (2) - : : : : : +- ^ Scan parquet (1) + : : : : : +- BroadcastQueryStage (6), Statistics(X) + : : : : : +- ColumnarBroadcastExchange (5) + : : : : : +- ^ ProjectExecTransformer (3) + : : : : : +- ^ NoopFilter (2) + : : : : : +- ^ Scan parquet (1) : : : : +- ^ NoopFilter (10) : : : : +- ^ Scan parquet (9) : : : +- ^ InputIteratorTransformer (19) - : : : +- ^ InputAdapter (18) - : : : +- ^ BroadcastQueryStage (17), Statistics(X) - : : : +- ColumnarBroadcastExchange (16) - : : : +- ^ NoopFilter (14) - : : : +- ^ Scan parquet (13) + : : : +- BroadcastQueryStage (17), Statistics(X) + : : : +- ColumnarBroadcastExchange (16) + : : : +- ^ NoopFilter (14) + : : : +- ^ Scan parquet (13) : : +- ^ InputIteratorTransformer (28) - : : +- ^ InputAdapter (27) - : : +- ^ BroadcastQueryStage (26), Statistics(X) - : : +- ColumnarBroadcastExchange (25) - : : +- ^ NoopFilter (23) - : : +- ^ Scan parquet (22) + : : +- BroadcastQueryStage (26), Statistics(X) + : : +- ColumnarBroadcastExchange (25) + : : +- ^ NoopFilter (23) + : : +- ^ Scan parquet (22) : +- ^ InputIteratorTransformer (37) - : +- ^ InputAdapter (36) - : +- ^ BroadcastQueryStage (35), Statistics(X) - : +- ColumnarBroadcastExchange (34) - : +- ^ NoopFilter (32) - : +- ^ Scan parquet (31) + : +- BroadcastQueryStage (35), Statistics(X) + : +- ColumnarBroadcastExchange (34) + : +- ^ NoopFilter (32) + : +- ^ Scan parquet (31) +- ^ InputIteratorTransformer (46) - +- ^ InputAdapter (45) - +- ^ BroadcastQueryStage (44), Statistics(X) - +- ColumnarBroadcastExchange (43) - +- ^ NoopFilter (41) - +- ^ Scan parquet (40) + +- BroadcastQueryStage (44), Statistics(X) + +- ColumnarBroadcastExchange (43) + +- ^ NoopFilter (41) + +- ^ Scan parquet (40) +- == Initial Plan == - Sort (97) - +- Exchange (96) - +- HashAggregate (95) - +- Exchange (94) - +- HashAggregate (93) - +- Project (92) - +- BroadcastHashJoin Inner BuildRight (91) - :- Project (87) - : +- BroadcastHashJoin Inner BuildRight (86) - : :- Project (82) - : : +- BroadcastHashJoin Inner BuildRight (81) - : : :- Project (77) - : : : +- BroadcastHashJoin Inner BuildRight (76) - : : : :- Project (72) - : : : : +- BroadcastHashJoin Inner BuildLeft (71) - : : : : :- BroadcastExchange (68) - : : : : : +- Project (67) - : : : : : +- Filter (66) - : : : : : +- Scan parquet (65) - : : : : +- Filter (70) - : : : : +- Scan parquet (69) - : : : +- BroadcastExchange (75) - : : : +- Filter (74) - : : : +- Scan parquet (73) - : : +- BroadcastExchange (80) - : : +- Filter (79) - : : +- Scan parquet (78) - : +- BroadcastExchange (85) - : +- Filter (84) - : +- Scan parquet (83) - +- BroadcastExchange (90) - +- Filter (89) - +- Scan parquet (88) + Sort (99) + +- Exchange (98) + +- HashAggregate (97) + +- Exchange (96) + +- HashAggregate (95) + +- Project (94) + +- BroadcastHashJoin Inner BuildRight (93) + :- Project (89) + : +- BroadcastHashJoin Inner BuildRight (88) + : :- Project (84) + : : +- BroadcastHashJoin Inner BuildRight (83) + : : :- Project (79) + : : : +- BroadcastHashJoin Inner BuildRight (78) + : : : :- Project (74) + : : : : +- BroadcastHashJoin Inner BuildLeft (73) + : : : : :- BroadcastExchange (70) + : : : : : +- Project (69) + : : : : : +- Filter (68) + : : : : : +- Scan parquet (67) + : : : : +- Filter (72) + : : : : +- Scan parquet (71) + : : : +- BroadcastExchange (77) + : : : +- Filter (76) + : : : +- Scan parquet (75) + : : +- BroadcastExchange (82) + : : +- Filter (81) + : : +- Scan parquet (80) + : +- BroadcastExchange (87) + : +- Filter (86) + : +- Scan parquet (85) + +- BroadcastExchange (92) + +- Filter (91) + +- Scan parquet (90) (1) Scan parquet @@ -318,222 +313,230 @@ Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] Input [5]: [hash_partition_key#X, nation#X, o_year#X, sum#X, isEmpty#X] Arguments: false -(52) ColumnarExchange +(52) VeloxAppendBatches +Input [5]: [hash_partition_key#X, nation#X, o_year#X, sum#X, isEmpty#X] +Arguments: X + +(53) ColumnarExchange Input [5]: [hash_partition_key#X, nation#X, o_year#X, sum#X, isEmpty#X] Arguments: hashpartitioning(nation#X, o_year#X, 1), ENSURE_REQUIREMENTS, [nation#X, o_year#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(53) ShuffleQueryStage +(54) ShuffleQueryStage Output [4]: [nation#X, o_year#X, sum#X, isEmpty#X] Arguments: X -(54) InputAdapter +(55) InputAdapter Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] -(55) InputIteratorTransformer +(56) InputIteratorTransformer Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] -(56) RegularHashAggregateExecTransformer +(57) RegularHashAggregateExecTransformer Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] Keys [2]: [nation#X, o_year#X] Functions [1]: [sum(amount#X)] Aggregate Attributes [1]: [sum(amount#X)#X] Results [3]: [nation#X, o_year#X, sum(amount#X)#X AS sum_profit#X] -(57) WholeStageCodegenTransformer (X) +(58) WholeStageCodegenTransformer (X) Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: false -(58) ColumnarExchange +(59) VeloxAppendBatches +Input [3]: [nation#X, o_year#X, sum_profit#X] +Arguments: X + +(60) ColumnarExchange Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: rangepartitioning(nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(59) ShuffleQueryStage +(61) ShuffleQueryStage Output [3]: [nation#X, o_year#X, sum_profit#X] Arguments: X -(60) InputAdapter +(62) InputAdapter Input [3]: [nation#X, o_year#X, sum_profit#X] -(61) InputIteratorTransformer +(63) InputIteratorTransformer Input [3]: [nation#X, o_year#X, sum_profit#X] -(62) SortExecTransformer +(64) SortExecTransformer Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: [nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST], true, 0 -(63) WholeStageCodegenTransformer (X) +(65) WholeStageCodegenTransformer (X) Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: false -(64) VeloxColumnarToRowExec +(66) VeloxColumnarToRowExec Input [3]: [nation#X, o_year#X, sum_profit#X] -(65) Scan parquet +(67) Scan parquet Output [2]: [p_partkey#X, p_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_name), StringContains(p_name,green), IsNotNull(p_partkey)] ReadSchema: struct -(66) Filter +(68) Filter Input [2]: [p_partkey#X, p_name#X] Condition : ((isnotnull(p_name#X) AND Contains(p_name#X, green)) AND isnotnull(p_partkey#X)) -(67) Project +(69) Project Output [1]: [p_partkey#X] Input [2]: [p_partkey#X, p_name#X] -(68) BroadcastExchange +(70) BroadcastExchange Input [1]: [p_partkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(69) Scan parquet +(71) Scan parquet Output [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(70) Filter +(72) Filter Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Condition : ((isnotnull(l_partkey#X) AND isnotnull(l_suppkey#X)) AND isnotnull(l_orderkey#X)) -(71) BroadcastHashJoin +(73) BroadcastHashJoin Left keys [1]: [p_partkey#X] Right keys [1]: [l_partkey#X] Join type: Inner Join condition: None -(72) Project +(74) Project Output [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Input [7]: [p_partkey#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -(73) Scan parquet +(75) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(74) Filter +(76) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(75) BroadcastExchange +(77) BroadcastExchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(76) BroadcastHashJoin +(78) BroadcastHashJoin Left keys [1]: [l_suppkey#X] Right keys [1]: [s_suppkey#X] Join type: Inner Join condition: None -(77) Project +(79) Project Output [7]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Input [8]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_suppkey#X, s_nationkey#X] -(78) Scan parquet +(80) Scan parquet Output [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_suppkey), IsNotNull(ps_partkey)] ReadSchema: struct -(79) Filter +(81) Filter Input [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Condition : (isnotnull(ps_suppkey#X) AND isnotnull(ps_partkey#X)) -(80) BroadcastExchange +(82) BroadcastExchange Input [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Arguments: HashedRelationBroadcastMode(List(input[1, bigint, false], input[0, bigint, false]),false), [plan_id=X] -(81) BroadcastHashJoin +(83) BroadcastHashJoin Left keys [2]: [l_suppkey#X, l_partkey#X] Right keys [2]: [ps_suppkey#X, ps_partkey#X] Join type: Inner Join condition: None -(82) Project +(84) Project Output [6]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] Input [10]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_partkey#X, ps_suppkey#X, ps_supplycost#X] -(83) Scan parquet +(85) Scan parquet Output [2]: [o_orderkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderkey)] ReadSchema: struct -(84) Filter +(86) Filter Input [2]: [o_orderkey#X, o_orderdate#X] Condition : isnotnull(o_orderkey#X) -(85) BroadcastExchange +(87) BroadcastExchange Input [2]: [o_orderkey#X, o_orderdate#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(86) BroadcastHashJoin +(88) BroadcastHashJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join type: Inner Join condition: None -(87) Project +(89) Project Output [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] Input [8]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderkey#X, o_orderdate#X] -(88) Scan parquet +(90) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey)] ReadSchema: struct -(89) Filter +(91) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : isnotnull(n_nationkey#X) -(90) BroadcastExchange +(92) BroadcastExchange Input [2]: [n_nationkey#X, n_name#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(91) BroadcastHashJoin +(93) BroadcastHashJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(92) Project +(94) Project Output [3]: [n_name#X AS nation#X, year(o_orderdate#X) AS o_year#X, ((l_extendedprice#X * (1 - l_discount#X)) - (ps_supplycost#X * l_quantity#X)) AS amount#X] Input [8]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X, n_nationkey#X, n_name#X] -(93) HashAggregate +(95) HashAggregate Input [3]: [nation#X, o_year#X, amount#X] Keys [2]: [nation#X, o_year#X] Functions [1]: [partial_sum(amount#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [4]: [nation#X, o_year#X, sum#X, isEmpty#X] -(94) Exchange +(96) Exchange Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] Arguments: hashpartitioning(nation#X, o_year#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(95) HashAggregate +(97) HashAggregate Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] Keys [2]: [nation#X, o_year#X] Functions [1]: [sum(amount#X)] Aggregate Attributes [1]: [sum(amount#X)#X] Results [3]: [nation#X, o_year#X, sum(amount#X)#X AS sum_profit#X] -(96) Exchange +(98) Exchange Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: rangepartitioning(nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(97) Sort +(99) Sort Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: [nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST], true, 0 -(98) AdaptiveSparkPlan +(100) AdaptiveSparkPlan Output [3]: [nation#X, o_year#X, sum_profit#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/1.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/1.txt index 7a80ed95f3e8d..63b7d317f3cf2 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/1.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/1.txt @@ -1,31 +1,31 @@ == Physical Plan == -AdaptiveSparkPlan (28) +AdaptiveSparkPlan (30) +- == Final Plan == - VeloxColumnarToRowExec (19) - +- ^ SortExecTransformer (17) - +- ^ InputIteratorTransformer (16) - +- ^ InputAdapter (15) - +- ^ ShuffleQueryStage (14) - +- ColumnarExchange (13) - +- ^ RegularHashAggregateExecTransformer (11) - +- ^ InputIteratorTransformer (10) - +- ^ InputAdapter (9) - +- ^ ShuffleQueryStage (8) - +- ColumnarExchange (7) + VeloxColumnarToRowExec (21) + +- ^ SortExecTransformer (19) + +- ^ InputIteratorTransformer (18) + +- ShuffleQueryStage (16) + +- ColumnarExchange (15) + +- VeloxAppendBatches (14) + +- ^ RegularHashAggregateExecTransformer (12) + +- ^ InputIteratorTransformer (11) + +- ShuffleQueryStage (9) + +- ColumnarExchange (8) + +- VeloxAppendBatches (7) +- ^ ProjectExecTransformer (5) +- ^ FlushableHashAggregateExecTransformer (4) +- ^ ProjectExecTransformer (3) +- ^ FilterExecTransformer (2) +- ^ Scan parquet (1) +- == Initial Plan == - Sort (27) - +- Exchange (26) - +- HashAggregate (25) - +- Exchange (24) - +- HashAggregate (23) - +- Project (22) - +- Filter (21) - +- Scan parquet (20) + Sort (29) + +- Exchange (28) + +- HashAggregate (27) + +- Exchange (26) + +- HashAggregate (25) + +- Project (24) + +- Filter (23) + +- Scan parquet (22) (1) Scan parquet @@ -58,97 +58,105 @@ Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, Input [18]: [hash_partition_key#X, l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Arguments: false -(7) ColumnarExchange +(7) VeloxAppendBatches +Input [18]: [hash_partition_key#X, l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] +Arguments: X + +(8) ColumnarExchange Input [18]: [hash_partition_key#X, l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Arguments: hashpartitioning(l_returnflag#X, l_linestatus#X, 1), ENSURE_REQUIREMENTS, [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X], [plan_id=X], [id=#X] -(8) ShuffleQueryStage +(9) ShuffleQueryStage Output [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Arguments: X -(9) InputAdapter +(10) InputAdapter Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] -(10) InputIteratorTransformer +(11) InputIteratorTransformer Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] -(11) RegularHashAggregateExecTransformer +(12) RegularHashAggregateExecTransformer Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Keys [2]: [l_returnflag#X, l_linestatus#X] Functions [8]: [sum(l_quantity#X), sum(l_extendedprice#X), sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true)), sum(CheckOverflow((promote_precision(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true)) * promote_precision(cast(CheckOverflow((1.00 + promote_precision(cast(l_tax#X as decimal(13,2)))), DecimalType(13,2), true) as decimal(26,4)))), DecimalType(38,6), true)), avg(l_quantity#X), avg(l_extendedprice#X), avg(l_discount#X), count(1)] Aggregate Attributes [8]: [sum(l_quantity#X)#X, sum(l_extendedprice#X)#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X, sum(CheckOverflow((promote_precision(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true)) * promote_precision(cast(CheckOverflow((1.00 + promote_precision(cast(l_tax#X as decimal(13,2)))), DecimalType(13,2), true) as decimal(26,4)))), DecimalType(38,6), true))#X, avg(l_quantity#X)#X, avg(l_extendedprice#X)#X, avg(l_discount#X)#X, count(1)#X] Results [10]: [l_returnflag#X, l_linestatus#X, sum(l_quantity#X)#X AS sum_qty#X, sum(l_extendedprice#X)#X AS sum_base_price#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X AS sum_disc_price#X, sum(CheckOverflow((promote_precision(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true)) * promote_precision(cast(CheckOverflow((1.00 + promote_precision(cast(l_tax#X as decimal(13,2)))), DecimalType(13,2), true) as decimal(26,4)))), DecimalType(38,6), true))#X AS sum_charge#X, avg(l_quantity#X)#X AS avg_qty#X, avg(l_extendedprice#X)#X AS avg_price#X, avg(l_discount#X)#X AS avg_disc#X, count(1)#X AS count_order#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] +Arguments: X + +(15) ColumnarExchange Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: rangepartitioning(l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] -(17) SortExecTransformer +(19) SortExecTransformer Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: [l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST], true, 0 -(18) WholeStageCodegenTransformer (X) +(20) WholeStageCodegenTransformer (X) Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: false -(19) VeloxColumnarToRowExec +(21) VeloxColumnarToRowExec Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] -(20) Scan parquet +(22) Scan parquet Output [7]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_tax#X, l_returnflag#X, l_linestatus#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), LessThanOrEqual(l_shipdate,1998-09-02)] ReadSchema: struct -(21) Filter +(23) Filter Input [7]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_tax#X, l_returnflag#X, l_linestatus#X, l_shipdate#X] Condition : (isnotnull(l_shipdate#X) AND (l_shipdate#X <= 1998-09-02)) -(22) Project +(24) Project Output [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_tax#X, l_returnflag#X, l_linestatus#X] Input [7]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_tax#X, l_returnflag#X, l_linestatus#X, l_shipdate#X] -(23) HashAggregate +(25) HashAggregate Input [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_tax#X, l_returnflag#X, l_linestatus#X] Keys [2]: [l_returnflag#X, l_linestatus#X] Functions [8]: [partial_sum(l_quantity#X), partial_sum(l_extendedprice#X), partial_sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true)), partial_sum(CheckOverflow((promote_precision(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true)) * promote_precision(cast(CheckOverflow((1.00 + promote_precision(cast(l_tax#X as decimal(13,2)))), DecimalType(13,2), true) as decimal(26,4)))), DecimalType(38,6), true)), partial_avg(l_quantity#X), partial_avg(l_extendedprice#X), partial_avg(l_discount#X), partial_count(1)] Aggregate Attributes [15]: [sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Results [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] -(24) Exchange +(26) Exchange Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Arguments: hashpartitioning(l_returnflag#X, l_linestatus#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(25) HashAggregate +(27) HashAggregate Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Keys [2]: [l_returnflag#X, l_linestatus#X] Functions [8]: [sum(l_quantity#X), sum(l_extendedprice#X), sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true)), sum(CheckOverflow((promote_precision(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true)) * promote_precision(cast(CheckOverflow((1.00 + promote_precision(cast(l_tax#X as decimal(13,2)))), DecimalType(13,2), true) as decimal(26,4)))), DecimalType(38,6), true)), avg(l_quantity#X), avg(l_extendedprice#X), avg(l_discount#X), count(1)] Aggregate Attributes [8]: [sum(l_quantity#X)#X, sum(l_extendedprice#X)#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X, sum(CheckOverflow((promote_precision(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true)) * promote_precision(cast(CheckOverflow((1.00 + promote_precision(cast(l_tax#X as decimal(13,2)))), DecimalType(13,2), true) as decimal(26,4)))), DecimalType(38,6), true))#X, avg(l_quantity#X)#X, avg(l_extendedprice#X)#X, avg(l_discount#X)#X, count(1)#X] Results [10]: [l_returnflag#X, l_linestatus#X, sum(l_quantity#X)#X AS sum_qty#X, sum(l_extendedprice#X)#X AS sum_base_price#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X AS sum_disc_price#X, sum(CheckOverflow((promote_precision(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true)) * promote_precision(cast(CheckOverflow((1.00 + promote_precision(cast(l_tax#X as decimal(13,2)))), DecimalType(13,2), true) as decimal(26,4)))), DecimalType(38,6), true))#X AS sum_charge#X, avg(l_quantity#X)#X AS avg_qty#X, avg(l_extendedprice#X)#X AS avg_price#X, avg(l_discount#X)#X AS avg_disc#X, count(1)#X AS count_order#X] -(26) Exchange +(28) Exchange Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: rangepartitioning(l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(27) Sort +(29) Sort Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: [l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST], true, 0 -(28) AdaptiveSparkPlan +(30) AdaptiveSparkPlan Output [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/10.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/10.txt index 935650e260592..db7c1cb796676 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/10.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/10.txt @@ -1,68 +1,65 @@ == Physical Plan == -AdaptiveSparkPlan (67) +AdaptiveSparkPlan (68) +- == Final Plan == - VeloxColumnarToRowExec (43) - +- TakeOrderedAndProjectExecTransformer (42) - +- ^ ProjectExecTransformer (40) - +- ^ RegularHashAggregateExecTransformer (39) - +- ^ InputIteratorTransformer (38) - +- ^ InputAdapter (37) - +- ^ ShuffleQueryStage (36) - +- ColumnarExchange (35) + VeloxColumnarToRowExec (44) + +- TakeOrderedAndProjectExecTransformer (43) + +- ^ ProjectExecTransformer (41) + +- ^ RegularHashAggregateExecTransformer (40) + +- ^ InputIteratorTransformer (39) + +- ShuffleQueryStage (37) + +- ColumnarExchange (36) + +- VeloxAppendBatches (35) +- ^ ProjectExecTransformer (33) +- ^ FlushableHashAggregateExecTransformer (32) +- ^ ProjectExecTransformer (31) - +- ^ BroadcastHashJoinExecTransformer Inner (30) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (30) :- ^ ProjectExecTransformer (22) - : +- ^ BroadcastHashJoinExecTransformer Inner (21) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (21) : :- ^ ProjectExecTransformer (12) - : : +- ^ BroadcastHashJoinExecTransformer Inner (11) + : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (11) : : :- ^ FilterExecTransformer (2) : : : +- ^ Scan parquet (1) : : +- ^ InputIteratorTransformer (10) - : : +- ^ InputAdapter (9) - : : +- ^ BroadcastQueryStage (8) - : : +- ColumnarBroadcastExchange (7) - : : +- ^ ProjectExecTransformer (5) - : : +- ^ FilterExecTransformer (4) - : : +- ^ Scan parquet (3) + : : +- BroadcastQueryStage (8) + : : +- ColumnarBroadcastExchange (7) + : : +- ^ ProjectExecTransformer (5) + : : +- ^ FilterExecTransformer (4) + : : +- ^ Scan parquet (3) : +- ^ InputIteratorTransformer (20) - : +- ^ InputAdapter (19) - : +- ^ BroadcastQueryStage (18) - : +- ColumnarBroadcastExchange (17) - : +- ^ ProjectExecTransformer (15) - : +- ^ FilterExecTransformer (14) - : +- ^ Scan parquet (13) + : +- BroadcastQueryStage (18) + : +- ColumnarBroadcastExchange (17) + : +- ^ ProjectExecTransformer (15) + : +- ^ FilterExecTransformer (14) + : +- ^ Scan parquet (13) +- ^ InputIteratorTransformer (29) - +- ^ InputAdapter (28) - +- ^ BroadcastQueryStage (27) - +- ColumnarBroadcastExchange (26) - +- ^ FilterExecTransformer (24) - +- ^ Scan parquet (23) + +- BroadcastQueryStage (27) + +- ColumnarBroadcastExchange (26) + +- ^ FilterExecTransformer (24) + +- ^ Scan parquet (23) +- == Initial Plan == - TakeOrderedAndProject (66) - +- HashAggregate (65) - +- Exchange (64) - +- HashAggregate (63) - +- Project (62) - +- BroadcastHashJoin Inner BuildRight (61) - :- Project (57) - : +- BroadcastHashJoin Inner BuildRight (56) - : :- Project (51) - : : +- BroadcastHashJoin Inner BuildRight (50) - : : :- Filter (45) - : : : +- Scan parquet (44) - : : +- BroadcastExchange (49) - : : +- Project (48) - : : +- Filter (47) - : : +- Scan parquet (46) - : +- BroadcastExchange (55) - : +- Project (54) - : +- Filter (53) - : +- Scan parquet (52) - +- BroadcastExchange (60) - +- Filter (59) - +- Scan parquet (58) + TakeOrderedAndProject (67) + +- HashAggregate (66) + +- Exchange (65) + +- HashAggregate (64) + +- Project (63) + +- BroadcastHashJoin Inner BuildRight (62) + :- Project (58) + : +- BroadcastHashJoin Inner BuildRight (57) + : :- Project (52) + : : +- BroadcastHashJoin Inner BuildRight (51) + : : :- Filter (46) + : : : +- Scan parquet (45) + : : +- BroadcastExchange (50) + : : +- Project (49) + : : +- Filter (48) + : : +- Scan parquet (47) + : +- BroadcastExchange (56) + : +- Project (55) + : +- Filter (54) + : +- Scan parquet (53) + +- BroadcastExchange (61) + +- Filter (60) + +- Scan parquet (59) (1) Scan parquet @@ -213,155 +210,159 @@ Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Arguments: false -(35) ColumnarExchange +(35) VeloxAppendBatches +Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] +Arguments: X + +(36) ColumnarExchange Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Arguments: hashpartitioning(c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(36) ShuffleQueryStage +(37) ShuffleQueryStage Output [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Arguments: X -(37) InputAdapter +(38) InputAdapter Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] -(38) InputIteratorTransformer +(39) InputIteratorTransformer Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] -(39) RegularHashAggregateExecTransformer +(40) RegularHashAggregateExecTransformer Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Keys [7]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] Results [8]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] -(40) ProjectExecTransformer +(41) ProjectExecTransformer Output [8]: [c_custkey#X, c_name#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X AS revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] Input [8]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] -(41) WholeStageCodegenTransformer (X) +(42) WholeStageCodegenTransformer (X) Input [8]: [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] Arguments: false -(42) TakeOrderedAndProjectExecTransformer +(43) TakeOrderedAndProjectExecTransformer Input [8]: [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] Arguments: X, [revenue#X DESC NULLS LAST], [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X], 0 -(43) VeloxColumnarToRowExec +(44) VeloxColumnarToRowExec Input [8]: [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] -(44) Scan parquet +(45) Scan parquet Output [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey), IsNotNull(c_nationkey)] ReadSchema: struct -(45) Filter +(46) Filter Input [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] Condition : (isnotnull(c_custkey#X) AND isnotnull(c_nationkey#X)) -(46) Scan parquet +(47) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1993-10-01), LessThan(o_orderdate,1994-01-01), IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(47) Filter +(48) Filter Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Condition : ((((isnotnull(o_orderdate#X) AND (o_orderdate#X >= 1993-10-01)) AND (o_orderdate#X < 1994-01-01)) AND isnotnull(o_custkey#X)) AND isnotnull(o_orderkey#X)) -(48) Project +(49) Project Output [2]: [o_orderkey#X, o_custkey#X] Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] -(49) BroadcastExchange +(50) BroadcastExchange Input [2]: [o_orderkey#X, o_custkey#X] Arguments: HashedRelationBroadcastMode(List(input[1, bigint, true]),false), [plan_id=X] -(50) BroadcastHashJoin +(51) BroadcastHashJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(51) Project +(52) Project Output [8]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] Input [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X, o_custkey#X] -(52) Scan parquet +(53) Scan parquet Output [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_returnflag#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_returnflag), EqualTo(l_returnflag,R), IsNotNull(l_orderkey)] ReadSchema: struct -(53) Filter +(54) Filter Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_returnflag#X] Condition : ((isnotnull(l_returnflag#X) AND (l_returnflag#X = R)) AND isnotnull(l_orderkey#X)) -(54) Project +(55) Project Output [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_returnflag#X] -(55) BroadcastExchange +(56) BroadcastExchange Input [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(56) BroadcastHashJoin +(57) BroadcastHashJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(57) Project +(58) Project Output [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] Input [11]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X] -(58) Scan parquet +(59) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey)] ReadSchema: struct -(59) Filter +(60) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : isnotnull(n_nationkey#X) -(60) BroadcastExchange +(61) BroadcastExchange Input [2]: [n_nationkey#X, n_name#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(61) BroadcastHashJoin +(62) BroadcastHashJoin Left keys [1]: [c_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(62) Project +(63) Project Output [9]: [c_custkey#X, c_name#X, c_address#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X, n_name#X] Input [11]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X, n_nationkey#X, n_name#X] -(63) HashAggregate +(64) HashAggregate Input [9]: [c_custkey#X, c_name#X, c_address#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X, n_name#X] Keys [7]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X] Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] -(64) Exchange +(65) Exchange Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Arguments: hashpartitioning(c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(65) HashAggregate +(66) HashAggregate Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Keys [7]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] Results [8]: [c_custkey#X, c_name#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X AS revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] -(66) TakeOrderedAndProject +(67) TakeOrderedAndProject Input [8]: [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] Arguments: X, [revenue#X DESC NULLS LAST], [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] -(67) AdaptiveSparkPlan +(68) AdaptiveSparkPlan Output [8]: [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/11.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/11.txt index 1cc9e9ac278ea..33c7971d37492 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/11.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/11.txt @@ -1,59 +1,57 @@ == Physical Plan == -AdaptiveSparkPlan (58) +AdaptiveSparkPlan (60) +- == Final Plan == - VeloxColumnarToRowExec (38) - +- ^ SortExecTransformer (36) - +- ^ InputIteratorTransformer (35) - +- ^ InputAdapter (34) - +- ^ ShuffleQueryStage (33) - +- ColumnarExchange (32) - +- ^ FilterExecTransformer (30) - +- ^ RegularHashAggregateExecTransformer (29) - +- ^ InputIteratorTransformer (28) - +- ^ InputAdapter (27) - +- ^ ShuffleQueryStage (26) - +- ColumnarExchange (25) + VeloxColumnarToRowExec (40) + +- ^ SortExecTransformer (38) + +- ^ InputIteratorTransformer (37) + +- ShuffleQueryStage (35) + +- ColumnarExchange (34) + +- VeloxAppendBatches (33) + +- ^ FilterExecTransformer (31) + +- ^ RegularHashAggregateExecTransformer (30) + +- ^ InputIteratorTransformer (29) + +- ShuffleQueryStage (27) + +- ColumnarExchange (26) + +- VeloxAppendBatches (25) +- ^ ProjectExecTransformer (23) +- ^ FlushableHashAggregateExecTransformer (22) +- ^ ProjectExecTransformer (21) - +- ^ BroadcastHashJoinExecTransformer Inner (20) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (20) :- ^ ProjectExecTransformer (11) - : +- ^ BroadcastHashJoinExecTransformer Inner (10) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (10) : :- ^ FilterExecTransformer (2) : : +- ^ Scan parquet (1) : +- ^ InputIteratorTransformer (9) - : +- ^ InputAdapter (8) - : +- ^ BroadcastQueryStage (7) - : +- ColumnarBroadcastExchange (6) - : +- ^ FilterExecTransformer (4) - : +- ^ Scan parquet (3) + : +- BroadcastQueryStage (7) + : +- ColumnarBroadcastExchange (6) + : +- ^ FilterExecTransformer (4) + : +- ^ Scan parquet (3) +- ^ InputIteratorTransformer (19) - +- ^ InputAdapter (18) - +- ^ BroadcastQueryStage (17) - +- ColumnarBroadcastExchange (16) - +- ^ ProjectExecTransformer (14) - +- ^ FilterExecTransformer (13) - +- ^ Scan parquet (12) + +- BroadcastQueryStage (17) + +- ColumnarBroadcastExchange (16) + +- ^ ProjectExecTransformer (14) + +- ^ FilterExecTransformer (13) + +- ^ Scan parquet (12) +- == Initial Plan == - Sort (57) - +- Exchange (56) - +- Filter (55) - +- HashAggregate (54) - +- Exchange (53) - +- HashAggregate (52) - +- Project (51) - +- BroadcastHashJoin Inner BuildRight (50) - :- Project (45) - : +- BroadcastHashJoin Inner BuildRight (44) - : :- Filter (40) - : : +- Scan parquet (39) - : +- BroadcastExchange (43) - : +- Filter (42) - : +- Scan parquet (41) - +- BroadcastExchange (49) - +- Project (48) - +- Filter (47) - +- Scan parquet (46) + Sort (59) + +- Exchange (58) + +- Filter (57) + +- HashAggregate (56) + +- Exchange (55) + +- HashAggregate (54) + +- Project (53) + +- BroadcastHashJoin Inner BuildRight (52) + :- Project (47) + : +- BroadcastHashJoin Inner BuildRight (46) + : :- Filter (42) + : : +- Scan parquet (41) + : +- BroadcastExchange (45) + : +- Filter (44) + : +- Scan parquet (43) + +- BroadcastExchange (51) + +- Project (50) + +- Filter (49) + +- Scan parquet (48) (1) Scan parquet @@ -162,153 +160,161 @@ Input [3]: [ps_partkey#X, sum#X, isEmpty#X] Input [4]: [hash_partition_key#X, ps_partkey#X, sum#X, isEmpty#X] Arguments: false -(25) ColumnarExchange +(25) VeloxAppendBatches +Input [4]: [hash_partition_key#X, ps_partkey#X, sum#X, isEmpty#X] +Arguments: X + +(26) ColumnarExchange Input [4]: [hash_partition_key#X, ps_partkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(26) ShuffleQueryStage +(27) ShuffleQueryStage Output [3]: [ps_partkey#X, sum#X, isEmpty#X] Arguments: X -(27) InputAdapter +(28) InputAdapter Input [3]: [ps_partkey#X, sum#X, isEmpty#X] -(28) InputIteratorTransformer +(29) InputIteratorTransformer Input [3]: [ps_partkey#X, sum#X, isEmpty#X] -(29) RegularHashAggregateExecTransformer +(30) RegularHashAggregateExecTransformer Input [3]: [ps_partkey#X, sum#X, isEmpty#X] Keys [1]: [ps_partkey#X] Functions [1]: [sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(cast(ps_availqty#X as decimal(10,0)) as decimal(12,2)))), DecimalType(23,2), true))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(cast(ps_availqty#X as decimal(10,0)) as decimal(12,2)))), DecimalType(23,2), true))#X] Results [2]: [ps_partkey#X, sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(cast(ps_availqty#X as decimal(10,0)) as decimal(12,2)))), DecimalType(23,2), true))#X AS value#X] -(30) FilterExecTransformer +(31) FilterExecTransformer Input [2]: [ps_partkey#X, value#X] Arguments: (isnotnull(value#X) AND (cast(value#X as decimal(38,6)) > Subquery subquery#X, [id=#X])) -(31) WholeStageCodegenTransformer (X) +(32) WholeStageCodegenTransformer (X) Input [2]: [ps_partkey#X, value#X] Arguments: false -(32) ColumnarExchange +(33) VeloxAppendBatches +Input [2]: [ps_partkey#X, value#X] +Arguments: X + +(34) ColumnarExchange Input [2]: [ps_partkey#X, value#X] Arguments: rangepartitioning(value#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(33) ShuffleQueryStage +(35) ShuffleQueryStage Output [2]: [ps_partkey#X, value#X] Arguments: X -(34) InputAdapter +(36) InputAdapter Input [2]: [ps_partkey#X, value#X] -(35) InputIteratorTransformer +(37) InputIteratorTransformer Input [2]: [ps_partkey#X, value#X] -(36) SortExecTransformer +(38) SortExecTransformer Input [2]: [ps_partkey#X, value#X] Arguments: [value#X DESC NULLS LAST], true, 0 -(37) WholeStageCodegenTransformer (X) +(39) WholeStageCodegenTransformer (X) Input [2]: [ps_partkey#X, value#X] Arguments: false -(38) VeloxColumnarToRowExec +(40) VeloxColumnarToRowExec Input [2]: [ps_partkey#X, value#X] -(39) Scan parquet +(41) Scan parquet Output [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_suppkey)] ReadSchema: struct -(40) Filter +(42) Filter Input [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Condition : isnotnull(ps_suppkey#X) -(41) Scan parquet +(43) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(42) Filter +(44) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(43) BroadcastExchange +(45) BroadcastExchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(44) BroadcastHashJoin +(46) BroadcastHashJoin Left keys [1]: [ps_suppkey#X] Right keys [1]: [s_suppkey#X] Join condition: None -(45) Project +(47) Project Output [4]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] Input [6]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X, s_suppkey#X, s_nationkey#X] -(46) Scan parquet +(48) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,GERMANY), IsNotNull(n_nationkey)] ReadSchema: struct -(47) Filter +(49) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : ((isnotnull(n_name#X) AND (n_name#X = GERMANY)) AND isnotnull(n_nationkey#X)) -(48) Project +(50) Project Output [1]: [n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(49) BroadcastExchange +(51) BroadcastExchange Input [1]: [n_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(50) BroadcastHashJoin +(52) BroadcastHashJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(51) Project +(53) Project Output [3]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X] Input [5]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X, n_nationkey#X] -(52) HashAggregate +(54) HashAggregate Input [3]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X] Keys [1]: [ps_partkey#X] Functions [1]: [partial_sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(cast(ps_availqty#X as decimal(10,0)) as decimal(12,2)))), DecimalType(23,2), true))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [ps_partkey#X, sum#X, isEmpty#X] -(53) Exchange +(55) Exchange Input [3]: [ps_partkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(54) HashAggregate +(56) HashAggregate Input [3]: [ps_partkey#X, sum#X, isEmpty#X] Keys [1]: [ps_partkey#X] Functions [1]: [sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(cast(ps_availqty#X as decimal(10,0)) as decimal(12,2)))), DecimalType(23,2), true))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(cast(ps_availqty#X as decimal(10,0)) as decimal(12,2)))), DecimalType(23,2), true))#X] Results [2]: [ps_partkey#X, sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(cast(ps_availqty#X as decimal(10,0)) as decimal(12,2)))), DecimalType(23,2), true))#X AS value#X] -(55) Filter +(57) Filter Input [2]: [ps_partkey#X, value#X] Condition : (isnotnull(value#X) AND (cast(value#X as decimal(38,6)) > Subquery subquery#X, [id=#X])) -(56) Exchange +(58) Exchange Input [2]: [ps_partkey#X, value#X] Arguments: rangepartitioning(value#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(57) Sort +(59) Sort Input [2]: [ps_partkey#X, value#X] Arguments: [value#X DESC NULLS LAST], true, 0 -(58) AdaptiveSparkPlan +(60) AdaptiveSparkPlan Output [2]: [ps_partkey#X, value#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/12.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/12.txt index 1d477cbd774e4..f1f3a9234354e 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/12.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/12.txt @@ -1,44 +1,43 @@ == Physical Plan == -AdaptiveSparkPlan (42) +AdaptiveSparkPlan (44) +- == Final Plan == - VeloxColumnarToRowExec (28) - +- ^ SortExecTransformer (26) - +- ^ InputIteratorTransformer (25) - +- ^ InputAdapter (24) - +- ^ ShuffleQueryStage (23) - +- ColumnarExchange (22) - +- ^ RegularHashAggregateExecTransformer (20) - +- ^ InputIteratorTransformer (19) - +- ^ InputAdapter (18) - +- ^ ShuffleQueryStage (17) - +- ColumnarExchange (16) + VeloxColumnarToRowExec (30) + +- ^ SortExecTransformer (28) + +- ^ InputIteratorTransformer (27) + +- ShuffleQueryStage (25) + +- ColumnarExchange (24) + +- VeloxAppendBatches (23) + +- ^ RegularHashAggregateExecTransformer (21) + +- ^ InputIteratorTransformer (20) + +- ShuffleQueryStage (18) + +- ColumnarExchange (17) + +- VeloxAppendBatches (16) +- ^ ProjectExecTransformer (14) +- ^ FlushableHashAggregateExecTransformer (13) +- ^ ProjectExecTransformer (12) - +- ^ BroadcastHashJoinExecTransformer Inner (11) + +- ^ BroadcastHashJoinExecTransformer Inner BuildLeft (11) :- ^ InputIteratorTransformer (7) - : +- ^ InputAdapter (6) - : +- ^ BroadcastQueryStage (5) - : +- ColumnarBroadcastExchange (4) - : +- ^ FilterExecTransformer (2) - : +- ^ Scan parquet (1) + : +- BroadcastQueryStage (5) + : +- ColumnarBroadcastExchange (4) + : +- ^ FilterExecTransformer (2) + : +- ^ Scan parquet (1) +- ^ ProjectExecTransformer (10) +- ^ FilterExecTransformer (9) +- ^ Scan parquet (8) +- == Initial Plan == - Sort (41) - +- Exchange (40) - +- HashAggregate (39) - +- Exchange (38) - +- HashAggregate (37) - +- Project (36) - +- BroadcastHashJoin Inner BuildLeft (35) - :- BroadcastExchange (31) - : +- Filter (30) - : +- Scan parquet (29) - +- Project (34) - +- Filter (33) - +- Scan parquet (32) + Sort (43) + +- Exchange (42) + +- HashAggregate (41) + +- Exchange (40) + +- HashAggregate (39) + +- Project (38) + +- BroadcastHashJoin Inner BuildLeft (37) + :- BroadcastExchange (33) + : +- Filter (32) + : +- Scan parquet (31) + +- Project (36) + +- Filter (35) + +- Scan parquet (34) (1) Scan parquet @@ -109,121 +108,129 @@ Input [3]: [l_shipmode#X, sum#X, sum#X] Input [4]: [hash_partition_key#X, l_shipmode#X, sum#X, sum#X] Arguments: false -(16) ColumnarExchange +(16) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_shipmode#X, sum#X, sum#X] +Arguments: X + +(17) ColumnarExchange Input [4]: [hash_partition_key#X, l_shipmode#X, sum#X, sum#X] Arguments: hashpartitioning(l_shipmode#X, 1), ENSURE_REQUIREMENTS, [l_shipmode#X, sum#X, sum#X], [plan_id=X], [id=#X] -(17) ShuffleQueryStage +(18) ShuffleQueryStage Output [3]: [l_shipmode#X, sum#X, sum#X] Arguments: X -(18) InputAdapter +(19) InputAdapter Input [3]: [l_shipmode#X, sum#X, sum#X] -(19) InputIteratorTransformer +(20) InputIteratorTransformer Input [3]: [l_shipmode#X, sum#X, sum#X] -(20) RegularHashAggregateExecTransformer +(21) RegularHashAggregateExecTransformer Input [3]: [l_shipmode#X, sum#X, sum#X] Keys [1]: [l_shipmode#X] Functions [2]: [sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END), sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)] Aggregate Attributes [2]: [sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X, sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X] Results [3]: [l_shipmode#X, sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X AS high_line_count#X, sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X AS low_line_count#X] -(21) WholeStageCodegenTransformer (X) +(22) WholeStageCodegenTransformer (X) Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: false -(22) ColumnarExchange +(23) VeloxAppendBatches +Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] +Arguments: X + +(24) ColumnarExchange Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: rangepartitioning(l_shipmode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(23) ShuffleQueryStage +(25) ShuffleQueryStage Output [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: X -(24) InputAdapter +(26) InputAdapter Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] -(25) InputIteratorTransformer +(27) InputIteratorTransformer Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] -(26) SortExecTransformer +(28) SortExecTransformer Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: [l_shipmode#X ASC NULLS FIRST], true, 0 -(27) WholeStageCodegenTransformer (X) +(29) WholeStageCodegenTransformer (X) Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: false -(28) VeloxColumnarToRowExec +(30) VeloxColumnarToRowExec Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] -(29) Scan parquet +(31) Scan parquet Output [2]: [o_orderkey#X, o_orderpriority#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderkey)] ReadSchema: struct -(30) Filter +(32) Filter Input [2]: [o_orderkey#X, o_orderpriority#X] Condition : isnotnull(o_orderkey#X) -(31) BroadcastExchange +(33) BroadcastExchange Input [2]: [o_orderkey#X, o_orderpriority#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(32) Scan parquet +(34) Scan parquet Output [5]: [l_orderkey#X, l_shipdate#X, l_commitdate#X, l_receiptdate#X, l_shipmode#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_commitdate), IsNotNull(l_receiptdate), IsNotNull(l_shipdate), In(l_shipmode, [MAIL,SHIP]), GreaterThanOrEqual(l_receiptdate,1994-01-01), LessThan(l_receiptdate,1995-01-01), IsNotNull(l_orderkey)] ReadSchema: struct -(33) Filter +(35) Filter Input [5]: [l_orderkey#X, l_shipdate#X, l_commitdate#X, l_receiptdate#X, l_shipmode#X] Condition : ((((((((isnotnull(l_commitdate#X) AND isnotnull(l_receiptdate#X)) AND isnotnull(l_shipdate#X)) AND l_shipmode#X IN (MAIL,SHIP)) AND (l_commitdate#X < l_receiptdate#X)) AND (l_shipdate#X < l_commitdate#X)) AND (l_receiptdate#X >= 1994-01-01)) AND (l_receiptdate#X < 1995-01-01)) AND isnotnull(l_orderkey#X)) -(34) Project +(36) Project Output [2]: [l_orderkey#X, l_shipmode#X] Input [5]: [l_orderkey#X, l_shipdate#X, l_commitdate#X, l_receiptdate#X, l_shipmode#X] -(35) BroadcastHashJoin +(37) BroadcastHashJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(36) Project +(38) Project Output [2]: [o_orderpriority#X, l_shipmode#X] Input [4]: [o_orderkey#X, o_orderpriority#X, l_orderkey#X, l_shipmode#X] -(37) HashAggregate +(39) HashAggregate Input [2]: [o_orderpriority#X, l_shipmode#X] Keys [1]: [l_shipmode#X] Functions [2]: [partial_sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END), partial_sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)] Aggregate Attributes [2]: [sum#X, sum#X] Results [3]: [l_shipmode#X, sum#X, sum#X] -(38) Exchange +(40) Exchange Input [3]: [l_shipmode#X, sum#X, sum#X] Arguments: hashpartitioning(l_shipmode#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(39) HashAggregate +(41) HashAggregate Input [3]: [l_shipmode#X, sum#X, sum#X] Keys [1]: [l_shipmode#X] Functions [2]: [sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END), sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)] Aggregate Attributes [2]: [sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X, sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X] Results [3]: [l_shipmode#X, sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X AS high_line_count#X, sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X AS low_line_count#X] -(40) Exchange +(42) Exchange Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: rangepartitioning(l_shipmode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(41) Sort +(43) Sort Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: [l_shipmode#X ASC NULLS FIRST], true, 0 -(42) AdaptiveSparkPlan +(44) AdaptiveSparkPlan Output [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/13.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/13.txt index 492cfafb1ec18..13ef25b681c0d 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/13.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/13.txt @@ -1,53 +1,52 @@ == Physical Plan == -AdaptiveSparkPlan (52) +AdaptiveSparkPlan (55) +- == Final Plan == - VeloxColumnarToRowExec (36) - +- ^ SortExecTransformer (34) - +- ^ InputIteratorTransformer (33) - +- ^ InputAdapter (32) - +- ^ ShuffleQueryStage (31) - +- ColumnarExchange (30) - +- ^ RegularHashAggregateExecTransformer (28) - +- ^ InputIteratorTransformer (27) - +- ^ InputAdapter (26) - +- ^ ShuffleQueryStage (25) - +- ColumnarExchange (24) - +- ^ ProjectExecTransformer (22) - +- ^ FlushableHashAggregateExecTransformer (21) - +- ^ ProjectExecTransformer (20) - +- ^ RegularHashAggregateExecTransformer (19) - +- ^ InputIteratorTransformer (18) - +- ^ InputAdapter (17) - +- ^ ShuffleQueryStage (16) - +- ColumnarExchange (15) + VeloxColumnarToRowExec (39) + +- ^ SortExecTransformer (37) + +- ^ InputIteratorTransformer (36) + +- ShuffleQueryStage (34) + +- ColumnarExchange (33) + +- VeloxAppendBatches (32) + +- ^ RegularHashAggregateExecTransformer (30) + +- ^ InputIteratorTransformer (29) + +- ShuffleQueryStage (27) + +- ColumnarExchange (26) + +- VeloxAppendBatches (25) + +- ^ ProjectExecTransformer (23) + +- ^ FlushableHashAggregateExecTransformer (22) + +- ^ ProjectExecTransformer (21) + +- ^ RegularHashAggregateExecTransformer (20) + +- ^ InputIteratorTransformer (19) + +- ShuffleQueryStage (17) + +- ColumnarExchange (16) + +- VeloxAppendBatches (15) +- ^ ProjectExecTransformer (13) +- ^ FlushableHashAggregateExecTransformer (12) +- ^ ProjectExecTransformer (11) - +- ^ BroadcastHashJoinExecTransformer LeftOuter (10) + +- ^ BroadcastHashJoinExecTransformer LeftOuter BuildRight (10) :- ^ Scan parquet (1) +- ^ InputIteratorTransformer (9) - +- ^ InputAdapter (8) - +- ^ BroadcastQueryStage (7) - +- ColumnarBroadcastExchange (6) - +- ^ ProjectExecTransformer (4) - +- ^ FilterExecTransformer (3) - +- ^ Scan parquet (2) + +- BroadcastQueryStage (7) + +- ColumnarBroadcastExchange (6) + +- ^ ProjectExecTransformer (4) + +- ^ FilterExecTransformer (3) + +- ^ Scan parquet (2) +- == Initial Plan == - Sort (51) - +- Exchange (50) - +- HashAggregate (49) - +- Exchange (48) - +- HashAggregate (47) - +- HashAggregate (46) - +- Exchange (45) - +- HashAggregate (44) - +- Project (43) - +- BroadcastHashJoin LeftOuter BuildRight (42) - :- Scan parquet (37) - +- BroadcastExchange (41) - +- Project (40) - +- Filter (39) - +- Scan parquet (38) + Sort (54) + +- Exchange (53) + +- HashAggregate (52) + +- Exchange (51) + +- HashAggregate (50) + +- HashAggregate (49) + +- Exchange (48) + +- HashAggregate (47) + +- Project (46) + +- BroadcastHashJoin LeftOuter BuildRight (45) + :- Scan parquet (40) + +- BroadcastExchange (44) + +- Project (43) + +- Filter (42) + +- Scan parquet (41) (1) Scan parquet @@ -113,174 +112,186 @@ Input [2]: [c_custkey#X, count#X] Input [3]: [hash_partition_key#X, c_custkey#X, count#X] Arguments: false -(15) ColumnarExchange +(15) VeloxAppendBatches +Input [3]: [hash_partition_key#X, c_custkey#X, count#X] +Arguments: X + +(16) ColumnarExchange Input [3]: [hash_partition_key#X, c_custkey#X, count#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, count#X], [plan_id=X], [id=#X] -(16) ShuffleQueryStage +(17) ShuffleQueryStage Output [2]: [c_custkey#X, count#X] Arguments: X -(17) InputAdapter +(18) InputAdapter Input [2]: [c_custkey#X, count#X] -(18) InputIteratorTransformer +(19) InputIteratorTransformer Input [2]: [c_custkey#X, count#X] -(19) RegularHashAggregateExecTransformer +(20) RegularHashAggregateExecTransformer Input [2]: [c_custkey#X, count#X] Keys [1]: [c_custkey#X] Functions [1]: [count(o_orderkey#X)] Aggregate Attributes [1]: [count(o_orderkey#X)#X] Results [2]: [c_custkey#X, count(o_orderkey#X)#X] -(20) ProjectExecTransformer +(21) ProjectExecTransformer Output [1]: [count(o_orderkey#X)#X AS c_count#X] Input [2]: [c_custkey#X, count(o_orderkey#X)#X] -(21) FlushableHashAggregateExecTransformer +(22) FlushableHashAggregateExecTransformer Input [1]: [c_count#X] Keys [1]: [c_count#X] Functions [1]: [partial_count(1)] Aggregate Attributes [1]: [count#X] Results [2]: [c_count#X, count#X] -(22) ProjectExecTransformer +(23) ProjectExecTransformer Output [3]: [hash(c_count#X, 42) AS hash_partition_key#X, c_count#X, count#X] Input [2]: [c_count#X, count#X] -(23) WholeStageCodegenTransformer (X) +(24) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, c_count#X, count#X] Arguments: false -(24) ColumnarExchange +(25) VeloxAppendBatches +Input [3]: [hash_partition_key#X, c_count#X, count#X] +Arguments: X + +(26) ColumnarExchange Input [3]: [hash_partition_key#X, c_count#X, count#X] Arguments: hashpartitioning(c_count#X, 1), ENSURE_REQUIREMENTS, [c_count#X, count#X], [plan_id=X], [id=#X] -(25) ShuffleQueryStage +(27) ShuffleQueryStage Output [2]: [c_count#X, count#X] Arguments: X -(26) InputAdapter +(28) InputAdapter Input [2]: [c_count#X, count#X] -(27) InputIteratorTransformer +(29) InputIteratorTransformer Input [2]: [c_count#X, count#X] -(28) RegularHashAggregateExecTransformer +(30) RegularHashAggregateExecTransformer Input [2]: [c_count#X, count#X] Keys [1]: [c_count#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [c_count#X, count(1)#X AS custdist#X] -(29) WholeStageCodegenTransformer (X) +(31) WholeStageCodegenTransformer (X) Input [2]: [c_count#X, custdist#X] Arguments: false -(30) ColumnarExchange +(32) VeloxAppendBatches +Input [2]: [c_count#X, custdist#X] +Arguments: X + +(33) ColumnarExchange Input [2]: [c_count#X, custdist#X] Arguments: rangepartitioning(custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(31) ShuffleQueryStage +(34) ShuffleQueryStage Output [2]: [c_count#X, custdist#X] Arguments: X -(32) InputAdapter +(35) InputAdapter Input [2]: [c_count#X, custdist#X] -(33) InputIteratorTransformer +(36) InputIteratorTransformer Input [2]: [c_count#X, custdist#X] -(34) SortExecTransformer +(37) SortExecTransformer Input [2]: [c_count#X, custdist#X] Arguments: [custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST], true, 0 -(35) WholeStageCodegenTransformer (X) +(38) WholeStageCodegenTransformer (X) Input [2]: [c_count#X, custdist#X] Arguments: false -(36) VeloxColumnarToRowExec +(39) VeloxColumnarToRowExec Input [2]: [c_count#X, custdist#X] -(37) Scan parquet +(40) Scan parquet Output [1]: [c_custkey#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(38) Scan parquet +(41) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_comment#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_comment), IsNotNull(o_custkey)] ReadSchema: struct -(39) Filter +(42) Filter Input [3]: [o_orderkey#X, o_custkey#X, o_comment#X] Condition : ((isnotnull(o_comment#X) AND NOT o_comment#X LIKE %special%requests%) AND isnotnull(o_custkey#X)) -(40) Project +(43) Project Output [2]: [o_orderkey#X, o_custkey#X] Input [3]: [o_orderkey#X, o_custkey#X, o_comment#X] -(41) BroadcastExchange +(44) BroadcastExchange Input [2]: [o_orderkey#X, o_custkey#X] Arguments: HashedRelationBroadcastMode(List(input[1, bigint, true]),false), [plan_id=X] -(42) BroadcastHashJoin +(45) BroadcastHashJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(43) Project +(46) Project Output [2]: [c_custkey#X, o_orderkey#X] Input [3]: [c_custkey#X, o_orderkey#X, o_custkey#X] -(44) HashAggregate +(47) HashAggregate Input [2]: [c_custkey#X, o_orderkey#X] Keys [1]: [c_custkey#X] Functions [1]: [partial_count(o_orderkey#X)] Aggregate Attributes [1]: [count#X] Results [2]: [c_custkey#X, count#X] -(45) Exchange +(48) Exchange Input [2]: [c_custkey#X, count#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(46) HashAggregate +(49) HashAggregate Input [2]: [c_custkey#X, count#X] Keys [1]: [c_custkey#X] Functions [1]: [count(o_orderkey#X)] Aggregate Attributes [1]: [count(o_orderkey#X)#X] Results [1]: [count(o_orderkey#X)#X AS c_count#X] -(47) HashAggregate +(50) HashAggregate Input [1]: [c_count#X] Keys [1]: [c_count#X] Functions [1]: [partial_count(1)] Aggregate Attributes [1]: [count#X] Results [2]: [c_count#X, count#X] -(48) Exchange +(51) Exchange Input [2]: [c_count#X, count#X] Arguments: hashpartitioning(c_count#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(49) HashAggregate +(52) HashAggregate Input [2]: [c_count#X, count#X] Keys [1]: [c_count#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [c_count#X, count(1)#X AS custdist#X] -(50) Exchange +(53) Exchange Input [2]: [c_count#X, custdist#X] Arguments: rangepartitioning(custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(51) Sort +(54) Sort Input [2]: [c_count#X, custdist#X] Arguments: [custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST], true, 0 -(52) AdaptiveSparkPlan +(55) AdaptiveSparkPlan Output [2]: [c_count#X, custdist#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/14.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/14.txt index b564240ff5520..c86f2215cda5b 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/14.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/14.txt @@ -1,37 +1,36 @@ == Physical Plan == -AdaptiveSparkPlan (34) +AdaptiveSparkPlan (35) +- == Final Plan == - VeloxColumnarToRowExec (22) - +- ^ ProjectExecTransformer (20) - +- ^ RegularHashAggregateExecTransformer (19) - +- ^ InputIteratorTransformer (18) - +- ^ InputAdapter (17) - +- ^ ShuffleQueryStage (16) - +- ColumnarExchange (15) + VeloxColumnarToRowExec (23) + +- ^ ProjectExecTransformer (21) + +- ^ RegularHashAggregateExecTransformer (20) + +- ^ InputIteratorTransformer (19) + +- ShuffleQueryStage (17) + +- ColumnarExchange (16) + +- VeloxAppendBatches (15) +- ^ FlushableHashAggregateExecTransformer (13) +- ^ ProjectExecTransformer (12) - +- ^ BroadcastHashJoinExecTransformer Inner (11) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (11) :- ^ ProjectExecTransformer (3) : +- ^ FilterExecTransformer (2) : +- ^ Scan parquet (1) +- ^ InputIteratorTransformer (10) - +- ^ InputAdapter (9) - +- ^ BroadcastQueryStage (8) - +- ColumnarBroadcastExchange (7) - +- ^ FilterExecTransformer (5) - +- ^ Scan parquet (4) + +- BroadcastQueryStage (8) + +- ColumnarBroadcastExchange (7) + +- ^ FilterExecTransformer (5) + +- ^ Scan parquet (4) +- == Initial Plan == - HashAggregate (33) - +- Exchange (32) - +- HashAggregate (31) - +- Project (30) - +- BroadcastHashJoin Inner BuildRight (29) - :- Project (25) - : +- Filter (24) - : +- Scan parquet (23) - +- BroadcastExchange (28) - +- Filter (27) - +- Scan parquet (26) + HashAggregate (34) + +- Exchange (33) + +- HashAggregate (32) + +- Project (31) + +- BroadcastHashJoin Inner BuildRight (30) + :- Project (26) + : +- Filter (25) + : +- Scan parquet (24) + +- BroadcastExchange (29) + +- Filter (28) + +- Scan parquet (27) (1) Scan parquet @@ -98,95 +97,99 @@ Results [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: false -(15) ColumnarExchange +(15) VeloxAppendBatches +Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] +Arguments: X + +(16) ColumnarExchange Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(16) ShuffleQueryStage +(17) ShuffleQueryStage Output [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: X -(17) InputAdapter +(18) InputAdapter Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] -(18) InputIteratorTransformer +(19) InputIteratorTransformer Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] -(19) RegularHashAggregateExecTransformer +(20) RegularHashAggregateExecTransformer Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Keys: [] Functions [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) ELSE 0.0000 END), sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) ELSE 0.0000 END)#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] Results [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) ELSE 0.0000 END)#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] -(20) ProjectExecTransformer +(21) ProjectExecTransformer Output [1]: [CheckOverflow((promote_precision(CheckOverflow((100.0000 * promote_precision(sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) ELSE 0.0000 END)#X)), DecimalType(38,6), true)) / promote_precision(cast(sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X as decimal(38,6)))), DecimalType(38,6), true) AS promo_revenue#X] Input [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) ELSE 0.0000 END)#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] -(21) WholeStageCodegenTransformer (X) +(22) WholeStageCodegenTransformer (X) Input [1]: [promo_revenue#X] Arguments: false -(22) VeloxColumnarToRowExec +(23) VeloxColumnarToRowExec Input [1]: [promo_revenue#X] -(23) Scan parquet +(24) Scan parquet Output [4]: [l_partkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1995-09-01), LessThan(l_shipdate,1995-10-01), IsNotNull(l_partkey)] ReadSchema: struct -(24) Filter +(25) Filter Input [4]: [l_partkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : (((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1995-09-01)) AND (l_shipdate#X < 1995-10-01)) AND isnotnull(l_partkey#X)) -(25) Project +(26) Project Output [3]: [l_partkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_partkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(26) Scan parquet +(27) Scan parquet Output [2]: [p_partkey#X, p_type#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_partkey)] ReadSchema: struct -(27) Filter +(28) Filter Input [2]: [p_partkey#X, p_type#X] Condition : isnotnull(p_partkey#X) -(28) BroadcastExchange +(29) BroadcastExchange Input [2]: [p_partkey#X, p_type#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(29) BroadcastHashJoin +(30) BroadcastHashJoin Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(30) Project +(31) Project Output [3]: [l_extendedprice#X, l_discount#X, p_type#X] Input [5]: [l_partkey#X, l_extendedprice#X, l_discount#X, p_partkey#X, p_type#X] -(31) HashAggregate +(32) HashAggregate Input [3]: [l_extendedprice#X, l_discount#X, p_type#X] Keys: [] Functions [2]: [partial_sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) ELSE 0.0000 END), partial_sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Results [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] -(32) Exchange +(33) Exchange Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X] -(33) HashAggregate +(34) HashAggregate Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Keys: [] Functions [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) ELSE 0.0000 END), sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) ELSE 0.0000 END)#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] Results [1]: [CheckOverflow((promote_precision(CheckOverflow((100.0000 * promote_precision(sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) ELSE 0.0000 END)#X)), DecimalType(38,6), true)) / promote_precision(cast(sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X as decimal(38,6)))), DecimalType(38,6), true) AS promo_revenue#X] -(34) AdaptiveSparkPlan +(35) AdaptiveSparkPlan Output [1]: [promo_revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/15.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/15.txt index ff632e930f729..8edb179e592fc 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/15.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/15.txt @@ -1,46 +1,45 @@ == Physical Plan == -AdaptiveSparkPlan (44) +AdaptiveSparkPlan (46) +- == Final Plan == - VeloxColumnarToRowExec (29) - +- ^ SortExecTransformer (27) - +- ^ InputIteratorTransformer (26) - +- ^ InputAdapter (25) - +- ^ ShuffleQueryStage (24) - +- ColumnarExchange (23) - +- ^ ProjectExecTransformer (21) - +- ^ BroadcastHashJoinExecTransformer Inner (20) + VeloxColumnarToRowExec (31) + +- ^ SortExecTransformer (29) + +- ^ InputIteratorTransformer (28) + +- ShuffleQueryStage (26) + +- ColumnarExchange (25) + +- VeloxAppendBatches (24) + +- ^ ProjectExecTransformer (22) + +- ^ BroadcastHashJoinExecTransformer Inner BuildLeft (21) :- ^ InputIteratorTransformer (7) - : +- ^ InputAdapter (6) - : +- ^ BroadcastQueryStage (5) - : +- ColumnarBroadcastExchange (4) - : +- ^ FilterExecTransformer (2) - : +- ^ Scan parquet (1) - +- ^ FilterExecTransformer (19) - +- ^ RegularHashAggregateExecTransformer (18) - +- ^ InputIteratorTransformer (17) - +- ^ InputAdapter (16) - +- ^ ShuffleQueryStage (15) - +- ColumnarExchange (14) + : +- BroadcastQueryStage (5) + : +- ColumnarBroadcastExchange (4) + : +- ^ FilterExecTransformer (2) + : +- ^ Scan parquet (1) + +- ^ FilterExecTransformer (20) + +- ^ RegularHashAggregateExecTransformer (19) + +- ^ InputIteratorTransformer (18) + +- ShuffleQueryStage (16) + +- ColumnarExchange (15) + +- VeloxAppendBatches (14) +- ^ ProjectExecTransformer (12) +- ^ FlushableHashAggregateExecTransformer (11) +- ^ ProjectExecTransformer (10) +- ^ FilterExecTransformer (9) +- ^ Scan parquet (8) +- == Initial Plan == - Sort (43) - +- Exchange (42) - +- Project (41) - +- BroadcastHashJoin Inner BuildLeft (40) - :- BroadcastExchange (32) - : +- Filter (31) - : +- Scan parquet (30) - +- Filter (39) - +- HashAggregate (38) - +- Exchange (37) - +- HashAggregate (36) - +- Project (35) - +- Filter (34) - +- Scan parquet (33) + Sort (45) + +- Exchange (44) + +- Project (43) + +- BroadcastHashJoin Inner BuildLeft (42) + :- BroadcastExchange (34) + : +- Filter (33) + : +- Scan parquet (32) + +- Filter (41) + +- HashAggregate (40) + +- Exchange (39) + +- HashAggregate (38) + +- Project (37) + +- Filter (36) + +- Scan parquet (35) (1) Scan parquet @@ -102,138 +101,146 @@ Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: false -(14) ColumnarExchange +(14) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] +Arguments: X + +(15) ColumnarExchange Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(15) ShuffleQueryStage +(16) ShuffleQueryStage Output [3]: [l_suppkey#X, sum#X, isEmpty#X] Arguments: X -(16) InputAdapter +(17) InputAdapter Input [3]: [l_suppkey#X, sum#X, isEmpty#X] -(17) InputIteratorTransformer +(18) InputIteratorTransformer Input [3]: [l_suppkey#X, sum#X, isEmpty#X] -(18) RegularHashAggregateExecTransformer +(19) RegularHashAggregateExecTransformer Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Keys [1]: [l_suppkey#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] Results [2]: [l_suppkey#X AS supplier_no#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X AS total_revenue#X] -(19) FilterExecTransformer +(20) FilterExecTransformer Input [2]: [supplier_no#X, total_revenue#X] Arguments: (isnotnull(total_revenue#X) AND (total_revenue#X = Subquery subquery#X, [id=#X])) -(20) BroadcastHashJoinExecTransformer +(21) BroadcastHashJoinExecTransformer Left keys [1]: [s_suppkey#X] Right keys [1]: [supplier_no#X] Join condition: None -(21) ProjectExecTransformer +(22) ProjectExecTransformer Output [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Input [6]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, supplier_no#X, total_revenue#X] -(22) WholeStageCodegenTransformer (X) +(23) WholeStageCodegenTransformer (X) Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: false -(23) ColumnarExchange +(24) VeloxAppendBatches +Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] +Arguments: X + +(25) ColumnarExchange Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: rangepartitioning(s_suppkey#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(24) ShuffleQueryStage +(26) ShuffleQueryStage Output [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: X -(25) InputAdapter +(27) InputAdapter Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] -(26) InputIteratorTransformer +(28) InputIteratorTransformer Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] -(27) SortExecTransformer +(29) SortExecTransformer Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: [s_suppkey#X ASC NULLS FIRST], true, 0 -(28) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: false -(29) VeloxColumnarToRowExec +(31) VeloxColumnarToRowExec Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] -(30) Scan parquet +(32) Scan parquet Output [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey)] ReadSchema: struct -(31) Filter +(33) Filter Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] Condition : isnotnull(s_suppkey#X) -(32) BroadcastExchange +(34) BroadcastExchange Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(33) Scan parquet +(35) Scan parquet Output [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1996-01-01), LessThan(l_shipdate,1996-04-01), IsNotNull(l_suppkey)] ReadSchema: struct -(34) Filter +(36) Filter Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : (((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1996-01-01)) AND (l_shipdate#X < 1996-04-01)) AND isnotnull(l_suppkey#X)) -(35) Project +(37) Project Output [3]: [l_suppkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(36) HashAggregate +(38) HashAggregate Input [3]: [l_suppkey#X, l_extendedprice#X, l_discount#X] Keys [1]: [l_suppkey#X] Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_suppkey#X, sum#X, isEmpty#X] -(37) Exchange +(39) Exchange Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(38) HashAggregate +(40) HashAggregate Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Keys [1]: [l_suppkey#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] Results [2]: [l_suppkey#X AS supplier_no#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X AS total_revenue#X] -(39) Filter +(41) Filter Input [2]: [supplier_no#X, total_revenue#X] Condition : (isnotnull(total_revenue#X) AND (total_revenue#X = Subquery subquery#X, [id=#X])) -(40) BroadcastHashJoin +(42) BroadcastHashJoin Left keys [1]: [s_suppkey#X] Right keys [1]: [supplier_no#X] Join condition: None -(41) Project +(43) Project Output [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Input [6]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, supplier_no#X, total_revenue#X] -(42) Exchange +(44) Exchange Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: rangepartitioning(s_suppkey#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(43) Sort +(45) Sort Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: [s_suppkey#X ASC NULLS FIRST], true, 0 -(44) AdaptiveSparkPlan +(46) AdaptiveSparkPlan Output [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/16.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/16.txt index 06fcb53dfebef..4d4d52c7d2e61 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/16.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/16.txt @@ -1,57 +1,56 @@ == Physical Plan == -AdaptiveSparkPlan (56) +AdaptiveSparkPlan (59) +- == Final Plan == - VeloxColumnarToRowExec (35) - +- ^ SortExecTransformer (33) - +- ^ InputIteratorTransformer (32) - +- ^ InputAdapter (31) - +- ^ ShuffleQueryStage (30) - +- ColumnarExchange (29) - +- ^ RegularHashAggregateExecTransformer (27) - +- ^ InputIteratorTransformer (26) - +- ^ InputAdapter (25) - +- ^ ShuffleQueryStage (24) - +- ColumnarExchange (23) - +- ^ ProjectExecTransformer (21) - +- ^ FlushableHashAggregateExecTransformer (20) - +- ^ RegularHashAggregateExecTransformer (19) - +- ^ InputIteratorTransformer (18) - +- ^ InputAdapter (17) - +- ^ ShuffleQueryStage (16) - +- ColumnarExchange (15) + VeloxColumnarToRowExec (38) + +- ^ SortExecTransformer (36) + +- ^ InputIteratorTransformer (35) + +- ShuffleQueryStage (33) + +- ColumnarExchange (32) + +- VeloxAppendBatches (31) + +- ^ RegularHashAggregateExecTransformer (29) + +- ^ InputIteratorTransformer (28) + +- ShuffleQueryStage (26) + +- ColumnarExchange (25) + +- VeloxAppendBatches (24) + +- ^ ProjectExecTransformer (22) + +- ^ FlushableHashAggregateExecTransformer (21) + +- ^ RegularHashAggregateExecTransformer (20) + +- ^ InputIteratorTransformer (19) + +- ShuffleQueryStage (17) + +- ColumnarExchange (16) + +- VeloxAppendBatches (15) +- ^ ProjectExecTransformer (13) +- ^ FlushableHashAggregateExecTransformer (12) +- ^ ProjectExecTransformer (11) - +- ^ BroadcastHashJoinExecTransformer Inner (10) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (10) :- ^ FilterExecTransformer (2) : +- ^ Scan parquet (1) +- ^ InputIteratorTransformer (9) - +- ^ InputAdapter (8) - +- ^ BroadcastQueryStage (7) - +- ColumnarBroadcastExchange (6) - +- ^ FilterExecTransformer (4) - +- ^ Scan parquet (3) + +- BroadcastQueryStage (7) + +- ColumnarBroadcastExchange (6) + +- ^ FilterExecTransformer (4) + +- ^ Scan parquet (3) +- == Initial Plan == - Sort (55) - +- Exchange (54) - +- HashAggregate (53) - +- Exchange (52) - +- HashAggregate (51) - +- HashAggregate (50) - +- Exchange (49) - +- HashAggregate (48) - +- Project (47) - +- BroadcastHashJoin Inner BuildRight (46) - :- BroadcastHashJoin LeftAnti BuildRight (42) - : :- Filter (37) - : : +- Scan parquet (36) - : +- BroadcastExchange (41) - : +- Project (40) - : +- Filter (39) - : +- Scan parquet (38) - +- BroadcastExchange (45) - +- Filter (44) - +- Scan parquet (43) + Sort (58) + +- Exchange (57) + +- HashAggregate (56) + +- Exchange (55) + +- HashAggregate (54) + +- HashAggregate (53) + +- Exchange (52) + +- HashAggregate (51) + +- Project (50) + +- BroadcastHashJoin Inner BuildRight (49) + :- BroadcastHashJoin LeftAnti BuildRight (45) + : :- Filter (40) + : : +- Scan parquet (39) + : +- BroadcastExchange (44) + : +- Project (43) + : +- Filter (42) + : +- Scan parquet (41) + +- BroadcastExchange (48) + +- Filter (47) + +- Scan parquet (46) (1) Scan parquet @@ -118,195 +117,207 @@ Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Arguments: false -(15) ColumnarExchange +(15) VeloxAppendBatches +Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, ps_suppkey#X] +Arguments: X + +(16) ColumnarExchange Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, ps_suppkey#X], [plan_id=X], [id=#X] -(16) ShuffleQueryStage +(17) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Arguments: X -(17) InputAdapter +(18) InputAdapter Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(18) InputIteratorTransformer +(19) InputIteratorTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(19) RegularHashAggregateExecTransformer +(20) RegularHashAggregateExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Keys [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Functions: [] Aggregate Attributes: [] Results [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(20) FlushableHashAggregateExecTransformer +(21) FlushableHashAggregateExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] Functions [1]: [partial_count(distinct ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count#X] -(21) ProjectExecTransformer +(22) ProjectExecTransformer Output [5]: [hash(p_brand#X, p_type#X, p_size#X, 42) AS hash_partition_key#X, p_brand#X, p_type#X, p_size#X, count#X] Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] -(22) WholeStageCodegenTransformer (X) +(23) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, count#X] Arguments: false -(23) ColumnarExchange +(24) VeloxAppendBatches +Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, count#X] +Arguments: X + +(25) ColumnarExchange Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, count#X] Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, count#X], [plan_id=X], [id=#X] -(24) ShuffleQueryStage +(26) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, count#X] Arguments: X -(25) InputAdapter +(27) InputAdapter Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] -(26) InputIteratorTransformer +(28) InputIteratorTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] -(27) RegularHashAggregateExecTransformer +(29) RegularHashAggregateExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] Functions [1]: [count(distinct ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count(ps_suppkey#X)#X AS supplier_cnt#X] -(28) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: false -(29) ColumnarExchange +(31) VeloxAppendBatches +Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] +Arguments: X + +(32) ColumnarExchange Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: rangepartitioning(supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(30) ShuffleQueryStage +(33) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: X -(31) InputAdapter +(34) InputAdapter Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] -(32) InputIteratorTransformer +(35) InputIteratorTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] -(33) SortExecTransformer +(36) SortExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: [supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST], true, 0 -(34) WholeStageCodegenTransformer (X) +(37) WholeStageCodegenTransformer (X) Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: false -(35) VeloxColumnarToRowExec +(38) VeloxColumnarToRowExec Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] -(36) Scan parquet +(39) Scan parquet Output [2]: [ps_partkey#X, ps_suppkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_partkey)] ReadSchema: struct -(37) Filter +(40) Filter Input [2]: [ps_partkey#X, ps_suppkey#X] Condition : isnotnull(ps_partkey#X) -(38) Scan parquet +(41) Scan parquet Output [2]: [s_suppkey#X, s_comment#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_comment)] ReadSchema: struct -(39) Filter +(42) Filter Input [2]: [s_suppkey#X, s_comment#X] Condition : (isnotnull(s_comment#X) AND s_comment#X LIKE %Customer%Complaints%) -(40) Project +(43) Project Output [1]: [s_suppkey#X] Input [2]: [s_suppkey#X, s_comment#X] -(41) BroadcastExchange +(44) BroadcastExchange Input [1]: [s_suppkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),true), [plan_id=X] -(42) BroadcastHashJoin +(45) BroadcastHashJoin Left keys [1]: [ps_suppkey#X] Right keys [1]: [s_suppkey#X] Join condition: None -(43) Scan parquet +(46) Scan parquet Output [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_brand), IsNotNull(p_type), Not(EqualTo(p_brand,Brand#X)), Not(StringStartsWith(p_type,MEDIUM POLISHED)), In(p_size, [14,19,23,3,36,45,49,9]), IsNotNull(p_partkey)] ReadSchema: struct -(44) Filter +(47) Filter Input [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] Condition : (((((isnotnull(p_brand#X) AND isnotnull(p_type#X)) AND NOT (p_brand#X = Brand#X)) AND NOT StartsWith(p_type#X, MEDIUM POLISHED)) AND p_size#X IN (49,14,23,45,19,3,36,9)) AND isnotnull(p_partkey#X)) -(45) BroadcastExchange +(48) BroadcastExchange Input [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(46) BroadcastHashJoin +(49) BroadcastHashJoin Left keys [1]: [ps_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(47) Project +(50) Project Output [4]: [ps_suppkey#X, p_brand#X, p_type#X, p_size#X] Input [6]: [ps_partkey#X, ps_suppkey#X, p_partkey#X, p_brand#X, p_type#X, p_size#X] -(48) HashAggregate +(51) HashAggregate Input [4]: [ps_suppkey#X, p_brand#X, p_type#X, p_size#X] Keys [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Functions: [] Aggregate Attributes: [] Results [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(49) Exchange +(52) Exchange Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(50) HashAggregate +(53) HashAggregate Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Keys [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Functions: [] Aggregate Attributes: [] Results [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(51) HashAggregate +(54) HashAggregate Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] Functions [1]: [partial_count(distinct ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count#X] -(52) Exchange +(55) Exchange Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(53) HashAggregate +(56) HashAggregate Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] Functions [1]: [count(distinct ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count(ps_suppkey#X)#X AS supplier_cnt#X] -(54) Exchange +(57) Exchange Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: rangepartitioning(supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(55) Sort +(58) Sort Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: [supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST], true, 0 -(56) AdaptiveSparkPlan +(59) AdaptiveSparkPlan Output [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/17.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/17.txt index d806c8c2577b5..7c5359849d4e8 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/17.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/17.txt @@ -1,39 +1,38 @@ == Physical Plan == -AdaptiveSparkPlan (35) +AdaptiveSparkPlan (36) +- == Final Plan == - VeloxColumnarToRowExec (14) - +- ^ ProjectExecTransformer (12) - +- ^ RegularHashAggregateExecTransformer (11) - +- ^ InputIteratorTransformer (10) - +- ^ InputAdapter (9) - +- ^ ShuffleQueryStage (8) - +- ColumnarExchange (7) + VeloxColumnarToRowExec (15) + +- ^ ProjectExecTransformer (13) + +- ^ RegularHashAggregateExecTransformer (12) + +- ^ InputIteratorTransformer (11) + +- ShuffleQueryStage (9) + +- ColumnarExchange (8) + +- VeloxAppendBatches (7) +- ^ FlushableHashAggregateExecTransformer (5) +- ^ InputIteratorTransformer (4) - +- ^ InputAdapter (3) - +- ^ RowToVeloxColumnar (2) - +- ^ LocalTableScan (1) + +- RowToVeloxColumnar (2) + +- LocalTableScan (1) +- == Initial Plan == - HashAggregate (34) - +- Exchange (33) - +- HashAggregate (32) - +- Project (31) - +- BroadcastHashJoin Inner BuildRight (30) - :- Project (22) - : +- BroadcastHashJoin Inner BuildRight (21) - : :- Filter (16) - : : +- Scan parquet (15) - : +- BroadcastExchange (20) - : +- Project (19) - : +- Filter (18) - : +- Scan parquet (17) - +- BroadcastExchange (29) - +- Filter (28) - +- HashAggregate (27) - +- Exchange (26) - +- HashAggregate (25) - +- Filter (24) - +- Scan parquet (23) + HashAggregate (35) + +- Exchange (34) + +- HashAggregate (33) + +- Project (32) + +- BroadcastHashJoin Inner BuildRight (31) + :- Project (23) + : +- BroadcastHashJoin Inner BuildRight (22) + : :- Filter (17) + : : +- Scan parquet (16) + : +- BroadcastExchange (21) + : +- Project (20) + : +- Filter (19) + : +- Scan parquet (18) + +- BroadcastExchange (30) + +- Filter (29) + +- HashAggregate (28) + +- Exchange (27) + +- HashAggregate (26) + +- Filter (25) + +- Scan parquet (24) (1) LocalTableScan @@ -60,141 +59,145 @@ Results [2]: [sum#X, isEmpty#X] Input [2]: [sum#X, isEmpty#X] Arguments: false -(7) ColumnarExchange +(7) VeloxAppendBatches +Input [2]: [sum#X, isEmpty#X] +Arguments: X + +(8) ColumnarExchange Input [2]: [sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(8) ShuffleQueryStage +(9) ShuffleQueryStage Output [2]: [sum#X, isEmpty#X] Arguments: X -(9) InputAdapter +(10) InputAdapter Input [2]: [sum#X, isEmpty#X] -(10) InputIteratorTransformer +(11) InputIteratorTransformer Input [2]: [sum#X, isEmpty#X] -(11) RegularHashAggregateExecTransformer +(12) RegularHashAggregateExecTransformer Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(l_extendedprice#X)] Aggregate Attributes [1]: [sum(l_extendedprice#X)#X] Results [1]: [sum(l_extendedprice#X)#X] -(12) ProjectExecTransformer +(13) ProjectExecTransformer Output [1]: [CheckOverflow((promote_precision(sum(l_extendedprice#X)#X) / 7.00), DecimalType(27,6), true) AS avg_yearly#X] Input [1]: [sum(l_extendedprice#X)#X] -(13) WholeStageCodegenTransformer (X) +(14) WholeStageCodegenTransformer (X) Input [1]: [avg_yearly#X] Arguments: false -(14) VeloxColumnarToRowExec +(15) VeloxColumnarToRowExec Input [1]: [avg_yearly#X] -(15) Scan parquet +(16) Scan parquet Output [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey), IsNotNull(l_quantity)] ReadSchema: struct -(16) Filter +(17) Filter Input [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] Condition : (isnotnull(l_partkey#X) AND isnotnull(l_quantity#X)) -(17) Scan parquet +(18) Scan parquet Output [3]: [p_partkey#X, p_brand#X, p_container#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_brand), IsNotNull(p_container), EqualTo(p_brand,Brand#X), EqualTo(p_container,MED BOX), IsNotNull(p_partkey)] ReadSchema: struct -(18) Filter +(19) Filter Input [3]: [p_partkey#X, p_brand#X, p_container#X] Condition : ((((isnotnull(p_brand#X) AND isnotnull(p_container#X)) AND (p_brand#X = Brand#X)) AND (p_container#X = MED BOX)) AND isnotnull(p_partkey#X)) -(19) Project +(20) Project Output [1]: [p_partkey#X] Input [3]: [p_partkey#X, p_brand#X, p_container#X] -(20) BroadcastExchange +(21) BroadcastExchange Input [1]: [p_partkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(21) BroadcastHashJoin +(22) BroadcastHashJoin Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(22) Project +(23) Project Output [3]: [l_quantity#X, l_extendedprice#X, p_partkey#X] Input [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, p_partkey#X] -(23) Scan parquet +(24) Scan parquet Output [2]: [l_partkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey)] ReadSchema: struct -(24) Filter +(25) Filter Input [2]: [l_partkey#X, l_quantity#X] Condition : isnotnull(l_partkey#X) -(25) HashAggregate +(26) HashAggregate Input [2]: [l_partkey#X, l_quantity#X] Keys [1]: [l_partkey#X] Functions [1]: [partial_avg(l_quantity#X)] Aggregate Attributes [2]: [sum#X, count#X] Results [3]: [l_partkey#X, sum#X, count#X] -(26) Exchange +(27) Exchange Input [3]: [l_partkey#X, sum#X, count#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(27) HashAggregate +(28) HashAggregate Input [3]: [l_partkey#X, sum#X, count#X] Keys [1]: [l_partkey#X] Functions [1]: [avg(l_quantity#X)] Aggregate Attributes [1]: [avg(l_quantity#X)#X] Results [2]: [CheckOverflow((0.200000 * promote_precision(avg(l_quantity#X)#X)), DecimalType(18,7), true) AS (0.2 * avg(l_quantity))#X, l_partkey#X] -(28) Filter +(29) Filter Input [2]: [(0.2 * avg(l_quantity))#X, l_partkey#X] Condition : isnotnull((0.2 * avg(l_quantity))#X) -(29) BroadcastExchange +(30) BroadcastExchange Input [2]: [(0.2 * avg(l_quantity))#X, l_partkey#X] Arguments: HashedRelationBroadcastMode(List(input[1, bigint, true]),false), [plan_id=X] -(30) BroadcastHashJoin +(31) BroadcastHashJoin Left keys [1]: [p_partkey#X] Right keys [1]: [l_partkey#X] Join condition: (cast(l_quantity#X as decimal(18,7)) < (0.2 * avg(l_quantity))#X) -(31) Project +(32) Project Output [1]: [l_extendedprice#X] Input [5]: [l_quantity#X, l_extendedprice#X, p_partkey#X, (0.2 * avg(l_quantity))#X, l_partkey#X] -(32) HashAggregate +(33) HashAggregate Input [1]: [l_extendedprice#X] Keys: [] Functions [1]: [partial_sum(l_extendedprice#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(33) Exchange +(34) Exchange Input [2]: [sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X] -(34) HashAggregate +(35) HashAggregate Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(l_extendedprice#X)] Aggregate Attributes [1]: [sum(l_extendedprice#X)#X] Results [1]: [CheckOverflow((promote_precision(sum(l_extendedprice#X)#X) / 7.00), DecimalType(27,6), true) AS avg_yearly#X] -(35) AdaptiveSparkPlan +(36) AdaptiveSparkPlan Output [1]: [avg_yearly#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/18.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/18.txt index 32359df262656..a4aaf08ff8bb3 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/18.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/18.txt @@ -1,86 +1,82 @@ == Physical Plan == -AdaptiveSparkPlan (86) +AdaptiveSparkPlan (88) +- == Final Plan == - VeloxColumnarToRowExec (53) - +- TakeOrderedAndProjectExecTransformer (52) - +- ^ RegularHashAggregateExecTransformer (50) - +- ^ InputIteratorTransformer (49) - +- ^ InputAdapter (48) - +- ^ ShuffleQueryStage (47) - +- ColumnarExchange (46) - +- ^ ProjectExecTransformer (44) - +- ^ FlushableHashAggregateExecTransformer (43) - +- ^ ProjectExecTransformer (42) - +- ^ BroadcastHashJoinExecTransformer Inner (41) - :- ^ ProjectExecTransformer (28) - : +- ^ BroadcastHashJoinExecTransformer Inner (27) + VeloxColumnarToRowExec (55) + +- TakeOrderedAndProjectExecTransformer (54) + +- ^ RegularHashAggregateExecTransformer (52) + +- ^ InputIteratorTransformer (51) + +- ShuffleQueryStage (49) + +- ColumnarExchange (48) + +- VeloxAppendBatches (47) + +- ^ ProjectExecTransformer (45) + +- ^ FlushableHashAggregateExecTransformer (44) + +- ^ ProjectExecTransformer (43) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (42) + :- ^ ProjectExecTransformer (29) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildLeft (28) : :- ^ InputIteratorTransformer (7) - : : +- ^ InputAdapter (6) - : : +- ^ BroadcastQueryStage (5) - : : +- ColumnarBroadcastExchange (4) - : : +- ^ FilterExecTransformer (2) - : : +- ^ Scan parquet (1) - : +- ^ BroadcastHashJoinExecTransformer LeftSemi (26) + : : +- BroadcastQueryStage (5) + : : +- ColumnarBroadcastExchange (4) + : : +- ^ FilterExecTransformer (2) + : : +- ^ Scan parquet (1) + : +- ^ BroadcastHashJoinExecTransformer LeftSemi BuildRight (27) : :- ^ FilterExecTransformer (9) : : +- ^ Scan parquet (8) - : +- ^ InputIteratorTransformer (25) - : +- ^ InputAdapter (24) - : +- ^ BroadcastQueryStage (23) - : +- ColumnarBroadcastExchange (22) - : +- ^ ProjectExecTransformer (20) - : +- ^ FilterExecTransformer (19) - : +- ^ RegularHashAggregateExecTransformer (18) - : +- ^ InputIteratorTransformer (17) - : +- ^ InputAdapter (16) - : +- ^ ShuffleQueryStage (15) - : +- ColumnarExchange (14) - : +- ^ ProjectExecTransformer (12) - : +- ^ FlushableHashAggregateExecTransformer (11) - : +- ^ Scan parquet (10) - +- ^ InputIteratorTransformer (40) - +- ^ InputAdapter (39) - +- ^ BroadcastQueryStage (38) - +- ColumnarBroadcastExchange (37) - +- ^ BroadcastHashJoinExecTransformer LeftSemi (35) - :- ^ FilterExecTransformer (30) - : +- ^ Scan parquet (29) - +- ^ InputIteratorTransformer (34) - +- ^ InputAdapter (33) - +- ^ BroadcastQueryStage (32) - +- ReusedExchange (31) + : +- ^ InputIteratorTransformer (26) + : +- BroadcastQueryStage (24) + : +- ColumnarBroadcastExchange (23) + : +- ^ ProjectExecTransformer (21) + : +- ^ FilterExecTransformer (20) + : +- ^ RegularHashAggregateExecTransformer (19) + : +- ^ InputIteratorTransformer (18) + : +- ShuffleQueryStage (16) + : +- ColumnarExchange (15) + : +- VeloxAppendBatches (14) + : +- ^ ProjectExecTransformer (12) + : +- ^ FlushableHashAggregateExecTransformer (11) + : +- ^ Scan parquet (10) + +- ^ InputIteratorTransformer (41) + +- BroadcastQueryStage (39) + +- ColumnarBroadcastExchange (38) + +- ^ BroadcastHashJoinExecTransformer LeftSemi BuildRight (36) + :- ^ FilterExecTransformer (31) + : +- ^ Scan parquet (30) + +- ^ InputIteratorTransformer (35) + +- BroadcastQueryStage (33) + +- ReusedExchange (32) +- == Initial Plan == - TakeOrderedAndProject (85) - +- HashAggregate (84) - +- Exchange (83) - +- HashAggregate (82) - +- Project (81) - +- BroadcastHashJoin Inner BuildRight (80) - :- Project (68) - : +- BroadcastHashJoin Inner BuildLeft (67) - : :- BroadcastExchange (56) - : : +- Filter (55) - : : +- Scan parquet (54) - : +- BroadcastHashJoin LeftSemi BuildRight (66) - : :- Filter (58) - : : +- Scan parquet (57) - : +- BroadcastExchange (65) - : +- Project (64) - : +- Filter (63) - : +- HashAggregate (62) - : +- Exchange (61) - : +- HashAggregate (60) - : +- Scan parquet (59) - +- BroadcastExchange (79) - +- BroadcastHashJoin LeftSemi BuildRight (78) - :- Filter (70) - : +- Scan parquet (69) - +- BroadcastExchange (77) - +- Project (76) - +- Filter (75) - +- HashAggregate (74) - +- Exchange (73) - +- HashAggregate (72) - +- Scan parquet (71) + TakeOrderedAndProject (87) + +- HashAggregate (86) + +- Exchange (85) + +- HashAggregate (84) + +- Project (83) + +- BroadcastHashJoin Inner BuildRight (82) + :- Project (70) + : +- BroadcastHashJoin Inner BuildLeft (69) + : :- BroadcastExchange (58) + : : +- Filter (57) + : : +- Scan parquet (56) + : +- BroadcastHashJoin LeftSemi BuildRight (68) + : :- Filter (60) + : : +- Scan parquet (59) + : +- BroadcastExchange (67) + : +- Project (66) + : +- Filter (65) + : +- HashAggregate (64) + : +- Exchange (63) + : +- HashAggregate (62) + : +- Scan parquet (61) + +- BroadcastExchange (81) + +- BroadcastHashJoin LeftSemi BuildRight (80) + :- Filter (72) + : +- Scan parquet (71) + +- BroadcastExchange (79) + +- Project (78) + +- Filter (77) + +- HashAggregate (76) + +- Exchange (75) + +- HashAggregate (74) + +- Scan parquet (73) (1) Scan parquet @@ -144,333 +140,341 @@ Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Input [4]: [hash_partition_key#X, l_orderkey#X, sum#X, isEmpty#X] Arguments: false -(14) ColumnarExchange +(14) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_orderkey#X, sum#X, isEmpty#X] +Arguments: X + +(15) ColumnarExchange Input [4]: [hash_partition_key#X, l_orderkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(15) ShuffleQueryStage +(16) ShuffleQueryStage Output [3]: [l_orderkey#X, sum#X, isEmpty#X] Arguments: X -(16) InputAdapter +(17) InputAdapter Input [3]: [l_orderkey#X, sum#X, isEmpty#X] -(17) InputIteratorTransformer +(18) InputIteratorTransformer Input [3]: [l_orderkey#X, sum#X, isEmpty#X] -(18) RegularHashAggregateExecTransformer +(19) RegularHashAggregateExecTransformer Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Keys [1]: [l_orderkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [2]: [l_orderkey#X, sum(l_quantity#X)#X AS sum(l_quantity#X)#X] -(19) FilterExecTransformer +(20) FilterExecTransformer Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] Arguments: (isnotnull(sum(l_quantity#X)#X) AND (sum(l_quantity#X)#X > 300.00)) -(20) ProjectExecTransformer +(21) ProjectExecTransformer Output [1]: [l_orderkey#X] Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] -(21) WholeStageCodegenTransformer (X) +(22) WholeStageCodegenTransformer (X) Input [1]: [l_orderkey#X] Arguments: false -(22) ColumnarBroadcastExchange +(23) ColumnarBroadcastExchange Input [1]: [l_orderkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(23) BroadcastQueryStage +(24) BroadcastQueryStage Output [1]: [l_orderkey#X] Arguments: X -(24) InputAdapter +(25) InputAdapter Input [1]: [l_orderkey#X] -(25) InputIteratorTransformer +(26) InputIteratorTransformer Input [1]: [l_orderkey#X] -(26) BroadcastHashJoinExecTransformer +(27) BroadcastHashJoinExecTransformer Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(27) BroadcastHashJoinExecTransformer +(28) BroadcastHashJoinExecTransformer Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(28) ProjectExecTransformer +(29) ProjectExecTransformer Output [5]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] Input [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -(29) Scan parquet +(30) Scan parquet Output [2]: [l_orderkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_orderkey)] ReadSchema: struct -(30) FilterExecTransformer +(31) FilterExecTransformer Input [2]: [l_orderkey#X, l_quantity#X] Arguments: isnotnull(l_orderkey#X) -(31) ReusedExchange [Reuses operator id: 22] +(32) ReusedExchange [Reuses operator id: 23] Output [1]: [l_orderkey#X] -(32) BroadcastQueryStage +(33) BroadcastQueryStage Output [1]: [l_orderkey#X] Arguments: X -(33) InputAdapter +(34) InputAdapter Input [1]: [l_orderkey#X] -(34) InputIteratorTransformer +(35) InputIteratorTransformer Input [1]: [l_orderkey#X] -(35) BroadcastHashJoinExecTransformer +(36) BroadcastHashJoinExecTransformer Left keys [1]: [l_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(36) WholeStageCodegenTransformer (X) +(37) WholeStageCodegenTransformer (X) Input [2]: [l_orderkey#X, l_quantity#X] Arguments: false -(37) ColumnarBroadcastExchange +(38) ColumnarBroadcastExchange Input [2]: [l_orderkey#X, l_quantity#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(38) BroadcastQueryStage +(39) BroadcastQueryStage Output [2]: [l_orderkey#X, l_quantity#X] Arguments: X -(39) InputAdapter +(40) InputAdapter Input [2]: [l_orderkey#X, l_quantity#X] -(40) InputIteratorTransformer +(41) InputIteratorTransformer Input [2]: [l_orderkey#X, l_quantity#X] -(41) BroadcastHashJoinExecTransformer +(42) BroadcastHashJoinExecTransformer Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(42) ProjectExecTransformer +(43) ProjectExecTransformer Output [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_quantity#X] Input [7]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_orderkey#X, l_quantity#X] -(43) FlushableHashAggregateExecTransformer +(44) FlushableHashAggregateExecTransformer Input [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_quantity#X] Keys [5]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] -(44) ProjectExecTransformer +(45) ProjectExecTransformer Output [8]: [hash(c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, 42) AS hash_partition_key#X, c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] Input [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] -(45) WholeStageCodegenTransformer (X) +(46) WholeStageCodegenTransformer (X) Input [8]: [hash_partition_key#X, c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] Arguments: false -(46) ColumnarExchange +(47) VeloxAppendBatches +Input [8]: [hash_partition_key#X, c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] +Arguments: X + +(48) ColumnarExchange Input [8]: [hash_partition_key#X, c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] Arguments: hashpartitioning(c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, 1), ENSURE_REQUIREMENTS, [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(47) ShuffleQueryStage +(49) ShuffleQueryStage Output [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] Arguments: X -(48) InputAdapter +(50) InputAdapter Input [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] -(49) InputIteratorTransformer +(51) InputIteratorTransformer Input [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] -(50) RegularHashAggregateExecTransformer +(52) RegularHashAggregateExecTransformer Input [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] Keys [5]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity#X)#X AS sum(l_quantity)#X] -(51) WholeStageCodegenTransformer (X) +(53) WholeStageCodegenTransformer (X) Input [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] Arguments: false -(52) TakeOrderedAndProjectExecTransformer +(54) TakeOrderedAndProjectExecTransformer Input [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] Arguments: X, [o_totalprice#X DESC NULLS LAST, o_orderdate#X ASC NULLS FIRST], [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X], 0 -(53) VeloxColumnarToRowExec +(55) VeloxColumnarToRowExec Input [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] -(54) Scan parquet +(56) Scan parquet Output [2]: [c_custkey#X, c_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey)] ReadSchema: struct -(55) Filter +(57) Filter Input [2]: [c_custkey#X, c_name#X] Condition : isnotnull(c_custkey#X) -(56) BroadcastExchange +(58) BroadcastExchange Input [2]: [c_custkey#X, c_name#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(57) Scan parquet +(59) Scan parquet Output [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(58) Filter +(60) Filter Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Condition : (isnotnull(o_custkey#X) AND isnotnull(o_orderkey#X)) -(59) Scan parquet +(61) Scan parquet Output [2]: [l_orderkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(60) HashAggregate +(62) HashAggregate Input [2]: [l_orderkey#X, l_quantity#X] Keys [1]: [l_orderkey#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_orderkey#X, sum#X, isEmpty#X] -(61) Exchange +(63) Exchange Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(62) HashAggregate +(64) HashAggregate Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Keys [1]: [l_orderkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [2]: [l_orderkey#X, sum(l_quantity#X)#X AS sum(l_quantity#X)#X] -(63) Filter +(65) Filter Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] Condition : (isnotnull(sum(l_quantity#X)#X) AND (sum(l_quantity#X)#X > 300.00)) -(64) Project +(66) Project Output [1]: [l_orderkey#X] Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] -(65) BroadcastExchange +(67) BroadcastExchange Input [1]: [l_orderkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(66) BroadcastHashJoin +(68) BroadcastHashJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(67) BroadcastHashJoin +(69) BroadcastHashJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(68) Project +(70) Project Output [5]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] Input [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -(69) Scan parquet +(71) Scan parquet Output [2]: [l_orderkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_orderkey)] ReadSchema: struct -(70) Filter +(72) Filter Input [2]: [l_orderkey#X, l_quantity#X] Condition : isnotnull(l_orderkey#X) -(71) Scan parquet +(73) Scan parquet Output [2]: [l_orderkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(72) HashAggregate +(74) HashAggregate Input [2]: [l_orderkey#X, l_quantity#X] Keys [1]: [l_orderkey#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_orderkey#X, sum#X, isEmpty#X] -(73) Exchange +(75) Exchange Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(74) HashAggregate +(76) HashAggregate Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Keys [1]: [l_orderkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [2]: [l_orderkey#X, sum(l_quantity#X)#X AS sum(l_quantity#X)#X] -(75) Filter +(77) Filter Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] Condition : (isnotnull(sum(l_quantity#X)#X) AND (sum(l_quantity#X)#X > 300.00)) -(76) Project +(78) Project Output [1]: [l_orderkey#X] Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] -(77) BroadcastExchange +(79) BroadcastExchange Input [1]: [l_orderkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(78) BroadcastHashJoin +(80) BroadcastHashJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(79) BroadcastExchange +(81) BroadcastExchange Input [2]: [l_orderkey#X, l_quantity#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(80) BroadcastHashJoin +(82) BroadcastHashJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(81) Project +(83) Project Output [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_quantity#X] Input [7]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_orderkey#X, l_quantity#X] -(82) HashAggregate +(84) HashAggregate Input [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_quantity#X] Keys [5]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] -(83) Exchange +(85) Exchange Input [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] Arguments: hashpartitioning(c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(84) HashAggregate +(86) HashAggregate Input [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] Keys [5]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity#X)#X AS sum(l_quantity)#X] -(85) TakeOrderedAndProject +(87) TakeOrderedAndProject Input [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] Arguments: X, [o_totalprice#X DESC NULLS LAST, o_orderdate#X ASC NULLS FIRST], [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] -(86) AdaptiveSparkPlan +(88) AdaptiveSparkPlan Output [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/19.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/19.txt index 232c27fb8a9e3..87acf3c4b28e6 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/19.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/19.txt @@ -1,36 +1,35 @@ == Physical Plan == -AdaptiveSparkPlan (33) +AdaptiveSparkPlan (34) +- == Final Plan == - VeloxColumnarToRowExec (21) - +- ^ RegularHashAggregateExecTransformer (19) - +- ^ InputIteratorTransformer (18) - +- ^ InputAdapter (17) - +- ^ ShuffleQueryStage (16) - +- ColumnarExchange (15) + VeloxColumnarToRowExec (22) + +- ^ RegularHashAggregateExecTransformer (20) + +- ^ InputIteratorTransformer (19) + +- ShuffleQueryStage (17) + +- ColumnarExchange (16) + +- VeloxAppendBatches (15) +- ^ FlushableHashAggregateExecTransformer (13) +- ^ ProjectExecTransformer (12) - +- ^ BroadcastHashJoinExecTransformer Inner (11) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (11) :- ^ ProjectExecTransformer (3) : +- ^ FilterExecTransformer (2) : +- ^ Scan parquet (1) +- ^ InputIteratorTransformer (10) - +- ^ InputAdapter (9) - +- ^ BroadcastQueryStage (8) - +- ColumnarBroadcastExchange (7) - +- ^ FilterExecTransformer (5) - +- ^ Scan parquet (4) + +- BroadcastQueryStage (8) + +- ColumnarBroadcastExchange (7) + +- ^ FilterExecTransformer (5) + +- ^ Scan parquet (4) +- == Initial Plan == - HashAggregate (32) - +- Exchange (31) - +- HashAggregate (30) - +- Project (29) - +- BroadcastHashJoin Inner BuildRight (28) - :- Project (24) - : +- Filter (23) - : +- Scan parquet (22) - +- BroadcastExchange (27) - +- Filter (26) - +- Scan parquet (25) + HashAggregate (33) + +- Exchange (32) + +- HashAggregate (31) + +- Project (30) + +- BroadcastHashJoin Inner BuildRight (29) + :- Project (25) + : +- Filter (24) + : +- Scan parquet (23) + +- BroadcastExchange (28) + +- Filter (27) + +- Scan parquet (26) (1) Scan parquet @@ -97,91 +96,95 @@ Results [2]: [sum#X, isEmpty#X] Input [2]: [sum#X, isEmpty#X] Arguments: false -(15) ColumnarExchange +(15) VeloxAppendBatches +Input [2]: [sum#X, isEmpty#X] +Arguments: X + +(16) ColumnarExchange Input [2]: [sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(16) ShuffleQueryStage +(17) ShuffleQueryStage Output [2]: [sum#X, isEmpty#X] Arguments: X -(17) InputAdapter +(18) InputAdapter Input [2]: [sum#X, isEmpty#X] -(18) InputIteratorTransformer +(19) InputIteratorTransformer Input [2]: [sum#X, isEmpty#X] -(19) RegularHashAggregateExecTransformer +(20) RegularHashAggregateExecTransformer Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] Results [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X AS revenue#X] -(20) WholeStageCodegenTransformer (X) +(21) WholeStageCodegenTransformer (X) Input [1]: [revenue#X] Arguments: false -(21) VeloxColumnarToRowExec +(22) VeloxColumnarToRowExec Input [1]: [revenue#X] -(22) Scan parquet +(23) Scan parquet Output [6]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, l_shipinstruct#X, l_shipmode#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipinstruct), In(l_shipmode, [AIR,AIR REG]), EqualTo(l_shipinstruct,DELIVER IN PERSON), IsNotNull(l_partkey), Or(Or(And(GreaterThanOrEqual(l_quantity,1.00),LessThanOrEqual(l_quantity,11.00)),And(GreaterThanOrEqual(l_quantity,10.00),LessThanOrEqual(l_quantity,20.00))),And(GreaterThanOrEqual(l_quantity,20.00),LessThanOrEqual(l_quantity,30.00)))] ReadSchema: struct -(23) Filter +(24) Filter Input [6]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, l_shipinstruct#X, l_shipmode#X] Condition : ((((isnotnull(l_shipinstruct#X) AND l_shipmode#X IN (AIR,AIR REG)) AND (l_shipinstruct#X = DELIVER IN PERSON)) AND isnotnull(l_partkey#X)) AND ((((l_quantity#X >= 1.00) AND (l_quantity#X <= 11.00)) OR ((l_quantity#X >= 10.00) AND (l_quantity#X <= 20.00))) OR ((l_quantity#X >= 20.00) AND (l_quantity#X <= 30.00)))) -(24) Project +(25) Project Output [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Input [6]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, l_shipinstruct#X, l_shipmode#X] -(25) Scan parquet +(26) Scan parquet Output [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_size), GreaterThanOrEqual(p_size,1), IsNotNull(p_partkey), Or(Or(And(And(EqualTo(p_brand,Brand#X),In(p_container, [SM BOX,SM CASE,SM PACK,SM PKG])),LessThanOrEqual(p_size,5)),And(And(EqualTo(p_brand,Brand#X),In(p_container, [MED BAG,MED BOX,MED PACK,MED PKG])),LessThanOrEqual(p_size,10))),And(And(EqualTo(p_brand,Brand#X),In(p_container, [LG BOX,LG CASE,LG PACK,LG PKG])),LessThanOrEqual(p_size,15)))] ReadSchema: struct -(26) Filter +(27) Filter Input [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] Condition : (((isnotnull(p_size#X) AND (p_size#X >= 1)) AND isnotnull(p_partkey#X)) AND (((((p_brand#X = Brand#X) AND p_container#X IN (SM CASE,SM BOX,SM PACK,SM PKG)) AND (p_size#X <= 5)) OR (((p_brand#X = Brand#X) AND p_container#X IN (MED BAG,MED BOX,MED PKG,MED PACK)) AND (p_size#X <= 10))) OR (((p_brand#X = Brand#X) AND p_container#X IN (LG CASE,LG BOX,LG PACK,LG PKG)) AND (p_size#X <= 15)))) -(27) BroadcastExchange +(28) BroadcastExchange Input [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(28) BroadcastHashJoin +(29) BroadcastHashJoin Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join condition: (((((((p_brand#X = Brand#X) AND p_container#X IN (SM CASE,SM BOX,SM PACK,SM PKG)) AND (l_quantity#X >= 1.00)) AND (l_quantity#X <= 11.00)) AND (p_size#X <= 5)) OR (((((p_brand#X = Brand#X) AND p_container#X IN (MED BAG,MED BOX,MED PKG,MED PACK)) AND (l_quantity#X >= 10.00)) AND (l_quantity#X <= 20.00)) AND (p_size#X <= 10))) OR (((((p_brand#X = Brand#X) AND p_container#X IN (LG CASE,LG BOX,LG PACK,LG PKG)) AND (l_quantity#X >= 20.00)) AND (l_quantity#X <= 30.00)) AND (p_size#X <= 15))) -(29) Project +(30) Project Output [2]: [l_extendedprice#X, l_discount#X] Input [8]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, p_partkey#X, p_brand#X, p_size#X, p_container#X] -(30) HashAggregate +(31) HashAggregate Input [2]: [l_extendedprice#X, l_discount#X] Keys: [] Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(31) Exchange +(32) Exchange Input [2]: [sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X] -(32) HashAggregate +(33) HashAggregate Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] Results [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X AS revenue#X] -(33) AdaptiveSparkPlan +(34) AdaptiveSparkPlan Output [1]: [revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/20.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/20.txt index fe2ae1df7cc7f..9e03d83195371 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/20.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/20.txt @@ -1,105 +1,102 @@ == Physical Plan == -AdaptiveSparkPlan (107) +AdaptiveSparkPlan (112) +- == Final Plan == - VeloxColumnarToRowExec (70) - +- ^ SortExecTransformer (68) - +- ^ InputIteratorTransformer (67) - +- ^ InputAdapter (66) - +- ^ ShuffleQueryStage (65) - +- ColumnarExchange (64) - +- ^ ProjectExecTransformer (62) - +- ^ BroadcastHashJoinExecTransformer Inner (61) - :- ^ ProjectExecTransformer (52) - : +- ^ BroadcastHashJoinExecTransformer LeftSemi (51) - : :- ^ InputIteratorTransformer (9) - : : +- ^ InputAdapter (8) - : : +- ^ AQEShuffleRead (7) - : : +- ^ ShuffleQueryStage (6) - : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (73) + +- ^ SortExecTransformer (71) + +- ^ InputIteratorTransformer (70) + +- ShuffleQueryStage (68) + +- ColumnarExchange (67) + +- VeloxAppendBatches (66) + +- ^ ProjectExecTransformer (64) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (63) + :- ^ ProjectExecTransformer (54) + : +- ^ BroadcastHashJoinExecTransformer LeftSemi BuildRight (53) + : :- ^ InputIteratorTransformer (10) + : : +- AQEShuffleRead (8) + : : +- ShuffleQueryStage (7) + : : +- ColumnarExchange (6) + : : +- VeloxAppendBatches (5) : : +- ^ ProjectExecTransformer (3) : : +- ^ FilterExecTransformer (2) : : +- ^ Scan parquet (1) - : +- ^ InputIteratorTransformer (50) - : +- ^ InputAdapter (49) - : +- ^ BroadcastQueryStage (48) - : +- ColumnarBroadcastExchange (47) - : +- ^ ProjectExecTransformer (45) - : +- ^ BroadcastHashJoinExecTransformer Inner (44) - : :- ^ InputIteratorTransformer (25) - : : +- ^ InputAdapter (24) - : : +- ^ BroadcastQueryStage (23) - : : +- ColumnarBroadcastExchange (22) - : : +- ^ BroadcastHashJoinExecTransformer LeftSemi (20) - : : :- ^ FilterExecTransformer (11) - : : : +- ^ Scan parquet (10) - : : +- ^ InputIteratorTransformer (19) - : : +- ^ InputAdapter (18) - : : +- ^ BroadcastQueryStage (17) - : : +- ColumnarBroadcastExchange (16) - : : +- ^ ProjectExecTransformer (14) - : : +- ^ FilterExecTransformer (13) - : : +- ^ Scan parquet (12) - : +- ^ FilterExecTransformer (43) - : +- ^ ProjectExecTransformer (42) - : +- ^ RegularHashAggregateExecTransformer (41) - : +- ^ InputIteratorTransformer (40) - : +- ^ InputAdapter (39) - : +- ^ ShuffleQueryStage (38) - : +- ColumnarExchange (37) - : +- ^ ProjectExecTransformer (35) - : +- ^ FlushableHashAggregateExecTransformer (34) - : +- ^ BroadcastHashJoinExecTransformer LeftSemi (33) - : :- ^ ProjectExecTransformer (28) - : : +- ^ FilterExecTransformer (27) - : : +- ^ Scan parquet (26) - : +- ^ InputIteratorTransformer (32) - : +- ^ InputAdapter (31) - : +- ^ BroadcastQueryStage (30) - : +- ReusedExchange (29) - +- ^ InputIteratorTransformer (60) - +- ^ InputAdapter (59) - +- ^ BroadcastQueryStage (58) - +- ColumnarBroadcastExchange (57) - +- ^ ProjectExecTransformer (55) - +- ^ FilterExecTransformer (54) - +- ^ Scan parquet (53) + : +- ^ InputIteratorTransformer (52) + : +- BroadcastQueryStage (50) + : +- ColumnarBroadcastExchange (49) + : +- ^ ProjectExecTransformer (47) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildLeft (46) + : :- ^ InputIteratorTransformer (26) + : : +- BroadcastQueryStage (24) + : : +- ColumnarBroadcastExchange (23) + : : +- ^ BroadcastHashJoinExecTransformer LeftSemi BuildRight (21) + : : :- ^ FilterExecTransformer (12) + : : : +- ^ Scan parquet (11) + : : +- ^ InputIteratorTransformer (20) + : : +- BroadcastQueryStage (18) + : : +- ColumnarBroadcastExchange (17) + : : +- ^ ProjectExecTransformer (15) + : : +- ^ FilterExecTransformer (14) + : : +- ^ Scan parquet (13) + : +- ^ FilterExecTransformer (45) + : +- ^ ProjectExecTransformer (44) + : +- ^ RegularHashAggregateExecTransformer (43) + : +- ^ InputIteratorTransformer (42) + : +- ShuffleQueryStage (40) + : +- ColumnarExchange (39) + : +- VeloxAppendBatches (38) + : +- ^ ProjectExecTransformer (36) + : +- ^ FlushableHashAggregateExecTransformer (35) + : +- ^ BroadcastHashJoinExecTransformer LeftSemi BuildRight (34) + : :- ^ ProjectExecTransformer (29) + : : +- ^ FilterExecTransformer (28) + : : +- ^ Scan parquet (27) + : +- ^ InputIteratorTransformer (33) + : +- BroadcastQueryStage (31) + : +- ReusedExchange (30) + +- ^ InputIteratorTransformer (62) + +- BroadcastQueryStage (60) + +- ColumnarBroadcastExchange (59) + +- ^ ProjectExecTransformer (57) + +- ^ FilterExecTransformer (56) + +- ^ Scan parquet (55) +- == Initial Plan == - Sort (106) - +- Exchange (105) - +- Project (104) - +- BroadcastHashJoin Inner BuildRight (103) - :- Project (98) - : +- ShuffledHashJoin LeftSemi BuildRight (97) - : :- Exchange (73) - : : +- Filter (72) - : : +- Scan parquet (71) - : +- Exchange (96) - : +- Project (95) - : +- BroadcastHashJoin Inner BuildLeft (94) - : :- BroadcastExchange (81) - : : +- BroadcastHashJoin LeftSemi BuildRight (80) - : : :- Filter (75) - : : : +- Scan parquet (74) - : : +- BroadcastExchange (79) - : : +- Project (78) - : : +- Filter (77) - : : +- Scan parquet (76) - : +- Filter (93) - : +- HashAggregate (92) - : +- Exchange (91) - : +- HashAggregate (90) - : +- BroadcastHashJoin LeftSemi BuildRight (89) - : :- Project (84) - : : +- Filter (83) - : : +- Scan parquet (82) - : +- BroadcastExchange (88) - : +- Project (87) - : +- Filter (86) - : +- Scan parquet (85) - +- BroadcastExchange (102) - +- Project (101) - +- Filter (100) - +- Scan parquet (99) + Sort (111) + +- Exchange (110) + +- Project (109) + +- BroadcastHashJoin Inner BuildRight (108) + :- Project (103) + : +- SortMergeJoin LeftSemi (102) + : :- Sort (77) + : : +- Exchange (76) + : : +- Filter (75) + : : +- Scan parquet (74) + : +- Sort (101) + : +- Exchange (100) + : +- Project (99) + : +- BroadcastHashJoin Inner BuildLeft (98) + : :- BroadcastExchange (85) + : : +- BroadcastHashJoin LeftSemi BuildRight (84) + : : :- Filter (79) + : : : +- Scan parquet (78) + : : +- BroadcastExchange (83) + : : +- Project (82) + : : +- Filter (81) + : : +- Scan parquet (80) + : +- Filter (97) + : +- HashAggregate (96) + : +- Exchange (95) + : +- HashAggregate (94) + : +- BroadcastHashJoin LeftSemi BuildRight (93) + : :- Project (88) + : : +- Filter (87) + : : +- Scan parquet (86) + : +- BroadcastExchange (92) + : +- Project (91) + : +- Filter (90) + : +- Scan parquet (89) + +- BroadcastExchange (107) + +- Project (106) + +- Filter (105) + +- Scan parquet (104) (1) Scan parquet @@ -121,448 +118,468 @@ Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] +Arguments: X + +(6) ColumnarExchange Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Arguments: X -(7) AQEShuffleRead +(8) AQEShuffleRead Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Arguments: local -(8) InputAdapter +(9) InputAdapter Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] -(9) InputIteratorTransformer +(10) InputIteratorTransformer Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] -(10) Scan parquet +(11) Scan parquet Output [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_availqty), IsNotNull(ps_partkey), IsNotNull(ps_suppkey)] ReadSchema: struct -(11) FilterExecTransformer +(12) FilterExecTransformer Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: ((isnotnull(ps_availqty#X) AND isnotnull(ps_partkey#X)) AND isnotnull(ps_suppkey#X)) -(12) Scan parquet +(13) Scan parquet Output [2]: [p_partkey#X, p_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_name), StringStartsWith(p_name,forest)] ReadSchema: struct -(13) FilterExecTransformer +(14) FilterExecTransformer Input [2]: [p_partkey#X, p_name#X] Arguments: (isnotnull(p_name#X) AND StartsWith(p_name#X, forest)) -(14) ProjectExecTransformer +(15) ProjectExecTransformer Output [1]: [p_partkey#X] Input [2]: [p_partkey#X, p_name#X] -(15) WholeStageCodegenTransformer (X) +(16) WholeStageCodegenTransformer (X) Input [1]: [p_partkey#X] Arguments: false -(16) ColumnarBroadcastExchange +(17) ColumnarBroadcastExchange Input [1]: [p_partkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(17) BroadcastQueryStage +(18) BroadcastQueryStage Output [1]: [p_partkey#X] Arguments: X -(18) InputAdapter +(19) InputAdapter Input [1]: [p_partkey#X] -(19) InputIteratorTransformer +(20) InputIteratorTransformer Input [1]: [p_partkey#X] -(20) BroadcastHashJoinExecTransformer +(21) BroadcastHashJoinExecTransformer Left keys [1]: [ps_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(21) WholeStageCodegenTransformer (X) +(22) WholeStageCodegenTransformer (X) Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: false -(22) ColumnarBroadcastExchange +(23) ColumnarBroadcastExchange Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false], input[1, bigint, false]),false), [plan_id=X] -(23) BroadcastQueryStage +(24) BroadcastQueryStage Output [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: X -(24) InputAdapter +(25) InputAdapter Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] -(25) InputIteratorTransformer +(26) InputIteratorTransformer Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] -(26) Scan parquet +(27) Scan parquet Output [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1994-01-01), LessThan(l_shipdate,1995-01-01), IsNotNull(l_partkey), IsNotNull(l_suppkey)] ReadSchema: struct -(27) FilterExecTransformer +(28) FilterExecTransformer Input [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] Arguments: ((((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1994-01-01)) AND (l_shipdate#X < 1995-01-01)) AND isnotnull(l_partkey#X)) AND isnotnull(l_suppkey#X)) -(28) ProjectExecTransformer +(29) ProjectExecTransformer Output [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] Input [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] -(29) ReusedExchange [Reuses operator id: 16] +(30) ReusedExchange [Reuses operator id: 17] Output [1]: [p_partkey#X] -(30) BroadcastQueryStage +(31) BroadcastQueryStage Output [1]: [p_partkey#X] Arguments: X -(31) InputAdapter +(32) InputAdapter Input [1]: [p_partkey#X] -(32) InputIteratorTransformer +(33) InputIteratorTransformer Input [1]: [p_partkey#X] -(33) BroadcastHashJoinExecTransformer +(34) BroadcastHashJoinExecTransformer Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(34) FlushableHashAggregateExecTransformer +(35) FlushableHashAggregateExecTransformer Input [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] Keys [2]: [l_partkey#X, l_suppkey#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] -(35) ProjectExecTransformer +(36) ProjectExecTransformer Output [5]: [hash(l_partkey#X, l_suppkey#X, 42) AS hash_partition_key#X, l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] Input [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] -(36) WholeStageCodegenTransformer (X) +(37) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: false -(37) ColumnarExchange +(38) VeloxAppendBatches +Input [5]: [hash_partition_key#X, l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] +Arguments: X + +(39) ColumnarExchange Input [5]: [hash_partition_key#X, l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_partkey#X, l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(38) ShuffleQueryStage +(40) ShuffleQueryStage Output [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: X -(39) InputAdapter +(41) InputAdapter Input [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] -(40) InputIteratorTransformer +(42) InputIteratorTransformer Input [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] -(41) RegularHashAggregateExecTransformer +(43) RegularHashAggregateExecTransformer Input [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] Keys [2]: [l_partkey#X, l_suppkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [3]: [l_partkey#X, l_suppkey#X, sum(l_quantity#X)#X] -(42) ProjectExecTransformer +(44) ProjectExecTransformer Output [3]: [CheckOverflow((0.50 * promote_precision(sum(l_quantity#X)#X)), DecimalType(24,3), true) AS (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Input [3]: [l_partkey#X, l_suppkey#X, sum(l_quantity#X)#X] -(43) FilterExecTransformer +(45) FilterExecTransformer Input [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Arguments: isnotnull((0.5 * sum(l_quantity))#X) -(44) BroadcastHashJoinExecTransformer +(46) BroadcastHashJoinExecTransformer Left keys [2]: [ps_partkey#X, ps_suppkey#X] Right keys [2]: [l_partkey#X, l_suppkey#X] Join condition: (cast(cast(ps_availqty#X as decimal(10,0)) as decimal(24,3)) > (0.5 * sum(l_quantity))#X) -(45) ProjectExecTransformer +(47) ProjectExecTransformer Output [1]: [ps_suppkey#X] Input [6]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] -(46) WholeStageCodegenTransformer (X) +(48) WholeStageCodegenTransformer (X) Input [1]: [ps_suppkey#X] Arguments: false -(47) ColumnarBroadcastExchange +(49) ColumnarBroadcastExchange Input [1]: [ps_suppkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(48) BroadcastQueryStage +(50) BroadcastQueryStage Output [1]: [ps_suppkey#X] Arguments: X -(49) InputAdapter +(51) InputAdapter Input [1]: [ps_suppkey#X] -(50) InputIteratorTransformer +(52) InputIteratorTransformer Input [1]: [ps_suppkey#X] -(51) BroadcastHashJoinExecTransformer +(53) BroadcastHashJoinExecTransformer Left keys [1]: [s_suppkey#X] Right keys [1]: [ps_suppkey#X] Join condition: None -(52) ProjectExecTransformer +(54) ProjectExecTransformer Output [3]: [s_name#X, s_address#X, s_nationkey#X] Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] -(53) Scan parquet +(55) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,CANADA), IsNotNull(n_nationkey)] ReadSchema: struct -(54) FilterExecTransformer +(56) FilterExecTransformer Input [2]: [n_nationkey#X, n_name#X] Arguments: ((isnotnull(n_name#X) AND (n_name#X = CANADA)) AND isnotnull(n_nationkey#X)) -(55) ProjectExecTransformer +(57) ProjectExecTransformer Output [1]: [n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(56) WholeStageCodegenTransformer (X) +(58) WholeStageCodegenTransformer (X) Input [1]: [n_nationkey#X] Arguments: false -(57) ColumnarBroadcastExchange +(59) ColumnarBroadcastExchange Input [1]: [n_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(58) BroadcastQueryStage +(60) BroadcastQueryStage Output [1]: [n_nationkey#X] Arguments: X -(59) InputAdapter +(61) InputAdapter Input [1]: [n_nationkey#X] -(60) InputIteratorTransformer +(62) InputIteratorTransformer Input [1]: [n_nationkey#X] -(61) BroadcastHashJoinExecTransformer +(63) BroadcastHashJoinExecTransformer Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(62) ProjectExecTransformer +(64) ProjectExecTransformer Output [2]: [s_name#X, s_address#X] Input [4]: [s_name#X, s_address#X, s_nationkey#X, n_nationkey#X] -(63) WholeStageCodegenTransformer (X) +(65) WholeStageCodegenTransformer (X) Input [2]: [s_name#X, s_address#X] Arguments: false -(64) ColumnarExchange +(66) VeloxAppendBatches +Input [2]: [s_name#X, s_address#X] +Arguments: X + +(67) ColumnarExchange Input [2]: [s_name#X, s_address#X] Arguments: rangepartitioning(s_name#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(65) ShuffleQueryStage +(68) ShuffleQueryStage Output [2]: [s_name#X, s_address#X] Arguments: X -(66) InputAdapter +(69) InputAdapter Input [2]: [s_name#X, s_address#X] -(67) InputIteratorTransformer +(70) InputIteratorTransformer Input [2]: [s_name#X, s_address#X] -(68) SortExecTransformer +(71) SortExecTransformer Input [2]: [s_name#X, s_address#X] Arguments: [s_name#X ASC NULLS FIRST], true, 0 -(69) WholeStageCodegenTransformer (X) +(72) WholeStageCodegenTransformer (X) Input [2]: [s_name#X, s_address#X] Arguments: false -(70) VeloxColumnarToRowExec +(73) VeloxColumnarToRowExec Input [2]: [s_name#X, s_address#X] -(71) Scan parquet +(74) Scan parquet Output [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_nationkey)] ReadSchema: struct -(72) Filter +(75) Filter Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Condition : isnotnull(s_nationkey#X) -(73) Exchange +(76) Exchange Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(74) Scan parquet +(77) Sort +Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] +Arguments: [s_suppkey#X ASC NULLS FIRST], false, 0 + +(78) Scan parquet Output [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_availqty), IsNotNull(ps_partkey), IsNotNull(ps_suppkey)] ReadSchema: struct -(75) Filter +(79) Filter Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Condition : ((isnotnull(ps_availqty#X) AND isnotnull(ps_partkey#X)) AND isnotnull(ps_suppkey#X)) -(76) Scan parquet +(80) Scan parquet Output [2]: [p_partkey#X, p_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_name), StringStartsWith(p_name,forest)] ReadSchema: struct -(77) Filter +(81) Filter Input [2]: [p_partkey#X, p_name#X] Condition : (isnotnull(p_name#X) AND StartsWith(p_name#X, forest)) -(78) Project +(82) Project Output [1]: [p_partkey#X] Input [2]: [p_partkey#X, p_name#X] -(79) BroadcastExchange +(83) BroadcastExchange Input [1]: [p_partkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(80) BroadcastHashJoin +(84) BroadcastHashJoin Left keys [1]: [ps_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(81) BroadcastExchange +(85) BroadcastExchange Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false], input[1, bigint, false]),false), [plan_id=X] -(82) Scan parquet +(86) Scan parquet Output [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1994-01-01), LessThan(l_shipdate,1995-01-01), IsNotNull(l_partkey), IsNotNull(l_suppkey)] ReadSchema: struct -(83) Filter +(87) Filter Input [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] Condition : ((((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1994-01-01)) AND (l_shipdate#X < 1995-01-01)) AND isnotnull(l_partkey#X)) AND isnotnull(l_suppkey#X)) -(84) Project +(88) Project Output [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] Input [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] -(85) Scan parquet +(89) Scan parquet Output [2]: [p_partkey#X, p_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_name), StringStartsWith(p_name,forest)] ReadSchema: struct -(86) Filter +(90) Filter Input [2]: [p_partkey#X, p_name#X] Condition : (isnotnull(p_name#X) AND StartsWith(p_name#X, forest)) -(87) Project +(91) Project Output [1]: [p_partkey#X] Input [2]: [p_partkey#X, p_name#X] -(88) BroadcastExchange +(92) BroadcastExchange Input [1]: [p_partkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(89) BroadcastHashJoin +(93) BroadcastHashJoin Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(90) HashAggregate +(94) HashAggregate Input [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] Keys [2]: [l_partkey#X, l_suppkey#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] -(91) Exchange +(95) Exchange Input [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_partkey#X, l_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(92) HashAggregate +(96) HashAggregate Input [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] Keys [2]: [l_partkey#X, l_suppkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [3]: [CheckOverflow((0.50 * promote_precision(sum(l_quantity#X)#X)), DecimalType(24,3), true) AS (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] -(93) Filter +(97) Filter Input [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Condition : isnotnull((0.5 * sum(l_quantity))#X) -(94) BroadcastHashJoin +(98) BroadcastHashJoin Left keys [2]: [ps_partkey#X, ps_suppkey#X] Right keys [2]: [l_partkey#X, l_suppkey#X] Join condition: (cast(cast(ps_availqty#X as decimal(10,0)) as decimal(24,3)) > (0.5 * sum(l_quantity))#X) -(95) Project +(99) Project Output [1]: [ps_suppkey#X] Input [6]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] -(96) Exchange +(100) Exchange Input [1]: [ps_suppkey#X] Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(97) ShuffledHashJoin +(101) Sort +Input [1]: [ps_suppkey#X] +Arguments: [ps_suppkey#X ASC NULLS FIRST], false, 0 + +(102) SortMergeJoin Left keys [1]: [s_suppkey#X] Right keys [1]: [ps_suppkey#X] Join condition: None -(98) Project +(103) Project Output [3]: [s_name#X, s_address#X, s_nationkey#X] Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] -(99) Scan parquet +(104) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,CANADA), IsNotNull(n_nationkey)] ReadSchema: struct -(100) Filter +(105) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : ((isnotnull(n_name#X) AND (n_name#X = CANADA)) AND isnotnull(n_nationkey#X)) -(101) Project +(106) Project Output [1]: [n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(102) BroadcastExchange +(107) BroadcastExchange Input [1]: [n_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(103) BroadcastHashJoin +(108) BroadcastHashJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(104) Project +(109) Project Output [2]: [s_name#X, s_address#X] Input [4]: [s_name#X, s_address#X, s_nationkey#X, n_nationkey#X] -(105) Exchange +(110) Exchange Input [2]: [s_name#X, s_address#X] Arguments: rangepartitioning(s_name#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(106) Sort +(111) Sort Input [2]: [s_name#X, s_address#X] Arguments: [s_name#X ASC NULLS FIRST], true, 0 -(107) AdaptiveSparkPlan +(112) AdaptiveSparkPlan Output [2]: [s_name#X, s_address#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/21.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/21.txt index c3bd62e04735a..ee28d7b592e87 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/21.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/21.txt @@ -1,91 +1,86 @@ == Physical Plan == -AdaptiveSparkPlan (92) +AdaptiveSparkPlan (93) +- == Final Plan == - VeloxColumnarToRowExec (59) - +- TakeOrderedAndProjectExecTransformer (58) - +- ^ RegularHashAggregateExecTransformer (56) - +- ^ InputIteratorTransformer (55) - +- ^ InputAdapter (54) - +- ^ ShuffleQueryStage (53) - +- ColumnarExchange (52) + VeloxColumnarToRowExec (60) + +- TakeOrderedAndProjectExecTransformer (59) + +- ^ RegularHashAggregateExecTransformer (57) + +- ^ InputIteratorTransformer (56) + +- ShuffleQueryStage (54) + +- ColumnarExchange (53) + +- VeloxAppendBatches (52) +- ^ ProjectExecTransformer (50) +- ^ FlushableHashAggregateExecTransformer (49) +- ^ ProjectExecTransformer (48) - +- ^ BroadcastHashJoinExecTransformer Inner (47) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (47) :- ^ ProjectExecTransformer (38) - : +- ^ BroadcastHashJoinExecTransformer Inner (37) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (37) : :- ^ ProjectExecTransformer (28) - : : +- ^ BroadcastHashJoinExecTransformer Inner (27) + : : +- ^ BroadcastHashJoinExecTransformer Inner BuildLeft (27) : : :- ^ InputIteratorTransformer (7) - : : : +- ^ InputAdapter (6) - : : : +- ^ BroadcastQueryStage (5) - : : : +- ColumnarBroadcastExchange (4) - : : : +- ^ FilterExecTransformer (2) - : : : +- ^ Scan parquet (1) - : : +- ^ BroadcastHashJoinExecTransformer LeftAnti (26) - : : :- ^ BroadcastHashJoinExecTransformer LeftSemi (17) + : : : +- BroadcastQueryStage (5) + : : : +- ColumnarBroadcastExchange (4) + : : : +- ^ FilterExecTransformer (2) + : : : +- ^ Scan parquet (1) + : : +- ^ BroadcastHashJoinExecTransformer LeftAnti BuildRight (26) + : : :- ^ BroadcastHashJoinExecTransformer LeftSemi BuildRight (17) : : : :- ^ ProjectExecTransformer (10) : : : : +- ^ FilterExecTransformer (9) : : : : +- ^ Scan parquet (8) : : : +- ^ InputIteratorTransformer (16) - : : : +- ^ InputAdapter (15) - : : : +- ^ BroadcastQueryStage (14) - : : : +- ColumnarBroadcastExchange (13) - : : : +- ^ Scan parquet (11) + : : : +- BroadcastQueryStage (14) + : : : +- ColumnarBroadcastExchange (13) + : : : +- ^ Scan parquet (11) : : +- ^ InputIteratorTransformer (25) - : : +- ^ InputAdapter (24) - : : +- ^ BroadcastQueryStage (23) - : : +- ColumnarBroadcastExchange (22) - : : +- ^ ProjectExecTransformer (20) - : : +- ^ FilterExecTransformer (19) - : : +- ^ Scan parquet (18) + : : +- BroadcastQueryStage (23) + : : +- ColumnarBroadcastExchange (22) + : : +- ^ ProjectExecTransformer (20) + : : +- ^ FilterExecTransformer (19) + : : +- ^ Scan parquet (18) : +- ^ InputIteratorTransformer (36) - : +- ^ InputAdapter (35) - : +- ^ BroadcastQueryStage (34) - : +- ColumnarBroadcastExchange (33) - : +- ^ ProjectExecTransformer (31) - : +- ^ FilterExecTransformer (30) - : +- ^ Scan parquet (29) + : +- BroadcastQueryStage (34) + : +- ColumnarBroadcastExchange (33) + : +- ^ ProjectExecTransformer (31) + : +- ^ FilterExecTransformer (30) + : +- ^ Scan parquet (29) +- ^ InputIteratorTransformer (46) - +- ^ InputAdapter (45) - +- ^ BroadcastQueryStage (44) - +- ColumnarBroadcastExchange (43) - +- ^ ProjectExecTransformer (41) - +- ^ FilterExecTransformer (40) - +- ^ Scan parquet (39) + +- BroadcastQueryStage (44) + +- ColumnarBroadcastExchange (43) + +- ^ ProjectExecTransformer (41) + +- ^ FilterExecTransformer (40) + +- ^ Scan parquet (39) +- == Initial Plan == - TakeOrderedAndProject (91) - +- HashAggregate (90) - +- Exchange (89) - +- HashAggregate (88) - +- Project (87) - +- BroadcastHashJoin Inner BuildRight (86) - :- Project (81) - : +- BroadcastHashJoin Inner BuildRight (80) - : :- Project (75) - : : +- BroadcastHashJoin Inner BuildLeft (74) - : : :- BroadcastExchange (62) - : : : +- Filter (61) - : : : +- Scan parquet (60) - : : +- BroadcastHashJoin LeftAnti BuildRight (73) - : : :- BroadcastHashJoin LeftSemi BuildRight (68) - : : : :- Project (65) - : : : : +- Filter (64) - : : : : +- Scan parquet (63) - : : : +- BroadcastExchange (67) - : : : +- Scan parquet (66) - : : +- BroadcastExchange (72) - : : +- Project (71) - : : +- Filter (70) - : : +- Scan parquet (69) - : +- BroadcastExchange (79) - : +- Project (78) - : +- Filter (77) - : +- Scan parquet (76) - +- BroadcastExchange (85) - +- Project (84) - +- Filter (83) - +- Scan parquet (82) + TakeOrderedAndProject (92) + +- HashAggregate (91) + +- Exchange (90) + +- HashAggregate (89) + +- Project (88) + +- BroadcastHashJoin Inner BuildRight (87) + :- Project (82) + : +- BroadcastHashJoin Inner BuildRight (81) + : :- Project (76) + : : +- BroadcastHashJoin Inner BuildLeft (75) + : : :- BroadcastExchange (63) + : : : +- Filter (62) + : : : +- Scan parquet (61) + : : +- BroadcastHashJoin LeftAnti BuildRight (74) + : : :- BroadcastHashJoin LeftSemi BuildRight (69) + : : : :- Project (66) + : : : : +- Filter (65) + : : : : +- Scan parquet (64) + : : : +- BroadcastExchange (68) + : : : +- Scan parquet (67) + : : +- BroadcastExchange (73) + : : +- Project (72) + : : +- Filter (71) + : : +- Scan parquet (70) + : +- BroadcastExchange (80) + : +- Project (79) + : +- Filter (78) + : +- Scan parquet (77) + +- BroadcastExchange (86) + +- Project (85) + +- Filter (84) + +- Scan parquet (83) (1) Scan parquet @@ -307,194 +302,198 @@ Input [2]: [s_name#X, count#X] Input [3]: [hash_partition_key#X, s_name#X, count#X] Arguments: false -(52) ColumnarExchange +(52) VeloxAppendBatches +Input [3]: [hash_partition_key#X, s_name#X, count#X] +Arguments: X + +(53) ColumnarExchange Input [3]: [hash_partition_key#X, s_name#X, count#X] Arguments: hashpartitioning(s_name#X, 1), ENSURE_REQUIREMENTS, [s_name#X, count#X], [plan_id=X], [id=#X] -(53) ShuffleQueryStage +(54) ShuffleQueryStage Output [2]: [s_name#X, count#X] Arguments: X -(54) InputAdapter +(55) InputAdapter Input [2]: [s_name#X, count#X] -(55) InputIteratorTransformer +(56) InputIteratorTransformer Input [2]: [s_name#X, count#X] -(56) RegularHashAggregateExecTransformer +(57) RegularHashAggregateExecTransformer Input [2]: [s_name#X, count#X] Keys [1]: [s_name#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [s_name#X, count(1)#X AS numwait#X] -(57) WholeStageCodegenTransformer (X) +(58) WholeStageCodegenTransformer (X) Input [2]: [s_name#X, numwait#X] Arguments: false -(58) TakeOrderedAndProjectExecTransformer +(59) TakeOrderedAndProjectExecTransformer Input [2]: [s_name#X, numwait#X] Arguments: X, [numwait#X DESC NULLS LAST, s_name#X ASC NULLS FIRST], [s_name#X, numwait#X], 0 -(59) VeloxColumnarToRowExec +(60) VeloxColumnarToRowExec Input [2]: [s_name#X, numwait#X] -(60) Scan parquet +(61) Scan parquet Output [3]: [s_suppkey#X, s_name#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(61) Filter +(62) Filter Input [3]: [s_suppkey#X, s_name#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(62) BroadcastExchange +(63) BroadcastExchange Input [3]: [s_suppkey#X, s_name#X, s_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(63) Scan parquet +(64) Scan parquet Output [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_receiptdate), IsNotNull(l_commitdate), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(64) Filter +(65) Filter Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Condition : ((((isnotnull(l_receiptdate#X) AND isnotnull(l_commitdate#X)) AND (l_receiptdate#X > l_commitdate#X)) AND isnotnull(l_suppkey#X)) AND isnotnull(l_orderkey#X)) -(65) Project +(66) Project Output [2]: [l_orderkey#X, l_suppkey#X] Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] -(66) Scan parquet +(67) Scan parquet Output [2]: [l_orderkey#X, l_suppkey#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(67) BroadcastExchange +(68) BroadcastExchange Input [2]: [l_orderkey#X, l_suppkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(68) BroadcastHashJoin +(69) BroadcastHashJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: NOT (l_suppkey#X = l_suppkey#X) -(69) Scan parquet +(70) Scan parquet Output [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_receiptdate), IsNotNull(l_commitdate)] ReadSchema: struct -(70) Filter +(71) Filter Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Condition : ((isnotnull(l_receiptdate#X) AND isnotnull(l_commitdate#X)) AND (l_receiptdate#X > l_commitdate#X)) -(71) Project +(72) Project Output [2]: [l_orderkey#X, l_suppkey#X] Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] -(72) BroadcastExchange +(73) BroadcastExchange Input [2]: [l_orderkey#X, l_suppkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(73) BroadcastHashJoin +(74) BroadcastHashJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: NOT (l_suppkey#X = l_suppkey#X) -(74) BroadcastHashJoin +(75) BroadcastHashJoin Left keys [1]: [s_suppkey#X] Right keys [1]: [l_suppkey#X] Join condition: None -(75) Project +(76) Project Output [3]: [s_name#X, s_nationkey#X, l_orderkey#X] Input [5]: [s_suppkey#X, s_name#X, s_nationkey#X, l_orderkey#X, l_suppkey#X] -(76) Scan parquet +(77) Scan parquet Output [2]: [o_orderkey#X, o_orderstatus#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderstatus), EqualTo(o_orderstatus,F), IsNotNull(o_orderkey)] ReadSchema: struct -(77) Filter +(78) Filter Input [2]: [o_orderkey#X, o_orderstatus#X] Condition : ((isnotnull(o_orderstatus#X) AND (o_orderstatus#X = F)) AND isnotnull(o_orderkey#X)) -(78) Project +(79) Project Output [1]: [o_orderkey#X] Input [2]: [o_orderkey#X, o_orderstatus#X] -(79) BroadcastExchange +(80) BroadcastExchange Input [1]: [o_orderkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(80) BroadcastHashJoin +(81) BroadcastHashJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join condition: None -(81) Project +(82) Project Output [2]: [s_name#X, s_nationkey#X] Input [4]: [s_name#X, s_nationkey#X, l_orderkey#X, o_orderkey#X] -(82) Scan parquet +(83) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,SAUDI ARABIA), IsNotNull(n_nationkey)] ReadSchema: struct -(83) Filter +(84) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : ((isnotnull(n_name#X) AND (n_name#X = SAUDI ARABIA)) AND isnotnull(n_nationkey#X)) -(84) Project +(85) Project Output [1]: [n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(85) BroadcastExchange +(86) BroadcastExchange Input [1]: [n_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(86) BroadcastHashJoin +(87) BroadcastHashJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(87) Project +(88) Project Output [1]: [s_name#X] Input [3]: [s_name#X, s_nationkey#X, n_nationkey#X] -(88) HashAggregate +(89) HashAggregate Input [1]: [s_name#X] Keys [1]: [s_name#X] Functions [1]: [partial_count(1)] Aggregate Attributes [1]: [count#X] Results [2]: [s_name#X, count#X] -(89) Exchange +(90) Exchange Input [2]: [s_name#X, count#X] Arguments: hashpartitioning(s_name#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(90) HashAggregate +(91) HashAggregate Input [2]: [s_name#X, count#X] Keys [1]: [s_name#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [s_name#X, count(1)#X AS numwait#X] -(91) TakeOrderedAndProject +(92) TakeOrderedAndProject Input [2]: [s_name#X, numwait#X] Arguments: X, [numwait#X DESC NULLS LAST, s_name#X ASC NULLS FIRST], [s_name#X, numwait#X] -(92) AdaptiveSparkPlan +(93) AdaptiveSparkPlan Output [2]: [s_name#X, numwait#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/22.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/22.txt index dff4926856867..d578b43f3d6a3 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/22.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/22.txt @@ -1,40 +1,39 @@ == Physical Plan == -AdaptiveSparkPlan (38) +AdaptiveSparkPlan (40) +- == Final Plan == - VeloxColumnarToRowExec (26) - +- ^ SortExecTransformer (24) - +- ^ InputIteratorTransformer (23) - +- ^ InputAdapter (22) - +- ^ ShuffleQueryStage (21) - +- ColumnarExchange (20) - +- ^ RegularHashAggregateExecTransformer (18) - +- ^ InputIteratorTransformer (17) - +- ^ InputAdapter (16) - +- ^ ShuffleQueryStage (15) - +- ColumnarExchange (14) + VeloxColumnarToRowExec (28) + +- ^ SortExecTransformer (26) + +- ^ InputIteratorTransformer (25) + +- ShuffleQueryStage (23) + +- ColumnarExchange (22) + +- VeloxAppendBatches (21) + +- ^ RegularHashAggregateExecTransformer (19) + +- ^ InputIteratorTransformer (18) + +- ShuffleQueryStage (16) + +- ColumnarExchange (15) + +- VeloxAppendBatches (14) +- ^ ProjectExecTransformer (12) +- ^ FlushableHashAggregateExecTransformer (11) +- ^ ProjectExecTransformer (10) - +- ^ BroadcastHashJoinExecTransformer LeftAnti (9) + +- ^ BroadcastHashJoinExecTransformer LeftAnti BuildRight (9) :- ^ FilterExecTransformer (2) : +- ^ Scan parquet (1) +- ^ InputIteratorTransformer (8) - +- ^ InputAdapter (7) - +- ^ BroadcastQueryStage (6) - +- ColumnarBroadcastExchange (5) - +- ^ Scan parquet (3) + +- BroadcastQueryStage (6) + +- ColumnarBroadcastExchange (5) + +- ^ Scan parquet (3) +- == Initial Plan == - Sort (37) - +- Exchange (36) - +- HashAggregate (35) - +- Exchange (34) - +- HashAggregate (33) - +- Project (32) - +- BroadcastHashJoin LeftAnti BuildRight (31) - :- Filter (28) - : +- Scan parquet (27) - +- BroadcastExchange (30) - +- Scan parquet (29) + Sort (39) + +- Exchange (38) + +- HashAggregate (37) + +- Exchange (36) + +- HashAggregate (35) + +- Project (34) + +- BroadcastHashJoin LeftAnti BuildRight (33) + :- Filter (30) + : +- Scan parquet (29) + +- BroadcastExchange (32) + +- Scan parquet (31) (1) Scan parquet @@ -96,112 +95,120 @@ Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] Input [5]: [hash_partition_key#X, cntrycode#X, count#X, sum#X, isEmpty#X] Arguments: false -(14) ColumnarExchange +(14) VeloxAppendBatches +Input [5]: [hash_partition_key#X, cntrycode#X, count#X, sum#X, isEmpty#X] +Arguments: X + +(15) ColumnarExchange Input [5]: [hash_partition_key#X, cntrycode#X, count#X, sum#X, isEmpty#X] Arguments: hashpartitioning(cntrycode#X, 1), ENSURE_REQUIREMENTS, [cntrycode#X, count#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(15) ShuffleQueryStage +(16) ShuffleQueryStage Output [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] Arguments: X -(16) InputAdapter +(17) InputAdapter Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] -(17) InputIteratorTransformer +(18) InputIteratorTransformer Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] -(18) RegularHashAggregateExecTransformer +(19) RegularHashAggregateExecTransformer Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] Keys [1]: [cntrycode#X] Functions [2]: [count(1), sum(c_acctbal#X)] Aggregate Attributes [2]: [count(1)#X, sum(c_acctbal#X)#X] Results [3]: [cntrycode#X, count(1)#X AS numcust#X, sum(c_acctbal#X)#X AS totacctbal#X] -(19) WholeStageCodegenTransformer (X) +(20) WholeStageCodegenTransformer (X) Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: false -(20) ColumnarExchange +(21) VeloxAppendBatches +Input [3]: [cntrycode#X, numcust#X, totacctbal#X] +Arguments: X + +(22) ColumnarExchange Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: rangepartitioning(cntrycode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(21) ShuffleQueryStage +(23) ShuffleQueryStage Output [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: X -(22) InputAdapter +(24) InputAdapter Input [3]: [cntrycode#X, numcust#X, totacctbal#X] -(23) InputIteratorTransformer +(25) InputIteratorTransformer Input [3]: [cntrycode#X, numcust#X, totacctbal#X] -(24) SortExecTransformer +(26) SortExecTransformer Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: [cntrycode#X ASC NULLS FIRST], true, 0 -(25) WholeStageCodegenTransformer (X) +(27) WholeStageCodegenTransformer (X) Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: false -(26) VeloxColumnarToRowExec +(28) VeloxColumnarToRowExec Input [3]: [cntrycode#X, numcust#X, totacctbal#X] -(27) Scan parquet +(29) Scan parquet Output [3]: [c_custkey#X, c_phone#X, c_acctbal#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_acctbal)] ReadSchema: struct -(28) Filter +(30) Filter Input [3]: [c_custkey#X, c_phone#X, c_acctbal#X] Condition : ((isnotnull(c_acctbal#X) AND substring(c_phone#X, 1, 2) IN (13,31,23,29,30,18,17)) AND (cast(c_acctbal#X as decimal(16,6)) > Subquery subquery#X, [id=#X])) -(29) Scan parquet +(31) Scan parquet Output [1]: [o_custkey#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(30) BroadcastExchange +(32) BroadcastExchange Input [1]: [o_custkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(31) BroadcastHashJoin +(33) BroadcastHashJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(32) Project +(34) Project Output [2]: [substring(c_phone#X, 1, 2) AS cntrycode#X, c_acctbal#X] Input [3]: [c_custkey#X, c_phone#X, c_acctbal#X] -(33) HashAggregate +(35) HashAggregate Input [2]: [cntrycode#X, c_acctbal#X] Keys [1]: [cntrycode#X] Functions [2]: [partial_count(1), partial_sum(c_acctbal#X)] Aggregate Attributes [3]: [count#X, sum#X, isEmpty#X] Results [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] -(34) Exchange +(36) Exchange Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] Arguments: hashpartitioning(cntrycode#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(35) HashAggregate +(37) HashAggregate Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] Keys [1]: [cntrycode#X] Functions [2]: [count(1), sum(c_acctbal#X)] Aggregate Attributes [2]: [count(1)#X, sum(c_acctbal#X)#X] Results [3]: [cntrycode#X, count(1)#X AS numcust#X, sum(c_acctbal#X)#X AS totacctbal#X] -(36) Exchange +(38) Exchange Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: rangepartitioning(cntrycode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(37) Sort +(39) Sort Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: [cntrycode#X ASC NULLS FIRST], true, 0 -(38) AdaptiveSparkPlan +(40) AdaptiveSparkPlan Output [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/3.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/3.txt index 950a881d1ecf8..2ffd75b92964a 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/3.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/3.txt @@ -1,55 +1,53 @@ == Physical Plan == -AdaptiveSparkPlan (53) +AdaptiveSparkPlan (54) +- == Final Plan == - VeloxColumnarToRowExec (34) - +- TakeOrderedAndProjectExecTransformer (33) - +- ^ ProjectExecTransformer (31) - +- ^ RegularHashAggregateExecTransformer (30) - +- ^ InputIteratorTransformer (29) - +- ^ InputAdapter (28) - +- ^ ShuffleQueryStage (27) - +- ColumnarExchange (26) + VeloxColumnarToRowExec (35) + +- TakeOrderedAndProjectExecTransformer (34) + +- ^ ProjectExecTransformer (32) + +- ^ RegularHashAggregateExecTransformer (31) + +- ^ InputIteratorTransformer (30) + +- ShuffleQueryStage (28) + +- ColumnarExchange (27) + +- VeloxAppendBatches (26) +- ^ ProjectExecTransformer (24) +- ^ FlushableHashAggregateExecTransformer (23) +- ^ ProjectExecTransformer (22) - +- ^ BroadcastHashJoinExecTransformer Inner (21) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (21) :- ^ ProjectExecTransformer (12) - : +- ^ BroadcastHashJoinExecTransformer Inner (11) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildLeft (11) : :- ^ InputIteratorTransformer (8) - : : +- ^ InputAdapter (7) - : : +- ^ BroadcastQueryStage (6) - : : +- ColumnarBroadcastExchange (5) - : : +- ^ ProjectExecTransformer (3) - : : +- ^ FilterExecTransformer (2) - : : +- ^ Scan parquet (1) + : : +- BroadcastQueryStage (6) + : : +- ColumnarBroadcastExchange (5) + : : +- ^ ProjectExecTransformer (3) + : : +- ^ FilterExecTransformer (2) + : : +- ^ Scan parquet (1) : +- ^ FilterExecTransformer (10) : +- ^ Scan parquet (9) +- ^ InputIteratorTransformer (20) - +- ^ InputAdapter (19) - +- ^ BroadcastQueryStage (18) - +- ColumnarBroadcastExchange (17) - +- ^ ProjectExecTransformer (15) - +- ^ FilterExecTransformer (14) - +- ^ Scan parquet (13) + +- BroadcastQueryStage (18) + +- ColumnarBroadcastExchange (17) + +- ^ ProjectExecTransformer (15) + +- ^ FilterExecTransformer (14) + +- ^ Scan parquet (13) +- == Initial Plan == - TakeOrderedAndProject (52) - +- HashAggregate (51) - +- Exchange (50) - +- HashAggregate (49) - +- Project (48) - +- BroadcastHashJoin Inner BuildRight (47) - :- Project (42) - : +- BroadcastHashJoin Inner BuildLeft (41) - : :- BroadcastExchange (38) - : : +- Project (37) - : : +- Filter (36) - : : +- Scan parquet (35) - : +- Filter (40) - : +- Scan parquet (39) - +- BroadcastExchange (46) - +- Project (45) - +- Filter (44) - +- Scan parquet (43) + TakeOrderedAndProject (53) + +- HashAggregate (52) + +- Exchange (51) + +- HashAggregate (50) + +- Project (49) + +- BroadcastHashJoin Inner BuildRight (48) + :- Project (43) + : +- BroadcastHashJoin Inner BuildLeft (42) + : :- BroadcastExchange (39) + : : +- Project (38) + : : +- Filter (37) + : : +- Scan parquet (36) + : +- Filter (41) + : +- Scan parquet (40) + +- BroadcastExchange (47) + +- Project (46) + +- Filter (45) + +- Scan parquet (44) (1) Scan parquet @@ -162,131 +160,135 @@ Input [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] Input [6]: [hash_partition_key#X, l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] Arguments: false -(26) ColumnarExchange +(26) VeloxAppendBatches +Input [6]: [hash_partition_key#X, l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] +Arguments: X + +(27) ColumnarExchange Input [6]: [hash_partition_key#X, l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_orderkey#X, o_orderdate#X, o_shippriority#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(27) ShuffleQueryStage +(28) ShuffleQueryStage Output [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] Arguments: X -(28) InputAdapter +(29) InputAdapter Input [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] -(29) InputIteratorTransformer +(30) InputIteratorTransformer Input [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] -(30) RegularHashAggregateExecTransformer +(31) RegularHashAggregateExecTransformer Input [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] Keys [3]: [l_orderkey#X, o_orderdate#X, o_shippriority#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] Results [4]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] -(31) ProjectExecTransformer +(32) ProjectExecTransformer Output [4]: [l_orderkey#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X AS revenue#X, o_orderdate#X, o_shippriority#X] Input [4]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] -(32) WholeStageCodegenTransformer (X) +(33) WholeStageCodegenTransformer (X) Input [4]: [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] Arguments: false -(33) TakeOrderedAndProjectExecTransformer +(34) TakeOrderedAndProjectExecTransformer Input [4]: [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] Arguments: X, [revenue#X DESC NULLS LAST, o_orderdate#X ASC NULLS FIRST], [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X], 0 -(34) VeloxColumnarToRowExec +(35) VeloxColumnarToRowExec Input [4]: [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] -(35) Scan parquet +(36) Scan parquet Output [2]: [c_custkey#X, c_mktsegment#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_mktsegment), EqualTo(c_mktsegment,BUILDING), IsNotNull(c_custkey)] ReadSchema: struct -(36) Filter +(37) Filter Input [2]: [c_custkey#X, c_mktsegment#X] Condition : ((isnotnull(c_mktsegment#X) AND (c_mktsegment#X = BUILDING)) AND isnotnull(c_custkey#X)) -(37) Project +(38) Project Output [1]: [c_custkey#X] Input [2]: [c_custkey#X, c_mktsegment#X] -(38) BroadcastExchange +(39) BroadcastExchange Input [1]: [c_custkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(39) Scan parquet +(40) Scan parquet Output [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), LessThan(o_orderdate,1995-03-15), IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(40) Filter +(41) Filter Input [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Condition : (((isnotnull(o_orderdate#X) AND (o_orderdate#X < 1995-03-15)) AND isnotnull(o_custkey#X)) AND isnotnull(o_orderkey#X)) -(41) BroadcastHashJoin +(42) BroadcastHashJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(42) Project +(43) Project Output [3]: [o_orderkey#X, o_orderdate#X, o_shippriority#X] Input [5]: [c_custkey#X, o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] -(43) Scan parquet +(44) Scan parquet Output [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThan(l_shipdate,1995-03-15), IsNotNull(l_orderkey)] ReadSchema: struct -(44) Filter +(45) Filter Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : ((isnotnull(l_shipdate#X) AND (l_shipdate#X > 1995-03-15)) AND isnotnull(l_orderkey#X)) -(45) Project +(46) Project Output [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(46) BroadcastExchange +(47) BroadcastExchange Input [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(47) BroadcastHashJoin +(48) BroadcastHashJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(48) Project +(49) Project Output [5]: [o_orderdate#X, o_shippriority#X, l_orderkey#X, l_extendedprice#X, l_discount#X] Input [6]: [o_orderkey#X, o_orderdate#X, o_shippriority#X, l_orderkey#X, l_extendedprice#X, l_discount#X] -(49) HashAggregate +(50) HashAggregate Input [5]: [o_orderdate#X, o_shippriority#X, l_orderkey#X, l_extendedprice#X, l_discount#X] Keys [3]: [l_orderkey#X, o_orderdate#X, o_shippriority#X] Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] -(50) Exchange +(51) Exchange Input [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_orderkey#X, o_orderdate#X, o_shippriority#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(51) HashAggregate +(52) HashAggregate Input [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] Keys [3]: [l_orderkey#X, o_orderdate#X, o_shippriority#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] Results [4]: [l_orderkey#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X AS revenue#X, o_orderdate#X, o_shippriority#X] -(52) TakeOrderedAndProject +(53) TakeOrderedAndProject Input [4]: [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] Arguments: X, [revenue#X DESC NULLS LAST, o_orderdate#X ASC NULLS FIRST], [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] -(53) AdaptiveSparkPlan +(54) AdaptiveSparkPlan Output [4]: [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/4.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/4.txt index 3969248bc2ec2..36c6de5374ba2 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/4.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/4.txt @@ -1,46 +1,45 @@ == Physical Plan == -AdaptiveSparkPlan (44) +AdaptiveSparkPlan (46) +- == Final Plan == - VeloxColumnarToRowExec (29) - +- ^ SortExecTransformer (27) - +- ^ InputIteratorTransformer (26) - +- ^ InputAdapter (25) - +- ^ ShuffleQueryStage (24) - +- ColumnarExchange (23) - +- ^ RegularHashAggregateExecTransformer (21) - +- ^ InputIteratorTransformer (20) - +- ^ InputAdapter (19) - +- ^ ShuffleQueryStage (18) - +- ColumnarExchange (17) + VeloxColumnarToRowExec (31) + +- ^ SortExecTransformer (29) + +- ^ InputIteratorTransformer (28) + +- ShuffleQueryStage (26) + +- ColumnarExchange (25) + +- VeloxAppendBatches (24) + +- ^ RegularHashAggregateExecTransformer (22) + +- ^ InputIteratorTransformer (21) + +- ShuffleQueryStage (19) + +- ColumnarExchange (18) + +- VeloxAppendBatches (17) +- ^ ProjectExecTransformer (15) +- ^ FlushableHashAggregateExecTransformer (14) +- ^ ProjectExecTransformer (13) - +- ^ BroadcastHashJoinExecTransformer LeftSemi (12) + +- ^ BroadcastHashJoinExecTransformer LeftSemi BuildRight (12) :- ^ ProjectExecTransformer (3) : +- ^ FilterExecTransformer (2) : +- ^ Scan parquet (1) +- ^ InputIteratorTransformer (11) - +- ^ InputAdapter (10) - +- ^ BroadcastQueryStage (9) - +- ColumnarBroadcastExchange (8) - +- ^ ProjectExecTransformer (6) - +- ^ FilterExecTransformer (5) - +- ^ Scan parquet (4) + +- BroadcastQueryStage (9) + +- ColumnarBroadcastExchange (8) + +- ^ ProjectExecTransformer (6) + +- ^ FilterExecTransformer (5) + +- ^ Scan parquet (4) +- == Initial Plan == - Sort (43) - +- Exchange (42) - +- HashAggregate (41) - +- Exchange (40) - +- HashAggregate (39) - +- Project (38) - +- BroadcastHashJoin LeftSemi BuildRight (37) - :- Project (32) - : +- Filter (31) - : +- Scan parquet (30) - +- BroadcastExchange (36) - +- Project (35) - +- Filter (34) - +- Scan parquet (33) + Sort (45) + +- Exchange (44) + +- HashAggregate (43) + +- Exchange (42) + +- HashAggregate (41) + +- Project (40) + +- BroadcastHashJoin LeftSemi BuildRight (39) + :- Project (34) + : +- Filter (33) + : +- Scan parquet (32) + +- BroadcastExchange (38) + +- Project (37) + +- Filter (36) + +- Scan parquet (35) (1) Scan parquet @@ -115,125 +114,133 @@ Input [2]: [o_orderpriority#X, count#X] Input [3]: [hash_partition_key#X, o_orderpriority#X, count#X] Arguments: false -(17) ColumnarExchange +(17) VeloxAppendBatches +Input [3]: [hash_partition_key#X, o_orderpriority#X, count#X] +Arguments: X + +(18) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderpriority#X, count#X] Arguments: hashpartitioning(o_orderpriority#X, 1), ENSURE_REQUIREMENTS, [o_orderpriority#X, count#X], [plan_id=X], [id=#X] -(18) ShuffleQueryStage +(19) ShuffleQueryStage Output [2]: [o_orderpriority#X, count#X] Arguments: X -(19) InputAdapter +(20) InputAdapter Input [2]: [o_orderpriority#X, count#X] -(20) InputIteratorTransformer +(21) InputIteratorTransformer Input [2]: [o_orderpriority#X, count#X] -(21) RegularHashAggregateExecTransformer +(22) RegularHashAggregateExecTransformer Input [2]: [o_orderpriority#X, count#X] Keys [1]: [o_orderpriority#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [o_orderpriority#X, count(1)#X AS order_count#X] -(22) WholeStageCodegenTransformer (X) +(23) WholeStageCodegenTransformer (X) Input [2]: [o_orderpriority#X, order_count#X] Arguments: false -(23) ColumnarExchange +(24) VeloxAppendBatches +Input [2]: [o_orderpriority#X, order_count#X] +Arguments: X + +(25) ColumnarExchange Input [2]: [o_orderpriority#X, order_count#X] Arguments: rangepartitioning(o_orderpriority#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(24) ShuffleQueryStage +(26) ShuffleQueryStage Output [2]: [o_orderpriority#X, order_count#X] Arguments: X -(25) InputAdapter +(27) InputAdapter Input [2]: [o_orderpriority#X, order_count#X] -(26) InputIteratorTransformer +(28) InputIteratorTransformer Input [2]: [o_orderpriority#X, order_count#X] -(27) SortExecTransformer +(29) SortExecTransformer Input [2]: [o_orderpriority#X, order_count#X] Arguments: [o_orderpriority#X ASC NULLS FIRST], true, 0 -(28) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [2]: [o_orderpriority#X, order_count#X] Arguments: false -(29) VeloxColumnarToRowExec +(31) VeloxColumnarToRowExec Input [2]: [o_orderpriority#X, order_count#X] -(30) Scan parquet +(32) Scan parquet Output [3]: [o_orderkey#X, o_orderdate#X, o_orderpriority#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1993-07-01), LessThan(o_orderdate,1993-10-01)] ReadSchema: struct -(31) Filter +(33) Filter Input [3]: [o_orderkey#X, o_orderdate#X, o_orderpriority#X] Condition : ((isnotnull(o_orderdate#X) AND (o_orderdate#X >= 1993-07-01)) AND (o_orderdate#X < 1993-10-01)) -(32) Project +(34) Project Output [2]: [o_orderkey#X, o_orderpriority#X] Input [3]: [o_orderkey#X, o_orderdate#X, o_orderpriority#X] -(33) Scan parquet +(35) Scan parquet Output [3]: [l_orderkey#X, l_commitdate#X, l_receiptdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_commitdate), IsNotNull(l_receiptdate)] ReadSchema: struct -(34) Filter +(36) Filter Input [3]: [l_orderkey#X, l_commitdate#X, l_receiptdate#X] Condition : ((isnotnull(l_commitdate#X) AND isnotnull(l_receiptdate#X)) AND (l_commitdate#X < l_receiptdate#X)) -(35) Project +(37) Project Output [1]: [l_orderkey#X] Input [3]: [l_orderkey#X, l_commitdate#X, l_receiptdate#X] -(36) BroadcastExchange +(38) BroadcastExchange Input [1]: [l_orderkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(37) BroadcastHashJoin +(39) BroadcastHashJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(38) Project +(40) Project Output [1]: [o_orderpriority#X] Input [2]: [o_orderkey#X, o_orderpriority#X] -(39) HashAggregate +(41) HashAggregate Input [1]: [o_orderpriority#X] Keys [1]: [o_orderpriority#X] Functions [1]: [partial_count(1)] Aggregate Attributes [1]: [count#X] Results [2]: [o_orderpriority#X, count#X] -(40) Exchange +(42) Exchange Input [2]: [o_orderpriority#X, count#X] Arguments: hashpartitioning(o_orderpriority#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(41) HashAggregate +(43) HashAggregate Input [2]: [o_orderpriority#X, count#X] Keys [1]: [o_orderpriority#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [o_orderpriority#X, count(1)#X AS order_count#X] -(42) Exchange +(44) Exchange Input [2]: [o_orderpriority#X, order_count#X] Arguments: rangepartitioning(o_orderpriority#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(43) Sort +(45) Sort Input [2]: [o_orderpriority#X, order_count#X] Arguments: [o_orderpriority#X ASC NULLS FIRST], true, 0 -(44) AdaptiveSparkPlan +(46) AdaptiveSparkPlan Output [2]: [o_orderpriority#X, order_count#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/5.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/5.txt index 678a52004a3f1..49ee27485b4ac 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/5.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/5.txt @@ -1,98 +1,93 @@ == Physical Plan == -AdaptiveSparkPlan (100) +AdaptiveSparkPlan (102) +- == Final Plan == - VeloxColumnarToRowExec (65) - +- ^ SortExecTransformer (63) - +- ^ InputIteratorTransformer (62) - +- ^ InputAdapter (61) - +- ^ ShuffleQueryStage (60) - +- ColumnarExchange (59) - +- ^ RegularHashAggregateExecTransformer (57) - +- ^ InputIteratorTransformer (56) - +- ^ InputAdapter (55) - +- ^ ShuffleQueryStage (54) - +- ColumnarExchange (53) + VeloxColumnarToRowExec (67) + +- ^ SortExecTransformer (65) + +- ^ InputIteratorTransformer (64) + +- ShuffleQueryStage (62) + +- ColumnarExchange (61) + +- VeloxAppendBatches (60) + +- ^ RegularHashAggregateExecTransformer (58) + +- ^ InputIteratorTransformer (57) + +- ShuffleQueryStage (55) + +- ColumnarExchange (54) + +- VeloxAppendBatches (53) +- ^ ProjectExecTransformer (51) +- ^ FlushableHashAggregateExecTransformer (50) +- ^ ProjectExecTransformer (49) - +- ^ BroadcastHashJoinExecTransformer Inner (48) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (48) :- ^ ProjectExecTransformer (39) - : +- ^ BroadcastHashJoinExecTransformer Inner (38) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (38) : :- ^ ProjectExecTransformer (30) - : : +- ^ BroadcastHashJoinExecTransformer Inner (29) + : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (29) : : :- ^ ProjectExecTransformer (21) - : : : +- ^ BroadcastHashJoinExecTransformer Inner (20) + : : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (20) : : : :- ^ ProjectExecTransformer (12) - : : : : +- ^ BroadcastHashJoinExecTransformer Inner (11) + : : : : +- ^ BroadcastHashJoinExecTransformer Inner BuildLeft (11) : : : : :- ^ InputIteratorTransformer (7) - : : : : : +- ^ InputAdapter (6) - : : : : : +- ^ BroadcastQueryStage (5) - : : : : : +- ColumnarBroadcastExchange (4) - : : : : : +- ^ FilterExecTransformer (2) - : : : : : +- ^ Scan parquet (1) + : : : : : +- BroadcastQueryStage (5) + : : : : : +- ColumnarBroadcastExchange (4) + : : : : : +- ^ FilterExecTransformer (2) + : : : : : +- ^ Scan parquet (1) : : : : +- ^ ProjectExecTransformer (10) : : : : +- ^ FilterExecTransformer (9) : : : : +- ^ Scan parquet (8) : : : +- ^ InputIteratorTransformer (19) - : : : +- ^ InputAdapter (18) - : : : +- ^ BroadcastQueryStage (17) - : : : +- ColumnarBroadcastExchange (16) - : : : +- ^ FilterExecTransformer (14) - : : : +- ^ Scan parquet (13) + : : : +- BroadcastQueryStage (17) + : : : +- ColumnarBroadcastExchange (16) + : : : +- ^ FilterExecTransformer (14) + : : : +- ^ Scan parquet (13) : : +- ^ InputIteratorTransformer (28) - : : +- ^ InputAdapter (27) - : : +- ^ BroadcastQueryStage (26) - : : +- ColumnarBroadcastExchange (25) - : : +- ^ FilterExecTransformer (23) - : : +- ^ Scan parquet (22) + : : +- BroadcastQueryStage (26) + : : +- ColumnarBroadcastExchange (25) + : : +- ^ FilterExecTransformer (23) + : : +- ^ Scan parquet (22) : +- ^ InputIteratorTransformer (37) - : +- ^ InputAdapter (36) - : +- ^ BroadcastQueryStage (35) - : +- ColumnarBroadcastExchange (34) - : +- ^ FilterExecTransformer (32) - : +- ^ Scan parquet (31) + : +- BroadcastQueryStage (35) + : +- ColumnarBroadcastExchange (34) + : +- ^ FilterExecTransformer (32) + : +- ^ Scan parquet (31) +- ^ InputIteratorTransformer (47) - +- ^ InputAdapter (46) - +- ^ BroadcastQueryStage (45) - +- ColumnarBroadcastExchange (44) - +- ^ ProjectExecTransformer (42) - +- ^ FilterExecTransformer (41) - +- ^ Scan parquet (40) + +- BroadcastQueryStage (45) + +- ColumnarBroadcastExchange (44) + +- ^ ProjectExecTransformer (42) + +- ^ FilterExecTransformer (41) + +- ^ Scan parquet (40) +- == Initial Plan == - Sort (99) - +- Exchange (98) - +- HashAggregate (97) - +- Exchange (96) - +- HashAggregate (95) - +- Project (94) - +- BroadcastHashJoin Inner BuildRight (93) - :- Project (88) - : +- BroadcastHashJoin Inner BuildRight (87) - : :- Project (83) - : : +- BroadcastHashJoin Inner BuildRight (82) - : : :- Project (78) - : : : +- BroadcastHashJoin Inner BuildRight (77) - : : : :- Project (73) - : : : : +- BroadcastHashJoin Inner BuildLeft (72) - : : : : :- BroadcastExchange (68) - : : : : : +- Filter (67) - : : : : : +- Scan parquet (66) - : : : : +- Project (71) - : : : : +- Filter (70) - : : : : +- Scan parquet (69) - : : : +- BroadcastExchange (76) - : : : +- Filter (75) - : : : +- Scan parquet (74) - : : +- BroadcastExchange (81) - : : +- Filter (80) - : : +- Scan parquet (79) - : +- BroadcastExchange (86) - : +- Filter (85) - : +- Scan parquet (84) - +- BroadcastExchange (92) - +- Project (91) - +- Filter (90) - +- Scan parquet (89) + Sort (101) + +- Exchange (100) + +- HashAggregate (99) + +- Exchange (98) + +- HashAggregate (97) + +- Project (96) + +- BroadcastHashJoin Inner BuildRight (95) + :- Project (90) + : +- BroadcastHashJoin Inner BuildRight (89) + : :- Project (85) + : : +- BroadcastHashJoin Inner BuildRight (84) + : : :- Project (80) + : : : +- BroadcastHashJoin Inner BuildRight (79) + : : : :- Project (75) + : : : : +- BroadcastHashJoin Inner BuildLeft (74) + : : : : :- BroadcastExchange (70) + : : : : : +- Filter (69) + : : : : : +- Scan parquet (68) + : : : : +- Project (73) + : : : : +- Filter (72) + : : : : +- Scan parquet (71) + : : : +- BroadcastExchange (78) + : : : +- Filter (77) + : : : +- Scan parquet (76) + : : +- BroadcastExchange (83) + : : +- Filter (82) + : : +- Scan parquet (81) + : +- BroadcastExchange (88) + : +- Filter (87) + : +- Scan parquet (86) + +- BroadcastExchange (94) + +- Project (93) + +- Filter (92) + +- Scan parquet (91) (1) Scan parquet @@ -319,221 +314,229 @@ Input [3]: [n_name#X, sum#X, isEmpty#X] Input [4]: [hash_partition_key#X, n_name#X, sum#X, isEmpty#X] Arguments: false -(53) ColumnarExchange +(53) VeloxAppendBatches +Input [4]: [hash_partition_key#X, n_name#X, sum#X, isEmpty#X] +Arguments: X + +(54) ColumnarExchange Input [4]: [hash_partition_key#X, n_name#X, sum#X, isEmpty#X] Arguments: hashpartitioning(n_name#X, 1), ENSURE_REQUIREMENTS, [n_name#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(54) ShuffleQueryStage +(55) ShuffleQueryStage Output [3]: [n_name#X, sum#X, isEmpty#X] Arguments: X -(55) InputAdapter +(56) InputAdapter Input [3]: [n_name#X, sum#X, isEmpty#X] -(56) InputIteratorTransformer +(57) InputIteratorTransformer Input [3]: [n_name#X, sum#X, isEmpty#X] -(57) RegularHashAggregateExecTransformer +(58) RegularHashAggregateExecTransformer Input [3]: [n_name#X, sum#X, isEmpty#X] Keys [1]: [n_name#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] Results [2]: [n_name#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X AS revenue#X] -(58) WholeStageCodegenTransformer (X) +(59) WholeStageCodegenTransformer (X) Input [2]: [n_name#X, revenue#X] Arguments: false -(59) ColumnarExchange +(60) VeloxAppendBatches +Input [2]: [n_name#X, revenue#X] +Arguments: X + +(61) ColumnarExchange Input [2]: [n_name#X, revenue#X] Arguments: rangepartitioning(revenue#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(60) ShuffleQueryStage +(62) ShuffleQueryStage Output [2]: [n_name#X, revenue#X] Arguments: X -(61) InputAdapter +(63) InputAdapter Input [2]: [n_name#X, revenue#X] -(62) InputIteratorTransformer +(64) InputIteratorTransformer Input [2]: [n_name#X, revenue#X] -(63) SortExecTransformer +(65) SortExecTransformer Input [2]: [n_name#X, revenue#X] Arguments: [revenue#X DESC NULLS LAST], true, 0 -(64) WholeStageCodegenTransformer (X) +(66) WholeStageCodegenTransformer (X) Input [2]: [n_name#X, revenue#X] Arguments: false -(65) VeloxColumnarToRowExec +(67) VeloxColumnarToRowExec Input [2]: [n_name#X, revenue#X] -(66) Scan parquet +(68) Scan parquet Output [2]: [c_custkey#X, c_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey), IsNotNull(c_nationkey)] ReadSchema: struct -(67) Filter +(69) Filter Input [2]: [c_custkey#X, c_nationkey#X] Condition : (isnotnull(c_custkey#X) AND isnotnull(c_nationkey#X)) -(68) BroadcastExchange +(70) BroadcastExchange Input [2]: [c_custkey#X, c_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(69) Scan parquet +(71) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1994-01-01), LessThan(o_orderdate,1995-01-01), IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(70) Filter +(72) Filter Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Condition : ((((isnotnull(o_orderdate#X) AND (o_orderdate#X >= 1994-01-01)) AND (o_orderdate#X < 1995-01-01)) AND isnotnull(o_custkey#X)) AND isnotnull(o_orderkey#X)) -(71) Project +(73) Project Output [2]: [o_orderkey#X, o_custkey#X] Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] -(72) BroadcastHashJoin +(74) BroadcastHashJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(73) Project +(75) Project Output [2]: [c_nationkey#X, o_orderkey#X] Input [4]: [c_custkey#X, c_nationkey#X, o_orderkey#X, o_custkey#X] -(74) Scan parquet +(76) Scan parquet Output [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_orderkey), IsNotNull(l_suppkey)] ReadSchema: struct -(75) Filter +(77) Filter Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Condition : (isnotnull(l_orderkey#X) AND isnotnull(l_suppkey#X)) -(76) BroadcastExchange +(78) BroadcastExchange Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(77) BroadcastHashJoin +(79) BroadcastHashJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(78) Project +(80) Project Output [4]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Input [6]: [c_nationkey#X, o_orderkey#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(79) Scan parquet +(81) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(80) Filter +(82) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(81) BroadcastExchange +(83) BroadcastExchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false], input[1, bigint, false]),false), [plan_id=X] -(82) BroadcastHashJoin +(84) BroadcastHashJoin Left keys [2]: [l_suppkey#X, c_nationkey#X] Right keys [2]: [s_suppkey#X, s_nationkey#X] Join condition: None -(83) Project +(85) Project Output [3]: [l_extendedprice#X, l_discount#X, s_nationkey#X] Input [6]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, s_suppkey#X, s_nationkey#X] -(84) Scan parquet +(86) Scan parquet Output [3]: [n_nationkey#X, n_name#X, n_regionkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), IsNotNull(n_regionkey)] ReadSchema: struct -(85) Filter +(87) Filter Input [3]: [n_nationkey#X, n_name#X, n_regionkey#X] Condition : (isnotnull(n_nationkey#X) AND isnotnull(n_regionkey#X)) -(86) BroadcastExchange +(88) BroadcastExchange Input [3]: [n_nationkey#X, n_name#X, n_regionkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(87) BroadcastHashJoin +(89) BroadcastHashJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(88) Project +(90) Project Output [4]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] Input [6]: [l_extendedprice#X, l_discount#X, s_nationkey#X, n_nationkey#X, n_name#X, n_regionkey#X] -(89) Scan parquet +(91) Scan parquet Output [2]: [r_regionkey#X, r_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(r_name), EqualTo(r_name,ASIA), IsNotNull(r_regionkey)] ReadSchema: struct -(90) Filter +(92) Filter Input [2]: [r_regionkey#X, r_name#X] Condition : ((isnotnull(r_name#X) AND (r_name#X = ASIA)) AND isnotnull(r_regionkey#X)) -(91) Project +(93) Project Output [1]: [r_regionkey#X] Input [2]: [r_regionkey#X, r_name#X] -(92) BroadcastExchange +(94) BroadcastExchange Input [1]: [r_regionkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(93) BroadcastHashJoin +(95) BroadcastHashJoin Left keys [1]: [n_regionkey#X] Right keys [1]: [r_regionkey#X] Join condition: None -(94) Project +(96) Project Output [3]: [l_extendedprice#X, l_discount#X, n_name#X] Input [5]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X, r_regionkey#X] -(95) HashAggregate +(97) HashAggregate Input [3]: [l_extendedprice#X, l_discount#X, n_name#X] Keys [1]: [n_name#X] Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [n_name#X, sum#X, isEmpty#X] -(96) Exchange +(98) Exchange Input [3]: [n_name#X, sum#X, isEmpty#X] Arguments: hashpartitioning(n_name#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(97) HashAggregate +(99) HashAggregate Input [3]: [n_name#X, sum#X, isEmpty#X] Keys [1]: [n_name#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] Results [2]: [n_name#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X AS revenue#X] -(98) Exchange +(100) Exchange Input [2]: [n_name#X, revenue#X] Arguments: rangepartitioning(revenue#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(99) Sort +(101) Sort Input [2]: [n_name#X, revenue#X] Arguments: [revenue#X DESC NULLS LAST], true, 0 -(100) AdaptiveSparkPlan +(102) AdaptiveSparkPlan Output [2]: [n_name#X, revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/6.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/6.txt index fa2536a365c8a..786a89fe715ae 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/6.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/6.txt @@ -1,23 +1,23 @@ == Physical Plan == -AdaptiveSparkPlan (19) +AdaptiveSparkPlan (20) +- == Final Plan == - VeloxColumnarToRowExec (12) - +- ^ RegularHashAggregateExecTransformer (10) - +- ^ InputIteratorTransformer (9) - +- ^ InputAdapter (8) - +- ^ ShuffleQueryStage (7) - +- ColumnarExchange (6) + VeloxColumnarToRowExec (13) + +- ^ RegularHashAggregateExecTransformer (11) + +- ^ InputIteratorTransformer (10) + +- ShuffleQueryStage (8) + +- ColumnarExchange (7) + +- VeloxAppendBatches (6) +- ^ FlushableHashAggregateExecTransformer (4) +- ^ ProjectExecTransformer (3) +- ^ FilterExecTransformer (2) +- ^ Scan parquet (1) +- == Initial Plan == - HashAggregate (18) - +- Exchange (17) - +- HashAggregate (16) - +- Project (15) - +- Filter (14) - +- Scan parquet (13) + HashAggregate (19) + +- Exchange (18) + +- HashAggregate (17) + +- Project (16) + +- Filter (15) + +- Scan parquet (14) (1) Scan parquet @@ -46,67 +46,71 @@ Results [2]: [sum#X, isEmpty#X] Input [2]: [sum#X, isEmpty#X] Arguments: false -(6) ColumnarExchange +(6) VeloxAppendBatches +Input [2]: [sum#X, isEmpty#X] +Arguments: X + +(7) ColumnarExchange Input [2]: [sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(7) ShuffleQueryStage +(8) ShuffleQueryStage Output [2]: [sum#X, isEmpty#X] Arguments: X -(8) InputAdapter +(9) InputAdapter Input [2]: [sum#X, isEmpty#X] -(9) InputIteratorTransformer +(10) InputIteratorTransformer Input [2]: [sum#X, isEmpty#X] -(10) RegularHashAggregateExecTransformer +(11) RegularHashAggregateExecTransformer Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(CheckOverflow((promote_precision(l_extendedprice#X) * promote_precision(l_discount#X)), DecimalType(25,4), true))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(l_extendedprice#X) * promote_precision(l_discount#X)), DecimalType(25,4), true))#X] Results [1]: [sum(CheckOverflow((promote_precision(l_extendedprice#X) * promote_precision(l_discount#X)), DecimalType(25,4), true))#X AS revenue#X] -(11) WholeStageCodegenTransformer (X) +(12) WholeStageCodegenTransformer (X) Input [1]: [revenue#X] Arguments: false -(12) VeloxColumnarToRowExec +(13) VeloxColumnarToRowExec Input [1]: [revenue#X] -(13) Scan parquet +(14) Scan parquet Output [4]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), IsNotNull(l_discount), IsNotNull(l_quantity), GreaterThanOrEqual(l_shipdate,1994-01-01), LessThan(l_shipdate,1995-01-01), GreaterThanOrEqual(l_discount,0.05), LessThanOrEqual(l_discount,0.07), LessThan(l_quantity,24.00)] ReadSchema: struct -(14) Filter +(15) Filter Input [4]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : (((((((isnotnull(l_shipdate#X) AND isnotnull(l_discount#X)) AND isnotnull(l_quantity#X)) AND (l_shipdate#X >= 1994-01-01)) AND (l_shipdate#X < 1995-01-01)) AND (l_discount#X >= 0.05)) AND (l_discount#X <= 0.07)) AND (l_quantity#X < 24.00)) -(15) Project +(16) Project Output [2]: [l_extendedprice#X, l_discount#X] Input [4]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(16) HashAggregate +(17) HashAggregate Input [2]: [l_extendedprice#X, l_discount#X] Keys: [] Functions [1]: [partial_sum(CheckOverflow((promote_precision(l_extendedprice#X) * promote_precision(l_discount#X)), DecimalType(25,4), true))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(17) Exchange +(18) Exchange Input [2]: [sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X] -(18) HashAggregate +(19) HashAggregate Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(CheckOverflow((promote_precision(l_extendedprice#X) * promote_precision(l_discount#X)), DecimalType(25,4), true))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(l_extendedprice#X) * promote_precision(l_discount#X)), DecimalType(25,4), true))#X] Results [1]: [sum(CheckOverflow((promote_precision(l_extendedprice#X) * promote_precision(l_discount#X)), DecimalType(25,4), true))#X AS revenue#X] -(19) AdaptiveSparkPlan +(20) AdaptiveSparkPlan Output [1]: [revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/7.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/7.txt index 19b6cc4b8aee6..2ba42f806f3e5 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/7.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/7.txt @@ -1,92 +1,87 @@ == Physical Plan == -AdaptiveSparkPlan (93) +AdaptiveSparkPlan (95) +- == Final Plan == - VeloxColumnarToRowExec (60) - +- ^ SortExecTransformer (58) - +- ^ InputIteratorTransformer (57) - +- ^ InputAdapter (56) - +- ^ ShuffleQueryStage (55) - +- ColumnarExchange (54) - +- ^ RegularHashAggregateExecTransformer (52) - +- ^ InputIteratorTransformer (51) - +- ^ InputAdapter (50) - +- ^ ShuffleQueryStage (49) - +- ColumnarExchange (48) + VeloxColumnarToRowExec (62) + +- ^ SortExecTransformer (60) + +- ^ InputIteratorTransformer (59) + +- ShuffleQueryStage (57) + +- ColumnarExchange (56) + +- VeloxAppendBatches (55) + +- ^ RegularHashAggregateExecTransformer (53) + +- ^ InputIteratorTransformer (52) + +- ShuffleQueryStage (50) + +- ColumnarExchange (49) + +- VeloxAppendBatches (48) +- ^ ProjectExecTransformer (46) +- ^ FlushableHashAggregateExecTransformer (45) +- ^ ProjectExecTransformer (44) - +- ^ BroadcastHashJoinExecTransformer Inner (43) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (43) :- ^ ProjectExecTransformer (38) - : +- ^ BroadcastHashJoinExecTransformer Inner (37) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (37) : :- ^ ProjectExecTransformer (29) - : : +- ^ BroadcastHashJoinExecTransformer Inner (28) + : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (28) : : :- ^ ProjectExecTransformer (20) - : : : +- ^ BroadcastHashJoinExecTransformer Inner (19) + : : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (19) : : : :- ^ ProjectExecTransformer (11) - : : : : +- ^ BroadcastHashJoinExecTransformer Inner (10) + : : : : +- ^ BroadcastHashJoinExecTransformer Inner BuildLeft (10) : : : : :- ^ InputIteratorTransformer (7) - : : : : : +- ^ InputAdapter (6) - : : : : : +- ^ BroadcastQueryStage (5) - : : : : : +- ColumnarBroadcastExchange (4) - : : : : : +- ^ FilterExecTransformer (2) - : : : : : +- ^ Scan parquet (1) + : : : : : +- BroadcastQueryStage (5) + : : : : : +- ColumnarBroadcastExchange (4) + : : : : : +- ^ FilterExecTransformer (2) + : : : : : +- ^ Scan parquet (1) : : : : +- ^ FilterExecTransformer (9) : : : : +- ^ Scan parquet (8) : : : +- ^ InputIteratorTransformer (18) - : : : +- ^ InputAdapter (17) - : : : +- ^ BroadcastQueryStage (16) - : : : +- ColumnarBroadcastExchange (15) - : : : +- ^ FilterExecTransformer (13) - : : : +- ^ Scan parquet (12) + : : : +- BroadcastQueryStage (16) + : : : +- ColumnarBroadcastExchange (15) + : : : +- ^ FilterExecTransformer (13) + : : : +- ^ Scan parquet (12) : : +- ^ InputIteratorTransformer (27) - : : +- ^ InputAdapter (26) - : : +- ^ BroadcastQueryStage (25) - : : +- ColumnarBroadcastExchange (24) - : : +- ^ FilterExecTransformer (22) - : : +- ^ Scan parquet (21) + : : +- BroadcastQueryStage (25) + : : +- ColumnarBroadcastExchange (24) + : : +- ^ FilterExecTransformer (22) + : : +- ^ Scan parquet (21) : +- ^ InputIteratorTransformer (36) - : +- ^ InputAdapter (35) - : +- ^ BroadcastQueryStage (34) - : +- ColumnarBroadcastExchange (33) - : +- ^ FilterExecTransformer (31) - : +- ^ Scan parquet (30) + : +- BroadcastQueryStage (34) + : +- ColumnarBroadcastExchange (33) + : +- ^ FilterExecTransformer (31) + : +- ^ Scan parquet (30) +- ^ InputIteratorTransformer (42) - +- ^ InputAdapter (41) - +- ^ BroadcastQueryStage (40) - +- ReusedExchange (39) + +- BroadcastQueryStage (40) + +- ReusedExchange (39) +- == Initial Plan == - Sort (92) - +- Exchange (91) - +- HashAggregate (90) - +- Exchange (89) - +- HashAggregate (88) - +- Project (87) - +- BroadcastHashJoin Inner BuildRight (86) - :- Project (82) - : +- BroadcastHashJoin Inner BuildRight (81) - : :- Project (77) - : : +- BroadcastHashJoin Inner BuildRight (76) - : : :- Project (72) - : : : +- BroadcastHashJoin Inner BuildRight (71) - : : : :- Project (67) - : : : : +- BroadcastHashJoin Inner BuildLeft (66) - : : : : :- BroadcastExchange (63) - : : : : : +- Filter (62) - : : : : : +- Scan parquet (61) - : : : : +- Filter (65) - : : : : +- Scan parquet (64) - : : : +- BroadcastExchange (70) - : : : +- Filter (69) - : : : +- Scan parquet (68) - : : +- BroadcastExchange (75) - : : +- Filter (74) - : : +- Scan parquet (73) - : +- BroadcastExchange (80) - : +- Filter (79) - : +- Scan parquet (78) - +- BroadcastExchange (85) - +- Filter (84) - +- Scan parquet (83) + Sort (94) + +- Exchange (93) + +- HashAggregate (92) + +- Exchange (91) + +- HashAggregate (90) + +- Project (89) + +- BroadcastHashJoin Inner BuildRight (88) + :- Project (84) + : +- BroadcastHashJoin Inner BuildRight (83) + : :- Project (79) + : : +- BroadcastHashJoin Inner BuildRight (78) + : : :- Project (74) + : : : +- BroadcastHashJoin Inner BuildRight (73) + : : : :- Project (69) + : : : : +- BroadcastHashJoin Inner BuildLeft (68) + : : : : :- BroadcastExchange (65) + : : : : : +- Filter (64) + : : : : : +- Scan parquet (63) + : : : : +- Filter (67) + : : : : +- Scan parquet (66) + : : : +- BroadcastExchange (72) + : : : +- Filter (71) + : : : +- Scan parquet (70) + : : +- BroadcastExchange (77) + : : +- Filter (76) + : : +- Scan parquet (75) + : +- BroadcastExchange (82) + : +- Filter (81) + : +- Scan parquet (80) + +- BroadcastExchange (87) + +- Filter (86) + +- Scan parquet (85) (1) Scan parquet @@ -289,213 +284,221 @@ Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Input [6]: [hash_partition_key#X, supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Arguments: false -(48) ColumnarExchange +(48) VeloxAppendBatches +Input [6]: [hash_partition_key#X, supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] +Arguments: X + +(49) ColumnarExchange Input [6]: [hash_partition_key#X, supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Arguments: hashpartitioning(supp_nation#X, cust_nation#X, l_year#X, 1), ENSURE_REQUIREMENTS, [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(49) ShuffleQueryStage +(50) ShuffleQueryStage Output [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Arguments: X -(50) InputAdapter +(51) InputAdapter Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] -(51) InputIteratorTransformer +(52) InputIteratorTransformer Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] -(52) RegularHashAggregateExecTransformer +(53) RegularHashAggregateExecTransformer Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Keys [3]: [supp_nation#X, cust_nation#X, l_year#X] Functions [1]: [sum(volume#X)] Aggregate Attributes [1]: [sum(volume#X)#X] Results [4]: [supp_nation#X, cust_nation#X, l_year#X, sum(volume#X)#X AS revenue#X] -(53) WholeStageCodegenTransformer (X) +(54) WholeStageCodegenTransformer (X) Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: false -(54) ColumnarExchange +(55) VeloxAppendBatches +Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] +Arguments: X + +(56) ColumnarExchange Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: rangepartitioning(supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(55) ShuffleQueryStage +(57) ShuffleQueryStage Output [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: X -(56) InputAdapter +(58) InputAdapter Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] -(57) InputIteratorTransformer +(59) InputIteratorTransformer Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] -(58) SortExecTransformer +(60) SortExecTransformer Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: [supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST], true, 0 -(59) WholeStageCodegenTransformer (X) +(61) WholeStageCodegenTransformer (X) Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: false -(60) VeloxColumnarToRowExec +(62) VeloxColumnarToRowExec Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] -(61) Scan parquet +(63) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(62) Filter +(64) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(63) BroadcastExchange +(65) BroadcastExchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(64) Scan parquet +(66) Scan parquet Output [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1995-01-01), LessThanOrEqual(l_shipdate,1996-12-31), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(65) Filter +(67) Filter Input [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : ((((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1995-01-01)) AND (l_shipdate#X <= 1996-12-31)) AND isnotnull(l_suppkey#X)) AND isnotnull(l_orderkey#X)) -(66) BroadcastHashJoin +(68) BroadcastHashJoin Left keys [1]: [s_suppkey#X] Right keys [1]: [l_suppkey#X] Join condition: None -(67) Project +(69) Project Output [5]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Input [7]: [s_suppkey#X, s_nationkey#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(68) Scan parquet +(70) Scan parquet Output [2]: [o_orderkey#X, o_custkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderkey), IsNotNull(o_custkey)] ReadSchema: struct -(69) Filter +(71) Filter Input [2]: [o_orderkey#X, o_custkey#X] Condition : (isnotnull(o_orderkey#X) AND isnotnull(o_custkey#X)) -(70) BroadcastExchange +(72) BroadcastExchange Input [2]: [o_orderkey#X, o_custkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(71) BroadcastHashJoin +(73) BroadcastHashJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join condition: None -(72) Project +(74) Project Output [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] Input [7]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_orderkey#X, o_custkey#X] -(73) Scan parquet +(75) Scan parquet Output [2]: [c_custkey#X, c_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey), IsNotNull(c_nationkey)] ReadSchema: struct -(74) Filter +(76) Filter Input [2]: [c_custkey#X, c_nationkey#X] Condition : (isnotnull(c_custkey#X) AND isnotnull(c_nationkey#X)) -(75) BroadcastExchange +(77) BroadcastExchange Input [2]: [c_custkey#X, c_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(76) BroadcastHashJoin +(78) BroadcastHashJoin Left keys [1]: [o_custkey#X] Right keys [1]: [c_custkey#X] Join condition: None -(77) Project +(79) Project Output [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] Input [7]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X, c_custkey#X, c_nationkey#X] -(78) Scan parquet +(80) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), Or(EqualTo(n_name,FRANCE),EqualTo(n_name,GERMANY))] ReadSchema: struct -(79) Filter +(81) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : (isnotnull(n_nationkey#X) AND ((n_name#X = FRANCE) OR (n_name#X = GERMANY))) -(80) BroadcastExchange +(82) BroadcastExchange Input [2]: [n_nationkey#X, n_name#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(81) BroadcastHashJoin +(83) BroadcastHashJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(82) Project +(84) Project Output [5]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] Input [7]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_nationkey#X, n_name#X] -(83) Scan parquet +(85) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), Or(EqualTo(n_name,GERMANY),EqualTo(n_name,FRANCE))] ReadSchema: struct -(84) Filter +(86) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : (isnotnull(n_nationkey#X) AND ((n_name#X = GERMANY) OR (n_name#X = FRANCE))) -(85) BroadcastExchange +(87) BroadcastExchange Input [2]: [n_nationkey#X, n_name#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(86) BroadcastHashJoin +(88) BroadcastHashJoin Left keys [1]: [c_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: (((n_name#X = FRANCE) AND (n_name#X = GERMANY)) OR ((n_name#X = GERMANY) AND (n_name#X = FRANCE))) -(87) Project +(89) Project Output [4]: [n_name#X AS supp_nation#X, n_name#X AS cust_nation#X, year(l_shipdate#X) AS l_year#X, CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) AS volume#X] Input [7]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X, n_nationkey#X, n_name#X] -(88) HashAggregate +(90) HashAggregate Input [4]: [supp_nation#X, cust_nation#X, l_year#X, volume#X] Keys [3]: [supp_nation#X, cust_nation#X, l_year#X] Functions [1]: [partial_sum(volume#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] -(89) Exchange +(91) Exchange Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Arguments: hashpartitioning(supp_nation#X, cust_nation#X, l_year#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(90) HashAggregate +(92) HashAggregate Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Keys [3]: [supp_nation#X, cust_nation#X, l_year#X] Functions [1]: [sum(volume#X)] Aggregate Attributes [1]: [sum(volume#X)#X] Results [4]: [supp_nation#X, cust_nation#X, l_year#X, sum(volume#X)#X AS revenue#X] -(91) Exchange +(93) Exchange Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: rangepartitioning(supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(92) Sort +(94) Sort Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: [supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST], true, 0 -(93) AdaptiveSparkPlan +(95) AdaptiveSparkPlan Output [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/8.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/8.txt index 3aa6e39645ccc..64a40563c2381 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/8.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/8.txt @@ -1,125 +1,118 @@ == Physical Plan == -AdaptiveSparkPlan (129) +AdaptiveSparkPlan (131) +- == Final Plan == - VeloxColumnarToRowExec (84) - +- ^ SortExecTransformer (82) - +- ^ InputIteratorTransformer (81) - +- ^ InputAdapter (80) - +- ^ ShuffleQueryStage (79) - +- ColumnarExchange (78) - +- ^ ProjectExecTransformer (76) - +- ^ RegularHashAggregateExecTransformer (75) - +- ^ InputIteratorTransformer (74) - +- ^ InputAdapter (73) - +- ^ ShuffleQueryStage (72) - +- ColumnarExchange (71) + VeloxColumnarToRowExec (86) + +- ^ SortExecTransformer (84) + +- ^ InputIteratorTransformer (83) + +- ShuffleQueryStage (81) + +- ColumnarExchange (80) + +- VeloxAppendBatches (79) + +- ^ ProjectExecTransformer (77) + +- ^ RegularHashAggregateExecTransformer (76) + +- ^ InputIteratorTransformer (75) + +- ShuffleQueryStage (73) + +- ColumnarExchange (72) + +- VeloxAppendBatches (71) +- ^ ProjectExecTransformer (69) +- ^ FlushableHashAggregateExecTransformer (68) +- ^ ProjectExecTransformer (67) - +- ^ BroadcastHashJoinExecTransformer Inner (66) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (66) :- ^ ProjectExecTransformer (57) - : +- ^ BroadcastHashJoinExecTransformer Inner (56) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (56) : :- ^ ProjectExecTransformer (48) - : : +- ^ BroadcastHashJoinExecTransformer Inner (47) + : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (47) : : :- ^ ProjectExecTransformer (39) - : : : +- ^ BroadcastHashJoinExecTransformer Inner (38) + : : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (38) : : : :- ^ ProjectExecTransformer (30) - : : : : +- ^ BroadcastHashJoinExecTransformer Inner (29) + : : : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (29) : : : : :- ^ ProjectExecTransformer (21) - : : : : : +- ^ BroadcastHashJoinExecTransformer Inner (20) + : : : : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (20) : : : : : :- ^ ProjectExecTransformer (12) - : : : : : : +- ^ BroadcastHashJoinExecTransformer Inner (11) + : : : : : : +- ^ BroadcastHashJoinExecTransformer Inner BuildLeft (11) : : : : : : :- ^ InputIteratorTransformer (8) - : : : : : : : +- ^ InputAdapter (7) - : : : : : : : +- ^ BroadcastQueryStage (6) - : : : : : : : +- ColumnarBroadcastExchange (5) - : : : : : : : +- ^ ProjectExecTransformer (3) - : : : : : : : +- ^ FilterExecTransformer (2) - : : : : : : : +- ^ Scan parquet (1) + : : : : : : : +- BroadcastQueryStage (6) + : : : : : : : +- ColumnarBroadcastExchange (5) + : : : : : : : +- ^ ProjectExecTransformer (3) + : : : : : : : +- ^ FilterExecTransformer (2) + : : : : : : : +- ^ Scan parquet (1) : : : : : : +- ^ FilterExecTransformer (10) : : : : : : +- ^ Scan parquet (9) : : : : : +- ^ InputIteratorTransformer (19) - : : : : : +- ^ InputAdapter (18) - : : : : : +- ^ BroadcastQueryStage (17) - : : : : : +- ColumnarBroadcastExchange (16) - : : : : : +- ^ FilterExecTransformer (14) - : : : : : +- ^ Scan parquet (13) + : : : : : +- BroadcastQueryStage (17) + : : : : : +- ColumnarBroadcastExchange (16) + : : : : : +- ^ FilterExecTransformer (14) + : : : : : +- ^ Scan parquet (13) : : : : +- ^ InputIteratorTransformer (28) - : : : : +- ^ InputAdapter (27) - : : : : +- ^ BroadcastQueryStage (26) - : : : : +- ColumnarBroadcastExchange (25) - : : : : +- ^ FilterExecTransformer (23) - : : : : +- ^ Scan parquet (22) + : : : : +- BroadcastQueryStage (26) + : : : : +- ColumnarBroadcastExchange (25) + : : : : +- ^ FilterExecTransformer (23) + : : : : +- ^ Scan parquet (22) : : : +- ^ InputIteratorTransformer (37) - : : : +- ^ InputAdapter (36) - : : : +- ^ BroadcastQueryStage (35) - : : : +- ColumnarBroadcastExchange (34) - : : : +- ^ FilterExecTransformer (32) - : : : +- ^ Scan parquet (31) + : : : +- BroadcastQueryStage (35) + : : : +- ColumnarBroadcastExchange (34) + : : : +- ^ FilterExecTransformer (32) + : : : +- ^ Scan parquet (31) : : +- ^ InputIteratorTransformer (46) - : : +- ^ InputAdapter (45) - : : +- ^ BroadcastQueryStage (44) - : : +- ColumnarBroadcastExchange (43) - : : +- ^ FilterExecTransformer (41) - : : +- ^ Scan parquet (40) + : : +- BroadcastQueryStage (44) + : : +- ColumnarBroadcastExchange (43) + : : +- ^ FilterExecTransformer (41) + : : +- ^ Scan parquet (40) : +- ^ InputIteratorTransformer (55) - : +- ^ InputAdapter (54) - : +- ^ BroadcastQueryStage (53) - : +- ColumnarBroadcastExchange (52) - : +- ^ FilterExecTransformer (50) - : +- ^ Scan parquet (49) + : +- BroadcastQueryStage (53) + : +- ColumnarBroadcastExchange (52) + : +- ^ FilterExecTransformer (50) + : +- ^ Scan parquet (49) +- ^ InputIteratorTransformer (65) - +- ^ InputAdapter (64) - +- ^ BroadcastQueryStage (63) - +- ColumnarBroadcastExchange (62) - +- ^ ProjectExecTransformer (60) - +- ^ FilterExecTransformer (59) - +- ^ Scan parquet (58) + +- BroadcastQueryStage (63) + +- ColumnarBroadcastExchange (62) + +- ^ ProjectExecTransformer (60) + +- ^ FilterExecTransformer (59) + +- ^ Scan parquet (58) +- == Initial Plan == - Sort (128) - +- Exchange (127) - +- HashAggregate (126) - +- Exchange (125) - +- HashAggregate (124) - +- Project (123) - +- BroadcastHashJoin Inner BuildRight (122) - :- Project (117) - : +- BroadcastHashJoin Inner BuildRight (116) - : :- Project (112) - : : +- BroadcastHashJoin Inner BuildRight (111) - : : :- Project (107) - : : : +- BroadcastHashJoin Inner BuildRight (106) - : : : :- Project (102) - : : : : +- BroadcastHashJoin Inner BuildRight (101) - : : : : :- Project (97) - : : : : : +- BroadcastHashJoin Inner BuildRight (96) - : : : : : :- Project (92) - : : : : : : +- BroadcastHashJoin Inner BuildLeft (91) - : : : : : : :- BroadcastExchange (88) - : : : : : : : +- Project (87) - : : : : : : : +- Filter (86) - : : : : : : : +- Scan parquet (85) - : : : : : : +- Filter (90) - : : : : : : +- Scan parquet (89) - : : : : : +- BroadcastExchange (95) - : : : : : +- Filter (94) - : : : : : +- Scan parquet (93) - : : : : +- BroadcastExchange (100) - : : : : +- Filter (99) - : : : : +- Scan parquet (98) - : : : +- BroadcastExchange (105) - : : : +- Filter (104) - : : : +- Scan parquet (103) - : : +- BroadcastExchange (110) - : : +- Filter (109) - : : +- Scan parquet (108) - : +- BroadcastExchange (115) - : +- Filter (114) - : +- Scan parquet (113) - +- BroadcastExchange (121) - +- Project (120) - +- Filter (119) - +- Scan parquet (118) + Sort (130) + +- Exchange (129) + +- HashAggregate (128) + +- Exchange (127) + +- HashAggregate (126) + +- Project (125) + +- BroadcastHashJoin Inner BuildRight (124) + :- Project (119) + : +- BroadcastHashJoin Inner BuildRight (118) + : :- Project (114) + : : +- BroadcastHashJoin Inner BuildRight (113) + : : :- Project (109) + : : : +- BroadcastHashJoin Inner BuildRight (108) + : : : :- Project (104) + : : : : +- BroadcastHashJoin Inner BuildRight (103) + : : : : :- Project (99) + : : : : : +- BroadcastHashJoin Inner BuildRight (98) + : : : : : :- Project (94) + : : : : : : +- BroadcastHashJoin Inner BuildLeft (93) + : : : : : : :- BroadcastExchange (90) + : : : : : : : +- Project (89) + : : : : : : : +- Filter (88) + : : : : : : : +- Scan parquet (87) + : : : : : : +- Filter (92) + : : : : : : +- Scan parquet (91) + : : : : : +- BroadcastExchange (97) + : : : : : +- Filter (96) + : : : : : +- Scan parquet (95) + : : : : +- BroadcastExchange (102) + : : : : +- Filter (101) + : : : : +- Scan parquet (100) + : : : +- BroadcastExchange (107) + : : : +- Filter (106) + : : : +- Scan parquet (105) + : : +- BroadcastExchange (112) + : : +- Filter (111) + : : +- Scan parquet (110) + : +- BroadcastExchange (117) + : +- Filter (116) + : +- Scan parquet (115) + +- BroadcastExchange (123) + +- Project (122) + +- Filter (121) + +- Scan parquet (120) (1) Scan parquet @@ -422,273 +415,281 @@ Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Input [6]: [hash_partition_key#X, o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: false -(71) ColumnarExchange +(71) VeloxAppendBatches +Input [6]: [hash_partition_key#X, o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] +Arguments: X + +(72) ColumnarExchange Input [6]: [hash_partition_key#X, o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: hashpartitioning(o_year#X, 1), ENSURE_REQUIREMENTS, [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(72) ShuffleQueryStage +(73) ShuffleQueryStage Output [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: X -(73) InputAdapter +(74) InputAdapter Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] -(74) InputIteratorTransformer +(75) InputIteratorTransformer Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] -(75) RegularHashAggregateExecTransformer +(76) RegularHashAggregateExecTransformer Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Keys [1]: [o_year#X] Functions [2]: [sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END), sum(volume#X)] Aggregate Attributes [2]: [sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X, sum(volume#X)#X] Results [3]: [o_year#X, sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X, sum(volume#X)#X] -(76) ProjectExecTransformer +(77) ProjectExecTransformer Output [2]: [o_year#X, CheckOverflow((promote_precision(sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X) / promote_precision(sum(volume#X)#X)), DecimalType(38,6), true) AS mkt_share#X] Input [3]: [o_year#X, sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X, sum(volume#X)#X] -(77) WholeStageCodegenTransformer (X) +(78) WholeStageCodegenTransformer (X) Input [2]: [o_year#X, mkt_share#X] Arguments: false -(78) ColumnarExchange +(79) VeloxAppendBatches +Input [2]: [o_year#X, mkt_share#X] +Arguments: X + +(80) ColumnarExchange Input [2]: [o_year#X, mkt_share#X] Arguments: rangepartitioning(o_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(79) ShuffleQueryStage +(81) ShuffleQueryStage Output [2]: [o_year#X, mkt_share#X] Arguments: X -(80) InputAdapter +(82) InputAdapter Input [2]: [o_year#X, mkt_share#X] -(81) InputIteratorTransformer +(83) InputIteratorTransformer Input [2]: [o_year#X, mkt_share#X] -(82) SortExecTransformer +(84) SortExecTransformer Input [2]: [o_year#X, mkt_share#X] Arguments: [o_year#X ASC NULLS FIRST], true, 0 -(83) WholeStageCodegenTransformer (X) +(85) WholeStageCodegenTransformer (X) Input [2]: [o_year#X, mkt_share#X] Arguments: false -(84) VeloxColumnarToRowExec +(86) VeloxColumnarToRowExec Input [2]: [o_year#X, mkt_share#X] -(85) Scan parquet +(87) Scan parquet Output [2]: [p_partkey#X, p_type#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_type), EqualTo(p_type,ECONOMY ANODIZED STEEL), IsNotNull(p_partkey)] ReadSchema: struct -(86) Filter +(88) Filter Input [2]: [p_partkey#X, p_type#X] Condition : ((isnotnull(p_type#X) AND (p_type#X = ECONOMY ANODIZED STEEL)) AND isnotnull(p_partkey#X)) -(87) Project +(89) Project Output [1]: [p_partkey#X] Input [2]: [p_partkey#X, p_type#X] -(88) BroadcastExchange +(90) BroadcastExchange Input [1]: [p_partkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(89) Scan parquet +(91) Scan parquet Output [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(90) Filter +(92) Filter Input [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Condition : ((isnotnull(l_partkey#X) AND isnotnull(l_suppkey#X)) AND isnotnull(l_orderkey#X)) -(91) BroadcastHashJoin +(93) BroadcastHashJoin Left keys [1]: [p_partkey#X] Right keys [1]: [l_partkey#X] Join condition: None -(92) Project +(94) Project Output [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Input [6]: [p_partkey#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(93) Scan parquet +(95) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(94) Filter +(96) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(95) BroadcastExchange +(97) BroadcastExchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(96) BroadcastHashJoin +(98) BroadcastHashJoin Left keys [1]: [l_suppkey#X] Right keys [1]: [s_suppkey#X] Join condition: None -(97) Project +(99) Project Output [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Input [6]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, s_suppkey#X, s_nationkey#X] -(98) Scan parquet +(100) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1995-01-01), LessThanOrEqual(o_orderdate,1996-12-31), IsNotNull(o_orderkey), IsNotNull(o_custkey)] ReadSchema: struct -(99) Filter +(101) Filter Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Condition : ((((isnotnull(o_orderdate#X) AND (o_orderdate#X >= 1995-01-01)) AND (o_orderdate#X <= 1996-12-31)) AND isnotnull(o_orderkey#X)) AND isnotnull(o_custkey#X)) -(100) BroadcastExchange +(102) BroadcastExchange Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(101) BroadcastHashJoin +(103) BroadcastHashJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join condition: None -(102) Project +(104) Project Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] Input [7]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderkey#X, o_custkey#X, o_orderdate#X] -(103) Scan parquet +(105) Scan parquet Output [2]: [c_custkey#X, c_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey), IsNotNull(c_nationkey)] ReadSchema: struct -(104) Filter +(106) Filter Input [2]: [c_custkey#X, c_nationkey#X] Condition : (isnotnull(c_custkey#X) AND isnotnull(c_nationkey#X)) -(105) BroadcastExchange +(107) BroadcastExchange Input [2]: [c_custkey#X, c_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(106) BroadcastHashJoin +(108) BroadcastHashJoin Left keys [1]: [o_custkey#X] Right keys [1]: [c_custkey#X] Join condition: None -(107) Project +(109) Project Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] Input [7]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X, c_custkey#X, c_nationkey#X] -(108) Scan parquet +(110) Scan parquet Output [2]: [n_nationkey#X, n_regionkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), IsNotNull(n_regionkey)] ReadSchema: struct -(109) Filter +(111) Filter Input [2]: [n_nationkey#X, n_regionkey#X] Condition : (isnotnull(n_nationkey#X) AND isnotnull(n_regionkey#X)) -(110) BroadcastExchange +(112) BroadcastExchange Input [2]: [n_nationkey#X, n_regionkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(111) BroadcastHashJoin +(113) BroadcastHashJoin Left keys [1]: [c_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(112) Project +(114) Project Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] Input [7]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X, n_nationkey#X, n_regionkey#X] -(113) Scan parquet +(115) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey)] ReadSchema: struct -(114) Filter +(116) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : isnotnull(n_nationkey#X) -(115) BroadcastExchange +(117) BroadcastExchange Input [2]: [n_nationkey#X, n_name#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(116) BroadcastHashJoin +(118) BroadcastHashJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(117) Project +(119) Project Output [5]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] Input [7]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X, n_nationkey#X, n_name#X] -(118) Scan parquet +(120) Scan parquet Output [2]: [r_regionkey#X, r_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(r_name), EqualTo(r_name,AMERICA), IsNotNull(r_regionkey)] ReadSchema: struct -(119) Filter +(121) Filter Input [2]: [r_regionkey#X, r_name#X] Condition : ((isnotnull(r_name#X) AND (r_name#X = AMERICA)) AND isnotnull(r_regionkey#X)) -(120) Project +(122) Project Output [1]: [r_regionkey#X] Input [2]: [r_regionkey#X, r_name#X] -(121) BroadcastExchange +(123) BroadcastExchange Input [1]: [r_regionkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(122) BroadcastHashJoin +(124) BroadcastHashJoin Left keys [1]: [n_regionkey#X] Right keys [1]: [r_regionkey#X] Join condition: None -(123) Project +(125) Project Output [3]: [year(o_orderdate#X) AS o_year#X, CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) AS volume#X, n_name#X AS nation#X] Input [6]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X, r_regionkey#X] -(124) HashAggregate +(126) HashAggregate Input [3]: [o_year#X, volume#X, nation#X] Keys [1]: [o_year#X] Functions [2]: [partial_sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END), partial_sum(volume#X)] Aggregate Attributes [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Results [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] -(125) Exchange +(127) Exchange Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: hashpartitioning(o_year#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(126) HashAggregate +(128) HashAggregate Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Keys [1]: [o_year#X] Functions [2]: [sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END), sum(volume#X)] Aggregate Attributes [2]: [sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X, sum(volume#X)#X] Results [2]: [o_year#X, CheckOverflow((promote_precision(sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X) / promote_precision(sum(volume#X)#X)), DecimalType(38,6), true) AS mkt_share#X] -(127) Exchange +(129) Exchange Input [2]: [o_year#X, mkt_share#X] Arguments: rangepartitioning(o_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(128) Sort +(130) Sort Input [2]: [o_year#X, mkt_share#X] Arguments: [o_year#X ASC NULLS FIRST], true, 0 -(129) AdaptiveSparkPlan +(131) AdaptiveSparkPlan Output [2]: [o_year#X, mkt_share#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/9.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/9.txt index b00947a3e3e0e..b5a21a0261d2e 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/9.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/9.txt @@ -1,96 +1,91 @@ == Physical Plan == -AdaptiveSparkPlan (98) +AdaptiveSparkPlan (100) +- == Final Plan == - VeloxColumnarToRowExec (64) - +- ^ SortExecTransformer (62) - +- ^ InputIteratorTransformer (61) - +- ^ InputAdapter (60) - +- ^ ShuffleQueryStage (59) - +- ColumnarExchange (58) - +- ^ RegularHashAggregateExecTransformer (56) - +- ^ InputIteratorTransformer (55) - +- ^ InputAdapter (54) - +- ^ ShuffleQueryStage (53) - +- ColumnarExchange (52) + VeloxColumnarToRowExec (66) + +- ^ SortExecTransformer (64) + +- ^ InputIteratorTransformer (63) + +- ShuffleQueryStage (61) + +- ColumnarExchange (60) + +- VeloxAppendBatches (59) + +- ^ RegularHashAggregateExecTransformer (57) + +- ^ InputIteratorTransformer (56) + +- ShuffleQueryStage (54) + +- ColumnarExchange (53) + +- VeloxAppendBatches (52) +- ^ ProjectExecTransformer (50) +- ^ FlushableHashAggregateExecTransformer (49) +- ^ ProjectExecTransformer (48) - +- ^ BroadcastHashJoinExecTransformer Inner (47) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (47) :- ^ ProjectExecTransformer (39) - : +- ^ BroadcastHashJoinExecTransformer Inner (38) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (38) : :- ^ ProjectExecTransformer (30) - : : +- ^ BroadcastHashJoinExecTransformer Inner (29) + : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (29) : : :- ^ ProjectExecTransformer (21) - : : : +- ^ BroadcastHashJoinExecTransformer Inner (20) + : : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (20) : : : :- ^ ProjectExecTransformer (12) - : : : : +- ^ BroadcastHashJoinExecTransformer Inner (11) + : : : : +- ^ BroadcastHashJoinExecTransformer Inner BuildLeft (11) : : : : :- ^ InputIteratorTransformer (8) - : : : : : +- ^ InputAdapter (7) - : : : : : +- ^ BroadcastQueryStage (6) - : : : : : +- ColumnarBroadcastExchange (5) - : : : : : +- ^ ProjectExecTransformer (3) - : : : : : +- ^ FilterExecTransformer (2) - : : : : : +- ^ Scan parquet (1) + : : : : : +- BroadcastQueryStage (6) + : : : : : +- ColumnarBroadcastExchange (5) + : : : : : +- ^ ProjectExecTransformer (3) + : : : : : +- ^ FilterExecTransformer (2) + : : : : : +- ^ Scan parquet (1) : : : : +- ^ FilterExecTransformer (10) : : : : +- ^ Scan parquet (9) : : : +- ^ InputIteratorTransformer (19) - : : : +- ^ InputAdapter (18) - : : : +- ^ BroadcastQueryStage (17) - : : : +- ColumnarBroadcastExchange (16) - : : : +- ^ FilterExecTransformer (14) - : : : +- ^ Scan parquet (13) + : : : +- BroadcastQueryStage (17) + : : : +- ColumnarBroadcastExchange (16) + : : : +- ^ FilterExecTransformer (14) + : : : +- ^ Scan parquet (13) : : +- ^ InputIteratorTransformer (28) - : : +- ^ InputAdapter (27) - : : +- ^ BroadcastQueryStage (26) - : : +- ColumnarBroadcastExchange (25) - : : +- ^ FilterExecTransformer (23) - : : +- ^ Scan parquet (22) + : : +- BroadcastQueryStage (26) + : : +- ColumnarBroadcastExchange (25) + : : +- ^ FilterExecTransformer (23) + : : +- ^ Scan parquet (22) : +- ^ InputIteratorTransformer (37) - : +- ^ InputAdapter (36) - : +- ^ BroadcastQueryStage (35) - : +- ColumnarBroadcastExchange (34) - : +- ^ FilterExecTransformer (32) - : +- ^ Scan parquet (31) + : +- BroadcastQueryStage (35) + : +- ColumnarBroadcastExchange (34) + : +- ^ FilterExecTransformer (32) + : +- ^ Scan parquet (31) +- ^ InputIteratorTransformer (46) - +- ^ InputAdapter (45) - +- ^ BroadcastQueryStage (44) - +- ColumnarBroadcastExchange (43) - +- ^ FilterExecTransformer (41) - +- ^ Scan parquet (40) + +- BroadcastQueryStage (44) + +- ColumnarBroadcastExchange (43) + +- ^ FilterExecTransformer (41) + +- ^ Scan parquet (40) +- == Initial Plan == - Sort (97) - +- Exchange (96) - +- HashAggregate (95) - +- Exchange (94) - +- HashAggregate (93) - +- Project (92) - +- BroadcastHashJoin Inner BuildRight (91) - :- Project (87) - : +- BroadcastHashJoin Inner BuildRight (86) - : :- Project (82) - : : +- BroadcastHashJoin Inner BuildRight (81) - : : :- Project (77) - : : : +- BroadcastHashJoin Inner BuildRight (76) - : : : :- Project (72) - : : : : +- BroadcastHashJoin Inner BuildLeft (71) - : : : : :- BroadcastExchange (68) - : : : : : +- Project (67) - : : : : : +- Filter (66) - : : : : : +- Scan parquet (65) - : : : : +- Filter (70) - : : : : +- Scan parquet (69) - : : : +- BroadcastExchange (75) - : : : +- Filter (74) - : : : +- Scan parquet (73) - : : +- BroadcastExchange (80) - : : +- Filter (79) - : : +- Scan parquet (78) - : +- BroadcastExchange (85) - : +- Filter (84) - : +- Scan parquet (83) - +- BroadcastExchange (90) - +- Filter (89) - +- Scan parquet (88) + Sort (99) + +- Exchange (98) + +- HashAggregate (97) + +- Exchange (96) + +- HashAggregate (95) + +- Project (94) + +- BroadcastHashJoin Inner BuildRight (93) + :- Project (89) + : +- BroadcastHashJoin Inner BuildRight (88) + : :- Project (84) + : : +- BroadcastHashJoin Inner BuildRight (83) + : : :- Project (79) + : : : +- BroadcastHashJoin Inner BuildRight (78) + : : : :- Project (74) + : : : : +- BroadcastHashJoin Inner BuildLeft (73) + : : : : :- BroadcastExchange (70) + : : : : : +- Project (69) + : : : : : +- Filter (68) + : : : : : +- Scan parquet (67) + : : : : +- Filter (72) + : : : : +- Scan parquet (71) + : : : +- BroadcastExchange (77) + : : : +- Filter (76) + : : : +- Scan parquet (75) + : : +- BroadcastExchange (82) + : : +- Filter (81) + : : +- Scan parquet (80) + : +- BroadcastExchange (87) + : +- Filter (86) + : +- Scan parquet (85) + +- BroadcastExchange (92) + +- Filter (91) + +- Scan parquet (90) (1) Scan parquet @@ -313,217 +308,225 @@ Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] Input [5]: [hash_partition_key#X, nation#X, o_year#X, sum#X, isEmpty#X] Arguments: false -(52) ColumnarExchange +(52) VeloxAppendBatches +Input [5]: [hash_partition_key#X, nation#X, o_year#X, sum#X, isEmpty#X] +Arguments: X + +(53) ColumnarExchange Input [5]: [hash_partition_key#X, nation#X, o_year#X, sum#X, isEmpty#X] Arguments: hashpartitioning(nation#X, o_year#X, 1), ENSURE_REQUIREMENTS, [nation#X, o_year#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(53) ShuffleQueryStage +(54) ShuffleQueryStage Output [4]: [nation#X, o_year#X, sum#X, isEmpty#X] Arguments: X -(54) InputAdapter +(55) InputAdapter Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] -(55) InputIteratorTransformer +(56) InputIteratorTransformer Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] -(56) RegularHashAggregateExecTransformer +(57) RegularHashAggregateExecTransformer Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] Keys [2]: [nation#X, o_year#X] Functions [1]: [sum(amount#X)] Aggregate Attributes [1]: [sum(amount#X)#X] Results [3]: [nation#X, o_year#X, sum(amount#X)#X AS sum_profit#X] -(57) WholeStageCodegenTransformer (X) +(58) WholeStageCodegenTransformer (X) Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: false -(58) ColumnarExchange +(59) VeloxAppendBatches +Input [3]: [nation#X, o_year#X, sum_profit#X] +Arguments: X + +(60) ColumnarExchange Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: rangepartitioning(nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(59) ShuffleQueryStage +(61) ShuffleQueryStage Output [3]: [nation#X, o_year#X, sum_profit#X] Arguments: X -(60) InputAdapter +(62) InputAdapter Input [3]: [nation#X, o_year#X, sum_profit#X] -(61) InputIteratorTransformer +(63) InputIteratorTransformer Input [3]: [nation#X, o_year#X, sum_profit#X] -(62) SortExecTransformer +(64) SortExecTransformer Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: [nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST], true, 0 -(63) WholeStageCodegenTransformer (X) +(65) WholeStageCodegenTransformer (X) Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: false -(64) VeloxColumnarToRowExec +(66) VeloxColumnarToRowExec Input [3]: [nation#X, o_year#X, sum_profit#X] -(65) Scan parquet +(67) Scan parquet Output [2]: [p_partkey#X, p_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_name), StringContains(p_name,green), IsNotNull(p_partkey)] ReadSchema: struct -(66) Filter +(68) Filter Input [2]: [p_partkey#X, p_name#X] Condition : ((isnotnull(p_name#X) AND Contains(p_name#X, green)) AND isnotnull(p_partkey#X)) -(67) Project +(69) Project Output [1]: [p_partkey#X] Input [2]: [p_partkey#X, p_name#X] -(68) BroadcastExchange +(70) BroadcastExchange Input [1]: [p_partkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(69) Scan parquet +(71) Scan parquet Output [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(70) Filter +(72) Filter Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Condition : ((isnotnull(l_partkey#X) AND isnotnull(l_suppkey#X)) AND isnotnull(l_orderkey#X)) -(71) BroadcastHashJoin +(73) BroadcastHashJoin Left keys [1]: [p_partkey#X] Right keys [1]: [l_partkey#X] Join condition: None -(72) Project +(74) Project Output [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Input [7]: [p_partkey#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -(73) Scan parquet +(75) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(74) Filter +(76) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(75) BroadcastExchange +(77) BroadcastExchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(76) BroadcastHashJoin +(78) BroadcastHashJoin Left keys [1]: [l_suppkey#X] Right keys [1]: [s_suppkey#X] Join condition: None -(77) Project +(79) Project Output [7]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Input [8]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_suppkey#X, s_nationkey#X] -(78) Scan parquet +(80) Scan parquet Output [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_suppkey), IsNotNull(ps_partkey)] ReadSchema: struct -(79) Filter +(81) Filter Input [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Condition : (isnotnull(ps_suppkey#X) AND isnotnull(ps_partkey#X)) -(80) BroadcastExchange +(82) BroadcastExchange Input [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Arguments: HashedRelationBroadcastMode(List(input[1, bigint, false], input[0, bigint, false]),false), [plan_id=X] -(81) BroadcastHashJoin +(83) BroadcastHashJoin Left keys [2]: [l_suppkey#X, l_partkey#X] Right keys [2]: [ps_suppkey#X, ps_partkey#X] Join condition: None -(82) Project +(84) Project Output [6]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] Input [10]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_partkey#X, ps_suppkey#X, ps_supplycost#X] -(83) Scan parquet +(85) Scan parquet Output [2]: [o_orderkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderkey)] ReadSchema: struct -(84) Filter +(86) Filter Input [2]: [o_orderkey#X, o_orderdate#X] Condition : isnotnull(o_orderkey#X) -(85) BroadcastExchange +(87) BroadcastExchange Input [2]: [o_orderkey#X, o_orderdate#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(86) BroadcastHashJoin +(88) BroadcastHashJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join condition: None -(87) Project +(89) Project Output [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] Input [8]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderkey#X, o_orderdate#X] -(88) Scan parquet +(90) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey)] ReadSchema: struct -(89) Filter +(91) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : isnotnull(n_nationkey#X) -(90) BroadcastExchange +(92) BroadcastExchange Input [2]: [n_nationkey#X, n_name#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(91) BroadcastHashJoin +(93) BroadcastHashJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(92) Project +(94) Project Output [3]: [n_name#X AS nation#X, year(o_orderdate#X) AS o_year#X, CheckOverflow((promote_precision(cast(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) as decimal(27,4))) - promote_precision(cast(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(l_quantity#X)), DecimalType(25,4), true) as decimal(27,4)))), DecimalType(27,4), true) AS amount#X] Input [8]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X, n_nationkey#X, n_name#X] -(93) HashAggregate +(95) HashAggregate Input [3]: [nation#X, o_year#X, amount#X] Keys [2]: [nation#X, o_year#X] Functions [1]: [partial_sum(amount#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [4]: [nation#X, o_year#X, sum#X, isEmpty#X] -(94) Exchange +(96) Exchange Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] Arguments: hashpartitioning(nation#X, o_year#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(95) HashAggregate +(97) HashAggregate Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] Keys [2]: [nation#X, o_year#X] Functions [1]: [sum(amount#X)] Aggregate Attributes [1]: [sum(amount#X)#X] Results [3]: [nation#X, o_year#X, sum(amount#X)#X AS sum_profit#X] -(96) Exchange +(98) Exchange Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: rangepartitioning(nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(97) Sort +(99) Sort Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: [nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST], true, 0 -(98) AdaptiveSparkPlan +(100) AdaptiveSparkPlan Output [3]: [nation#X, o_year#X, sum_profit#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/1.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/1.txt index 0cacd70fdca74..22dd5100c4fb3 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/1.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/1.txt @@ -1,31 +1,31 @@ == Physical Plan == -AdaptiveSparkPlan (28) +AdaptiveSparkPlan (30) +- == Final Plan == - VeloxColumnarToRowExec (19) - +- ^ SortExecTransformer (17) - +- ^ InputIteratorTransformer (16) - +- ^ InputAdapter (15) - +- ^ ShuffleQueryStage (14), Statistics(X) - +- ColumnarExchange (13) - +- ^ RegularHashAggregateExecTransformer (11) - +- ^ InputIteratorTransformer (10) - +- ^ InputAdapter (9) - +- ^ ShuffleQueryStage (8), Statistics(X) - +- ColumnarExchange (7) + VeloxColumnarToRowExec (21) + +- ^ SortExecTransformer (19) + +- ^ InputIteratorTransformer (18) + +- ShuffleQueryStage (16), Statistics(X) + +- ColumnarExchange (15) + +- VeloxAppendBatches (14) + +- ^ RegularHashAggregateExecTransformer (12) + +- ^ InputIteratorTransformer (11) + +- ShuffleQueryStage (9), Statistics(X) + +- ColumnarExchange (8) + +- VeloxAppendBatches (7) +- ^ ProjectExecTransformer (5) +- ^ FlushableHashAggregateExecTransformer (4) +- ^ ProjectExecTransformer (3) +- ^ FilterExecTransformer (2) +- ^ Scan parquet (1) +- == Initial Plan == - Sort (27) - +- Exchange (26) - +- HashAggregate (25) - +- Exchange (24) - +- HashAggregate (23) - +- Project (22) - +- Filter (21) - +- Scan parquet (20) + Sort (29) + +- Exchange (28) + +- HashAggregate (27) + +- Exchange (26) + +- HashAggregate (25) + +- Project (24) + +- Filter (23) + +- Scan parquet (22) (1) Scan parquet @@ -58,97 +58,105 @@ Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, Input [18]: [hash_partition_key#X, l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Arguments: false -(7) ColumnarExchange +(7) VeloxAppendBatches +Input [18]: [hash_partition_key#X, l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] +Arguments: X + +(8) ColumnarExchange Input [18]: [hash_partition_key#X, l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Arguments: hashpartitioning(l_returnflag#X, l_linestatus#X, 1), ENSURE_REQUIREMENTS, [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X], [plan_id=X], [id=#X] -(8) ShuffleQueryStage +(9) ShuffleQueryStage Output [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Arguments: X -(9) InputAdapter +(10) InputAdapter Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] -(10) InputIteratorTransformer +(11) InputIteratorTransformer Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] -(11) RegularHashAggregateExecTransformer +(12) RegularHashAggregateExecTransformer Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Keys [2]: [l_returnflag#X, l_linestatus#X] Functions [8]: [sum(l_quantity#X), sum(l_extendedprice#X), sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4))), sum(CheckOverflow((promote_precision(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4))) * promote_precision(cast(CheckOverflow((1.00 + promote_precision(cast(l_tax#X as decimal(13,2)))), DecimalType(13,2)) as decimal(26,4)))), DecimalType(38,6))), avg(l_quantity#X), avg(l_extendedprice#X), avg(l_discount#X), count(1)] Aggregate Attributes [8]: [sum(l_quantity#X)#X, sum(l_extendedprice#X)#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X, sum(CheckOverflow((promote_precision(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4))) * promote_precision(cast(CheckOverflow((1.00 + promote_precision(cast(l_tax#X as decimal(13,2)))), DecimalType(13,2)) as decimal(26,4)))), DecimalType(38,6)))#X, avg(l_quantity#X)#X, avg(l_extendedprice#X)#X, avg(l_discount#X)#X, count(1)#X] Results [10]: [l_returnflag#X, l_linestatus#X, sum(l_quantity#X)#X AS sum_qty#X, sum(l_extendedprice#X)#X AS sum_base_price#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X AS sum_disc_price#X, sum(CheckOverflow((promote_precision(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4))) * promote_precision(cast(CheckOverflow((1.00 + promote_precision(cast(l_tax#X as decimal(13,2)))), DecimalType(13,2)) as decimal(26,4)))), DecimalType(38,6)))#X AS sum_charge#X, avg(l_quantity#X)#X AS avg_qty#X, avg(l_extendedprice#X)#X AS avg_price#X, avg(l_discount#X)#X AS avg_disc#X, count(1)#X AS count_order#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] +Arguments: X + +(15) ColumnarExchange Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: rangepartitioning(l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] -(17) SortExecTransformer +(19) SortExecTransformer Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: [l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST], true, 0 -(18) WholeStageCodegenTransformer (X) +(20) WholeStageCodegenTransformer (X) Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: false -(19) VeloxColumnarToRowExec +(21) VeloxColumnarToRowExec Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] -(20) Scan parquet +(22) Scan parquet Output [7]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_tax#X, l_returnflag#X, l_linestatus#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), LessThanOrEqual(l_shipdate,1998-09-02)] ReadSchema: struct -(21) Filter +(23) Filter Input [7]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_tax#X, l_returnflag#X, l_linestatus#X, l_shipdate#X] Condition : (isnotnull(l_shipdate#X) AND (l_shipdate#X <= 1998-09-02)) -(22) Project +(24) Project Output [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_tax#X, l_returnflag#X, l_linestatus#X] Input [7]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_tax#X, l_returnflag#X, l_linestatus#X, l_shipdate#X] -(23) HashAggregate +(25) HashAggregate Input [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_tax#X, l_returnflag#X, l_linestatus#X] Keys [2]: [l_returnflag#X, l_linestatus#X] Functions [8]: [partial_sum(l_quantity#X), partial_sum(l_extendedprice#X), partial_sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4))), partial_sum(CheckOverflow((promote_precision(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4))) * promote_precision(cast(CheckOverflow((1.00 + promote_precision(cast(l_tax#X as decimal(13,2)))), DecimalType(13,2)) as decimal(26,4)))), DecimalType(38,6))), partial_avg(l_quantity#X), partial_avg(l_extendedprice#X), partial_avg(l_discount#X), partial_count(1)] Aggregate Attributes [15]: [sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Results [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] -(24) Exchange +(26) Exchange Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Arguments: hashpartitioning(l_returnflag#X, l_linestatus#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(25) HashAggregate +(27) HashAggregate Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Keys [2]: [l_returnflag#X, l_linestatus#X] Functions [8]: [sum(l_quantity#X), sum(l_extendedprice#X), sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4))), sum(CheckOverflow((promote_precision(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4))) * promote_precision(cast(CheckOverflow((1.00 + promote_precision(cast(l_tax#X as decimal(13,2)))), DecimalType(13,2)) as decimal(26,4)))), DecimalType(38,6))), avg(l_quantity#X), avg(l_extendedprice#X), avg(l_discount#X), count(1)] Aggregate Attributes [8]: [sum(l_quantity#X)#X, sum(l_extendedprice#X)#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X, sum(CheckOverflow((promote_precision(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4))) * promote_precision(cast(CheckOverflow((1.00 + promote_precision(cast(l_tax#X as decimal(13,2)))), DecimalType(13,2)) as decimal(26,4)))), DecimalType(38,6)))#X, avg(l_quantity#X)#X, avg(l_extendedprice#X)#X, avg(l_discount#X)#X, count(1)#X] Results [10]: [l_returnflag#X, l_linestatus#X, sum(l_quantity#X)#X AS sum_qty#X, sum(l_extendedprice#X)#X AS sum_base_price#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X AS sum_disc_price#X, sum(CheckOverflow((promote_precision(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4))) * promote_precision(cast(CheckOverflow((1.00 + promote_precision(cast(l_tax#X as decimal(13,2)))), DecimalType(13,2)) as decimal(26,4)))), DecimalType(38,6)))#X AS sum_charge#X, avg(l_quantity#X)#X AS avg_qty#X, avg(l_extendedprice#X)#X AS avg_price#X, avg(l_discount#X)#X AS avg_disc#X, count(1)#X AS count_order#X] -(26) Exchange +(28) Exchange Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: rangepartitioning(l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(27) Sort +(29) Sort Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: [l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST], true, 0 -(28) AdaptiveSparkPlan +(30) AdaptiveSparkPlan Output [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/10.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/10.txt index 2977896f3da87..8681ecf5f93f6 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/10.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/10.txt @@ -1,68 +1,65 @@ == Physical Plan == -AdaptiveSparkPlan (67) +AdaptiveSparkPlan (68) +- == Final Plan == - VeloxColumnarToRowExec (43) - +- TakeOrderedAndProjectExecTransformer (42) - +- ^ ProjectExecTransformer (40) - +- ^ RegularHashAggregateExecTransformer (39) - +- ^ InputIteratorTransformer (38) - +- ^ InputAdapter (37) - +- ^ ShuffleQueryStage (36), Statistics(X) - +- ColumnarExchange (35) + VeloxColumnarToRowExec (44) + +- TakeOrderedAndProjectExecTransformer (43) + +- ^ ProjectExecTransformer (41) + +- ^ RegularHashAggregateExecTransformer (40) + +- ^ InputIteratorTransformer (39) + +- ShuffleQueryStage (37), Statistics(X) + +- ColumnarExchange (36) + +- VeloxAppendBatches (35) +- ^ ProjectExecTransformer (33) +- ^ FlushableHashAggregateExecTransformer (32) +- ^ ProjectExecTransformer (31) - +- ^ BroadcastHashJoinExecTransformer Inner (30) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (30) :- ^ ProjectExecTransformer (22) - : +- ^ BroadcastHashJoinExecTransformer Inner (21) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (21) : :- ^ ProjectExecTransformer (12) - : : +- ^ BroadcastHashJoinExecTransformer Inner (11) + : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (11) : : :- ^ FilterExecTransformer (2) : : : +- ^ Scan parquet (1) : : +- ^ InputIteratorTransformer (10) - : : +- ^ InputAdapter (9) - : : +- ^ BroadcastQueryStage (8), Statistics(X) - : : +- ColumnarBroadcastExchange (7) - : : +- ^ ProjectExecTransformer (5) - : : +- ^ FilterExecTransformer (4) - : : +- ^ Scan parquet (3) + : : +- BroadcastQueryStage (8), Statistics(X) + : : +- ColumnarBroadcastExchange (7) + : : +- ^ ProjectExecTransformer (5) + : : +- ^ FilterExecTransformer (4) + : : +- ^ Scan parquet (3) : +- ^ InputIteratorTransformer (20) - : +- ^ InputAdapter (19) - : +- ^ BroadcastQueryStage (18), Statistics(X) - : +- ColumnarBroadcastExchange (17) - : +- ^ ProjectExecTransformer (15) - : +- ^ FilterExecTransformer (14) - : +- ^ Scan parquet (13) + : +- BroadcastQueryStage (18), Statistics(X) + : +- ColumnarBroadcastExchange (17) + : +- ^ ProjectExecTransformer (15) + : +- ^ FilterExecTransformer (14) + : +- ^ Scan parquet (13) +- ^ InputIteratorTransformer (29) - +- ^ InputAdapter (28) - +- ^ BroadcastQueryStage (27), Statistics(X) - +- ColumnarBroadcastExchange (26) - +- ^ FilterExecTransformer (24) - +- ^ Scan parquet (23) + +- BroadcastQueryStage (27), Statistics(X) + +- ColumnarBroadcastExchange (26) + +- ^ FilterExecTransformer (24) + +- ^ Scan parquet (23) +- == Initial Plan == - TakeOrderedAndProject (66) - +- HashAggregate (65) - +- Exchange (64) - +- HashAggregate (63) - +- Project (62) - +- BroadcastHashJoin Inner BuildRight (61) - :- Project (57) - : +- BroadcastHashJoin Inner BuildRight (56) - : :- Project (51) - : : +- BroadcastHashJoin Inner BuildRight (50) - : : :- Filter (45) - : : : +- Scan parquet (44) - : : +- BroadcastExchange (49) - : : +- Project (48) - : : +- Filter (47) - : : +- Scan parquet (46) - : +- BroadcastExchange (55) - : +- Project (54) - : +- Filter (53) - : +- Scan parquet (52) - +- BroadcastExchange (60) - +- Filter (59) - +- Scan parquet (58) + TakeOrderedAndProject (67) + +- HashAggregate (66) + +- Exchange (65) + +- HashAggregate (64) + +- Project (63) + +- BroadcastHashJoin Inner BuildRight (62) + :- Project (58) + : +- BroadcastHashJoin Inner BuildRight (57) + : :- Project (52) + : : +- BroadcastHashJoin Inner BuildRight (51) + : : :- Filter (46) + : : : +- Scan parquet (45) + : : +- BroadcastExchange (50) + : : +- Project (49) + : : +- Filter (48) + : : +- Scan parquet (47) + : +- BroadcastExchange (56) + : +- Project (55) + : +- Filter (54) + : +- Scan parquet (53) + +- BroadcastExchange (61) + +- Filter (60) + +- Scan parquet (59) (1) Scan parquet @@ -213,155 +210,159 @@ Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Arguments: false -(35) ColumnarExchange +(35) VeloxAppendBatches +Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] +Arguments: X + +(36) ColumnarExchange Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Arguments: hashpartitioning(c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(36) ShuffleQueryStage +(37) ShuffleQueryStage Output [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Arguments: X -(37) InputAdapter +(38) InputAdapter Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] -(38) InputIteratorTransformer +(39) InputIteratorTransformer Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] -(39) RegularHashAggregateExecTransformer +(40) RegularHashAggregateExecTransformer Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Keys [7]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] Results [8]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] -(40) ProjectExecTransformer +(41) ProjectExecTransformer Output [8]: [c_custkey#X, c_name#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X AS revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] Input [8]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] -(41) WholeStageCodegenTransformer (X) +(42) WholeStageCodegenTransformer (X) Input [8]: [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] Arguments: false -(42) TakeOrderedAndProjectExecTransformer +(43) TakeOrderedAndProjectExecTransformer Input [8]: [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] Arguments: X, [revenue#X DESC NULLS LAST], [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X], 0 -(43) VeloxColumnarToRowExec +(44) VeloxColumnarToRowExec Input [8]: [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] -(44) Scan parquet +(45) Scan parquet Output [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey), IsNotNull(c_nationkey)] ReadSchema: struct -(45) Filter +(46) Filter Input [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] Condition : (isnotnull(c_custkey#X) AND isnotnull(c_nationkey#X)) -(46) Scan parquet +(47) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1993-10-01), LessThan(o_orderdate,1994-01-01), IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(47) Filter +(48) Filter Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Condition : ((((isnotnull(o_orderdate#X) AND (o_orderdate#X >= 1993-10-01)) AND (o_orderdate#X < 1994-01-01)) AND isnotnull(o_custkey#X)) AND isnotnull(o_orderkey#X)) -(48) Project +(49) Project Output [2]: [o_orderkey#X, o_custkey#X] Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] -(49) BroadcastExchange +(50) BroadcastExchange Input [2]: [o_orderkey#X, o_custkey#X] Arguments: HashedRelationBroadcastMode(List(input[1, bigint, true]),false), [plan_id=X] -(50) BroadcastHashJoin +(51) BroadcastHashJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(51) Project +(52) Project Output [8]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] Input [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X, o_custkey#X] -(52) Scan parquet +(53) Scan parquet Output [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_returnflag#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_returnflag), EqualTo(l_returnflag,R), IsNotNull(l_orderkey)] ReadSchema: struct -(53) Filter +(54) Filter Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_returnflag#X] Condition : ((isnotnull(l_returnflag#X) AND (l_returnflag#X = R)) AND isnotnull(l_orderkey#X)) -(54) Project +(55) Project Output [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_returnflag#X] -(55) BroadcastExchange +(56) BroadcastExchange Input [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(56) BroadcastHashJoin +(57) BroadcastHashJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(57) Project +(58) Project Output [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] Input [11]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X] -(58) Scan parquet +(59) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey)] ReadSchema: struct -(59) Filter +(60) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : isnotnull(n_nationkey#X) -(60) BroadcastExchange +(61) BroadcastExchange Input [2]: [n_nationkey#X, n_name#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(61) BroadcastHashJoin +(62) BroadcastHashJoin Left keys [1]: [c_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(62) Project +(63) Project Output [9]: [c_custkey#X, c_name#X, c_address#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X, n_name#X] Input [11]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X, n_nationkey#X, n_name#X] -(63) HashAggregate +(64) HashAggregate Input [9]: [c_custkey#X, c_name#X, c_address#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X, n_name#X] Keys [7]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X] Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] -(64) Exchange +(65) Exchange Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Arguments: hashpartitioning(c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(65) HashAggregate +(66) HashAggregate Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Keys [7]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] Results [8]: [c_custkey#X, c_name#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X AS revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] -(66) TakeOrderedAndProject +(67) TakeOrderedAndProject Input [8]: [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] Arguments: X, [revenue#X DESC NULLS LAST], [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] -(67) AdaptiveSparkPlan +(68) AdaptiveSparkPlan Output [8]: [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/11.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/11.txt index 2a860ed5c063a..6dfc1d00f4cc1 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/11.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/11.txt @@ -1,59 +1,57 @@ == Physical Plan == -AdaptiveSparkPlan (58) +AdaptiveSparkPlan (60) +- == Final Plan == - VeloxColumnarToRowExec (38) - +- ^ SortExecTransformer (36) - +- ^ InputIteratorTransformer (35) - +- ^ InputAdapter (34) - +- ^ ShuffleQueryStage (33), Statistics(X) - +- ColumnarExchange (32) - +- ^ FilterExecTransformer (30) - +- ^ RegularHashAggregateExecTransformer (29) - +- ^ InputIteratorTransformer (28) - +- ^ InputAdapter (27) - +- ^ ShuffleQueryStage (26), Statistics(X) - +- ColumnarExchange (25) + VeloxColumnarToRowExec (40) + +- ^ SortExecTransformer (38) + +- ^ InputIteratorTransformer (37) + +- ShuffleQueryStage (35), Statistics(X) + +- ColumnarExchange (34) + +- VeloxAppendBatches (33) + +- ^ FilterExecTransformer (31) + +- ^ RegularHashAggregateExecTransformer (30) + +- ^ InputIteratorTransformer (29) + +- ShuffleQueryStage (27), Statistics(X) + +- ColumnarExchange (26) + +- VeloxAppendBatches (25) +- ^ ProjectExecTransformer (23) +- ^ FlushableHashAggregateExecTransformer (22) +- ^ ProjectExecTransformer (21) - +- ^ BroadcastHashJoinExecTransformer Inner (20) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (20) :- ^ ProjectExecTransformer (11) - : +- ^ BroadcastHashJoinExecTransformer Inner (10) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (10) : :- ^ FilterExecTransformer (2) : : +- ^ Scan parquet (1) : +- ^ InputIteratorTransformer (9) - : +- ^ InputAdapter (8) - : +- ^ BroadcastQueryStage (7), Statistics(X) - : +- ColumnarBroadcastExchange (6) - : +- ^ FilterExecTransformer (4) - : +- ^ Scan parquet (3) + : +- BroadcastQueryStage (7), Statistics(X) + : +- ColumnarBroadcastExchange (6) + : +- ^ FilterExecTransformer (4) + : +- ^ Scan parquet (3) +- ^ InputIteratorTransformer (19) - +- ^ InputAdapter (18) - +- ^ BroadcastQueryStage (17), Statistics(X) - +- ColumnarBroadcastExchange (16) - +- ^ ProjectExecTransformer (14) - +- ^ FilterExecTransformer (13) - +- ^ Scan parquet (12) + +- BroadcastQueryStage (17), Statistics(X) + +- ColumnarBroadcastExchange (16) + +- ^ ProjectExecTransformer (14) + +- ^ FilterExecTransformer (13) + +- ^ Scan parquet (12) +- == Initial Plan == - Sort (57) - +- Exchange (56) - +- Filter (55) - +- HashAggregate (54) - +- Exchange (53) - +- HashAggregate (52) - +- Project (51) - +- BroadcastHashJoin Inner BuildRight (50) - :- Project (45) - : +- BroadcastHashJoin Inner BuildRight (44) - : :- Filter (40) - : : +- Scan parquet (39) - : +- BroadcastExchange (43) - : +- Filter (42) - : +- Scan parquet (41) - +- BroadcastExchange (49) - +- Project (48) - +- Filter (47) - +- Scan parquet (46) + Sort (59) + +- Exchange (58) + +- Filter (57) + +- HashAggregate (56) + +- Exchange (55) + +- HashAggregate (54) + +- Project (53) + +- BroadcastHashJoin Inner BuildRight (52) + :- Project (47) + : +- BroadcastHashJoin Inner BuildRight (46) + : :- Filter (42) + : : +- Scan parquet (41) + : +- BroadcastExchange (45) + : +- Filter (44) + : +- Scan parquet (43) + +- BroadcastExchange (51) + +- Project (50) + +- Filter (49) + +- Scan parquet (48) (1) Scan parquet @@ -162,382 +160,392 @@ Input [3]: [ps_partkey#X, sum#X, isEmpty#X] Input [4]: [hash_partition_key#X, ps_partkey#X, sum#X, isEmpty#X] Arguments: false -(25) ColumnarExchange +(25) VeloxAppendBatches +Input [4]: [hash_partition_key#X, ps_partkey#X, sum#X, isEmpty#X] +Arguments: X + +(26) ColumnarExchange Input [4]: [hash_partition_key#X, ps_partkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(26) ShuffleQueryStage +(27) ShuffleQueryStage Output [3]: [ps_partkey#X, sum#X, isEmpty#X] Arguments: X -(27) InputAdapter +(28) InputAdapter Input [3]: [ps_partkey#X, sum#X, isEmpty#X] -(28) InputIteratorTransformer +(29) InputIteratorTransformer Input [3]: [ps_partkey#X, sum#X, isEmpty#X] -(29) RegularHashAggregateExecTransformer +(30) RegularHashAggregateExecTransformer Input [3]: [ps_partkey#X, sum#X, isEmpty#X] Keys [1]: [ps_partkey#X] Functions [1]: [sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))#X] Results [2]: [ps_partkey#X, sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))#X AS value#X] -(30) FilterExecTransformer +(31) FilterExecTransformer Input [2]: [ps_partkey#X, value#X] Arguments: (isnotnull(value#X) AND (cast(value#X as decimal(38,6)) > Subquery subquery#X, [id=#X])) -(31) WholeStageCodegenTransformer (X) +(32) WholeStageCodegenTransformer (X) Input [2]: [ps_partkey#X, value#X] Arguments: false -(32) ColumnarExchange +(33) VeloxAppendBatches +Input [2]: [ps_partkey#X, value#X] +Arguments: X + +(34) ColumnarExchange Input [2]: [ps_partkey#X, value#X] Arguments: rangepartitioning(value#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(33) ShuffleQueryStage +(35) ShuffleQueryStage Output [2]: [ps_partkey#X, value#X] Arguments: X -(34) InputAdapter +(36) InputAdapter Input [2]: [ps_partkey#X, value#X] -(35) InputIteratorTransformer +(37) InputIteratorTransformer Input [2]: [ps_partkey#X, value#X] -(36) SortExecTransformer +(38) SortExecTransformer Input [2]: [ps_partkey#X, value#X] Arguments: [value#X DESC NULLS LAST], true, 0 -(37) WholeStageCodegenTransformer (X) +(39) WholeStageCodegenTransformer (X) Input [2]: [ps_partkey#X, value#X] Arguments: false -(38) VeloxColumnarToRowExec +(40) VeloxColumnarToRowExec Input [2]: [ps_partkey#X, value#X] -(39) Scan parquet +(41) Scan parquet Output [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_suppkey)] ReadSchema: struct -(40) Filter +(42) Filter Input [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Condition : isnotnull(ps_suppkey#X) -(41) Scan parquet +(43) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(42) Filter +(44) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(43) BroadcastExchange +(45) BroadcastExchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(44) BroadcastHashJoin +(46) BroadcastHashJoin Left keys [1]: [ps_suppkey#X] Right keys [1]: [s_suppkey#X] Join condition: None -(45) Project +(47) Project Output [4]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] Input [6]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X, s_suppkey#X, s_nationkey#X] -(46) Scan parquet +(48) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,GERMANY), IsNotNull(n_nationkey)] ReadSchema: struct -(47) Filter +(49) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : ((isnotnull(n_name#X) AND (n_name#X = GERMANY)) AND isnotnull(n_nationkey#X)) -(48) Project +(50) Project Output [1]: [n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(49) BroadcastExchange +(51) BroadcastExchange Input [1]: [n_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(50) BroadcastHashJoin +(52) BroadcastHashJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(51) Project +(53) Project Output [3]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X] Input [5]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X, n_nationkey#X] -(52) HashAggregate +(54) HashAggregate Input [3]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X] Keys [1]: [ps_partkey#X] Functions [1]: [partial_sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [ps_partkey#X, sum#X, isEmpty#X] -(53) Exchange +(55) Exchange Input [3]: [ps_partkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(54) HashAggregate +(56) HashAggregate Input [3]: [ps_partkey#X, sum#X, isEmpty#X] Keys [1]: [ps_partkey#X] Functions [1]: [sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))#X] Results [2]: [ps_partkey#X, sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))#X AS value#X] -(55) Filter +(57) Filter Input [2]: [ps_partkey#X, value#X] Condition : (isnotnull(value#X) AND (cast(value#X as decimal(38,6)) > Subquery subquery#X, [id=#X])) -(56) Exchange +(58) Exchange Input [2]: [ps_partkey#X, value#X] Arguments: rangepartitioning(value#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(57) Sort +(59) Sort Input [2]: [ps_partkey#X, value#X] Arguments: [value#X DESC NULLS LAST], true, 0 -(58) AdaptiveSparkPlan +(60) AdaptiveSparkPlan Output [2]: [ps_partkey#X, value#X] Arguments: isFinalPlan=true ===== Subqueries ===== -Subquery:1 Hosting operator id = 30 Hosting Expression = Subquery subquery#X, [id=#X] -AdaptiveSparkPlan (99) +Subquery:1 Hosting operator id = 31 Hosting Expression = Subquery subquery#X, [id=#X] +AdaptiveSparkPlan (102) +- == Final Plan == - VeloxColumnarToRowExec (82) - +- ^ ProjectExecTransformer (80) - +- ^ RegularHashAggregateExecTransformer (79) - +- ^ InputIteratorTransformer (78) - +- ^ InputAdapter (77) - +- ^ ShuffleQueryStage (76), Statistics(X) - +- ColumnarExchange (75) - +- ^ FlushableHashAggregateExecTransformer (73) - +- ^ ProjectExecTransformer (72) - +- ^ BroadcastHashJoinExecTransformer Inner (71) - :- ^ ProjectExecTransformer (66) - : +- ^ BroadcastHashJoinExecTransformer Inner (65) - : :- ^ FilterExecTransformer (60) - : : +- ^ Scan parquet (59) - : +- ^ InputIteratorTransformer (64) - : +- ^ InputAdapter (63) - : +- ^ BroadcastQueryStage (62), Statistics(X) - : +- ReusedExchange (61) - +- ^ InputIteratorTransformer (70) - +- ^ InputAdapter (69) - +- ^ BroadcastQueryStage (68), Statistics(X) - +- ReusedExchange (67) + VeloxColumnarToRowExec (85) + +- ^ ProjectExecTransformer (83) + +- ^ RegularHashAggregateExecTransformer (82) + +- ^ InputIteratorTransformer (81) + +- ShuffleQueryStage (79), Statistics(X) + +- ColumnarExchange (78) + +- VeloxAppendBatches (77) + +- ^ FlushableHashAggregateExecTransformer (75) + +- ^ ProjectExecTransformer (74) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (73) + :- ^ ProjectExecTransformer (68) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (67) + : :- ^ FilterExecTransformer (62) + : : +- ^ Scan parquet (61) + : +- ^ InputIteratorTransformer (66) + : +- BroadcastQueryStage (64), Statistics(X) + : +- ReusedExchange (63) + +- ^ InputIteratorTransformer (72) + +- BroadcastQueryStage (70), Statistics(X) + +- ReusedExchange (69) +- == Initial Plan == - HashAggregate (98) - +- Exchange (97) - +- HashAggregate (96) - +- Project (95) - +- BroadcastHashJoin Inner BuildRight (94) - :- Project (89) - : +- BroadcastHashJoin Inner BuildRight (88) - : :- Filter (84) - : : +- Scan parquet (83) - : +- BroadcastExchange (87) - : +- Filter (86) - : +- Scan parquet (85) - +- BroadcastExchange (93) - +- Project (92) - +- Filter (91) - +- Scan parquet (90) - - -(59) Scan parquet + HashAggregate (101) + +- Exchange (100) + +- HashAggregate (99) + +- Project (98) + +- BroadcastHashJoin Inner BuildRight (97) + :- Project (92) + : +- BroadcastHashJoin Inner BuildRight (91) + : :- Filter (87) + : : +- Scan parquet (86) + : +- BroadcastExchange (90) + : +- Filter (89) + : +- Scan parquet (88) + +- BroadcastExchange (96) + +- Project (95) + +- Filter (94) + +- Scan parquet (93) + + +(61) Scan parquet Output [3]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_suppkey)] ReadSchema: struct -(60) FilterExecTransformer +(62) FilterExecTransformer Input [3]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Arguments: isnotnull(ps_suppkey#X) -(61) ReusedExchange [Reuses operator id: 6] +(63) ReusedExchange [Reuses operator id: 6] Output [2]: [s_suppkey#X, s_nationkey#X] -(62) BroadcastQueryStage +(64) BroadcastQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] Arguments: X -(63) InputAdapter +(65) InputAdapter Input [2]: [s_suppkey#X, s_nationkey#X] -(64) InputIteratorTransformer +(66) InputIteratorTransformer Input [2]: [s_suppkey#X, s_nationkey#X] -(65) BroadcastHashJoinExecTransformer +(67) BroadcastHashJoinExecTransformer Left keys [1]: [ps_suppkey#X] Right keys [1]: [s_suppkey#X] Join condition: None -(66) ProjectExecTransformer +(68) ProjectExecTransformer Output [3]: [ps_availqty#X, ps_supplycost#X, s_nationkey#X] Input [5]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X, s_suppkey#X, s_nationkey#X] -(67) ReusedExchange [Reuses operator id: 16] +(69) ReusedExchange [Reuses operator id: 16] Output [1]: [n_nationkey#X] -(68) BroadcastQueryStage +(70) BroadcastQueryStage Output [1]: [n_nationkey#X] Arguments: X -(69) InputAdapter +(71) InputAdapter Input [1]: [n_nationkey#X] -(70) InputIteratorTransformer +(72) InputIteratorTransformer Input [1]: [n_nationkey#X] -(71) BroadcastHashJoinExecTransformer +(73) BroadcastHashJoinExecTransformer Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(72) ProjectExecTransformer +(74) ProjectExecTransformer Output [3]: [ps_availqty#X, ps_supplycost#X, CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)) AS _pre_X#X] Input [4]: [ps_availqty#X, ps_supplycost#X, s_nationkey#X, n_nationkey#X] -(73) FlushableHashAggregateExecTransformer +(75) FlushableHashAggregateExecTransformer Input [3]: [ps_availqty#X, ps_supplycost#X, _pre_X#X] Keys: [] Functions [1]: [partial_sum(_pre_X#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(74) WholeStageCodegenTransformer (X) +(76) WholeStageCodegenTransformer (X) Input [2]: [sum#X, isEmpty#X] Arguments: false -(75) ColumnarExchange +(77) VeloxAppendBatches +Input [2]: [sum#X, isEmpty#X] +Arguments: X + +(78) ColumnarExchange Input [2]: [sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(76) ShuffleQueryStage +(79) ShuffleQueryStage Output [2]: [sum#X, isEmpty#X] Arguments: X -(77) InputAdapter +(80) InputAdapter Input [2]: [sum#X, isEmpty#X] -(78) InputIteratorTransformer +(81) InputIteratorTransformer Input [2]: [sum#X, isEmpty#X] -(79) RegularHashAggregateExecTransformer +(82) RegularHashAggregateExecTransformer Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))#X] Results [1]: [sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))#X] -(80) ProjectExecTransformer +(83) ProjectExecTransformer Output [1]: [CheckOverflow((promote_precision(cast(sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))#X as decimal(38,10))) * 0.0001000000), DecimalType(38,6)) AS (sum((ps_supplycost * ps_availqty)) * 0.0001000000)#X] Input [1]: [sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))#X] -(81) WholeStageCodegenTransformer (X) +(84) WholeStageCodegenTransformer (X) Input [1]: [(sum((ps_supplycost * ps_availqty)) * 0.0001000000)#X] Arguments: false -(82) VeloxColumnarToRowExec +(85) VeloxColumnarToRowExec Input [1]: [(sum((ps_supplycost * ps_availqty)) * 0.0001000000)#X] -(83) Scan parquet +(86) Scan parquet Output [3]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_suppkey)] ReadSchema: struct -(84) Filter +(87) Filter Input [3]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Condition : isnotnull(ps_suppkey#X) -(85) Scan parquet +(88) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(86) Filter +(89) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(87) BroadcastExchange +(90) BroadcastExchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(88) BroadcastHashJoin +(91) BroadcastHashJoin Left keys [1]: [ps_suppkey#X] Right keys [1]: [s_suppkey#X] Join condition: None -(89) Project +(92) Project Output [3]: [ps_availqty#X, ps_supplycost#X, s_nationkey#X] Input [5]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X, s_suppkey#X, s_nationkey#X] -(90) Scan parquet +(93) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,GERMANY), IsNotNull(n_nationkey)] ReadSchema: struct -(91) Filter +(94) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : ((isnotnull(n_name#X) AND (n_name#X = GERMANY)) AND isnotnull(n_nationkey#X)) -(92) Project +(95) Project Output [1]: [n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(93) BroadcastExchange +(96) BroadcastExchange Input [1]: [n_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(94) BroadcastHashJoin +(97) BroadcastHashJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(95) Project +(98) Project Output [2]: [ps_availqty#X, ps_supplycost#X] Input [4]: [ps_availqty#X, ps_supplycost#X, s_nationkey#X, n_nationkey#X] -(96) HashAggregate +(99) HashAggregate Input [2]: [ps_availqty#X, ps_supplycost#X] Keys: [] Functions [1]: [partial_sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(97) Exchange +(100) Exchange Input [2]: [sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X] -(98) HashAggregate +(101) HashAggregate Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))#X] Results [1]: [CheckOverflow((promote_precision(cast(sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))#X as decimal(38,10))) * 0.0001000000), DecimalType(38,6)) AS (sum((ps_supplycost * ps_availqty)) * 0.0001000000)#X] -(99) AdaptiveSparkPlan +(102) AdaptiveSparkPlan Output [1]: [(sum((ps_supplycost * ps_availqty)) * 0.0001000000)#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/12.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/12.txt index 99576f6e063a2..1e1ad6d497a4a 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/12.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/12.txt @@ -1,44 +1,43 @@ == Physical Plan == -AdaptiveSparkPlan (42) +AdaptiveSparkPlan (44) +- == Final Plan == - VeloxColumnarToRowExec (28) - +- ^ SortExecTransformer (26) - +- ^ InputIteratorTransformer (25) - +- ^ InputAdapter (24) - +- ^ ShuffleQueryStage (23), Statistics(X) - +- ColumnarExchange (22) - +- ^ RegularHashAggregateExecTransformer (20) - +- ^ InputIteratorTransformer (19) - +- ^ InputAdapter (18) - +- ^ ShuffleQueryStage (17), Statistics(X) - +- ColumnarExchange (16) + VeloxColumnarToRowExec (30) + +- ^ SortExecTransformer (28) + +- ^ InputIteratorTransformer (27) + +- ShuffleQueryStage (25), Statistics(X) + +- ColumnarExchange (24) + +- VeloxAppendBatches (23) + +- ^ RegularHashAggregateExecTransformer (21) + +- ^ InputIteratorTransformer (20) + +- ShuffleQueryStage (18), Statistics(X) + +- ColumnarExchange (17) + +- VeloxAppendBatches (16) +- ^ ProjectExecTransformer (14) +- ^ FlushableHashAggregateExecTransformer (13) +- ^ ProjectExecTransformer (12) - +- ^ BroadcastHashJoinExecTransformer Inner (11) + +- ^ BroadcastHashJoinExecTransformer Inner BuildLeft (11) :- ^ InputIteratorTransformer (7) - : +- ^ InputAdapter (6) - : +- ^ BroadcastQueryStage (5), Statistics(X) - : +- ColumnarBroadcastExchange (4) - : +- ^ FilterExecTransformer (2) - : +- ^ Scan parquet (1) + : +- BroadcastQueryStage (5), Statistics(X) + : +- ColumnarBroadcastExchange (4) + : +- ^ FilterExecTransformer (2) + : +- ^ Scan parquet (1) +- ^ ProjectExecTransformer (10) +- ^ FilterExecTransformer (9) +- ^ Scan parquet (8) +- == Initial Plan == - Sort (41) - +- Exchange (40) - +- HashAggregate (39) - +- Exchange (38) - +- HashAggregate (37) - +- Project (36) - +- BroadcastHashJoin Inner BuildLeft (35) - :- BroadcastExchange (31) - : +- Filter (30) - : +- Scan parquet (29) - +- Project (34) - +- Filter (33) - +- Scan parquet (32) + Sort (43) + +- Exchange (42) + +- HashAggregate (41) + +- Exchange (40) + +- HashAggregate (39) + +- Project (38) + +- BroadcastHashJoin Inner BuildLeft (37) + :- BroadcastExchange (33) + : +- Filter (32) + : +- Scan parquet (31) + +- Project (36) + +- Filter (35) + +- Scan parquet (34) (1) Scan parquet @@ -109,121 +108,129 @@ Input [3]: [l_shipmode#X, sum#X, sum#X] Input [4]: [hash_partition_key#X, l_shipmode#X, sum#X, sum#X] Arguments: false -(16) ColumnarExchange +(16) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_shipmode#X, sum#X, sum#X] +Arguments: X + +(17) ColumnarExchange Input [4]: [hash_partition_key#X, l_shipmode#X, sum#X, sum#X] Arguments: hashpartitioning(l_shipmode#X, 1), ENSURE_REQUIREMENTS, [l_shipmode#X, sum#X, sum#X], [plan_id=X], [id=#X] -(17) ShuffleQueryStage +(18) ShuffleQueryStage Output [3]: [l_shipmode#X, sum#X, sum#X] Arguments: X -(18) InputAdapter +(19) InputAdapter Input [3]: [l_shipmode#X, sum#X, sum#X] -(19) InputIteratorTransformer +(20) InputIteratorTransformer Input [3]: [l_shipmode#X, sum#X, sum#X] -(20) RegularHashAggregateExecTransformer +(21) RegularHashAggregateExecTransformer Input [3]: [l_shipmode#X, sum#X, sum#X] Keys [1]: [l_shipmode#X] Functions [2]: [sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END), sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)] Aggregate Attributes [2]: [sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X, sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X] Results [3]: [l_shipmode#X, sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X AS high_line_count#X, sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X AS low_line_count#X] -(21) WholeStageCodegenTransformer (X) +(22) WholeStageCodegenTransformer (X) Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: false -(22) ColumnarExchange +(23) VeloxAppendBatches +Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] +Arguments: X + +(24) ColumnarExchange Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: rangepartitioning(l_shipmode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(23) ShuffleQueryStage +(25) ShuffleQueryStage Output [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: X -(24) InputAdapter +(26) InputAdapter Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] -(25) InputIteratorTransformer +(27) InputIteratorTransformer Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] -(26) SortExecTransformer +(28) SortExecTransformer Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: [l_shipmode#X ASC NULLS FIRST], true, 0 -(27) WholeStageCodegenTransformer (X) +(29) WholeStageCodegenTransformer (X) Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: false -(28) VeloxColumnarToRowExec +(30) VeloxColumnarToRowExec Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] -(29) Scan parquet +(31) Scan parquet Output [2]: [o_orderkey#X, o_orderpriority#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderkey)] ReadSchema: struct -(30) Filter +(32) Filter Input [2]: [o_orderkey#X, o_orderpriority#X] Condition : isnotnull(o_orderkey#X) -(31) BroadcastExchange +(33) BroadcastExchange Input [2]: [o_orderkey#X, o_orderpriority#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(32) Scan parquet +(34) Scan parquet Output [5]: [l_orderkey#X, l_shipdate#X, l_commitdate#X, l_receiptdate#X, l_shipmode#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_commitdate), IsNotNull(l_receiptdate), IsNotNull(l_shipdate), In(l_shipmode, [MAIL,SHIP]), GreaterThanOrEqual(l_receiptdate,1994-01-01), LessThan(l_receiptdate,1995-01-01), IsNotNull(l_orderkey)] ReadSchema: struct -(33) Filter +(35) Filter Input [5]: [l_orderkey#X, l_shipdate#X, l_commitdate#X, l_receiptdate#X, l_shipmode#X] Condition : ((((((((isnotnull(l_commitdate#X) AND isnotnull(l_receiptdate#X)) AND isnotnull(l_shipdate#X)) AND l_shipmode#X IN (MAIL,SHIP)) AND (l_commitdate#X < l_receiptdate#X)) AND (l_shipdate#X < l_commitdate#X)) AND (l_receiptdate#X >= 1994-01-01)) AND (l_receiptdate#X < 1995-01-01)) AND isnotnull(l_orderkey#X)) -(34) Project +(36) Project Output [2]: [l_orderkey#X, l_shipmode#X] Input [5]: [l_orderkey#X, l_shipdate#X, l_commitdate#X, l_receiptdate#X, l_shipmode#X] -(35) BroadcastHashJoin +(37) BroadcastHashJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(36) Project +(38) Project Output [2]: [o_orderpriority#X, l_shipmode#X] Input [4]: [o_orderkey#X, o_orderpriority#X, l_orderkey#X, l_shipmode#X] -(37) HashAggregate +(39) HashAggregate Input [2]: [o_orderpriority#X, l_shipmode#X] Keys [1]: [l_shipmode#X] Functions [2]: [partial_sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END), partial_sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)] Aggregate Attributes [2]: [sum#X, sum#X] Results [3]: [l_shipmode#X, sum#X, sum#X] -(38) Exchange +(40) Exchange Input [3]: [l_shipmode#X, sum#X, sum#X] Arguments: hashpartitioning(l_shipmode#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(39) HashAggregate +(41) HashAggregate Input [3]: [l_shipmode#X, sum#X, sum#X] Keys [1]: [l_shipmode#X] Functions [2]: [sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END), sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)] Aggregate Attributes [2]: [sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X, sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X] Results [3]: [l_shipmode#X, sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X AS high_line_count#X, sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X AS low_line_count#X] -(40) Exchange +(42) Exchange Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: rangepartitioning(l_shipmode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(41) Sort +(43) Sort Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: [l_shipmode#X ASC NULLS FIRST], true, 0 -(42) AdaptiveSparkPlan +(44) AdaptiveSparkPlan Output [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/13.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/13.txt index 8ebeb7d32d52e..ff6532aa65791 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/13.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/13.txt @@ -1,53 +1,52 @@ == Physical Plan == -AdaptiveSparkPlan (52) +AdaptiveSparkPlan (55) +- == Final Plan == - VeloxColumnarToRowExec (36) - +- ^ SortExecTransformer (34) - +- ^ InputIteratorTransformer (33) - +- ^ InputAdapter (32) - +- ^ ShuffleQueryStage (31), Statistics(X) - +- ColumnarExchange (30) - +- ^ RegularHashAggregateExecTransformer (28) - +- ^ InputIteratorTransformer (27) - +- ^ InputAdapter (26) - +- ^ ShuffleQueryStage (25), Statistics(X) - +- ColumnarExchange (24) - +- ^ ProjectExecTransformer (22) - +- ^ FlushableHashAggregateExecTransformer (21) - +- ^ ProjectExecTransformer (20) - +- ^ RegularHashAggregateExecTransformer (19) - +- ^ InputIteratorTransformer (18) - +- ^ InputAdapter (17) - +- ^ ShuffleQueryStage (16), Statistics(X) - +- ColumnarExchange (15) + VeloxColumnarToRowExec (39) + +- ^ SortExecTransformer (37) + +- ^ InputIteratorTransformer (36) + +- ShuffleQueryStage (34), Statistics(X) + +- ColumnarExchange (33) + +- VeloxAppendBatches (32) + +- ^ RegularHashAggregateExecTransformer (30) + +- ^ InputIteratorTransformer (29) + +- ShuffleQueryStage (27), Statistics(X) + +- ColumnarExchange (26) + +- VeloxAppendBatches (25) + +- ^ ProjectExecTransformer (23) + +- ^ FlushableHashAggregateExecTransformer (22) + +- ^ ProjectExecTransformer (21) + +- ^ RegularHashAggregateExecTransformer (20) + +- ^ InputIteratorTransformer (19) + +- ShuffleQueryStage (17), Statistics(X) + +- ColumnarExchange (16) + +- VeloxAppendBatches (15) +- ^ ProjectExecTransformer (13) +- ^ FlushableHashAggregateExecTransformer (12) +- ^ ProjectExecTransformer (11) - +- ^ BroadcastHashJoinExecTransformer LeftOuter (10) + +- ^ BroadcastHashJoinExecTransformer LeftOuter BuildRight (10) :- ^ Scan parquet (1) +- ^ InputIteratorTransformer (9) - +- ^ InputAdapter (8) - +- ^ BroadcastQueryStage (7), Statistics(X) - +- ColumnarBroadcastExchange (6) - +- ^ ProjectExecTransformer (4) - +- ^ FilterExecTransformer (3) - +- ^ Scan parquet (2) + +- BroadcastQueryStage (7), Statistics(X) + +- ColumnarBroadcastExchange (6) + +- ^ ProjectExecTransformer (4) + +- ^ FilterExecTransformer (3) + +- ^ Scan parquet (2) +- == Initial Plan == - Sort (51) - +- Exchange (50) - +- HashAggregate (49) - +- Exchange (48) - +- HashAggregate (47) - +- HashAggregate (46) - +- Exchange (45) - +- HashAggregate (44) - +- Project (43) - +- BroadcastHashJoin LeftOuter BuildRight (42) - :- Scan parquet (37) - +- BroadcastExchange (41) - +- Project (40) - +- Filter (39) - +- Scan parquet (38) + Sort (54) + +- Exchange (53) + +- HashAggregate (52) + +- Exchange (51) + +- HashAggregate (50) + +- HashAggregate (49) + +- Exchange (48) + +- HashAggregate (47) + +- Project (46) + +- BroadcastHashJoin LeftOuter BuildRight (45) + :- Scan parquet (40) + +- BroadcastExchange (44) + +- Project (43) + +- Filter (42) + +- Scan parquet (41) (1) Scan parquet @@ -113,174 +112,186 @@ Input [2]: [c_custkey#X, count#X] Input [3]: [hash_partition_key#X, c_custkey#X, count#X] Arguments: false -(15) ColumnarExchange +(15) VeloxAppendBatches +Input [3]: [hash_partition_key#X, c_custkey#X, count#X] +Arguments: X + +(16) ColumnarExchange Input [3]: [hash_partition_key#X, c_custkey#X, count#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, count#X], [plan_id=X], [id=#X] -(16) ShuffleQueryStage +(17) ShuffleQueryStage Output [2]: [c_custkey#X, count#X] Arguments: X -(17) InputAdapter +(18) InputAdapter Input [2]: [c_custkey#X, count#X] -(18) InputIteratorTransformer +(19) InputIteratorTransformer Input [2]: [c_custkey#X, count#X] -(19) RegularHashAggregateExecTransformer +(20) RegularHashAggregateExecTransformer Input [2]: [c_custkey#X, count#X] Keys [1]: [c_custkey#X] Functions [1]: [count(o_orderkey#X)] Aggregate Attributes [1]: [count(o_orderkey#X)#X] Results [2]: [c_custkey#X, count(o_orderkey#X)#X] -(20) ProjectExecTransformer +(21) ProjectExecTransformer Output [1]: [count(o_orderkey#X)#X AS c_count#X] Input [2]: [c_custkey#X, count(o_orderkey#X)#X] -(21) FlushableHashAggregateExecTransformer +(22) FlushableHashAggregateExecTransformer Input [1]: [c_count#X] Keys [1]: [c_count#X] Functions [1]: [partial_count(1)] Aggregate Attributes [1]: [count#X] Results [2]: [c_count#X, count#X] -(22) ProjectExecTransformer +(23) ProjectExecTransformer Output [3]: [hash(c_count#X, 42) AS hash_partition_key#X, c_count#X, count#X] Input [2]: [c_count#X, count#X] -(23) WholeStageCodegenTransformer (X) +(24) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, c_count#X, count#X] Arguments: false -(24) ColumnarExchange +(25) VeloxAppendBatches +Input [3]: [hash_partition_key#X, c_count#X, count#X] +Arguments: X + +(26) ColumnarExchange Input [3]: [hash_partition_key#X, c_count#X, count#X] Arguments: hashpartitioning(c_count#X, 1), ENSURE_REQUIREMENTS, [c_count#X, count#X], [plan_id=X], [id=#X] -(25) ShuffleQueryStage +(27) ShuffleQueryStage Output [2]: [c_count#X, count#X] Arguments: X -(26) InputAdapter +(28) InputAdapter Input [2]: [c_count#X, count#X] -(27) InputIteratorTransformer +(29) InputIteratorTransformer Input [2]: [c_count#X, count#X] -(28) RegularHashAggregateExecTransformer +(30) RegularHashAggregateExecTransformer Input [2]: [c_count#X, count#X] Keys [1]: [c_count#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [c_count#X, count(1)#X AS custdist#X] -(29) WholeStageCodegenTransformer (X) +(31) WholeStageCodegenTransformer (X) Input [2]: [c_count#X, custdist#X] Arguments: false -(30) ColumnarExchange +(32) VeloxAppendBatches +Input [2]: [c_count#X, custdist#X] +Arguments: X + +(33) ColumnarExchange Input [2]: [c_count#X, custdist#X] Arguments: rangepartitioning(custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(31) ShuffleQueryStage +(34) ShuffleQueryStage Output [2]: [c_count#X, custdist#X] Arguments: X -(32) InputAdapter +(35) InputAdapter Input [2]: [c_count#X, custdist#X] -(33) InputIteratorTransformer +(36) InputIteratorTransformer Input [2]: [c_count#X, custdist#X] -(34) SortExecTransformer +(37) SortExecTransformer Input [2]: [c_count#X, custdist#X] Arguments: [custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST], true, 0 -(35) WholeStageCodegenTransformer (X) +(38) WholeStageCodegenTransformer (X) Input [2]: [c_count#X, custdist#X] Arguments: false -(36) VeloxColumnarToRowExec +(39) VeloxColumnarToRowExec Input [2]: [c_count#X, custdist#X] -(37) Scan parquet +(40) Scan parquet Output [1]: [c_custkey#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(38) Scan parquet +(41) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_comment#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_comment), IsNotNull(o_custkey)] ReadSchema: struct -(39) Filter +(42) Filter Input [3]: [o_orderkey#X, o_custkey#X, o_comment#X] Condition : ((isnotnull(o_comment#X) AND NOT o_comment#X LIKE %special%requests%) AND isnotnull(o_custkey#X)) -(40) Project +(43) Project Output [2]: [o_orderkey#X, o_custkey#X] Input [3]: [o_orderkey#X, o_custkey#X, o_comment#X] -(41) BroadcastExchange +(44) BroadcastExchange Input [2]: [o_orderkey#X, o_custkey#X] Arguments: HashedRelationBroadcastMode(List(input[1, bigint, true]),false), [plan_id=X] -(42) BroadcastHashJoin +(45) BroadcastHashJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(43) Project +(46) Project Output [2]: [c_custkey#X, o_orderkey#X] Input [3]: [c_custkey#X, o_orderkey#X, o_custkey#X] -(44) HashAggregate +(47) HashAggregate Input [2]: [c_custkey#X, o_orderkey#X] Keys [1]: [c_custkey#X] Functions [1]: [partial_count(o_orderkey#X)] Aggregate Attributes [1]: [count#X] Results [2]: [c_custkey#X, count#X] -(45) Exchange +(48) Exchange Input [2]: [c_custkey#X, count#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(46) HashAggregate +(49) HashAggregate Input [2]: [c_custkey#X, count#X] Keys [1]: [c_custkey#X] Functions [1]: [count(o_orderkey#X)] Aggregate Attributes [1]: [count(o_orderkey#X)#X] Results [1]: [count(o_orderkey#X)#X AS c_count#X] -(47) HashAggregate +(50) HashAggregate Input [1]: [c_count#X] Keys [1]: [c_count#X] Functions [1]: [partial_count(1)] Aggregate Attributes [1]: [count#X] Results [2]: [c_count#X, count#X] -(48) Exchange +(51) Exchange Input [2]: [c_count#X, count#X] Arguments: hashpartitioning(c_count#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(49) HashAggregate +(52) HashAggregate Input [2]: [c_count#X, count#X] Keys [1]: [c_count#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [c_count#X, count(1)#X AS custdist#X] -(50) Exchange +(53) Exchange Input [2]: [c_count#X, custdist#X] Arguments: rangepartitioning(custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(51) Sort +(54) Sort Input [2]: [c_count#X, custdist#X] Arguments: [custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST], true, 0 -(52) AdaptiveSparkPlan +(55) AdaptiveSparkPlan Output [2]: [c_count#X, custdist#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/14.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/14.txt index bfba707a57920..d144dd39ca1d0 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/14.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/14.txt @@ -1,37 +1,36 @@ == Physical Plan == -AdaptiveSparkPlan (34) +AdaptiveSparkPlan (35) +- == Final Plan == - VeloxColumnarToRowExec (22) - +- ^ ProjectExecTransformer (20) - +- ^ RegularHashAggregateExecTransformer (19) - +- ^ InputIteratorTransformer (18) - +- ^ InputAdapter (17) - +- ^ ShuffleQueryStage (16), Statistics(X) - +- ColumnarExchange (15) + VeloxColumnarToRowExec (23) + +- ^ ProjectExecTransformer (21) + +- ^ RegularHashAggregateExecTransformer (20) + +- ^ InputIteratorTransformer (19) + +- ShuffleQueryStage (17), Statistics(X) + +- ColumnarExchange (16) + +- VeloxAppendBatches (15) +- ^ FlushableHashAggregateExecTransformer (13) +- ^ ProjectExecTransformer (12) - +- ^ BroadcastHashJoinExecTransformer Inner (11) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (11) :- ^ ProjectExecTransformer (3) : +- ^ FilterExecTransformer (2) : +- ^ Scan parquet (1) +- ^ InputIteratorTransformer (10) - +- ^ InputAdapter (9) - +- ^ BroadcastQueryStage (8), Statistics(X) - +- ColumnarBroadcastExchange (7) - +- ^ FilterExecTransformer (5) - +- ^ Scan parquet (4) + +- BroadcastQueryStage (8), Statistics(X) + +- ColumnarBroadcastExchange (7) + +- ^ FilterExecTransformer (5) + +- ^ Scan parquet (4) +- == Initial Plan == - HashAggregate (33) - +- Exchange (32) - +- HashAggregate (31) - +- Project (30) - +- BroadcastHashJoin Inner BuildRight (29) - :- Project (25) - : +- Filter (24) - : +- Scan parquet (23) - +- BroadcastExchange (28) - +- Filter (27) - +- Scan parquet (26) + HashAggregate (34) + +- Exchange (33) + +- HashAggregate (32) + +- Project (31) + +- BroadcastHashJoin Inner BuildRight (30) + :- Project (26) + : +- Filter (25) + : +- Scan parquet (24) + +- BroadcastExchange (29) + +- Filter (28) + +- Scan parquet (27) (1) Scan parquet @@ -98,95 +97,99 @@ Results [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: false -(15) ColumnarExchange +(15) VeloxAppendBatches +Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] +Arguments: X + +(16) ColumnarExchange Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(16) ShuffleQueryStage +(17) ShuffleQueryStage Output [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: X -(17) InputAdapter +(18) InputAdapter Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] -(18) InputIteratorTransformer +(19) InputIteratorTransformer Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] -(19) RegularHashAggregateExecTransformer +(20) RegularHashAggregateExecTransformer Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Keys: [] Functions [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) ELSE 0.0000 END), sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) ELSE 0.0000 END)#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] Results [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) ELSE 0.0000 END)#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] -(20) ProjectExecTransformer +(21) ProjectExecTransformer Output [1]: [CheckOverflow((promote_precision(CheckOverflow((100.0000 * promote_precision(sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) ELSE 0.0000 END)#X)), DecimalType(38,6))) / promote_precision(cast(sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X as decimal(38,6)))), DecimalType(38,6)) AS promo_revenue#X] Input [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) ELSE 0.0000 END)#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] -(21) WholeStageCodegenTransformer (X) +(22) WholeStageCodegenTransformer (X) Input [1]: [promo_revenue#X] Arguments: false -(22) VeloxColumnarToRowExec +(23) VeloxColumnarToRowExec Input [1]: [promo_revenue#X] -(23) Scan parquet +(24) Scan parquet Output [4]: [l_partkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1995-09-01), LessThan(l_shipdate,1995-10-01), IsNotNull(l_partkey)] ReadSchema: struct -(24) Filter +(25) Filter Input [4]: [l_partkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : (((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1995-09-01)) AND (l_shipdate#X < 1995-10-01)) AND isnotnull(l_partkey#X)) -(25) Project +(26) Project Output [3]: [l_partkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_partkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(26) Scan parquet +(27) Scan parquet Output [2]: [p_partkey#X, p_type#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_partkey)] ReadSchema: struct -(27) Filter +(28) Filter Input [2]: [p_partkey#X, p_type#X] Condition : isnotnull(p_partkey#X) -(28) BroadcastExchange +(29) BroadcastExchange Input [2]: [p_partkey#X, p_type#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(29) BroadcastHashJoin +(30) BroadcastHashJoin Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(30) Project +(31) Project Output [3]: [l_extendedprice#X, l_discount#X, p_type#X] Input [5]: [l_partkey#X, l_extendedprice#X, l_discount#X, p_partkey#X, p_type#X] -(31) HashAggregate +(32) HashAggregate Input [3]: [l_extendedprice#X, l_discount#X, p_type#X] Keys: [] Functions [2]: [partial_sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) ELSE 0.0000 END), partial_sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Results [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] -(32) Exchange +(33) Exchange Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X] -(33) HashAggregate +(34) HashAggregate Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Keys: [] Functions [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) ELSE 0.0000 END), sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) ELSE 0.0000 END)#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] Results [1]: [CheckOverflow((promote_precision(CheckOverflow((100.0000 * promote_precision(sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) ELSE 0.0000 END)#X)), DecimalType(38,6))) / promote_precision(cast(sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X as decimal(38,6)))), DecimalType(38,6)) AS promo_revenue#X] -(34) AdaptiveSparkPlan +(35) AdaptiveSparkPlan Output [1]: [promo_revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/15.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/15.txt index 06b833361548b..c4f825f5ca533 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/15.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/15.txt @@ -1,44 +1,44 @@ == Physical Plan == -AdaptiveSparkPlan (41) +AdaptiveSparkPlan (43) +- == Final Plan == - VeloxColumnarToRowExec (26) - +- AQEShuffleRead (25) - +- ShuffleQueryStage (24), Statistics(X) - +- ColumnarExchange (23) - +- ^ ProjectExecTransformer (21) - +- ^ BroadcastHashJoinExecTransformer Inner (20) - :- ^ InputIteratorTransformer (7) - : +- ^ InputAdapter (6) - : +- ^ BroadcastQueryStage (5), Statistics(X) - : +- ColumnarBroadcastExchange (4) - : +- ^ FilterExecTransformer (2) - : +- ^ Scan parquet (1) - +- ^ FilterExecTransformer (19) - +- ^ RegularHashAggregateExecTransformer (18) - +- ^ InputIteratorTransformer (17) - +- ^ InputAdapter (16) - +- ^ ShuffleQueryStage (15), Statistics(X) - +- ColumnarExchange (14) - +- ^ ProjectExecTransformer (12) - +- ^ FlushableHashAggregateExecTransformer (11) - +- ^ ProjectExecTransformer (10) - +- ^ FilterExecTransformer (9) - +- ^ Scan parquet (8) + VeloxColumnarToRowExec (28) + +- AQEShuffleRead (27) + +- ShuffleQueryStage (26), Statistics(X) + +- ColumnarExchange (25) + +- VeloxAppendBatches (24) + +- ^ ProjectExecTransformer (22) + +- ^ BroadcastHashJoinExecTransformer Inner BuildLeft (21) + :- ^ InputIteratorTransformer (7) + : +- BroadcastQueryStage (5), Statistics(X) + : +- ColumnarBroadcastExchange (4) + : +- ^ FilterExecTransformer (2) + : +- ^ Scan parquet (1) + +- ^ FilterExecTransformer (20) + +- ^ RegularHashAggregateExecTransformer (19) + +- ^ InputIteratorTransformer (18) + +- ShuffleQueryStage (16), Statistics(X) + +- ColumnarExchange (15) + +- VeloxAppendBatches (14) + +- ^ ProjectExecTransformer (12) + +- ^ FlushableHashAggregateExecTransformer (11) + +- ^ ProjectExecTransformer (10) + +- ^ FilterExecTransformer (9) + +- ^ Scan parquet (8) +- == Initial Plan == - Sort (40) - +- Exchange (39) - +- Project (38) - +- BroadcastHashJoin Inner BuildLeft (37) - :- BroadcastExchange (29) - : +- Filter (28) - : +- Scan parquet (27) - +- Filter (36) - +- HashAggregate (35) - +- Exchange (34) - +- HashAggregate (33) - +- Project (32) - +- Filter (31) - +- Scan parquet (30) + Sort (42) + +- Exchange (41) + +- Project (40) + +- BroadcastHashJoin Inner BuildLeft (39) + :- BroadcastExchange (31) + : +- Filter (30) + : +- Scan parquet (29) + +- Filter (38) + +- HashAggregate (37) + +- Exchange (36) + +- HashAggregate (35) + +- Project (34) + +- Filter (33) + +- Scan parquet (32) (1) Scan parquet @@ -100,285 +100,297 @@ Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: false -(14) ColumnarExchange +(14) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] +Arguments: X + +(15) ColumnarExchange Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(15) ShuffleQueryStage +(16) ShuffleQueryStage Output [3]: [l_suppkey#X, sum#X, isEmpty#X] Arguments: X -(16) InputAdapter +(17) InputAdapter Input [3]: [l_suppkey#X, sum#X, isEmpty#X] -(17) InputIteratorTransformer +(18) InputIteratorTransformer Input [3]: [l_suppkey#X, sum#X, isEmpty#X] -(18) RegularHashAggregateExecTransformer +(19) RegularHashAggregateExecTransformer Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Keys [1]: [l_suppkey#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] Results [2]: [l_suppkey#X AS supplier_no#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X AS total_revenue#X] -(19) FilterExecTransformer +(20) FilterExecTransformer Input [2]: [supplier_no#X, total_revenue#X] Arguments: (isnotnull(total_revenue#X) AND (total_revenue#X = Subquery subquery#X, [id=#X])) -(20) BroadcastHashJoinExecTransformer +(21) BroadcastHashJoinExecTransformer Left keys [1]: [s_suppkey#X] Right keys [1]: [supplier_no#X] Join condition: None -(21) ProjectExecTransformer +(22) ProjectExecTransformer Output [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Input [6]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, supplier_no#X, total_revenue#X] -(22) WholeStageCodegenTransformer (X) +(23) WholeStageCodegenTransformer (X) Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: false -(23) ColumnarExchange +(24) VeloxAppendBatches +Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] +Arguments: X + +(25) ColumnarExchange Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: rangepartitioning(s_suppkey#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(24) ShuffleQueryStage +(26) ShuffleQueryStage Output [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: X -(25) AQEShuffleRead +(27) AQEShuffleRead Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: local -(26) VeloxColumnarToRowExec +(28) VeloxColumnarToRowExec Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] -(27) Scan parquet +(29) Scan parquet Output [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey)] ReadSchema: struct -(28) Filter +(30) Filter Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] Condition : isnotnull(s_suppkey#X) -(29) BroadcastExchange +(31) BroadcastExchange Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(30) Scan parquet +(32) Scan parquet Output [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1996-01-01), LessThan(l_shipdate,1996-04-01), IsNotNull(l_suppkey)] ReadSchema: struct -(31) Filter +(33) Filter Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : (((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1996-01-01)) AND (l_shipdate#X < 1996-04-01)) AND isnotnull(l_suppkey#X)) -(32) Project +(34) Project Output [3]: [l_suppkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(33) HashAggregate +(35) HashAggregate Input [3]: [l_suppkey#X, l_extendedprice#X, l_discount#X] Keys [1]: [l_suppkey#X] Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_suppkey#X, sum#X, isEmpty#X] -(34) Exchange +(36) Exchange Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(35) HashAggregate +(37) HashAggregate Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Keys [1]: [l_suppkey#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] Results [2]: [l_suppkey#X AS supplier_no#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X AS total_revenue#X] -(36) Filter +(38) Filter Input [2]: [supplier_no#X, total_revenue#X] Condition : (isnotnull(total_revenue#X) AND (total_revenue#X = Subquery subquery#X, [id=#X])) -(37) BroadcastHashJoin +(39) BroadcastHashJoin Left keys [1]: [s_suppkey#X] Right keys [1]: [supplier_no#X] Join condition: None -(38) Project +(40) Project Output [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Input [6]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, supplier_no#X, total_revenue#X] -(39) Exchange +(41) Exchange Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: rangepartitioning(s_suppkey#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(40) Sort +(42) Sort Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: [s_suppkey#X ASC NULLS FIRST], true, 0 -(41) AdaptiveSparkPlan +(43) AdaptiveSparkPlan Output [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: isFinalPlan=true ===== Subqueries ===== -Subquery:1 Hosting operator id = 19 Hosting Expression = Subquery subquery#X, [id=#X] -AdaptiveSparkPlan (66) +Subquery:1 Hosting operator id = 20 Hosting Expression = Subquery subquery#X, [id=#X] +AdaptiveSparkPlan (69) +- == Final Plan == - VeloxColumnarToRowExec (57) - +- ^ RegularHashAggregateExecTransformer (55) - +- ^ RegularHashAggregateExecTransformer (54) - +- ^ ProjectExecTransformer (53) - +- ^ RegularHashAggregateExecTransformer (52) - +- ^ InputIteratorTransformer (51) - +- ^ InputAdapter (50) - +- ^ ShuffleQueryStage (49), Statistics(X) - +- ColumnarExchange (48) - +- ^ ProjectExecTransformer (46) - +- ^ FlushableHashAggregateExecTransformer (45) - +- ^ ProjectExecTransformer (44) - +- ^ FilterExecTransformer (43) - +- ^ Scan parquet (42) + VeloxColumnarToRowExec (60) + +- ^ RegularHashAggregateExecTransformer (58) + +- ^ RegularHashAggregateExecTransformer (57) + +- ^ ProjectExecTransformer (56) + +- ^ RegularHashAggregateExecTransformer (55) + +- ^ InputIteratorTransformer (54) + +- ShuffleQueryStage (52), Statistics(X) + +- ColumnarExchange (51) + +- VeloxAppendBatches (50) + +- ^ ProjectExecTransformer (48) + +- ^ FlushableHashAggregateExecTransformer (47) + +- ^ ProjectExecTransformer (46) + +- ^ FilterExecTransformer (45) + +- ^ Scan parquet (44) +- == Initial Plan == - HashAggregate (65) - +- HashAggregate (64) - +- HashAggregate (63) - +- Exchange (62) - +- HashAggregate (61) - +- Project (60) - +- Filter (59) - +- Scan parquet (58) + HashAggregate (68) + +- HashAggregate (67) + +- HashAggregate (66) + +- Exchange (65) + +- HashAggregate (64) + +- Project (63) + +- Filter (62) + +- Scan parquet (61) -(42) Scan parquet +(44) Scan parquet Output [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1996-01-01), LessThan(l_shipdate,1996-04-01)] ReadSchema: struct -(43) FilterExecTransformer +(45) FilterExecTransformer Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: ((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1996-01-01)) AND (l_shipdate#X < 1996-04-01)) -(44) ProjectExecTransformer +(46) ProjectExecTransformer Output [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) AS _pre_X#X] Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(45) FlushableHashAggregateExecTransformer +(47) FlushableHashAggregateExecTransformer Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, _pre_X#X] Keys [1]: [l_suppkey#X] Functions [1]: [partial_sum(_pre_X#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_suppkey#X, sum#X, isEmpty#X] -(46) ProjectExecTransformer +(48) ProjectExecTransformer Output [4]: [hash(l_suppkey#X, 42) AS hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] Input [3]: [l_suppkey#X, sum#X, isEmpty#X] -(47) WholeStageCodegenTransformer (X) +(49) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: false -(48) ColumnarExchange +(50) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] +Arguments: X + +(51) ColumnarExchange Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(49) ShuffleQueryStage +(52) ShuffleQueryStage Output [3]: [l_suppkey#X, sum#X, isEmpty#X] Arguments: X -(50) InputAdapter +(53) InputAdapter Input [3]: [l_suppkey#X, sum#X, isEmpty#X] -(51) InputIteratorTransformer +(54) InputIteratorTransformer Input [3]: [l_suppkey#X, sum#X, isEmpty#X] -(52) RegularHashAggregateExecTransformer +(55) RegularHashAggregateExecTransformer Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Keys [1]: [l_suppkey#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] Results [2]: [l_suppkey#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] -(53) ProjectExecTransformer +(56) ProjectExecTransformer Output [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X AS total_revenue#X] Input [2]: [l_suppkey#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] -(54) RegularHashAggregateExecTransformer +(57) RegularHashAggregateExecTransformer Input [1]: [total_revenue#X] Keys: [] Functions [1]: [partial_max(total_revenue#X)] Aggregate Attributes [1]: [max#X] Results [1]: [max#X] -(55) RegularHashAggregateExecTransformer +(58) RegularHashAggregateExecTransformer Input [1]: [max#X] Keys: [] Functions [1]: [max(total_revenue#X)] Aggregate Attributes [1]: [max(total_revenue#X)#X] Results [1]: [max(total_revenue#X)#X AS max(total_revenue)#X] -(56) WholeStageCodegenTransformer (X) +(59) WholeStageCodegenTransformer (X) Input [1]: [max(total_revenue)#X] Arguments: false -(57) VeloxColumnarToRowExec +(60) VeloxColumnarToRowExec Input [1]: [max(total_revenue)#X] -(58) Scan parquet +(61) Scan parquet Output [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1996-01-01), LessThan(l_shipdate,1996-04-01)] ReadSchema: struct -(59) Filter +(62) Filter Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : ((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1996-01-01)) AND (l_shipdate#X < 1996-04-01)) -(60) Project +(63) Project Output [3]: [l_suppkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(61) HashAggregate +(64) HashAggregate Input [3]: [l_suppkey#X, l_extendedprice#X, l_discount#X] Keys [1]: [l_suppkey#X] Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_suppkey#X, sum#X, isEmpty#X] -(62) Exchange +(65) Exchange Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(63) HashAggregate +(66) HashAggregate Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Keys [1]: [l_suppkey#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] Results [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X AS total_revenue#X] -(64) HashAggregate +(67) HashAggregate Input [1]: [total_revenue#X] Keys: [] Functions [1]: [partial_max(total_revenue#X)] Aggregate Attributes [1]: [max#X] Results [1]: [max#X] -(65) HashAggregate +(68) HashAggregate Input [1]: [max#X] Keys: [] Functions [1]: [max(total_revenue#X)] Aggregate Attributes [1]: [max(total_revenue#X)#X] Results [1]: [max(total_revenue#X)#X AS max(total_revenue)#X] -(66) AdaptiveSparkPlan +(69) AdaptiveSparkPlan Output [1]: [max(total_revenue)#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/16.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/16.txt index af623a3a2f683..741b837976a42 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/16.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/16.txt @@ -1,57 +1,56 @@ == Physical Plan == -AdaptiveSparkPlan (56) +AdaptiveSparkPlan (59) +- == Final Plan == - VeloxColumnarToRowExec (35) - +- ^ SortExecTransformer (33) - +- ^ InputIteratorTransformer (32) - +- ^ InputAdapter (31) - +- ^ ShuffleQueryStage (30), Statistics(X) - +- ColumnarExchange (29) - +- ^ RegularHashAggregateExecTransformer (27) - +- ^ InputIteratorTransformer (26) - +- ^ InputAdapter (25) - +- ^ ShuffleQueryStage (24), Statistics(X) - +- ColumnarExchange (23) - +- ^ ProjectExecTransformer (21) - +- ^ FlushableHashAggregateExecTransformer (20) - +- ^ RegularHashAggregateExecTransformer (19) - +- ^ InputIteratorTransformer (18) - +- ^ InputAdapter (17) - +- ^ ShuffleQueryStage (16), Statistics(X) - +- ColumnarExchange (15) + VeloxColumnarToRowExec (38) + +- ^ SortExecTransformer (36) + +- ^ InputIteratorTransformer (35) + +- ShuffleQueryStage (33), Statistics(X) + +- ColumnarExchange (32) + +- VeloxAppendBatches (31) + +- ^ RegularHashAggregateExecTransformer (29) + +- ^ InputIteratorTransformer (28) + +- ShuffleQueryStage (26), Statistics(X) + +- ColumnarExchange (25) + +- VeloxAppendBatches (24) + +- ^ ProjectExecTransformer (22) + +- ^ FlushableHashAggregateExecTransformer (21) + +- ^ RegularHashAggregateExecTransformer (20) + +- ^ InputIteratorTransformer (19) + +- ShuffleQueryStage (17), Statistics(X) + +- ColumnarExchange (16) + +- VeloxAppendBatches (15) +- ^ ProjectExecTransformer (13) +- ^ FlushableHashAggregateExecTransformer (12) +- ^ ProjectExecTransformer (11) - +- ^ BroadcastHashJoinExecTransformer Inner (10) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (10) :- ^ FilterExecTransformer (2) : +- ^ Scan parquet (1) +- ^ InputIteratorTransformer (9) - +- ^ InputAdapter (8) - +- ^ BroadcastQueryStage (7), Statistics(X) - +- ColumnarBroadcastExchange (6) - +- ^ FilterExecTransformer (4) - +- ^ Scan parquet (3) + +- BroadcastQueryStage (7), Statistics(X) + +- ColumnarBroadcastExchange (6) + +- ^ FilterExecTransformer (4) + +- ^ Scan parquet (3) +- == Initial Plan == - Sort (55) - +- Exchange (54) - +- HashAggregate (53) - +- Exchange (52) - +- HashAggregate (51) - +- HashAggregate (50) - +- Exchange (49) - +- HashAggregate (48) - +- Project (47) - +- BroadcastHashJoin Inner BuildRight (46) - :- BroadcastHashJoin LeftAnti BuildRight (42) - : :- Filter (37) - : : +- Scan parquet (36) - : +- BroadcastExchange (41) - : +- Project (40) - : +- Filter (39) - : +- Scan parquet (38) - +- BroadcastExchange (45) - +- Filter (44) - +- Scan parquet (43) + Sort (58) + +- Exchange (57) + +- HashAggregate (56) + +- Exchange (55) + +- HashAggregate (54) + +- HashAggregate (53) + +- Exchange (52) + +- HashAggregate (51) + +- Project (50) + +- BroadcastHashJoin Inner BuildRight (49) + :- BroadcastHashJoin LeftAnti BuildRight (45) + : :- Filter (40) + : : +- Scan parquet (39) + : +- BroadcastExchange (44) + : +- Project (43) + : +- Filter (42) + : +- Scan parquet (41) + +- BroadcastExchange (48) + +- Filter (47) + +- Scan parquet (46) (1) Scan parquet @@ -118,195 +117,207 @@ Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Arguments: false -(15) ColumnarExchange +(15) VeloxAppendBatches +Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, ps_suppkey#X] +Arguments: X + +(16) ColumnarExchange Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, ps_suppkey#X], [plan_id=X], [id=#X] -(16) ShuffleQueryStage +(17) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Arguments: X -(17) InputAdapter +(18) InputAdapter Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(18) InputIteratorTransformer +(19) InputIteratorTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(19) RegularHashAggregateExecTransformer +(20) RegularHashAggregateExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Keys [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Functions: [] Aggregate Attributes: [] Results [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(20) FlushableHashAggregateExecTransformer +(21) FlushableHashAggregateExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] Functions [1]: [partial_count(distinct ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count#X] -(21) ProjectExecTransformer +(22) ProjectExecTransformer Output [5]: [hash(p_brand#X, p_type#X, p_size#X, 42) AS hash_partition_key#X, p_brand#X, p_type#X, p_size#X, count#X] Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] -(22) WholeStageCodegenTransformer (X) +(23) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, count#X] Arguments: false -(23) ColumnarExchange +(24) VeloxAppendBatches +Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, count#X] +Arguments: X + +(25) ColumnarExchange Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, count#X] Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, count#X], [plan_id=X], [id=#X] -(24) ShuffleQueryStage +(26) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, count#X] Arguments: X -(25) InputAdapter +(27) InputAdapter Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] -(26) InputIteratorTransformer +(28) InputIteratorTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] -(27) RegularHashAggregateExecTransformer +(29) RegularHashAggregateExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] Functions [1]: [count(distinct ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count(ps_suppkey#X)#X AS supplier_cnt#X] -(28) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: false -(29) ColumnarExchange +(31) VeloxAppendBatches +Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] +Arguments: X + +(32) ColumnarExchange Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: rangepartitioning(supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(30) ShuffleQueryStage +(33) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: X -(31) InputAdapter +(34) InputAdapter Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] -(32) InputIteratorTransformer +(35) InputIteratorTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] -(33) SortExecTransformer +(36) SortExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: [supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST], true, 0 -(34) WholeStageCodegenTransformer (X) +(37) WholeStageCodegenTransformer (X) Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: false -(35) VeloxColumnarToRowExec +(38) VeloxColumnarToRowExec Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] -(36) Scan parquet +(39) Scan parquet Output [2]: [ps_partkey#X, ps_suppkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_partkey)] ReadSchema: struct -(37) Filter +(40) Filter Input [2]: [ps_partkey#X, ps_suppkey#X] Condition : isnotnull(ps_partkey#X) -(38) Scan parquet +(41) Scan parquet Output [2]: [s_suppkey#X, s_comment#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_comment)] ReadSchema: struct -(39) Filter +(42) Filter Input [2]: [s_suppkey#X, s_comment#X] Condition : (isnotnull(s_comment#X) AND s_comment#X LIKE %Customer%Complaints%) -(40) Project +(43) Project Output [1]: [s_suppkey#X] Input [2]: [s_suppkey#X, s_comment#X] -(41) BroadcastExchange +(44) BroadcastExchange Input [1]: [s_suppkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),true), [plan_id=X] -(42) BroadcastHashJoin +(45) BroadcastHashJoin Left keys [1]: [ps_suppkey#X] Right keys [1]: [s_suppkey#X] Join condition: None -(43) Scan parquet +(46) Scan parquet Output [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_brand), IsNotNull(p_type), Not(EqualTo(p_brand,Brand#X)), Not(StringStartsWith(p_type,MEDIUM POLISHED)), In(p_size, [14,19,23,3,36,45,49,9]), IsNotNull(p_partkey)] ReadSchema: struct -(44) Filter +(47) Filter Input [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] Condition : (((((isnotnull(p_brand#X) AND isnotnull(p_type#X)) AND NOT (p_brand#X = Brand#X)) AND NOT StartsWith(p_type#X, MEDIUM POLISHED)) AND p_size#X IN (49,14,23,45,19,3,36,9)) AND isnotnull(p_partkey#X)) -(45) BroadcastExchange +(48) BroadcastExchange Input [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(46) BroadcastHashJoin +(49) BroadcastHashJoin Left keys [1]: [ps_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(47) Project +(50) Project Output [4]: [ps_suppkey#X, p_brand#X, p_type#X, p_size#X] Input [6]: [ps_partkey#X, ps_suppkey#X, p_partkey#X, p_brand#X, p_type#X, p_size#X] -(48) HashAggregate +(51) HashAggregate Input [4]: [ps_suppkey#X, p_brand#X, p_type#X, p_size#X] Keys [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Functions: [] Aggregate Attributes: [] Results [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(49) Exchange +(52) Exchange Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(50) HashAggregate +(53) HashAggregate Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Keys [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Functions: [] Aggregate Attributes: [] Results [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(51) HashAggregate +(54) HashAggregate Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] Functions [1]: [partial_count(distinct ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count#X] -(52) Exchange +(55) Exchange Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(53) HashAggregate +(56) HashAggregate Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] Functions [1]: [count(distinct ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count(ps_suppkey#X)#X AS supplier_cnt#X] -(54) Exchange +(57) Exchange Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: rangepartitioning(supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(55) Sort +(58) Sort Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: [supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST], true, 0 -(56) AdaptiveSparkPlan +(59) AdaptiveSparkPlan Output [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/17.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/17.txt index 712560e7f577c..1e2ed970aef4f 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/17.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/17.txt @@ -1,39 +1,38 @@ == Physical Plan == -AdaptiveSparkPlan (35) +AdaptiveSparkPlan (36) +- == Final Plan == - VeloxColumnarToRowExec (14) - +- ^ ProjectExecTransformer (12) - +- ^ RegularHashAggregateExecTransformer (11) - +- ^ InputIteratorTransformer (10) - +- ^ InputAdapter (9) - +- ^ ShuffleQueryStage (8), Statistics(X) - +- ColumnarExchange (7) + VeloxColumnarToRowExec (15) + +- ^ ProjectExecTransformer (13) + +- ^ RegularHashAggregateExecTransformer (12) + +- ^ InputIteratorTransformer (11) + +- ShuffleQueryStage (9), Statistics(X) + +- ColumnarExchange (8) + +- VeloxAppendBatches (7) +- ^ FlushableHashAggregateExecTransformer (5) +- ^ InputIteratorTransformer (4) - +- ^ InputAdapter (3) - +- ^ RowToVeloxColumnar (2) - +- ^ LocalTableScan (1) + +- RowToVeloxColumnar (2) + +- LocalTableScan (1) +- == Initial Plan == - HashAggregate (34) - +- Exchange (33) - +- HashAggregate (32) - +- Project (31) - +- BroadcastHashJoin Inner BuildRight (30) - :- Project (22) - : +- BroadcastHashJoin Inner BuildRight (21) - : :- Filter (16) - : : +- Scan parquet (15) - : +- BroadcastExchange (20) - : +- Project (19) - : +- Filter (18) - : +- Scan parquet (17) - +- BroadcastExchange (29) - +- Filter (28) - +- HashAggregate (27) - +- Exchange (26) - +- HashAggregate (25) - +- Filter (24) - +- Scan parquet (23) + HashAggregate (35) + +- Exchange (34) + +- HashAggregate (33) + +- Project (32) + +- BroadcastHashJoin Inner BuildRight (31) + :- Project (23) + : +- BroadcastHashJoin Inner BuildRight (22) + : :- Filter (17) + : : +- Scan parquet (16) + : +- BroadcastExchange (21) + : +- Project (20) + : +- Filter (19) + : +- Scan parquet (18) + +- BroadcastExchange (30) + +- Filter (29) + +- HashAggregate (28) + +- Exchange (27) + +- HashAggregate (26) + +- Filter (25) + +- Scan parquet (24) (1) LocalTableScan @@ -60,141 +59,145 @@ Results [2]: [sum#X, isEmpty#X] Input [2]: [sum#X, isEmpty#X] Arguments: false -(7) ColumnarExchange +(7) VeloxAppendBatches +Input [2]: [sum#X, isEmpty#X] +Arguments: X + +(8) ColumnarExchange Input [2]: [sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(8) ShuffleQueryStage +(9) ShuffleQueryStage Output [2]: [sum#X, isEmpty#X] Arguments: X -(9) InputAdapter +(10) InputAdapter Input [2]: [sum#X, isEmpty#X] -(10) InputIteratorTransformer +(11) InputIteratorTransformer Input [2]: [sum#X, isEmpty#X] -(11) RegularHashAggregateExecTransformer +(12) RegularHashAggregateExecTransformer Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(l_extendedprice#X)] Aggregate Attributes [1]: [sum(l_extendedprice#X)#X] Results [1]: [sum(l_extendedprice#X)#X] -(12) ProjectExecTransformer +(13) ProjectExecTransformer Output [1]: [CheckOverflow((promote_precision(sum(l_extendedprice#X)#X) / 7.00), DecimalType(27,6)) AS avg_yearly#X] Input [1]: [sum(l_extendedprice#X)#X] -(13) WholeStageCodegenTransformer (X) +(14) WholeStageCodegenTransformer (X) Input [1]: [avg_yearly#X] Arguments: false -(14) VeloxColumnarToRowExec +(15) VeloxColumnarToRowExec Input [1]: [avg_yearly#X] -(15) Scan parquet +(16) Scan parquet Output [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey), IsNotNull(l_quantity)] ReadSchema: struct -(16) Filter +(17) Filter Input [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] Condition : (isnotnull(l_partkey#X) AND isnotnull(l_quantity#X)) -(17) Scan parquet +(18) Scan parquet Output [3]: [p_partkey#X, p_brand#X, p_container#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_brand), IsNotNull(p_container), EqualTo(p_brand,Brand#X), EqualTo(p_container,MED BOX), IsNotNull(p_partkey)] ReadSchema: struct -(18) Filter +(19) Filter Input [3]: [p_partkey#X, p_brand#X, p_container#X] Condition : ((((isnotnull(p_brand#X) AND isnotnull(p_container#X)) AND (p_brand#X = Brand#X)) AND (p_container#X = MED BOX)) AND isnotnull(p_partkey#X)) -(19) Project +(20) Project Output [1]: [p_partkey#X] Input [3]: [p_partkey#X, p_brand#X, p_container#X] -(20) BroadcastExchange +(21) BroadcastExchange Input [1]: [p_partkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(21) BroadcastHashJoin +(22) BroadcastHashJoin Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(22) Project +(23) Project Output [3]: [l_quantity#X, l_extendedprice#X, p_partkey#X] Input [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, p_partkey#X] -(23) Scan parquet +(24) Scan parquet Output [2]: [l_partkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey)] ReadSchema: struct -(24) Filter +(25) Filter Input [2]: [l_partkey#X, l_quantity#X] Condition : isnotnull(l_partkey#X) -(25) HashAggregate +(26) HashAggregate Input [2]: [l_partkey#X, l_quantity#X] Keys [1]: [l_partkey#X] Functions [1]: [partial_avg(l_quantity#X)] Aggregate Attributes [2]: [sum#X, count#X] Results [3]: [l_partkey#X, sum#X, count#X] -(26) Exchange +(27) Exchange Input [3]: [l_partkey#X, sum#X, count#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(27) HashAggregate +(28) HashAggregate Input [3]: [l_partkey#X, sum#X, count#X] Keys [1]: [l_partkey#X] Functions [1]: [avg(l_quantity#X)] Aggregate Attributes [1]: [avg(l_quantity#X)#X] Results [2]: [CheckOverflow((0.200000 * promote_precision(avg(l_quantity#X)#X)), DecimalType(18,7)) AS (0.2 * avg(l_quantity))#X, l_partkey#X] -(28) Filter +(29) Filter Input [2]: [(0.2 * avg(l_quantity))#X, l_partkey#X] Condition : isnotnull((0.2 * avg(l_quantity))#X) -(29) BroadcastExchange +(30) BroadcastExchange Input [2]: [(0.2 * avg(l_quantity))#X, l_partkey#X] Arguments: HashedRelationBroadcastMode(List(input[1, bigint, true]),false), [plan_id=X] -(30) BroadcastHashJoin +(31) BroadcastHashJoin Left keys [1]: [p_partkey#X] Right keys [1]: [l_partkey#X] Join condition: (cast(l_quantity#X as decimal(18,7)) < (0.2 * avg(l_quantity))#X) -(31) Project +(32) Project Output [1]: [l_extendedprice#X] Input [5]: [l_quantity#X, l_extendedprice#X, p_partkey#X, (0.2 * avg(l_quantity))#X, l_partkey#X] -(32) HashAggregate +(33) HashAggregate Input [1]: [l_extendedprice#X] Keys: [] Functions [1]: [partial_sum(l_extendedprice#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(33) Exchange +(34) Exchange Input [2]: [sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X] -(34) HashAggregate +(35) HashAggregate Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(l_extendedprice#X)] Aggregate Attributes [1]: [sum(l_extendedprice#X)#X] Results [1]: [CheckOverflow((promote_precision(sum(l_extendedprice#X)#X) / 7.00), DecimalType(27,6)) AS avg_yearly#X] -(35) AdaptiveSparkPlan +(36) AdaptiveSparkPlan Output [1]: [avg_yearly#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/18.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/18.txt index cc13e5f3b886c..05884fea78851 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/18.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/18.txt @@ -1,86 +1,82 @@ == Physical Plan == -AdaptiveSparkPlan (86) +AdaptiveSparkPlan (88) +- == Final Plan == - VeloxColumnarToRowExec (53) - +- TakeOrderedAndProjectExecTransformer (52) - +- ^ RegularHashAggregateExecTransformer (50) - +- ^ InputIteratorTransformer (49) - +- ^ InputAdapter (48) - +- ^ ShuffleQueryStage (47), Statistics(X) - +- ColumnarExchange (46) - +- ^ ProjectExecTransformer (44) - +- ^ FlushableHashAggregateExecTransformer (43) - +- ^ ProjectExecTransformer (42) - +- ^ BroadcastHashJoinExecTransformer Inner (41) - :- ^ ProjectExecTransformer (28) - : +- ^ BroadcastHashJoinExecTransformer Inner (27) + VeloxColumnarToRowExec (55) + +- TakeOrderedAndProjectExecTransformer (54) + +- ^ RegularHashAggregateExecTransformer (52) + +- ^ InputIteratorTransformer (51) + +- ShuffleQueryStage (49), Statistics(X) + +- ColumnarExchange (48) + +- VeloxAppendBatches (47) + +- ^ ProjectExecTransformer (45) + +- ^ FlushableHashAggregateExecTransformer (44) + +- ^ ProjectExecTransformer (43) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (42) + :- ^ ProjectExecTransformer (29) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildLeft (28) : :- ^ InputIteratorTransformer (7) - : : +- ^ InputAdapter (6) - : : +- ^ BroadcastQueryStage (5), Statistics(X) - : : +- ColumnarBroadcastExchange (4) - : : +- ^ FilterExecTransformer (2) - : : +- ^ Scan parquet (1) - : +- ^ BroadcastHashJoinExecTransformer LeftSemi (26) + : : +- BroadcastQueryStage (5), Statistics(X) + : : +- ColumnarBroadcastExchange (4) + : : +- ^ FilterExecTransformer (2) + : : +- ^ Scan parquet (1) + : +- ^ BroadcastHashJoinExecTransformer LeftSemi BuildRight (27) : :- ^ FilterExecTransformer (9) : : +- ^ Scan parquet (8) - : +- ^ InputIteratorTransformer (25) - : +- ^ InputAdapter (24) - : +- ^ BroadcastQueryStage (23), Statistics(X) - : +- ColumnarBroadcastExchange (22) - : +- ^ ProjectExecTransformer (20) - : +- ^ FilterExecTransformer (19) - : +- ^ RegularHashAggregateExecTransformer (18) - : +- ^ InputIteratorTransformer (17) - : +- ^ InputAdapter (16) - : +- ^ ShuffleQueryStage (15), Statistics(X) - : +- ColumnarExchange (14) - : +- ^ ProjectExecTransformer (12) - : +- ^ FlushableHashAggregateExecTransformer (11) - : +- ^ Scan parquet (10) - +- ^ InputIteratorTransformer (40) - +- ^ InputAdapter (39) - +- ^ BroadcastQueryStage (38), Statistics(X) - +- ColumnarBroadcastExchange (37) - +- ^ BroadcastHashJoinExecTransformer LeftSemi (35) - :- ^ FilterExecTransformer (30) - : +- ^ Scan parquet (29) - +- ^ InputIteratorTransformer (34) - +- ^ InputAdapter (33) - +- ^ BroadcastQueryStage (32), Statistics(X) - +- ReusedExchange (31) + : +- ^ InputIteratorTransformer (26) + : +- BroadcastQueryStage (24), Statistics(X) + : +- ColumnarBroadcastExchange (23) + : +- ^ ProjectExecTransformer (21) + : +- ^ FilterExecTransformer (20) + : +- ^ RegularHashAggregateExecTransformer (19) + : +- ^ InputIteratorTransformer (18) + : +- ShuffleQueryStage (16), Statistics(X) + : +- ColumnarExchange (15) + : +- VeloxAppendBatches (14) + : +- ^ ProjectExecTransformer (12) + : +- ^ FlushableHashAggregateExecTransformer (11) + : +- ^ Scan parquet (10) + +- ^ InputIteratorTransformer (41) + +- BroadcastQueryStage (39), Statistics(X) + +- ColumnarBroadcastExchange (38) + +- ^ BroadcastHashJoinExecTransformer LeftSemi BuildRight (36) + :- ^ FilterExecTransformer (31) + : +- ^ Scan parquet (30) + +- ^ InputIteratorTransformer (35) + +- BroadcastQueryStage (33), Statistics(X) + +- ReusedExchange (32) +- == Initial Plan == - TakeOrderedAndProject (85) - +- HashAggregate (84) - +- Exchange (83) - +- HashAggregate (82) - +- Project (81) - +- BroadcastHashJoin Inner BuildRight (80) - :- Project (68) - : +- BroadcastHashJoin Inner BuildLeft (67) - : :- BroadcastExchange (56) - : : +- Filter (55) - : : +- Scan parquet (54) - : +- BroadcastHashJoin LeftSemi BuildRight (66) - : :- Filter (58) - : : +- Scan parquet (57) - : +- BroadcastExchange (65) - : +- Project (64) - : +- Filter (63) - : +- HashAggregate (62) - : +- Exchange (61) - : +- HashAggregate (60) - : +- Scan parquet (59) - +- BroadcastExchange (79) - +- BroadcastHashJoin LeftSemi BuildRight (78) - :- Filter (70) - : +- Scan parquet (69) - +- BroadcastExchange (77) - +- Project (76) - +- Filter (75) - +- HashAggregate (74) - +- Exchange (73) - +- HashAggregate (72) - +- Scan parquet (71) + TakeOrderedAndProject (87) + +- HashAggregate (86) + +- Exchange (85) + +- HashAggregate (84) + +- Project (83) + +- BroadcastHashJoin Inner BuildRight (82) + :- Project (70) + : +- BroadcastHashJoin Inner BuildLeft (69) + : :- BroadcastExchange (58) + : : +- Filter (57) + : : +- Scan parquet (56) + : +- BroadcastHashJoin LeftSemi BuildRight (68) + : :- Filter (60) + : : +- Scan parquet (59) + : +- BroadcastExchange (67) + : +- Project (66) + : +- Filter (65) + : +- HashAggregate (64) + : +- Exchange (63) + : +- HashAggregate (62) + : +- Scan parquet (61) + +- BroadcastExchange (81) + +- BroadcastHashJoin LeftSemi BuildRight (80) + :- Filter (72) + : +- Scan parquet (71) + +- BroadcastExchange (79) + +- Project (78) + +- Filter (77) + +- HashAggregate (76) + +- Exchange (75) + +- HashAggregate (74) + +- Scan parquet (73) (1) Scan parquet @@ -144,333 +140,341 @@ Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Input [4]: [hash_partition_key#X, l_orderkey#X, sum#X, isEmpty#X] Arguments: false -(14) ColumnarExchange +(14) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_orderkey#X, sum#X, isEmpty#X] +Arguments: X + +(15) ColumnarExchange Input [4]: [hash_partition_key#X, l_orderkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(15) ShuffleQueryStage +(16) ShuffleQueryStage Output [3]: [l_orderkey#X, sum#X, isEmpty#X] Arguments: X -(16) InputAdapter +(17) InputAdapter Input [3]: [l_orderkey#X, sum#X, isEmpty#X] -(17) InputIteratorTransformer +(18) InputIteratorTransformer Input [3]: [l_orderkey#X, sum#X, isEmpty#X] -(18) RegularHashAggregateExecTransformer +(19) RegularHashAggregateExecTransformer Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Keys [1]: [l_orderkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [2]: [l_orderkey#X, sum(l_quantity#X)#X AS sum(l_quantity#X)#X] -(19) FilterExecTransformer +(20) FilterExecTransformer Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] Arguments: (isnotnull(sum(l_quantity#X)#X) AND (sum(l_quantity#X)#X > 300.00)) -(20) ProjectExecTransformer +(21) ProjectExecTransformer Output [1]: [l_orderkey#X] Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] -(21) WholeStageCodegenTransformer (X) +(22) WholeStageCodegenTransformer (X) Input [1]: [l_orderkey#X] Arguments: false -(22) ColumnarBroadcastExchange +(23) ColumnarBroadcastExchange Input [1]: [l_orderkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(23) BroadcastQueryStage +(24) BroadcastQueryStage Output [1]: [l_orderkey#X] Arguments: X -(24) InputAdapter +(25) InputAdapter Input [1]: [l_orderkey#X] -(25) InputIteratorTransformer +(26) InputIteratorTransformer Input [1]: [l_orderkey#X] -(26) BroadcastHashJoinExecTransformer +(27) BroadcastHashJoinExecTransformer Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(27) BroadcastHashJoinExecTransformer +(28) BroadcastHashJoinExecTransformer Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(28) ProjectExecTransformer +(29) ProjectExecTransformer Output [5]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] Input [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -(29) Scan parquet +(30) Scan parquet Output [2]: [l_orderkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_orderkey)] ReadSchema: struct -(30) FilterExecTransformer +(31) FilterExecTransformer Input [2]: [l_orderkey#X, l_quantity#X] Arguments: isnotnull(l_orderkey#X) -(31) ReusedExchange [Reuses operator id: 22] +(32) ReusedExchange [Reuses operator id: 23] Output [1]: [l_orderkey#X] -(32) BroadcastQueryStage +(33) BroadcastQueryStage Output [1]: [l_orderkey#X] Arguments: X -(33) InputAdapter +(34) InputAdapter Input [1]: [l_orderkey#X] -(34) InputIteratorTransformer +(35) InputIteratorTransformer Input [1]: [l_orderkey#X] -(35) BroadcastHashJoinExecTransformer +(36) BroadcastHashJoinExecTransformer Left keys [1]: [l_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(36) WholeStageCodegenTransformer (X) +(37) WholeStageCodegenTransformer (X) Input [2]: [l_orderkey#X, l_quantity#X] Arguments: false -(37) ColumnarBroadcastExchange +(38) ColumnarBroadcastExchange Input [2]: [l_orderkey#X, l_quantity#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(38) BroadcastQueryStage +(39) BroadcastQueryStage Output [2]: [l_orderkey#X, l_quantity#X] Arguments: X -(39) InputAdapter +(40) InputAdapter Input [2]: [l_orderkey#X, l_quantity#X] -(40) InputIteratorTransformer +(41) InputIteratorTransformer Input [2]: [l_orderkey#X, l_quantity#X] -(41) BroadcastHashJoinExecTransformer +(42) BroadcastHashJoinExecTransformer Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(42) ProjectExecTransformer +(43) ProjectExecTransformer Output [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_quantity#X] Input [7]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_orderkey#X, l_quantity#X] -(43) FlushableHashAggregateExecTransformer +(44) FlushableHashAggregateExecTransformer Input [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_quantity#X] Keys [5]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] -(44) ProjectExecTransformer +(45) ProjectExecTransformer Output [8]: [hash(c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, 42) AS hash_partition_key#X, c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] Input [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] -(45) WholeStageCodegenTransformer (X) +(46) WholeStageCodegenTransformer (X) Input [8]: [hash_partition_key#X, c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] Arguments: false -(46) ColumnarExchange +(47) VeloxAppendBatches +Input [8]: [hash_partition_key#X, c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] +Arguments: X + +(48) ColumnarExchange Input [8]: [hash_partition_key#X, c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] Arguments: hashpartitioning(c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, 1), ENSURE_REQUIREMENTS, [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(47) ShuffleQueryStage +(49) ShuffleQueryStage Output [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] Arguments: X -(48) InputAdapter +(50) InputAdapter Input [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] -(49) InputIteratorTransformer +(51) InputIteratorTransformer Input [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] -(50) RegularHashAggregateExecTransformer +(52) RegularHashAggregateExecTransformer Input [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] Keys [5]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity#X)#X AS sum(l_quantity)#X] -(51) WholeStageCodegenTransformer (X) +(53) WholeStageCodegenTransformer (X) Input [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] Arguments: false -(52) TakeOrderedAndProjectExecTransformer +(54) TakeOrderedAndProjectExecTransformer Input [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] Arguments: X, [o_totalprice#X DESC NULLS LAST, o_orderdate#X ASC NULLS FIRST], [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X], 0 -(53) VeloxColumnarToRowExec +(55) VeloxColumnarToRowExec Input [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] -(54) Scan parquet +(56) Scan parquet Output [2]: [c_custkey#X, c_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey)] ReadSchema: struct -(55) Filter +(57) Filter Input [2]: [c_custkey#X, c_name#X] Condition : isnotnull(c_custkey#X) -(56) BroadcastExchange +(58) BroadcastExchange Input [2]: [c_custkey#X, c_name#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(57) Scan parquet +(59) Scan parquet Output [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(58) Filter +(60) Filter Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Condition : (isnotnull(o_custkey#X) AND isnotnull(o_orderkey#X)) -(59) Scan parquet +(61) Scan parquet Output [2]: [l_orderkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(60) HashAggregate +(62) HashAggregate Input [2]: [l_orderkey#X, l_quantity#X] Keys [1]: [l_orderkey#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_orderkey#X, sum#X, isEmpty#X] -(61) Exchange +(63) Exchange Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(62) HashAggregate +(64) HashAggregate Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Keys [1]: [l_orderkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [2]: [l_orderkey#X, sum(l_quantity#X)#X AS sum(l_quantity#X)#X] -(63) Filter +(65) Filter Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] Condition : (isnotnull(sum(l_quantity#X)#X) AND (sum(l_quantity#X)#X > 300.00)) -(64) Project +(66) Project Output [1]: [l_orderkey#X] Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] -(65) BroadcastExchange +(67) BroadcastExchange Input [1]: [l_orderkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(66) BroadcastHashJoin +(68) BroadcastHashJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(67) BroadcastHashJoin +(69) BroadcastHashJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(68) Project +(70) Project Output [5]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] Input [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -(69) Scan parquet +(71) Scan parquet Output [2]: [l_orderkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_orderkey)] ReadSchema: struct -(70) Filter +(72) Filter Input [2]: [l_orderkey#X, l_quantity#X] Condition : isnotnull(l_orderkey#X) -(71) Scan parquet +(73) Scan parquet Output [2]: [l_orderkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(72) HashAggregate +(74) HashAggregate Input [2]: [l_orderkey#X, l_quantity#X] Keys [1]: [l_orderkey#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_orderkey#X, sum#X, isEmpty#X] -(73) Exchange +(75) Exchange Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(74) HashAggregate +(76) HashAggregate Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Keys [1]: [l_orderkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [2]: [l_orderkey#X, sum(l_quantity#X)#X AS sum(l_quantity#X)#X] -(75) Filter +(77) Filter Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] Condition : (isnotnull(sum(l_quantity#X)#X) AND (sum(l_quantity#X)#X > 300.00)) -(76) Project +(78) Project Output [1]: [l_orderkey#X] Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] -(77) BroadcastExchange +(79) BroadcastExchange Input [1]: [l_orderkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(78) BroadcastHashJoin +(80) BroadcastHashJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(79) BroadcastExchange +(81) BroadcastExchange Input [2]: [l_orderkey#X, l_quantity#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(80) BroadcastHashJoin +(82) BroadcastHashJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(81) Project +(83) Project Output [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_quantity#X] Input [7]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_orderkey#X, l_quantity#X] -(82) HashAggregate +(84) HashAggregate Input [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_quantity#X] Keys [5]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] -(83) Exchange +(85) Exchange Input [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] Arguments: hashpartitioning(c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(84) HashAggregate +(86) HashAggregate Input [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] Keys [5]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity#X)#X AS sum(l_quantity)#X] -(85) TakeOrderedAndProject +(87) TakeOrderedAndProject Input [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] Arguments: X, [o_totalprice#X DESC NULLS LAST, o_orderdate#X ASC NULLS FIRST], [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] -(86) AdaptiveSparkPlan +(88) AdaptiveSparkPlan Output [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/19.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/19.txt index 3350a6c3009f4..f044660313529 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/19.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/19.txt @@ -1,36 +1,35 @@ == Physical Plan == -AdaptiveSparkPlan (33) +AdaptiveSparkPlan (34) +- == Final Plan == - VeloxColumnarToRowExec (21) - +- ^ RegularHashAggregateExecTransformer (19) - +- ^ InputIteratorTransformer (18) - +- ^ InputAdapter (17) - +- ^ ShuffleQueryStage (16), Statistics(X) - +- ColumnarExchange (15) + VeloxColumnarToRowExec (22) + +- ^ RegularHashAggregateExecTransformer (20) + +- ^ InputIteratorTransformer (19) + +- ShuffleQueryStage (17), Statistics(X) + +- ColumnarExchange (16) + +- VeloxAppendBatches (15) +- ^ FlushableHashAggregateExecTransformer (13) +- ^ ProjectExecTransformer (12) - +- ^ BroadcastHashJoinExecTransformer Inner (11) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (11) :- ^ ProjectExecTransformer (3) : +- ^ FilterExecTransformer (2) : +- ^ Scan parquet (1) +- ^ InputIteratorTransformer (10) - +- ^ InputAdapter (9) - +- ^ BroadcastQueryStage (8), Statistics(X) - +- ColumnarBroadcastExchange (7) - +- ^ FilterExecTransformer (5) - +- ^ Scan parquet (4) + +- BroadcastQueryStage (8), Statistics(X) + +- ColumnarBroadcastExchange (7) + +- ^ FilterExecTransformer (5) + +- ^ Scan parquet (4) +- == Initial Plan == - HashAggregate (32) - +- Exchange (31) - +- HashAggregate (30) - +- Project (29) - +- BroadcastHashJoin Inner BuildRight (28) - :- Project (24) - : +- Filter (23) - : +- Scan parquet (22) - +- BroadcastExchange (27) - +- Filter (26) - +- Scan parquet (25) + HashAggregate (33) + +- Exchange (32) + +- HashAggregate (31) + +- Project (30) + +- BroadcastHashJoin Inner BuildRight (29) + :- Project (25) + : +- Filter (24) + : +- Scan parquet (23) + +- BroadcastExchange (28) + +- Filter (27) + +- Scan parquet (26) (1) Scan parquet @@ -97,91 +96,95 @@ Results [2]: [sum#X, isEmpty#X] Input [2]: [sum#X, isEmpty#X] Arguments: false -(15) ColumnarExchange +(15) VeloxAppendBatches +Input [2]: [sum#X, isEmpty#X] +Arguments: X + +(16) ColumnarExchange Input [2]: [sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(16) ShuffleQueryStage +(17) ShuffleQueryStage Output [2]: [sum#X, isEmpty#X] Arguments: X -(17) InputAdapter +(18) InputAdapter Input [2]: [sum#X, isEmpty#X] -(18) InputIteratorTransformer +(19) InputIteratorTransformer Input [2]: [sum#X, isEmpty#X] -(19) RegularHashAggregateExecTransformer +(20) RegularHashAggregateExecTransformer Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] Results [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X AS revenue#X] -(20) WholeStageCodegenTransformer (X) +(21) WholeStageCodegenTransformer (X) Input [1]: [revenue#X] Arguments: false -(21) VeloxColumnarToRowExec +(22) VeloxColumnarToRowExec Input [1]: [revenue#X] -(22) Scan parquet +(23) Scan parquet Output [6]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, l_shipinstruct#X, l_shipmode#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipinstruct), In(l_shipmode, [AIR,AIR REG]), EqualTo(l_shipinstruct,DELIVER IN PERSON), IsNotNull(l_partkey), Or(Or(And(GreaterThanOrEqual(l_quantity,1.00),LessThanOrEqual(l_quantity,11.00)),And(GreaterThanOrEqual(l_quantity,10.00),LessThanOrEqual(l_quantity,20.00))),And(GreaterThanOrEqual(l_quantity,20.00),LessThanOrEqual(l_quantity,30.00)))] ReadSchema: struct -(23) Filter +(24) Filter Input [6]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, l_shipinstruct#X, l_shipmode#X] Condition : ((((isnotnull(l_shipinstruct#X) AND l_shipmode#X IN (AIR,AIR REG)) AND (l_shipinstruct#X = DELIVER IN PERSON)) AND isnotnull(l_partkey#X)) AND ((((l_quantity#X >= 1.00) AND (l_quantity#X <= 11.00)) OR ((l_quantity#X >= 10.00) AND (l_quantity#X <= 20.00))) OR ((l_quantity#X >= 20.00) AND (l_quantity#X <= 30.00)))) -(24) Project +(25) Project Output [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Input [6]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, l_shipinstruct#X, l_shipmode#X] -(25) Scan parquet +(26) Scan parquet Output [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_size), GreaterThanOrEqual(p_size,1), IsNotNull(p_partkey), Or(Or(And(And(EqualTo(p_brand,Brand#X),In(p_container, [SM BOX,SM CASE,SM PACK,SM PKG])),LessThanOrEqual(p_size,5)),And(And(EqualTo(p_brand,Brand#X),In(p_container, [MED BAG,MED BOX,MED PACK,MED PKG])),LessThanOrEqual(p_size,10))),And(And(EqualTo(p_brand,Brand#X),In(p_container, [LG BOX,LG CASE,LG PACK,LG PKG])),LessThanOrEqual(p_size,15)))] ReadSchema: struct -(26) Filter +(27) Filter Input [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] Condition : (((isnotnull(p_size#X) AND (p_size#X >= 1)) AND isnotnull(p_partkey#X)) AND (((((p_brand#X = Brand#X) AND p_container#X IN (SM CASE,SM BOX,SM PACK,SM PKG)) AND (p_size#X <= 5)) OR (((p_brand#X = Brand#X) AND p_container#X IN (MED BAG,MED BOX,MED PKG,MED PACK)) AND (p_size#X <= 10))) OR (((p_brand#X = Brand#X) AND p_container#X IN (LG CASE,LG BOX,LG PACK,LG PKG)) AND (p_size#X <= 15)))) -(27) BroadcastExchange +(28) BroadcastExchange Input [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(28) BroadcastHashJoin +(29) BroadcastHashJoin Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join condition: (((((((p_brand#X = Brand#X) AND p_container#X IN (SM CASE,SM BOX,SM PACK,SM PKG)) AND (l_quantity#X >= 1.00)) AND (l_quantity#X <= 11.00)) AND (p_size#X <= 5)) OR (((((p_brand#X = Brand#X) AND p_container#X IN (MED BAG,MED BOX,MED PKG,MED PACK)) AND (l_quantity#X >= 10.00)) AND (l_quantity#X <= 20.00)) AND (p_size#X <= 10))) OR (((((p_brand#X = Brand#X) AND p_container#X IN (LG CASE,LG BOX,LG PACK,LG PKG)) AND (l_quantity#X >= 20.00)) AND (l_quantity#X <= 30.00)) AND (p_size#X <= 15))) -(29) Project +(30) Project Output [2]: [l_extendedprice#X, l_discount#X] Input [8]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, p_partkey#X, p_brand#X, p_size#X, p_container#X] -(30) HashAggregate +(31) HashAggregate Input [2]: [l_extendedprice#X, l_discount#X] Keys: [] Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(31) Exchange +(32) Exchange Input [2]: [sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X] -(32) HashAggregate +(33) HashAggregate Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] Results [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X AS revenue#X] -(33) AdaptiveSparkPlan +(34) AdaptiveSparkPlan Output [1]: [revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/20.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/20.txt index a95115d00b26f..5cd3c9d35c2c7 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/20.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/20.txt @@ -1,103 +1,101 @@ == Physical Plan == -AdaptiveSparkPlan (104) +AdaptiveSparkPlan (109) +- == Final Plan == - VeloxColumnarToRowExec (67) - +- AQEShuffleRead (66) - +- ShuffleQueryStage (65), Statistics(X) - +- ColumnarExchange (64) - +- ^ ProjectExecTransformer (62) - +- ^ BroadcastHashJoinExecTransformer Inner (61) - :- ^ ProjectExecTransformer (52) - : +- ^ BroadcastHashJoinExecTransformer LeftSemi (51) - : :- ^ InputIteratorTransformer (9) - : : +- ^ InputAdapter (8) - : : +- ^ AQEShuffleRead (7) - : : +- ^ ShuffleQueryStage (6), Statistics(X) - : : +- ColumnarExchange (5) - : : +- ^ ProjectExecTransformer (3) - : : +- ^ FilterExecTransformer (2) - : : +- ^ Scan parquet (1) - : +- ^ InputIteratorTransformer (50) - : +- ^ InputAdapter (49) - : +- ^ BroadcastQueryStage (48), Statistics(X) - : +- ColumnarBroadcastExchange (47) - : +- ^ ProjectExecTransformer (45) - : +- ^ BroadcastHashJoinExecTransformer Inner (44) - : :- ^ InputIteratorTransformer (25) - : : +- ^ InputAdapter (24) - : : +- ^ BroadcastQueryStage (23), Statistics(X) - : : +- ColumnarBroadcastExchange (22) - : : +- ^ BroadcastHashJoinExecTransformer LeftSemi (20) - : : :- ^ FilterExecTransformer (11) - : : : +- ^ Scan parquet (10) - : : +- ^ InputIteratorTransformer (19) - : : +- ^ InputAdapter (18) - : : +- ^ BroadcastQueryStage (17), Statistics(X) - : : +- ColumnarBroadcastExchange (16) - : : +- ^ ProjectExecTransformer (14) - : : +- ^ FilterExecTransformer (13) - : : +- ^ Scan parquet (12) - : +- ^ FilterExecTransformer (43) - : +- ^ ProjectExecTransformer (42) - : +- ^ RegularHashAggregateExecTransformer (41) - : +- ^ InputIteratorTransformer (40) - : +- ^ InputAdapter (39) - : +- ^ ShuffleQueryStage (38), Statistics(X) - : +- ColumnarExchange (37) - : +- ^ ProjectExecTransformer (35) - : +- ^ FlushableHashAggregateExecTransformer (34) - : +- ^ BroadcastHashJoinExecTransformer LeftSemi (33) - : :- ^ ProjectExecTransformer (28) - : : +- ^ FilterExecTransformer (27) - : : +- ^ Scan parquet (26) - : +- ^ InputIteratorTransformer (32) - : +- ^ InputAdapter (31) - : +- ^ BroadcastQueryStage (30), Statistics(X) - : +- ReusedExchange (29) - +- ^ InputIteratorTransformer (60) - +- ^ InputAdapter (59) - +- ^ BroadcastQueryStage (58), Statistics(X) - +- ColumnarBroadcastExchange (57) - +- ^ ProjectExecTransformer (55) - +- ^ FilterExecTransformer (54) - +- ^ Scan parquet (53) + VeloxColumnarToRowExec (70) + +- AQEShuffleRead (69) + +- ShuffleQueryStage (68), Statistics(X) + +- ColumnarExchange (67) + +- VeloxAppendBatches (66) + +- ^ ProjectExecTransformer (64) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (63) + :- ^ ProjectExecTransformer (54) + : +- ^ BroadcastHashJoinExecTransformer LeftSemi BuildRight (53) + : :- ^ InputIteratorTransformer (10) + : : +- AQEShuffleRead (8) + : : +- ShuffleQueryStage (7), Statistics(X) + : : +- ColumnarExchange (6) + : : +- VeloxAppendBatches (5) + : : +- ^ ProjectExecTransformer (3) + : : +- ^ FilterExecTransformer (2) + : : +- ^ Scan parquet (1) + : +- ^ InputIteratorTransformer (52) + : +- BroadcastQueryStage (50), Statistics(X) + : +- ColumnarBroadcastExchange (49) + : +- ^ ProjectExecTransformer (47) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildLeft (46) + : :- ^ InputIteratorTransformer (26) + : : +- BroadcastQueryStage (24), Statistics(X) + : : +- ColumnarBroadcastExchange (23) + : : +- ^ BroadcastHashJoinExecTransformer LeftSemi BuildRight (21) + : : :- ^ FilterExecTransformer (12) + : : : +- ^ Scan parquet (11) + : : +- ^ InputIteratorTransformer (20) + : : +- BroadcastQueryStage (18), Statistics(X) + : : +- ColumnarBroadcastExchange (17) + : : +- ^ ProjectExecTransformer (15) + : : +- ^ FilterExecTransformer (14) + : : +- ^ Scan parquet (13) + : +- ^ FilterExecTransformer (45) + : +- ^ ProjectExecTransformer (44) + : +- ^ RegularHashAggregateExecTransformer (43) + : +- ^ InputIteratorTransformer (42) + : +- ShuffleQueryStage (40), Statistics(X) + : +- ColumnarExchange (39) + : +- VeloxAppendBatches (38) + : +- ^ ProjectExecTransformer (36) + : +- ^ FlushableHashAggregateExecTransformer (35) + : +- ^ BroadcastHashJoinExecTransformer LeftSemi BuildRight (34) + : :- ^ ProjectExecTransformer (29) + : : +- ^ FilterExecTransformer (28) + : : +- ^ Scan parquet (27) + : +- ^ InputIteratorTransformer (33) + : +- BroadcastQueryStage (31), Statistics(X) + : +- ReusedExchange (30) + +- ^ InputIteratorTransformer (62) + +- BroadcastQueryStage (60), Statistics(X) + +- ColumnarBroadcastExchange (59) + +- ^ ProjectExecTransformer (57) + +- ^ FilterExecTransformer (56) + +- ^ Scan parquet (55) +- == Initial Plan == - Sort (103) - +- Exchange (102) - +- Project (101) - +- BroadcastHashJoin Inner BuildRight (100) - :- Project (95) - : +- ShuffledHashJoin LeftSemi BuildRight (94) - : :- Exchange (70) - : : +- Filter (69) - : : +- Scan parquet (68) - : +- Exchange (93) - : +- Project (92) - : +- BroadcastHashJoin Inner BuildLeft (91) - : :- BroadcastExchange (78) - : : +- BroadcastHashJoin LeftSemi BuildRight (77) - : : :- Filter (72) - : : : +- Scan parquet (71) - : : +- BroadcastExchange (76) - : : +- Project (75) - : : +- Filter (74) - : : +- Scan parquet (73) - : +- Filter (90) - : +- HashAggregate (89) - : +- Exchange (88) - : +- HashAggregate (87) - : +- BroadcastHashJoin LeftSemi BuildRight (86) - : :- Project (81) - : : +- Filter (80) - : : +- Scan parquet (79) - : +- BroadcastExchange (85) - : +- Project (84) - : +- Filter (83) - : +- Scan parquet (82) - +- BroadcastExchange (99) - +- Project (98) - +- Filter (97) - +- Scan parquet (96) + Sort (108) + +- Exchange (107) + +- Project (106) + +- BroadcastHashJoin Inner BuildRight (105) + :- Project (100) + : +- SortMergeJoin LeftSemi (99) + : :- Sort (74) + : : +- Exchange (73) + : : +- Filter (72) + : : +- Scan parquet (71) + : +- Sort (98) + : +- Exchange (97) + : +- Project (96) + : +- BroadcastHashJoin Inner BuildLeft (95) + : :- BroadcastExchange (82) + : : +- BroadcastHashJoin LeftSemi BuildRight (81) + : : :- Filter (76) + : : : +- Scan parquet (75) + : : +- BroadcastExchange (80) + : : +- Project (79) + : : +- Filter (78) + : : +- Scan parquet (77) + : +- Filter (94) + : +- HashAggregate (93) + : +- Exchange (92) + : +- HashAggregate (91) + : +- BroadcastHashJoin LeftSemi BuildRight (90) + : :- Project (85) + : : +- Filter (84) + : : +- Scan parquet (83) + : +- BroadcastExchange (89) + : +- Project (88) + : +- Filter (87) + : +- Scan parquet (86) + +- BroadcastExchange (104) + +- Project (103) + +- Filter (102) + +- Scan parquet (101) (1) Scan parquet @@ -119,438 +117,458 @@ Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] +Arguments: X + +(6) ColumnarExchange Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Arguments: X -(7) AQEShuffleRead +(8) AQEShuffleRead Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Arguments: local -(8) InputAdapter +(9) InputAdapter Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] -(9) InputIteratorTransformer +(10) InputIteratorTransformer Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] -(10) Scan parquet +(11) Scan parquet Output [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_availqty), IsNotNull(ps_partkey), IsNotNull(ps_suppkey)] ReadSchema: struct -(11) FilterExecTransformer +(12) FilterExecTransformer Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: ((isnotnull(ps_availqty#X) AND isnotnull(ps_partkey#X)) AND isnotnull(ps_suppkey#X)) -(12) Scan parquet +(13) Scan parquet Output [2]: [p_partkey#X, p_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_name), StringStartsWith(p_name,forest)] ReadSchema: struct -(13) FilterExecTransformer +(14) FilterExecTransformer Input [2]: [p_partkey#X, p_name#X] Arguments: (isnotnull(p_name#X) AND StartsWith(p_name#X, forest)) -(14) ProjectExecTransformer +(15) ProjectExecTransformer Output [1]: [p_partkey#X] Input [2]: [p_partkey#X, p_name#X] -(15) WholeStageCodegenTransformer (X) +(16) WholeStageCodegenTransformer (X) Input [1]: [p_partkey#X] Arguments: false -(16) ColumnarBroadcastExchange +(17) ColumnarBroadcastExchange Input [1]: [p_partkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(17) BroadcastQueryStage +(18) BroadcastQueryStage Output [1]: [p_partkey#X] Arguments: X -(18) InputAdapter +(19) InputAdapter Input [1]: [p_partkey#X] -(19) InputIteratorTransformer +(20) InputIteratorTransformer Input [1]: [p_partkey#X] -(20) BroadcastHashJoinExecTransformer +(21) BroadcastHashJoinExecTransformer Left keys [1]: [ps_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(21) WholeStageCodegenTransformer (X) +(22) WholeStageCodegenTransformer (X) Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: false -(22) ColumnarBroadcastExchange +(23) ColumnarBroadcastExchange Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false], input[1, bigint, false]),false), [plan_id=X] -(23) BroadcastQueryStage +(24) BroadcastQueryStage Output [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: X -(24) InputAdapter +(25) InputAdapter Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] -(25) InputIteratorTransformer +(26) InputIteratorTransformer Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] -(26) Scan parquet +(27) Scan parquet Output [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1994-01-01), LessThan(l_shipdate,1995-01-01), IsNotNull(l_partkey), IsNotNull(l_suppkey)] ReadSchema: struct -(27) FilterExecTransformer +(28) FilterExecTransformer Input [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] Arguments: ((((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1994-01-01)) AND (l_shipdate#X < 1995-01-01)) AND isnotnull(l_partkey#X)) AND isnotnull(l_suppkey#X)) -(28) ProjectExecTransformer +(29) ProjectExecTransformer Output [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] Input [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] -(29) ReusedExchange [Reuses operator id: 16] +(30) ReusedExchange [Reuses operator id: 17] Output [1]: [p_partkey#X] -(30) BroadcastQueryStage +(31) BroadcastQueryStage Output [1]: [p_partkey#X] Arguments: X -(31) InputAdapter +(32) InputAdapter Input [1]: [p_partkey#X] -(32) InputIteratorTransformer +(33) InputIteratorTransformer Input [1]: [p_partkey#X] -(33) BroadcastHashJoinExecTransformer +(34) BroadcastHashJoinExecTransformer Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(34) FlushableHashAggregateExecTransformer +(35) FlushableHashAggregateExecTransformer Input [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] Keys [2]: [l_partkey#X, l_suppkey#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] -(35) ProjectExecTransformer +(36) ProjectExecTransformer Output [5]: [hash(l_partkey#X, l_suppkey#X, 42) AS hash_partition_key#X, l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] Input [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] -(36) WholeStageCodegenTransformer (X) +(37) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: false -(37) ColumnarExchange +(38) VeloxAppendBatches +Input [5]: [hash_partition_key#X, l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] +Arguments: X + +(39) ColumnarExchange Input [5]: [hash_partition_key#X, l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_partkey#X, l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(38) ShuffleQueryStage +(40) ShuffleQueryStage Output [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: X -(39) InputAdapter +(41) InputAdapter Input [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] -(40) InputIteratorTransformer +(42) InputIteratorTransformer Input [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] -(41) RegularHashAggregateExecTransformer +(43) RegularHashAggregateExecTransformer Input [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] Keys [2]: [l_partkey#X, l_suppkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [3]: [l_partkey#X, l_suppkey#X, sum(l_quantity#X)#X] -(42) ProjectExecTransformer +(44) ProjectExecTransformer Output [3]: [CheckOverflow((0.50 * promote_precision(sum(l_quantity#X)#X)), DecimalType(24,3)) AS (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Input [3]: [l_partkey#X, l_suppkey#X, sum(l_quantity#X)#X] -(43) FilterExecTransformer +(45) FilterExecTransformer Input [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Arguments: isnotnull((0.5 * sum(l_quantity))#X) -(44) BroadcastHashJoinExecTransformer +(46) BroadcastHashJoinExecTransformer Left keys [2]: [ps_partkey#X, ps_suppkey#X] Right keys [2]: [l_partkey#X, l_suppkey#X] Join condition: (cast(ps_availqty#X as decimal(24,3)) > (0.5 * sum(l_quantity))#X) -(45) ProjectExecTransformer +(47) ProjectExecTransformer Output [1]: [ps_suppkey#X] Input [6]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] -(46) WholeStageCodegenTransformer (X) +(48) WholeStageCodegenTransformer (X) Input [1]: [ps_suppkey#X] Arguments: false -(47) ColumnarBroadcastExchange +(49) ColumnarBroadcastExchange Input [1]: [ps_suppkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(48) BroadcastQueryStage +(50) BroadcastQueryStage Output [1]: [ps_suppkey#X] Arguments: X -(49) InputAdapter +(51) InputAdapter Input [1]: [ps_suppkey#X] -(50) InputIteratorTransformer +(52) InputIteratorTransformer Input [1]: [ps_suppkey#X] -(51) BroadcastHashJoinExecTransformer +(53) BroadcastHashJoinExecTransformer Left keys [1]: [s_suppkey#X] Right keys [1]: [ps_suppkey#X] Join condition: None -(52) ProjectExecTransformer +(54) ProjectExecTransformer Output [3]: [s_name#X, s_address#X, s_nationkey#X] Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] -(53) Scan parquet +(55) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,CANADA), IsNotNull(n_nationkey)] ReadSchema: struct -(54) FilterExecTransformer +(56) FilterExecTransformer Input [2]: [n_nationkey#X, n_name#X] Arguments: ((isnotnull(n_name#X) AND (n_name#X = CANADA)) AND isnotnull(n_nationkey#X)) -(55) ProjectExecTransformer +(57) ProjectExecTransformer Output [1]: [n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(56) WholeStageCodegenTransformer (X) +(58) WholeStageCodegenTransformer (X) Input [1]: [n_nationkey#X] Arguments: false -(57) ColumnarBroadcastExchange +(59) ColumnarBroadcastExchange Input [1]: [n_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(58) BroadcastQueryStage +(60) BroadcastQueryStage Output [1]: [n_nationkey#X] Arguments: X -(59) InputAdapter +(61) InputAdapter Input [1]: [n_nationkey#X] -(60) InputIteratorTransformer +(62) InputIteratorTransformer Input [1]: [n_nationkey#X] -(61) BroadcastHashJoinExecTransformer +(63) BroadcastHashJoinExecTransformer Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(62) ProjectExecTransformer +(64) ProjectExecTransformer Output [2]: [s_name#X, s_address#X] Input [4]: [s_name#X, s_address#X, s_nationkey#X, n_nationkey#X] -(63) WholeStageCodegenTransformer (X) +(65) WholeStageCodegenTransformer (X) Input [2]: [s_name#X, s_address#X] Arguments: false -(64) ColumnarExchange +(66) VeloxAppendBatches +Input [2]: [s_name#X, s_address#X] +Arguments: X + +(67) ColumnarExchange Input [2]: [s_name#X, s_address#X] Arguments: rangepartitioning(s_name#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(65) ShuffleQueryStage +(68) ShuffleQueryStage Output [2]: [s_name#X, s_address#X] Arguments: X -(66) AQEShuffleRead +(69) AQEShuffleRead Input [2]: [s_name#X, s_address#X] Arguments: local -(67) VeloxColumnarToRowExec +(70) VeloxColumnarToRowExec Input [2]: [s_name#X, s_address#X] -(68) Scan parquet +(71) Scan parquet Output [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_nationkey)] ReadSchema: struct -(69) Filter +(72) Filter Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Condition : isnotnull(s_nationkey#X) -(70) Exchange +(73) Exchange Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(71) Scan parquet +(74) Sort +Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] +Arguments: [s_suppkey#X ASC NULLS FIRST], false, 0 + +(75) Scan parquet Output [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_availqty), IsNotNull(ps_partkey), IsNotNull(ps_suppkey)] ReadSchema: struct -(72) Filter +(76) Filter Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Condition : ((isnotnull(ps_availqty#X) AND isnotnull(ps_partkey#X)) AND isnotnull(ps_suppkey#X)) -(73) Scan parquet +(77) Scan parquet Output [2]: [p_partkey#X, p_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_name), StringStartsWith(p_name,forest)] ReadSchema: struct -(74) Filter +(78) Filter Input [2]: [p_partkey#X, p_name#X] Condition : (isnotnull(p_name#X) AND StartsWith(p_name#X, forest)) -(75) Project +(79) Project Output [1]: [p_partkey#X] Input [2]: [p_partkey#X, p_name#X] -(76) BroadcastExchange +(80) BroadcastExchange Input [1]: [p_partkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(77) BroadcastHashJoin +(81) BroadcastHashJoin Left keys [1]: [ps_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(78) BroadcastExchange +(82) BroadcastExchange Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false], input[1, bigint, false]),false), [plan_id=X] -(79) Scan parquet +(83) Scan parquet Output [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1994-01-01), LessThan(l_shipdate,1995-01-01), IsNotNull(l_partkey), IsNotNull(l_suppkey)] ReadSchema: struct -(80) Filter +(84) Filter Input [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] Condition : ((((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1994-01-01)) AND (l_shipdate#X < 1995-01-01)) AND isnotnull(l_partkey#X)) AND isnotnull(l_suppkey#X)) -(81) Project +(85) Project Output [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] Input [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] -(82) Scan parquet +(86) Scan parquet Output [2]: [p_partkey#X, p_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_name), StringStartsWith(p_name,forest)] ReadSchema: struct -(83) Filter +(87) Filter Input [2]: [p_partkey#X, p_name#X] Condition : (isnotnull(p_name#X) AND StartsWith(p_name#X, forest)) -(84) Project +(88) Project Output [1]: [p_partkey#X] Input [2]: [p_partkey#X, p_name#X] -(85) BroadcastExchange +(89) BroadcastExchange Input [1]: [p_partkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(86) BroadcastHashJoin +(90) BroadcastHashJoin Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(87) HashAggregate +(91) HashAggregate Input [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] Keys [2]: [l_partkey#X, l_suppkey#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] -(88) Exchange +(92) Exchange Input [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_partkey#X, l_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(89) HashAggregate +(93) HashAggregate Input [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] Keys [2]: [l_partkey#X, l_suppkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [3]: [CheckOverflow((0.50 * promote_precision(sum(l_quantity#X)#X)), DecimalType(24,3)) AS (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] -(90) Filter +(94) Filter Input [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Condition : isnotnull((0.5 * sum(l_quantity))#X) -(91) BroadcastHashJoin +(95) BroadcastHashJoin Left keys [2]: [ps_partkey#X, ps_suppkey#X] Right keys [2]: [l_partkey#X, l_suppkey#X] Join condition: (cast(ps_availqty#X as decimal(24,3)) > (0.5 * sum(l_quantity))#X) -(92) Project +(96) Project Output [1]: [ps_suppkey#X] Input [6]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] -(93) Exchange +(97) Exchange Input [1]: [ps_suppkey#X] Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(94) ShuffledHashJoin +(98) Sort +Input [1]: [ps_suppkey#X] +Arguments: [ps_suppkey#X ASC NULLS FIRST], false, 0 + +(99) SortMergeJoin Left keys [1]: [s_suppkey#X] Right keys [1]: [ps_suppkey#X] Join condition: None -(95) Project +(100) Project Output [3]: [s_name#X, s_address#X, s_nationkey#X] Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] -(96) Scan parquet +(101) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,CANADA), IsNotNull(n_nationkey)] ReadSchema: struct -(97) Filter +(102) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : ((isnotnull(n_name#X) AND (n_name#X = CANADA)) AND isnotnull(n_nationkey#X)) -(98) Project +(103) Project Output [1]: [n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(99) BroadcastExchange +(104) BroadcastExchange Input [1]: [n_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(100) BroadcastHashJoin +(105) BroadcastHashJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(101) Project +(106) Project Output [2]: [s_name#X, s_address#X] Input [4]: [s_name#X, s_address#X, s_nationkey#X, n_nationkey#X] -(102) Exchange +(107) Exchange Input [2]: [s_name#X, s_address#X] Arguments: rangepartitioning(s_name#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(103) Sort +(108) Sort Input [2]: [s_name#X, s_address#X] Arguments: [s_name#X ASC NULLS FIRST], true, 0 -(104) AdaptiveSparkPlan +(109) AdaptiveSparkPlan Output [2]: [s_name#X, s_address#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/21.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/21.txt index 2e1ddb4e0851b..3ffdec004af2d 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/21.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/21.txt @@ -1,90 +1,85 @@ == Physical Plan == -AdaptiveSparkPlan (91) +AdaptiveSparkPlan (92) +- == Final Plan == - VeloxColumnarToRowExec (58) - +- ^ RegularHashAggregateExecTransformer (56) - +- ^ InputIteratorTransformer (55) - +- ^ InputAdapter (54) - +- ^ ShuffleQueryStage (53), Statistics(X) - +- ColumnarExchange (52) + VeloxColumnarToRowExec (59) + +- ^ RegularHashAggregateExecTransformer (57) + +- ^ InputIteratorTransformer (56) + +- ShuffleQueryStage (54), Statistics(X) + +- ColumnarExchange (53) + +- VeloxAppendBatches (52) +- ^ ProjectExecTransformer (50) +- ^ FlushableHashAggregateExecTransformer (49) +- ^ ProjectExecTransformer (48) - +- ^ BroadcastHashJoinExecTransformer Inner (47) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (47) :- ^ ProjectExecTransformer (38) - : +- ^ BroadcastHashJoinExecTransformer Inner (37) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (37) : :- ^ ProjectExecTransformer (28) - : : +- ^ BroadcastHashJoinExecTransformer Inner (27) + : : +- ^ BroadcastHashJoinExecTransformer Inner BuildLeft (27) : : :- ^ InputIteratorTransformer (7) - : : : +- ^ InputAdapter (6) - : : : +- ^ BroadcastQueryStage (5), Statistics(X) - : : : +- ColumnarBroadcastExchange (4) - : : : +- ^ FilterExecTransformer (2) - : : : +- ^ Scan parquet (1) - : : +- ^ BroadcastHashJoinExecTransformer LeftAnti (26) - : : :- ^ BroadcastHashJoinExecTransformer LeftSemi (17) + : : : +- BroadcastQueryStage (5), Statistics(X) + : : : +- ColumnarBroadcastExchange (4) + : : : +- ^ FilterExecTransformer (2) + : : : +- ^ Scan parquet (1) + : : +- ^ BroadcastHashJoinExecTransformer LeftAnti BuildRight (26) + : : :- ^ BroadcastHashJoinExecTransformer LeftSemi BuildRight (17) : : : :- ^ ProjectExecTransformer (10) : : : : +- ^ FilterExecTransformer (9) : : : : +- ^ Scan parquet (8) : : : +- ^ InputIteratorTransformer (16) - : : : +- ^ InputAdapter (15) - : : : +- ^ BroadcastQueryStage (14), Statistics(X) - : : : +- ColumnarBroadcastExchange (13) - : : : +- ^ Scan parquet (11) + : : : +- BroadcastQueryStage (14), Statistics(X) + : : : +- ColumnarBroadcastExchange (13) + : : : +- ^ Scan parquet (11) : : +- ^ InputIteratorTransformer (25) - : : +- ^ InputAdapter (24) - : : +- ^ BroadcastQueryStage (23), Statistics(X) - : : +- ColumnarBroadcastExchange (22) - : : +- ^ ProjectExecTransformer (20) - : : +- ^ FilterExecTransformer (19) - : : +- ^ Scan parquet (18) + : : +- BroadcastQueryStage (23), Statistics(X) + : : +- ColumnarBroadcastExchange (22) + : : +- ^ ProjectExecTransformer (20) + : : +- ^ FilterExecTransformer (19) + : : +- ^ Scan parquet (18) : +- ^ InputIteratorTransformer (36) - : +- ^ InputAdapter (35) - : +- ^ BroadcastQueryStage (34), Statistics(X) - : +- ColumnarBroadcastExchange (33) - : +- ^ ProjectExecTransformer (31) - : +- ^ FilterExecTransformer (30) - : +- ^ Scan parquet (29) + : +- BroadcastQueryStage (34), Statistics(X) + : +- ColumnarBroadcastExchange (33) + : +- ^ ProjectExecTransformer (31) + : +- ^ FilterExecTransformer (30) + : +- ^ Scan parquet (29) +- ^ InputIteratorTransformer (46) - +- ^ InputAdapter (45) - +- ^ BroadcastQueryStage (44), Statistics(X) - +- ColumnarBroadcastExchange (43) - +- ^ ProjectExecTransformer (41) - +- ^ FilterExecTransformer (40) - +- ^ Scan parquet (39) + +- BroadcastQueryStage (44), Statistics(X) + +- ColumnarBroadcastExchange (43) + +- ^ ProjectExecTransformer (41) + +- ^ FilterExecTransformer (40) + +- ^ Scan parquet (39) +- == Initial Plan == - TakeOrderedAndProject (90) - +- HashAggregate (89) - +- Exchange (88) - +- HashAggregate (87) - +- Project (86) - +- BroadcastHashJoin Inner BuildRight (85) - :- Project (80) - : +- BroadcastHashJoin Inner BuildRight (79) - : :- Project (74) - : : +- BroadcastHashJoin Inner BuildLeft (73) - : : :- BroadcastExchange (61) - : : : +- Filter (60) - : : : +- Scan parquet (59) - : : +- BroadcastHashJoin LeftAnti BuildRight (72) - : : :- BroadcastHashJoin LeftSemi BuildRight (67) - : : : :- Project (64) - : : : : +- Filter (63) - : : : : +- Scan parquet (62) - : : : +- BroadcastExchange (66) - : : : +- Scan parquet (65) - : : +- BroadcastExchange (71) - : : +- Project (70) - : : +- Filter (69) - : : +- Scan parquet (68) - : +- BroadcastExchange (78) - : +- Project (77) - : +- Filter (76) - : +- Scan parquet (75) - +- BroadcastExchange (84) - +- Project (83) - +- Filter (82) - +- Scan parquet (81) + TakeOrderedAndProject (91) + +- HashAggregate (90) + +- Exchange (89) + +- HashAggregate (88) + +- Project (87) + +- BroadcastHashJoin Inner BuildRight (86) + :- Project (81) + : +- BroadcastHashJoin Inner BuildRight (80) + : :- Project (75) + : : +- BroadcastHashJoin Inner BuildLeft (74) + : : :- BroadcastExchange (62) + : : : +- Filter (61) + : : : +- Scan parquet (60) + : : +- BroadcastHashJoin LeftAnti BuildRight (73) + : : :- BroadcastHashJoin LeftSemi BuildRight (68) + : : : :- Project (65) + : : : : +- Filter (64) + : : : : +- Scan parquet (63) + : : : +- BroadcastExchange (67) + : : : +- Scan parquet (66) + : : +- BroadcastExchange (72) + : : +- Project (71) + : : +- Filter (70) + : : +- Scan parquet (69) + : +- BroadcastExchange (79) + : +- Project (78) + : +- Filter (77) + : +- Scan parquet (76) + +- BroadcastExchange (85) + +- Project (84) + +- Filter (83) + +- Scan parquet (82) (1) Scan parquet @@ -306,190 +301,194 @@ Input [2]: [s_name#X, count#X] Input [3]: [hash_partition_key#X, s_name#X, count#X] Arguments: false -(52) ColumnarExchange +(52) VeloxAppendBatches +Input [3]: [hash_partition_key#X, s_name#X, count#X] +Arguments: X + +(53) ColumnarExchange Input [3]: [hash_partition_key#X, s_name#X, count#X] Arguments: hashpartitioning(s_name#X, 1), ENSURE_REQUIREMENTS, [s_name#X, count#X], [plan_id=X], [id=#X] -(53) ShuffleQueryStage +(54) ShuffleQueryStage Output [2]: [s_name#X, count#X] Arguments: X -(54) InputAdapter +(55) InputAdapter Input [2]: [s_name#X, count#X] -(55) InputIteratorTransformer +(56) InputIteratorTransformer Input [2]: [s_name#X, count#X] -(56) RegularHashAggregateExecTransformer +(57) RegularHashAggregateExecTransformer Input [2]: [s_name#X, count#X] Keys [1]: [s_name#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [s_name#X, count(1)#X AS numwait#X] -(57) WholeStageCodegenTransformer (X) +(58) WholeStageCodegenTransformer (X) Input [2]: [s_name#X, numwait#X] Arguments: false -(58) VeloxColumnarToRowExec +(59) VeloxColumnarToRowExec Input [2]: [s_name#X, numwait#X] -(59) Scan parquet +(60) Scan parquet Output [3]: [s_suppkey#X, s_name#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(60) Filter +(61) Filter Input [3]: [s_suppkey#X, s_name#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(61) BroadcastExchange +(62) BroadcastExchange Input [3]: [s_suppkey#X, s_name#X, s_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(62) Scan parquet +(63) Scan parquet Output [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_receiptdate), IsNotNull(l_commitdate), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(63) Filter +(64) Filter Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Condition : ((((isnotnull(l_receiptdate#X) AND isnotnull(l_commitdate#X)) AND (l_receiptdate#X > l_commitdate#X)) AND isnotnull(l_suppkey#X)) AND isnotnull(l_orderkey#X)) -(64) Project +(65) Project Output [2]: [l_orderkey#X, l_suppkey#X] Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] -(65) Scan parquet +(66) Scan parquet Output [2]: [l_orderkey#X, l_suppkey#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(66) BroadcastExchange +(67) BroadcastExchange Input [2]: [l_orderkey#X, l_suppkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(67) BroadcastHashJoin +(68) BroadcastHashJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: NOT (l_suppkey#X = l_suppkey#X) -(68) Scan parquet +(69) Scan parquet Output [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_receiptdate), IsNotNull(l_commitdate)] ReadSchema: struct -(69) Filter +(70) Filter Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Condition : ((isnotnull(l_receiptdate#X) AND isnotnull(l_commitdate#X)) AND (l_receiptdate#X > l_commitdate#X)) -(70) Project +(71) Project Output [2]: [l_orderkey#X, l_suppkey#X] Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] -(71) BroadcastExchange +(72) BroadcastExchange Input [2]: [l_orderkey#X, l_suppkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(72) BroadcastHashJoin +(73) BroadcastHashJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: NOT (l_suppkey#X = l_suppkey#X) -(73) BroadcastHashJoin +(74) BroadcastHashJoin Left keys [1]: [s_suppkey#X] Right keys [1]: [l_suppkey#X] Join condition: None -(74) Project +(75) Project Output [3]: [s_name#X, s_nationkey#X, l_orderkey#X] Input [5]: [s_suppkey#X, s_name#X, s_nationkey#X, l_orderkey#X, l_suppkey#X] -(75) Scan parquet +(76) Scan parquet Output [2]: [o_orderkey#X, o_orderstatus#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderstatus), EqualTo(o_orderstatus,F), IsNotNull(o_orderkey)] ReadSchema: struct -(76) Filter +(77) Filter Input [2]: [o_orderkey#X, o_orderstatus#X] Condition : ((isnotnull(o_orderstatus#X) AND (o_orderstatus#X = F)) AND isnotnull(o_orderkey#X)) -(77) Project +(78) Project Output [1]: [o_orderkey#X] Input [2]: [o_orderkey#X, o_orderstatus#X] -(78) BroadcastExchange +(79) BroadcastExchange Input [1]: [o_orderkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(79) BroadcastHashJoin +(80) BroadcastHashJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join condition: None -(80) Project +(81) Project Output [2]: [s_name#X, s_nationkey#X] Input [4]: [s_name#X, s_nationkey#X, l_orderkey#X, o_orderkey#X] -(81) Scan parquet +(82) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,SAUDI ARABIA), IsNotNull(n_nationkey)] ReadSchema: struct -(82) Filter +(83) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : ((isnotnull(n_name#X) AND (n_name#X = SAUDI ARABIA)) AND isnotnull(n_nationkey#X)) -(83) Project +(84) Project Output [1]: [n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(84) BroadcastExchange +(85) BroadcastExchange Input [1]: [n_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(85) BroadcastHashJoin +(86) BroadcastHashJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(86) Project +(87) Project Output [1]: [s_name#X] Input [3]: [s_name#X, s_nationkey#X, n_nationkey#X] -(87) HashAggregate +(88) HashAggregate Input [1]: [s_name#X] Keys [1]: [s_name#X] Functions [1]: [partial_count(1)] Aggregate Attributes [1]: [count#X] Results [2]: [s_name#X, count#X] -(88) Exchange +(89) Exchange Input [2]: [s_name#X, count#X] Arguments: hashpartitioning(s_name#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(89) HashAggregate +(90) HashAggregate Input [2]: [s_name#X, count#X] Keys [1]: [s_name#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [s_name#X, count(1)#X AS numwait#X] -(90) TakeOrderedAndProject +(91) TakeOrderedAndProject Input [2]: [s_name#X, numwait#X] Arguments: X, [numwait#X DESC NULLS LAST, s_name#X ASC NULLS FIRST], [s_name#X, numwait#X] -(91) AdaptiveSparkPlan +(92) AdaptiveSparkPlan Output [2]: [s_name#X, numwait#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/22.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/22.txt index 774b9ca851655..f30752effaa42 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/22.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/22.txt @@ -1,40 +1,39 @@ == Physical Plan == -AdaptiveSparkPlan (38) +AdaptiveSparkPlan (40) +- == Final Plan == - VeloxColumnarToRowExec (26) - +- ^ SortExecTransformer (24) - +- ^ InputIteratorTransformer (23) - +- ^ InputAdapter (22) - +- ^ ShuffleQueryStage (21), Statistics(X) - +- ColumnarExchange (20) - +- ^ RegularHashAggregateExecTransformer (18) - +- ^ InputIteratorTransformer (17) - +- ^ InputAdapter (16) - +- ^ ShuffleQueryStage (15), Statistics(X) - +- ColumnarExchange (14) + VeloxColumnarToRowExec (28) + +- ^ SortExecTransformer (26) + +- ^ InputIteratorTransformer (25) + +- ShuffleQueryStage (23), Statistics(X) + +- ColumnarExchange (22) + +- VeloxAppendBatches (21) + +- ^ RegularHashAggregateExecTransformer (19) + +- ^ InputIteratorTransformer (18) + +- ShuffleQueryStage (16), Statistics(X) + +- ColumnarExchange (15) + +- VeloxAppendBatches (14) +- ^ ProjectExecTransformer (12) +- ^ FlushableHashAggregateExecTransformer (11) +- ^ ProjectExecTransformer (10) - +- ^ BroadcastHashJoinExecTransformer LeftAnti (9) + +- ^ BroadcastHashJoinExecTransformer LeftAnti BuildRight (9) :- ^ FilterExecTransformer (2) : +- ^ Scan parquet (1) +- ^ InputIteratorTransformer (8) - +- ^ InputAdapter (7) - +- ^ BroadcastQueryStage (6), Statistics(X) - +- ColumnarBroadcastExchange (5) - +- ^ Scan parquet (3) + +- BroadcastQueryStage (6), Statistics(X) + +- ColumnarBroadcastExchange (5) + +- ^ Scan parquet (3) +- == Initial Plan == - Sort (37) - +- Exchange (36) - +- HashAggregate (35) - +- Exchange (34) - +- HashAggregate (33) - +- Project (32) - +- BroadcastHashJoin LeftAnti BuildRight (31) - :- Filter (28) - : +- Scan parquet (27) - +- BroadcastExchange (30) - +- Scan parquet (29) + Sort (39) + +- Exchange (38) + +- HashAggregate (37) + +- Exchange (36) + +- HashAggregate (35) + +- Project (34) + +- BroadcastHashJoin LeftAnti BuildRight (33) + :- Filter (30) + : +- Scan parquet (29) + +- BroadcastExchange (32) + +- Scan parquet (31) (1) Scan parquet @@ -96,248 +95,260 @@ Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] Input [5]: [hash_partition_key#X, cntrycode#X, count#X, sum#X, isEmpty#X] Arguments: false -(14) ColumnarExchange +(14) VeloxAppendBatches +Input [5]: [hash_partition_key#X, cntrycode#X, count#X, sum#X, isEmpty#X] +Arguments: X + +(15) ColumnarExchange Input [5]: [hash_partition_key#X, cntrycode#X, count#X, sum#X, isEmpty#X] Arguments: hashpartitioning(cntrycode#X, 1), ENSURE_REQUIREMENTS, [cntrycode#X, count#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(15) ShuffleQueryStage +(16) ShuffleQueryStage Output [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] Arguments: X -(16) InputAdapter +(17) InputAdapter Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] -(17) InputIteratorTransformer +(18) InputIteratorTransformer Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] -(18) RegularHashAggregateExecTransformer +(19) RegularHashAggregateExecTransformer Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] Keys [1]: [cntrycode#X] Functions [2]: [count(1), sum(c_acctbal#X)] Aggregate Attributes [2]: [count(1)#X, sum(c_acctbal#X)#X] Results [3]: [cntrycode#X, count(1)#X AS numcust#X, sum(c_acctbal#X)#X AS totacctbal#X] -(19) WholeStageCodegenTransformer (X) +(20) WholeStageCodegenTransformer (X) Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: false -(20) ColumnarExchange +(21) VeloxAppendBatches +Input [3]: [cntrycode#X, numcust#X, totacctbal#X] +Arguments: X + +(22) ColumnarExchange Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: rangepartitioning(cntrycode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(21) ShuffleQueryStage +(23) ShuffleQueryStage Output [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: X -(22) InputAdapter +(24) InputAdapter Input [3]: [cntrycode#X, numcust#X, totacctbal#X] -(23) InputIteratorTransformer +(25) InputIteratorTransformer Input [3]: [cntrycode#X, numcust#X, totacctbal#X] -(24) SortExecTransformer +(26) SortExecTransformer Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: [cntrycode#X ASC NULLS FIRST], true, 0 -(25) WholeStageCodegenTransformer (X) +(27) WholeStageCodegenTransformer (X) Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: false -(26) VeloxColumnarToRowExec +(28) VeloxColumnarToRowExec Input [3]: [cntrycode#X, numcust#X, totacctbal#X] -(27) Scan parquet +(29) Scan parquet Output [3]: [c_custkey#X, c_phone#X, c_acctbal#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_acctbal)] ReadSchema: struct -(28) Filter +(30) Filter Input [3]: [c_custkey#X, c_phone#X, c_acctbal#X] Condition : ((isnotnull(c_acctbal#X) AND substring(c_phone#X, 1, 2) IN (13,31,23,29,30,18,17)) AND (cast(c_acctbal#X as decimal(16,6)) > Subquery subquery#X, [id=#X])) -(29) Scan parquet +(31) Scan parquet Output [1]: [o_custkey#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(30) BroadcastExchange +(32) BroadcastExchange Input [1]: [o_custkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(31) BroadcastHashJoin +(33) BroadcastHashJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(32) Project +(34) Project Output [2]: [substring(c_phone#X, 1, 2) AS cntrycode#X, c_acctbal#X] Input [3]: [c_custkey#X, c_phone#X, c_acctbal#X] -(33) HashAggregate +(35) HashAggregate Input [2]: [cntrycode#X, c_acctbal#X] Keys [1]: [cntrycode#X] Functions [2]: [partial_count(1), partial_sum(c_acctbal#X)] Aggregate Attributes [3]: [count#X, sum#X, isEmpty#X] Results [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] -(34) Exchange +(36) Exchange Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] Arguments: hashpartitioning(cntrycode#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(35) HashAggregate +(37) HashAggregate Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] Keys [1]: [cntrycode#X] Functions [2]: [count(1), sum(c_acctbal#X)] Aggregate Attributes [2]: [count(1)#X, sum(c_acctbal#X)#X] Results [3]: [cntrycode#X, count(1)#X AS numcust#X, sum(c_acctbal#X)#X AS totacctbal#X] -(36) Exchange +(38) Exchange Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: rangepartitioning(cntrycode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(37) Sort +(39) Sort Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: [cntrycode#X ASC NULLS FIRST], true, 0 -(38) AdaptiveSparkPlan +(40) AdaptiveSparkPlan Output [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: isFinalPlan=true ===== Subqueries ===== Subquery:1 Hosting operator id = 2 Hosting Expression = Subquery subquery#X, [id=#X] -AdaptiveSparkPlan (57) +AdaptiveSparkPlan (60) +- == Final Plan == - VeloxColumnarToRowExec (50) - +- ^ RegularHashAggregateExecTransformer (48) - +- ^ InputIteratorTransformer (47) - +- ^ InputAdapter (46) - +- ^ ShuffleQueryStage (45), Statistics(X) - +- ColumnarExchange (44) - +- ^ FlushableHashAggregateExecTransformer (42) - +- ^ ProjectExecTransformer (41) - +- ^ FilterExecTransformer (40) - +- ^ Scan parquet (39) + VeloxColumnarToRowExec (53) + +- ^ RegularHashAggregateExecTransformer (51) + +- ^ InputIteratorTransformer (50) + +- ShuffleQueryStage (48), Statistics(X) + +- ColumnarExchange (47) + +- VeloxAppendBatches (46) + +- ^ FlushableHashAggregateExecTransformer (44) + +- ^ ProjectExecTransformer (43) + +- ^ FilterExecTransformer (42) + +- ^ Scan parquet (41) +- == Initial Plan == - HashAggregate (56) - +- Exchange (55) - +- HashAggregate (54) - +- Project (53) - +- Filter (52) - +- Scan parquet (51) + HashAggregate (59) + +- Exchange (58) + +- HashAggregate (57) + +- Project (56) + +- Filter (55) + +- Scan parquet (54) -(39) Scan parquet +(41) Scan parquet Output [2]: [c_phone#X, c_acctbal#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_acctbal), GreaterThan(c_acctbal,0.00)] ReadSchema: struct -(40) FilterExecTransformer +(42) FilterExecTransformer Input [2]: [c_phone#X, c_acctbal#X] Arguments: ((isnotnull(c_acctbal#X) AND (c_acctbal#X > 0.00)) AND substring(c_phone#X, 1, 2) IN (13,31,23,29,30,18,17)) -(41) ProjectExecTransformer +(43) ProjectExecTransformer Output [1]: [c_acctbal#X] Input [2]: [c_phone#X, c_acctbal#X] -(42) FlushableHashAggregateExecTransformer +(44) FlushableHashAggregateExecTransformer Input [1]: [c_acctbal#X] Keys: [] Functions [1]: [partial_avg(c_acctbal#X)] Aggregate Attributes [2]: [sum#X, count#X] Results [2]: [sum#X, count#X] -(43) WholeStageCodegenTransformer (X) +(45) WholeStageCodegenTransformer (X) Input [2]: [sum#X, count#X] Arguments: false -(44) ColumnarExchange +(46) VeloxAppendBatches +Input [2]: [sum#X, count#X] +Arguments: X + +(47) ColumnarExchange Input [2]: [sum#X, count#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(45) ShuffleQueryStage +(48) ShuffleQueryStage Output [2]: [sum#X, count#X] Arguments: X -(46) InputAdapter +(49) InputAdapter Input [2]: [sum#X, count#X] -(47) InputIteratorTransformer +(50) InputIteratorTransformer Input [2]: [sum#X, count#X] -(48) RegularHashAggregateExecTransformer +(51) RegularHashAggregateExecTransformer Input [2]: [sum#X, count#X] Keys: [] Functions [1]: [avg(c_acctbal#X)] Aggregate Attributes [1]: [avg(c_acctbal#X)#X] Results [1]: [avg(c_acctbal#X)#X AS avg(c_acctbal)#X] -(49) WholeStageCodegenTransformer (X) +(52) WholeStageCodegenTransformer (X) Input [1]: [avg(c_acctbal)#X] Arguments: false -(50) VeloxColumnarToRowExec +(53) VeloxColumnarToRowExec Input [1]: [avg(c_acctbal)#X] -(51) Scan parquet +(54) Scan parquet Output [2]: [c_phone#X, c_acctbal#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_acctbal), GreaterThan(c_acctbal,0.00)] ReadSchema: struct -(52) Filter +(55) Filter Input [2]: [c_phone#X, c_acctbal#X] Condition : ((isnotnull(c_acctbal#X) AND (c_acctbal#X > 0.00)) AND substring(c_phone#X, 1, 2) IN (13,31,23,29,30,18,17)) -(53) Project +(56) Project Output [1]: [c_acctbal#X] Input [2]: [c_phone#X, c_acctbal#X] -(54) HashAggregate +(57) HashAggregate Input [1]: [c_acctbal#X] Keys: [] Functions [1]: [partial_avg(c_acctbal#X)] Aggregate Attributes [2]: [sum#X, count#X] Results [2]: [sum#X, count#X] -(55) Exchange +(58) Exchange Input [2]: [sum#X, count#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X] -(56) HashAggregate +(59) HashAggregate Input [2]: [sum#X, count#X] Keys: [] Functions [1]: [avg(c_acctbal#X)] Aggregate Attributes [1]: [avg(c_acctbal#X)#X] Results [1]: [avg(c_acctbal#X)#X AS avg(c_acctbal)#X] -(57) AdaptiveSparkPlan +(60) AdaptiveSparkPlan Output [1]: [avg(c_acctbal)#X] Arguments: isFinalPlan=true Subquery:2 Hosting operator id = 1 Hosting Expression = Subquery subquery#X, [id=#X] -AdaptiveSparkPlan (57) +AdaptiveSparkPlan (60) +- == Final Plan == - VeloxColumnarToRowExec (50) - +- ^ RegularHashAggregateExecTransformer (48) - +- ^ InputIteratorTransformer (47) - +- ^ InputAdapter (46) - +- ^ ShuffleQueryStage (45), Statistics(X) - +- ColumnarExchange (44) - +- ^ FlushableHashAggregateExecTransformer (42) - +- ^ ProjectExecTransformer (41) - +- ^ FilterExecTransformer (40) - +- ^ Scan parquet (39) + VeloxColumnarToRowExec (53) + +- ^ RegularHashAggregateExecTransformer (51) + +- ^ InputIteratorTransformer (50) + +- ShuffleQueryStage (48), Statistics(X) + +- ColumnarExchange (47) + +- VeloxAppendBatches (46) + +- ^ FlushableHashAggregateExecTransformer (44) + +- ^ ProjectExecTransformer (43) + +- ^ FilterExecTransformer (42) + +- ^ Scan parquet (41) +- == Initial Plan == - HashAggregate (56) - +- Exchange (55) - +- HashAggregate (54) - +- Project (53) - +- Filter (52) - +- Scan parquet (51) \ No newline at end of file + HashAggregate (59) + +- Exchange (58) + +- HashAggregate (57) + +- Project (56) + +- Filter (55) + +- Scan parquet (54) \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/3.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/3.txt index 6a5c8395fa5d2..d9f87ca9b5381 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/3.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/3.txt @@ -1,55 +1,53 @@ == Physical Plan == -AdaptiveSparkPlan (53) +AdaptiveSparkPlan (54) +- == Final Plan == - VeloxColumnarToRowExec (34) - +- TakeOrderedAndProjectExecTransformer (33) - +- ^ ProjectExecTransformer (31) - +- ^ RegularHashAggregateExecTransformer (30) - +- ^ InputIteratorTransformer (29) - +- ^ InputAdapter (28) - +- ^ ShuffleQueryStage (27), Statistics(X) - +- ColumnarExchange (26) + VeloxColumnarToRowExec (35) + +- TakeOrderedAndProjectExecTransformer (34) + +- ^ ProjectExecTransformer (32) + +- ^ RegularHashAggregateExecTransformer (31) + +- ^ InputIteratorTransformer (30) + +- ShuffleQueryStage (28), Statistics(X) + +- ColumnarExchange (27) + +- VeloxAppendBatches (26) +- ^ ProjectExecTransformer (24) +- ^ FlushableHashAggregateExecTransformer (23) +- ^ ProjectExecTransformer (22) - +- ^ BroadcastHashJoinExecTransformer Inner (21) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (21) :- ^ ProjectExecTransformer (12) - : +- ^ BroadcastHashJoinExecTransformer Inner (11) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildLeft (11) : :- ^ InputIteratorTransformer (8) - : : +- ^ InputAdapter (7) - : : +- ^ BroadcastQueryStage (6), Statistics(X) - : : +- ColumnarBroadcastExchange (5) - : : +- ^ ProjectExecTransformer (3) - : : +- ^ FilterExecTransformer (2) - : : +- ^ Scan parquet (1) + : : +- BroadcastQueryStage (6), Statistics(X) + : : +- ColumnarBroadcastExchange (5) + : : +- ^ ProjectExecTransformer (3) + : : +- ^ FilterExecTransformer (2) + : : +- ^ Scan parquet (1) : +- ^ FilterExecTransformer (10) : +- ^ Scan parquet (9) +- ^ InputIteratorTransformer (20) - +- ^ InputAdapter (19) - +- ^ BroadcastQueryStage (18), Statistics(X) - +- ColumnarBroadcastExchange (17) - +- ^ ProjectExecTransformer (15) - +- ^ FilterExecTransformer (14) - +- ^ Scan parquet (13) + +- BroadcastQueryStage (18), Statistics(X) + +- ColumnarBroadcastExchange (17) + +- ^ ProjectExecTransformer (15) + +- ^ FilterExecTransformer (14) + +- ^ Scan parquet (13) +- == Initial Plan == - TakeOrderedAndProject (52) - +- HashAggregate (51) - +- Exchange (50) - +- HashAggregate (49) - +- Project (48) - +- BroadcastHashJoin Inner BuildRight (47) - :- Project (42) - : +- BroadcastHashJoin Inner BuildLeft (41) - : :- BroadcastExchange (38) - : : +- Project (37) - : : +- Filter (36) - : : +- Scan parquet (35) - : +- Filter (40) - : +- Scan parquet (39) - +- BroadcastExchange (46) - +- Project (45) - +- Filter (44) - +- Scan parquet (43) + TakeOrderedAndProject (53) + +- HashAggregate (52) + +- Exchange (51) + +- HashAggregate (50) + +- Project (49) + +- BroadcastHashJoin Inner BuildRight (48) + :- Project (43) + : +- BroadcastHashJoin Inner BuildLeft (42) + : :- BroadcastExchange (39) + : : +- Project (38) + : : +- Filter (37) + : : +- Scan parquet (36) + : +- Filter (41) + : +- Scan parquet (40) + +- BroadcastExchange (47) + +- Project (46) + +- Filter (45) + +- Scan parquet (44) (1) Scan parquet @@ -162,131 +160,135 @@ Input [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] Input [6]: [hash_partition_key#X, l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] Arguments: false -(26) ColumnarExchange +(26) VeloxAppendBatches +Input [6]: [hash_partition_key#X, l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] +Arguments: X + +(27) ColumnarExchange Input [6]: [hash_partition_key#X, l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_orderkey#X, o_orderdate#X, o_shippriority#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(27) ShuffleQueryStage +(28) ShuffleQueryStage Output [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] Arguments: X -(28) InputAdapter +(29) InputAdapter Input [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] -(29) InputIteratorTransformer +(30) InputIteratorTransformer Input [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] -(30) RegularHashAggregateExecTransformer +(31) RegularHashAggregateExecTransformer Input [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] Keys [3]: [l_orderkey#X, o_orderdate#X, o_shippriority#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] Results [4]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] -(31) ProjectExecTransformer +(32) ProjectExecTransformer Output [4]: [l_orderkey#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X AS revenue#X, o_orderdate#X, o_shippriority#X] Input [4]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] -(32) WholeStageCodegenTransformer (X) +(33) WholeStageCodegenTransformer (X) Input [4]: [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] Arguments: false -(33) TakeOrderedAndProjectExecTransformer +(34) TakeOrderedAndProjectExecTransformer Input [4]: [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] Arguments: X, [revenue#X DESC NULLS LAST, o_orderdate#X ASC NULLS FIRST], [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X], 0 -(34) VeloxColumnarToRowExec +(35) VeloxColumnarToRowExec Input [4]: [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] -(35) Scan parquet +(36) Scan parquet Output [2]: [c_custkey#X, c_mktsegment#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_mktsegment), EqualTo(c_mktsegment,BUILDING), IsNotNull(c_custkey)] ReadSchema: struct -(36) Filter +(37) Filter Input [2]: [c_custkey#X, c_mktsegment#X] Condition : ((isnotnull(c_mktsegment#X) AND (c_mktsegment#X = BUILDING)) AND isnotnull(c_custkey#X)) -(37) Project +(38) Project Output [1]: [c_custkey#X] Input [2]: [c_custkey#X, c_mktsegment#X] -(38) BroadcastExchange +(39) BroadcastExchange Input [1]: [c_custkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(39) Scan parquet +(40) Scan parquet Output [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), LessThan(o_orderdate,1995-03-15), IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(40) Filter +(41) Filter Input [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Condition : (((isnotnull(o_orderdate#X) AND (o_orderdate#X < 1995-03-15)) AND isnotnull(o_custkey#X)) AND isnotnull(o_orderkey#X)) -(41) BroadcastHashJoin +(42) BroadcastHashJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(42) Project +(43) Project Output [3]: [o_orderkey#X, o_orderdate#X, o_shippriority#X] Input [5]: [c_custkey#X, o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] -(43) Scan parquet +(44) Scan parquet Output [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThan(l_shipdate,1995-03-15), IsNotNull(l_orderkey)] ReadSchema: struct -(44) Filter +(45) Filter Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : ((isnotnull(l_shipdate#X) AND (l_shipdate#X > 1995-03-15)) AND isnotnull(l_orderkey#X)) -(45) Project +(46) Project Output [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(46) BroadcastExchange +(47) BroadcastExchange Input [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(47) BroadcastHashJoin +(48) BroadcastHashJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(48) Project +(49) Project Output [5]: [o_orderdate#X, o_shippriority#X, l_orderkey#X, l_extendedprice#X, l_discount#X] Input [6]: [o_orderkey#X, o_orderdate#X, o_shippriority#X, l_orderkey#X, l_extendedprice#X, l_discount#X] -(49) HashAggregate +(50) HashAggregate Input [5]: [o_orderdate#X, o_shippriority#X, l_orderkey#X, l_extendedprice#X, l_discount#X] Keys [3]: [l_orderkey#X, o_orderdate#X, o_shippriority#X] Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] -(50) Exchange +(51) Exchange Input [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_orderkey#X, o_orderdate#X, o_shippriority#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(51) HashAggregate +(52) HashAggregate Input [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] Keys [3]: [l_orderkey#X, o_orderdate#X, o_shippriority#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] Results [4]: [l_orderkey#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X AS revenue#X, o_orderdate#X, o_shippriority#X] -(52) TakeOrderedAndProject +(53) TakeOrderedAndProject Input [4]: [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] Arguments: X, [revenue#X DESC NULLS LAST, o_orderdate#X ASC NULLS FIRST], [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] -(53) AdaptiveSparkPlan +(54) AdaptiveSparkPlan Output [4]: [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/4.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/4.txt index 0bbf78a26ac81..db9f7716a365c 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/4.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/4.txt @@ -1,46 +1,45 @@ == Physical Plan == -AdaptiveSparkPlan (44) +AdaptiveSparkPlan (46) +- == Final Plan == - VeloxColumnarToRowExec (29) - +- ^ SortExecTransformer (27) - +- ^ InputIteratorTransformer (26) - +- ^ InputAdapter (25) - +- ^ ShuffleQueryStage (24), Statistics(X) - +- ColumnarExchange (23) - +- ^ RegularHashAggregateExecTransformer (21) - +- ^ InputIteratorTransformer (20) - +- ^ InputAdapter (19) - +- ^ ShuffleQueryStage (18), Statistics(X) - +- ColumnarExchange (17) + VeloxColumnarToRowExec (31) + +- ^ SortExecTransformer (29) + +- ^ InputIteratorTransformer (28) + +- ShuffleQueryStage (26), Statistics(X) + +- ColumnarExchange (25) + +- VeloxAppendBatches (24) + +- ^ RegularHashAggregateExecTransformer (22) + +- ^ InputIteratorTransformer (21) + +- ShuffleQueryStage (19), Statistics(X) + +- ColumnarExchange (18) + +- VeloxAppendBatches (17) +- ^ ProjectExecTransformer (15) +- ^ FlushableHashAggregateExecTransformer (14) +- ^ ProjectExecTransformer (13) - +- ^ BroadcastHashJoinExecTransformer LeftSemi (12) + +- ^ BroadcastHashJoinExecTransformer LeftSemi BuildRight (12) :- ^ ProjectExecTransformer (3) : +- ^ FilterExecTransformer (2) : +- ^ Scan parquet (1) +- ^ InputIteratorTransformer (11) - +- ^ InputAdapter (10) - +- ^ BroadcastQueryStage (9), Statistics(X) - +- ColumnarBroadcastExchange (8) - +- ^ ProjectExecTransformer (6) - +- ^ FilterExecTransformer (5) - +- ^ Scan parquet (4) + +- BroadcastQueryStage (9), Statistics(X) + +- ColumnarBroadcastExchange (8) + +- ^ ProjectExecTransformer (6) + +- ^ FilterExecTransformer (5) + +- ^ Scan parquet (4) +- == Initial Plan == - Sort (43) - +- Exchange (42) - +- HashAggregate (41) - +- Exchange (40) - +- HashAggregate (39) - +- Project (38) - +- BroadcastHashJoin LeftSemi BuildRight (37) - :- Project (32) - : +- Filter (31) - : +- Scan parquet (30) - +- BroadcastExchange (36) - +- Project (35) - +- Filter (34) - +- Scan parquet (33) + Sort (45) + +- Exchange (44) + +- HashAggregate (43) + +- Exchange (42) + +- HashAggregate (41) + +- Project (40) + +- BroadcastHashJoin LeftSemi BuildRight (39) + :- Project (34) + : +- Filter (33) + : +- Scan parquet (32) + +- BroadcastExchange (38) + +- Project (37) + +- Filter (36) + +- Scan parquet (35) (1) Scan parquet @@ -115,125 +114,133 @@ Input [2]: [o_orderpriority#X, count#X] Input [3]: [hash_partition_key#X, o_orderpriority#X, count#X] Arguments: false -(17) ColumnarExchange +(17) VeloxAppendBatches +Input [3]: [hash_partition_key#X, o_orderpriority#X, count#X] +Arguments: X + +(18) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderpriority#X, count#X] Arguments: hashpartitioning(o_orderpriority#X, 1), ENSURE_REQUIREMENTS, [o_orderpriority#X, count#X], [plan_id=X], [id=#X] -(18) ShuffleQueryStage +(19) ShuffleQueryStage Output [2]: [o_orderpriority#X, count#X] Arguments: X -(19) InputAdapter +(20) InputAdapter Input [2]: [o_orderpriority#X, count#X] -(20) InputIteratorTransformer +(21) InputIteratorTransformer Input [2]: [o_orderpriority#X, count#X] -(21) RegularHashAggregateExecTransformer +(22) RegularHashAggregateExecTransformer Input [2]: [o_orderpriority#X, count#X] Keys [1]: [o_orderpriority#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [o_orderpriority#X, count(1)#X AS order_count#X] -(22) WholeStageCodegenTransformer (X) +(23) WholeStageCodegenTransformer (X) Input [2]: [o_orderpriority#X, order_count#X] Arguments: false -(23) ColumnarExchange +(24) VeloxAppendBatches +Input [2]: [o_orderpriority#X, order_count#X] +Arguments: X + +(25) ColumnarExchange Input [2]: [o_orderpriority#X, order_count#X] Arguments: rangepartitioning(o_orderpriority#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(24) ShuffleQueryStage +(26) ShuffleQueryStage Output [2]: [o_orderpriority#X, order_count#X] Arguments: X -(25) InputAdapter +(27) InputAdapter Input [2]: [o_orderpriority#X, order_count#X] -(26) InputIteratorTransformer +(28) InputIteratorTransformer Input [2]: [o_orderpriority#X, order_count#X] -(27) SortExecTransformer +(29) SortExecTransformer Input [2]: [o_orderpriority#X, order_count#X] Arguments: [o_orderpriority#X ASC NULLS FIRST], true, 0 -(28) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [2]: [o_orderpriority#X, order_count#X] Arguments: false -(29) VeloxColumnarToRowExec +(31) VeloxColumnarToRowExec Input [2]: [o_orderpriority#X, order_count#X] -(30) Scan parquet +(32) Scan parquet Output [3]: [o_orderkey#X, o_orderdate#X, o_orderpriority#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1993-07-01), LessThan(o_orderdate,1993-10-01)] ReadSchema: struct -(31) Filter +(33) Filter Input [3]: [o_orderkey#X, o_orderdate#X, o_orderpriority#X] Condition : ((isnotnull(o_orderdate#X) AND (o_orderdate#X >= 1993-07-01)) AND (o_orderdate#X < 1993-10-01)) -(32) Project +(34) Project Output [2]: [o_orderkey#X, o_orderpriority#X] Input [3]: [o_orderkey#X, o_orderdate#X, o_orderpriority#X] -(33) Scan parquet +(35) Scan parquet Output [3]: [l_orderkey#X, l_commitdate#X, l_receiptdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_commitdate), IsNotNull(l_receiptdate)] ReadSchema: struct -(34) Filter +(36) Filter Input [3]: [l_orderkey#X, l_commitdate#X, l_receiptdate#X] Condition : ((isnotnull(l_commitdate#X) AND isnotnull(l_receiptdate#X)) AND (l_commitdate#X < l_receiptdate#X)) -(35) Project +(37) Project Output [1]: [l_orderkey#X] Input [3]: [l_orderkey#X, l_commitdate#X, l_receiptdate#X] -(36) BroadcastExchange +(38) BroadcastExchange Input [1]: [l_orderkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(37) BroadcastHashJoin +(39) BroadcastHashJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(38) Project +(40) Project Output [1]: [o_orderpriority#X] Input [2]: [o_orderkey#X, o_orderpriority#X] -(39) HashAggregate +(41) HashAggregate Input [1]: [o_orderpriority#X] Keys [1]: [o_orderpriority#X] Functions [1]: [partial_count(1)] Aggregate Attributes [1]: [count#X] Results [2]: [o_orderpriority#X, count#X] -(40) Exchange +(42) Exchange Input [2]: [o_orderpriority#X, count#X] Arguments: hashpartitioning(o_orderpriority#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(41) HashAggregate +(43) HashAggregate Input [2]: [o_orderpriority#X, count#X] Keys [1]: [o_orderpriority#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [o_orderpriority#X, count(1)#X AS order_count#X] -(42) Exchange +(44) Exchange Input [2]: [o_orderpriority#X, order_count#X] Arguments: rangepartitioning(o_orderpriority#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(43) Sort +(45) Sort Input [2]: [o_orderpriority#X, order_count#X] Arguments: [o_orderpriority#X ASC NULLS FIRST], true, 0 -(44) AdaptiveSparkPlan +(46) AdaptiveSparkPlan Output [2]: [o_orderpriority#X, order_count#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/5.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/5.txt index cc6561aefb3b1..67e9d847aefad 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/5.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/5.txt @@ -1,98 +1,93 @@ == Physical Plan == -AdaptiveSparkPlan (100) +AdaptiveSparkPlan (102) +- == Final Plan == - VeloxColumnarToRowExec (65) - +- ^ SortExecTransformer (63) - +- ^ InputIteratorTransformer (62) - +- ^ InputAdapter (61) - +- ^ ShuffleQueryStage (60), Statistics(X) - +- ColumnarExchange (59) - +- ^ RegularHashAggregateExecTransformer (57) - +- ^ InputIteratorTransformer (56) - +- ^ InputAdapter (55) - +- ^ ShuffleQueryStage (54), Statistics(X) - +- ColumnarExchange (53) + VeloxColumnarToRowExec (67) + +- ^ SortExecTransformer (65) + +- ^ InputIteratorTransformer (64) + +- ShuffleQueryStage (62), Statistics(X) + +- ColumnarExchange (61) + +- VeloxAppendBatches (60) + +- ^ RegularHashAggregateExecTransformer (58) + +- ^ InputIteratorTransformer (57) + +- ShuffleQueryStage (55), Statistics(X) + +- ColumnarExchange (54) + +- VeloxAppendBatches (53) +- ^ ProjectExecTransformer (51) +- ^ FlushableHashAggregateExecTransformer (50) +- ^ ProjectExecTransformer (49) - +- ^ BroadcastHashJoinExecTransformer Inner (48) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (48) :- ^ ProjectExecTransformer (39) - : +- ^ BroadcastHashJoinExecTransformer Inner (38) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (38) : :- ^ ProjectExecTransformer (30) - : : +- ^ BroadcastHashJoinExecTransformer Inner (29) + : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (29) : : :- ^ ProjectExecTransformer (21) - : : : +- ^ BroadcastHashJoinExecTransformer Inner (20) + : : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (20) : : : :- ^ ProjectExecTransformer (12) - : : : : +- ^ BroadcastHashJoinExecTransformer Inner (11) + : : : : +- ^ BroadcastHashJoinExecTransformer Inner BuildLeft (11) : : : : :- ^ InputIteratorTransformer (7) - : : : : : +- ^ InputAdapter (6) - : : : : : +- ^ BroadcastQueryStage (5), Statistics(X) - : : : : : +- ColumnarBroadcastExchange (4) - : : : : : +- ^ FilterExecTransformer (2) - : : : : : +- ^ Scan parquet (1) + : : : : : +- BroadcastQueryStage (5), Statistics(X) + : : : : : +- ColumnarBroadcastExchange (4) + : : : : : +- ^ FilterExecTransformer (2) + : : : : : +- ^ Scan parquet (1) : : : : +- ^ ProjectExecTransformer (10) : : : : +- ^ FilterExecTransformer (9) : : : : +- ^ Scan parquet (8) : : : +- ^ InputIteratorTransformer (19) - : : : +- ^ InputAdapter (18) - : : : +- ^ BroadcastQueryStage (17), Statistics(X) - : : : +- ColumnarBroadcastExchange (16) - : : : +- ^ FilterExecTransformer (14) - : : : +- ^ Scan parquet (13) + : : : +- BroadcastQueryStage (17), Statistics(X) + : : : +- ColumnarBroadcastExchange (16) + : : : +- ^ FilterExecTransformer (14) + : : : +- ^ Scan parquet (13) : : +- ^ InputIteratorTransformer (28) - : : +- ^ InputAdapter (27) - : : +- ^ BroadcastQueryStage (26), Statistics(X) - : : +- ColumnarBroadcastExchange (25) - : : +- ^ FilterExecTransformer (23) - : : +- ^ Scan parquet (22) + : : +- BroadcastQueryStage (26), Statistics(X) + : : +- ColumnarBroadcastExchange (25) + : : +- ^ FilterExecTransformer (23) + : : +- ^ Scan parquet (22) : +- ^ InputIteratorTransformer (37) - : +- ^ InputAdapter (36) - : +- ^ BroadcastQueryStage (35), Statistics(X) - : +- ColumnarBroadcastExchange (34) - : +- ^ FilterExecTransformer (32) - : +- ^ Scan parquet (31) + : +- BroadcastQueryStage (35), Statistics(X) + : +- ColumnarBroadcastExchange (34) + : +- ^ FilterExecTransformer (32) + : +- ^ Scan parquet (31) +- ^ InputIteratorTransformer (47) - +- ^ InputAdapter (46) - +- ^ BroadcastQueryStage (45), Statistics(X) - +- ColumnarBroadcastExchange (44) - +- ^ ProjectExecTransformer (42) - +- ^ FilterExecTransformer (41) - +- ^ Scan parquet (40) + +- BroadcastQueryStage (45), Statistics(X) + +- ColumnarBroadcastExchange (44) + +- ^ ProjectExecTransformer (42) + +- ^ FilterExecTransformer (41) + +- ^ Scan parquet (40) +- == Initial Plan == - Sort (99) - +- Exchange (98) - +- HashAggregate (97) - +- Exchange (96) - +- HashAggregate (95) - +- Project (94) - +- BroadcastHashJoin Inner BuildRight (93) - :- Project (88) - : +- BroadcastHashJoin Inner BuildRight (87) - : :- Project (83) - : : +- BroadcastHashJoin Inner BuildRight (82) - : : :- Project (78) - : : : +- BroadcastHashJoin Inner BuildRight (77) - : : : :- Project (73) - : : : : +- BroadcastHashJoin Inner BuildLeft (72) - : : : : :- BroadcastExchange (68) - : : : : : +- Filter (67) - : : : : : +- Scan parquet (66) - : : : : +- Project (71) - : : : : +- Filter (70) - : : : : +- Scan parquet (69) - : : : +- BroadcastExchange (76) - : : : +- Filter (75) - : : : +- Scan parquet (74) - : : +- BroadcastExchange (81) - : : +- Filter (80) - : : +- Scan parquet (79) - : +- BroadcastExchange (86) - : +- Filter (85) - : +- Scan parquet (84) - +- BroadcastExchange (92) - +- Project (91) - +- Filter (90) - +- Scan parquet (89) + Sort (101) + +- Exchange (100) + +- HashAggregate (99) + +- Exchange (98) + +- HashAggregate (97) + +- Project (96) + +- BroadcastHashJoin Inner BuildRight (95) + :- Project (90) + : +- BroadcastHashJoin Inner BuildRight (89) + : :- Project (85) + : : +- BroadcastHashJoin Inner BuildRight (84) + : : :- Project (80) + : : : +- BroadcastHashJoin Inner BuildRight (79) + : : : :- Project (75) + : : : : +- BroadcastHashJoin Inner BuildLeft (74) + : : : : :- BroadcastExchange (70) + : : : : : +- Filter (69) + : : : : : +- Scan parquet (68) + : : : : +- Project (73) + : : : : +- Filter (72) + : : : : +- Scan parquet (71) + : : : +- BroadcastExchange (78) + : : : +- Filter (77) + : : : +- Scan parquet (76) + : : +- BroadcastExchange (83) + : : +- Filter (82) + : : +- Scan parquet (81) + : +- BroadcastExchange (88) + : +- Filter (87) + : +- Scan parquet (86) + +- BroadcastExchange (94) + +- Project (93) + +- Filter (92) + +- Scan parquet (91) (1) Scan parquet @@ -319,221 +314,229 @@ Input [3]: [n_name#X, sum#X, isEmpty#X] Input [4]: [hash_partition_key#X, n_name#X, sum#X, isEmpty#X] Arguments: false -(53) ColumnarExchange +(53) VeloxAppendBatches +Input [4]: [hash_partition_key#X, n_name#X, sum#X, isEmpty#X] +Arguments: X + +(54) ColumnarExchange Input [4]: [hash_partition_key#X, n_name#X, sum#X, isEmpty#X] Arguments: hashpartitioning(n_name#X, 1), ENSURE_REQUIREMENTS, [n_name#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(54) ShuffleQueryStage +(55) ShuffleQueryStage Output [3]: [n_name#X, sum#X, isEmpty#X] Arguments: X -(55) InputAdapter +(56) InputAdapter Input [3]: [n_name#X, sum#X, isEmpty#X] -(56) InputIteratorTransformer +(57) InputIteratorTransformer Input [3]: [n_name#X, sum#X, isEmpty#X] -(57) RegularHashAggregateExecTransformer +(58) RegularHashAggregateExecTransformer Input [3]: [n_name#X, sum#X, isEmpty#X] Keys [1]: [n_name#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] Results [2]: [n_name#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X AS revenue#X] -(58) WholeStageCodegenTransformer (X) +(59) WholeStageCodegenTransformer (X) Input [2]: [n_name#X, revenue#X] Arguments: false -(59) ColumnarExchange +(60) VeloxAppendBatches +Input [2]: [n_name#X, revenue#X] +Arguments: X + +(61) ColumnarExchange Input [2]: [n_name#X, revenue#X] Arguments: rangepartitioning(revenue#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(60) ShuffleQueryStage +(62) ShuffleQueryStage Output [2]: [n_name#X, revenue#X] Arguments: X -(61) InputAdapter +(63) InputAdapter Input [2]: [n_name#X, revenue#X] -(62) InputIteratorTransformer +(64) InputIteratorTransformer Input [2]: [n_name#X, revenue#X] -(63) SortExecTransformer +(65) SortExecTransformer Input [2]: [n_name#X, revenue#X] Arguments: [revenue#X DESC NULLS LAST], true, 0 -(64) WholeStageCodegenTransformer (X) +(66) WholeStageCodegenTransformer (X) Input [2]: [n_name#X, revenue#X] Arguments: false -(65) VeloxColumnarToRowExec +(67) VeloxColumnarToRowExec Input [2]: [n_name#X, revenue#X] -(66) Scan parquet +(68) Scan parquet Output [2]: [c_custkey#X, c_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey), IsNotNull(c_nationkey)] ReadSchema: struct -(67) Filter +(69) Filter Input [2]: [c_custkey#X, c_nationkey#X] Condition : (isnotnull(c_custkey#X) AND isnotnull(c_nationkey#X)) -(68) BroadcastExchange +(70) BroadcastExchange Input [2]: [c_custkey#X, c_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(69) Scan parquet +(71) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1994-01-01), LessThan(o_orderdate,1995-01-01), IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(70) Filter +(72) Filter Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Condition : ((((isnotnull(o_orderdate#X) AND (o_orderdate#X >= 1994-01-01)) AND (o_orderdate#X < 1995-01-01)) AND isnotnull(o_custkey#X)) AND isnotnull(o_orderkey#X)) -(71) Project +(73) Project Output [2]: [o_orderkey#X, o_custkey#X] Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] -(72) BroadcastHashJoin +(74) BroadcastHashJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(73) Project +(75) Project Output [2]: [c_nationkey#X, o_orderkey#X] Input [4]: [c_custkey#X, c_nationkey#X, o_orderkey#X, o_custkey#X] -(74) Scan parquet +(76) Scan parquet Output [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_orderkey), IsNotNull(l_suppkey)] ReadSchema: struct -(75) Filter +(77) Filter Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Condition : (isnotnull(l_orderkey#X) AND isnotnull(l_suppkey#X)) -(76) BroadcastExchange +(78) BroadcastExchange Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(77) BroadcastHashJoin +(79) BroadcastHashJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(78) Project +(80) Project Output [4]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Input [6]: [c_nationkey#X, o_orderkey#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(79) Scan parquet +(81) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(80) Filter +(82) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(81) BroadcastExchange +(83) BroadcastExchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false], input[1, bigint, false]),false), [plan_id=X] -(82) BroadcastHashJoin +(84) BroadcastHashJoin Left keys [2]: [l_suppkey#X, c_nationkey#X] Right keys [2]: [s_suppkey#X, s_nationkey#X] Join condition: None -(83) Project +(85) Project Output [3]: [l_extendedprice#X, l_discount#X, s_nationkey#X] Input [6]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, s_suppkey#X, s_nationkey#X] -(84) Scan parquet +(86) Scan parquet Output [3]: [n_nationkey#X, n_name#X, n_regionkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), IsNotNull(n_regionkey)] ReadSchema: struct -(85) Filter +(87) Filter Input [3]: [n_nationkey#X, n_name#X, n_regionkey#X] Condition : (isnotnull(n_nationkey#X) AND isnotnull(n_regionkey#X)) -(86) BroadcastExchange +(88) BroadcastExchange Input [3]: [n_nationkey#X, n_name#X, n_regionkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(87) BroadcastHashJoin +(89) BroadcastHashJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(88) Project +(90) Project Output [4]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] Input [6]: [l_extendedprice#X, l_discount#X, s_nationkey#X, n_nationkey#X, n_name#X, n_regionkey#X] -(89) Scan parquet +(91) Scan parquet Output [2]: [r_regionkey#X, r_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(r_name), EqualTo(r_name,ASIA), IsNotNull(r_regionkey)] ReadSchema: struct -(90) Filter +(92) Filter Input [2]: [r_regionkey#X, r_name#X] Condition : ((isnotnull(r_name#X) AND (r_name#X = ASIA)) AND isnotnull(r_regionkey#X)) -(91) Project +(93) Project Output [1]: [r_regionkey#X] Input [2]: [r_regionkey#X, r_name#X] -(92) BroadcastExchange +(94) BroadcastExchange Input [1]: [r_regionkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(93) BroadcastHashJoin +(95) BroadcastHashJoin Left keys [1]: [n_regionkey#X] Right keys [1]: [r_regionkey#X] Join condition: None -(94) Project +(96) Project Output [3]: [l_extendedprice#X, l_discount#X, n_name#X] Input [5]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X, r_regionkey#X] -(95) HashAggregate +(97) HashAggregate Input [3]: [l_extendedprice#X, l_discount#X, n_name#X] Keys [1]: [n_name#X] Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [n_name#X, sum#X, isEmpty#X] -(96) Exchange +(98) Exchange Input [3]: [n_name#X, sum#X, isEmpty#X] Arguments: hashpartitioning(n_name#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(97) HashAggregate +(99) HashAggregate Input [3]: [n_name#X, sum#X, isEmpty#X] Keys [1]: [n_name#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] Results [2]: [n_name#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X AS revenue#X] -(98) Exchange +(100) Exchange Input [2]: [n_name#X, revenue#X] Arguments: rangepartitioning(revenue#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(99) Sort +(101) Sort Input [2]: [n_name#X, revenue#X] Arguments: [revenue#X DESC NULLS LAST], true, 0 -(100) AdaptiveSparkPlan +(102) AdaptiveSparkPlan Output [2]: [n_name#X, revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/6.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/6.txt index 15f7640401849..3432579a0de06 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/6.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/6.txt @@ -1,23 +1,23 @@ == Physical Plan == -AdaptiveSparkPlan (19) +AdaptiveSparkPlan (20) +- == Final Plan == - VeloxColumnarToRowExec (12) - +- ^ RegularHashAggregateExecTransformer (10) - +- ^ InputIteratorTransformer (9) - +- ^ InputAdapter (8) - +- ^ ShuffleQueryStage (7), Statistics(X) - +- ColumnarExchange (6) + VeloxColumnarToRowExec (13) + +- ^ RegularHashAggregateExecTransformer (11) + +- ^ InputIteratorTransformer (10) + +- ShuffleQueryStage (8), Statistics(X) + +- ColumnarExchange (7) + +- VeloxAppendBatches (6) +- ^ FlushableHashAggregateExecTransformer (4) +- ^ ProjectExecTransformer (3) +- ^ FilterExecTransformer (2) +- ^ Scan parquet (1) +- == Initial Plan == - HashAggregate (18) - +- Exchange (17) - +- HashAggregate (16) - +- Project (15) - +- Filter (14) - +- Scan parquet (13) + HashAggregate (19) + +- Exchange (18) + +- HashAggregate (17) + +- Project (16) + +- Filter (15) + +- Scan parquet (14) (1) Scan parquet @@ -46,67 +46,71 @@ Results [2]: [sum#X, isEmpty#X] Input [2]: [sum#X, isEmpty#X] Arguments: false -(6) ColumnarExchange +(6) VeloxAppendBatches +Input [2]: [sum#X, isEmpty#X] +Arguments: X + +(7) ColumnarExchange Input [2]: [sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(7) ShuffleQueryStage +(8) ShuffleQueryStage Output [2]: [sum#X, isEmpty#X] Arguments: X -(8) InputAdapter +(9) InputAdapter Input [2]: [sum#X, isEmpty#X] -(9) InputIteratorTransformer +(10) InputIteratorTransformer Input [2]: [sum#X, isEmpty#X] -(10) RegularHashAggregateExecTransformer +(11) RegularHashAggregateExecTransformer Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(CheckOverflow((promote_precision(l_extendedprice#X) * promote_precision(l_discount#X)), DecimalType(25,4)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(l_extendedprice#X) * promote_precision(l_discount#X)), DecimalType(25,4)))#X] Results [1]: [sum(CheckOverflow((promote_precision(l_extendedprice#X) * promote_precision(l_discount#X)), DecimalType(25,4)))#X AS revenue#X] -(11) WholeStageCodegenTransformer (X) +(12) WholeStageCodegenTransformer (X) Input [1]: [revenue#X] Arguments: false -(12) VeloxColumnarToRowExec +(13) VeloxColumnarToRowExec Input [1]: [revenue#X] -(13) Scan parquet +(14) Scan parquet Output [4]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), IsNotNull(l_discount), IsNotNull(l_quantity), GreaterThanOrEqual(l_shipdate,1994-01-01), LessThan(l_shipdate,1995-01-01), GreaterThanOrEqual(l_discount,0.05), LessThanOrEqual(l_discount,0.07), LessThan(l_quantity,24.00)] ReadSchema: struct -(14) Filter +(15) Filter Input [4]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : (((((((isnotnull(l_shipdate#X) AND isnotnull(l_discount#X)) AND isnotnull(l_quantity#X)) AND (l_shipdate#X >= 1994-01-01)) AND (l_shipdate#X < 1995-01-01)) AND (l_discount#X >= 0.05)) AND (l_discount#X <= 0.07)) AND (l_quantity#X < 24.00)) -(15) Project +(16) Project Output [2]: [l_extendedprice#X, l_discount#X] Input [4]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(16) HashAggregate +(17) HashAggregate Input [2]: [l_extendedprice#X, l_discount#X] Keys: [] Functions [1]: [partial_sum(CheckOverflow((promote_precision(l_extendedprice#X) * promote_precision(l_discount#X)), DecimalType(25,4)))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(17) Exchange +(18) Exchange Input [2]: [sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X] -(18) HashAggregate +(19) HashAggregate Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(CheckOverflow((promote_precision(l_extendedprice#X) * promote_precision(l_discount#X)), DecimalType(25,4)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(l_extendedprice#X) * promote_precision(l_discount#X)), DecimalType(25,4)))#X] Results [1]: [sum(CheckOverflow((promote_precision(l_extendedprice#X) * promote_precision(l_discount#X)), DecimalType(25,4)))#X AS revenue#X] -(19) AdaptiveSparkPlan +(20) AdaptiveSparkPlan Output [1]: [revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/7.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/7.txt index bab148b6bb17c..3db2d7669f149 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/7.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/7.txt @@ -1,92 +1,87 @@ == Physical Plan == -AdaptiveSparkPlan (93) +AdaptiveSparkPlan (95) +- == Final Plan == - VeloxColumnarToRowExec (60) - +- ^ SortExecTransformer (58) - +- ^ InputIteratorTransformer (57) - +- ^ InputAdapter (56) - +- ^ ShuffleQueryStage (55), Statistics(X) - +- ColumnarExchange (54) - +- ^ RegularHashAggregateExecTransformer (52) - +- ^ InputIteratorTransformer (51) - +- ^ InputAdapter (50) - +- ^ ShuffleQueryStage (49), Statistics(X) - +- ColumnarExchange (48) + VeloxColumnarToRowExec (62) + +- ^ SortExecTransformer (60) + +- ^ InputIteratorTransformer (59) + +- ShuffleQueryStage (57), Statistics(X) + +- ColumnarExchange (56) + +- VeloxAppendBatches (55) + +- ^ RegularHashAggregateExecTransformer (53) + +- ^ InputIteratorTransformer (52) + +- ShuffleQueryStage (50), Statistics(X) + +- ColumnarExchange (49) + +- VeloxAppendBatches (48) +- ^ ProjectExecTransformer (46) +- ^ FlushableHashAggregateExecTransformer (45) +- ^ ProjectExecTransformer (44) - +- ^ BroadcastHashJoinExecTransformer Inner (43) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (43) :- ^ ProjectExecTransformer (38) - : +- ^ BroadcastHashJoinExecTransformer Inner (37) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (37) : :- ^ ProjectExecTransformer (29) - : : +- ^ BroadcastHashJoinExecTransformer Inner (28) + : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (28) : : :- ^ ProjectExecTransformer (20) - : : : +- ^ BroadcastHashJoinExecTransformer Inner (19) + : : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (19) : : : :- ^ ProjectExecTransformer (11) - : : : : +- ^ BroadcastHashJoinExecTransformer Inner (10) + : : : : +- ^ BroadcastHashJoinExecTransformer Inner BuildLeft (10) : : : : :- ^ InputIteratorTransformer (7) - : : : : : +- ^ InputAdapter (6) - : : : : : +- ^ BroadcastQueryStage (5), Statistics(X) - : : : : : +- ColumnarBroadcastExchange (4) - : : : : : +- ^ FilterExecTransformer (2) - : : : : : +- ^ Scan parquet (1) + : : : : : +- BroadcastQueryStage (5), Statistics(X) + : : : : : +- ColumnarBroadcastExchange (4) + : : : : : +- ^ FilterExecTransformer (2) + : : : : : +- ^ Scan parquet (1) : : : : +- ^ FilterExecTransformer (9) : : : : +- ^ Scan parquet (8) : : : +- ^ InputIteratorTransformer (18) - : : : +- ^ InputAdapter (17) - : : : +- ^ BroadcastQueryStage (16), Statistics(X) - : : : +- ColumnarBroadcastExchange (15) - : : : +- ^ FilterExecTransformer (13) - : : : +- ^ Scan parquet (12) + : : : +- BroadcastQueryStage (16), Statistics(X) + : : : +- ColumnarBroadcastExchange (15) + : : : +- ^ FilterExecTransformer (13) + : : : +- ^ Scan parquet (12) : : +- ^ InputIteratorTransformer (27) - : : +- ^ InputAdapter (26) - : : +- ^ BroadcastQueryStage (25), Statistics(X) - : : +- ColumnarBroadcastExchange (24) - : : +- ^ FilterExecTransformer (22) - : : +- ^ Scan parquet (21) + : : +- BroadcastQueryStage (25), Statistics(X) + : : +- ColumnarBroadcastExchange (24) + : : +- ^ FilterExecTransformer (22) + : : +- ^ Scan parquet (21) : +- ^ InputIteratorTransformer (36) - : +- ^ InputAdapter (35) - : +- ^ BroadcastQueryStage (34), Statistics(X) - : +- ColumnarBroadcastExchange (33) - : +- ^ FilterExecTransformer (31) - : +- ^ Scan parquet (30) + : +- BroadcastQueryStage (34), Statistics(X) + : +- ColumnarBroadcastExchange (33) + : +- ^ FilterExecTransformer (31) + : +- ^ Scan parquet (30) +- ^ InputIteratorTransformer (42) - +- ^ InputAdapter (41) - +- ^ BroadcastQueryStage (40), Statistics(X) - +- ReusedExchange (39) + +- BroadcastQueryStage (40), Statistics(X) + +- ReusedExchange (39) +- == Initial Plan == - Sort (92) - +- Exchange (91) - +- HashAggregate (90) - +- Exchange (89) - +- HashAggregate (88) - +- Project (87) - +- BroadcastHashJoin Inner BuildRight (86) - :- Project (82) - : +- BroadcastHashJoin Inner BuildRight (81) - : :- Project (77) - : : +- BroadcastHashJoin Inner BuildRight (76) - : : :- Project (72) - : : : +- BroadcastHashJoin Inner BuildRight (71) - : : : :- Project (67) - : : : : +- BroadcastHashJoin Inner BuildLeft (66) - : : : : :- BroadcastExchange (63) - : : : : : +- Filter (62) - : : : : : +- Scan parquet (61) - : : : : +- Filter (65) - : : : : +- Scan parquet (64) - : : : +- BroadcastExchange (70) - : : : +- Filter (69) - : : : +- Scan parquet (68) - : : +- BroadcastExchange (75) - : : +- Filter (74) - : : +- Scan parquet (73) - : +- BroadcastExchange (80) - : +- Filter (79) - : +- Scan parquet (78) - +- BroadcastExchange (85) - +- Filter (84) - +- Scan parquet (83) + Sort (94) + +- Exchange (93) + +- HashAggregate (92) + +- Exchange (91) + +- HashAggregate (90) + +- Project (89) + +- BroadcastHashJoin Inner BuildRight (88) + :- Project (84) + : +- BroadcastHashJoin Inner BuildRight (83) + : :- Project (79) + : : +- BroadcastHashJoin Inner BuildRight (78) + : : :- Project (74) + : : : +- BroadcastHashJoin Inner BuildRight (73) + : : : :- Project (69) + : : : : +- BroadcastHashJoin Inner BuildLeft (68) + : : : : :- BroadcastExchange (65) + : : : : : +- Filter (64) + : : : : : +- Scan parquet (63) + : : : : +- Filter (67) + : : : : +- Scan parquet (66) + : : : +- BroadcastExchange (72) + : : : +- Filter (71) + : : : +- Scan parquet (70) + : : +- BroadcastExchange (77) + : : +- Filter (76) + : : +- Scan parquet (75) + : +- BroadcastExchange (82) + : +- Filter (81) + : +- Scan parquet (80) + +- BroadcastExchange (87) + +- Filter (86) + +- Scan parquet (85) (1) Scan parquet @@ -289,213 +284,221 @@ Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Input [6]: [hash_partition_key#X, supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Arguments: false -(48) ColumnarExchange +(48) VeloxAppendBatches +Input [6]: [hash_partition_key#X, supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] +Arguments: X + +(49) ColumnarExchange Input [6]: [hash_partition_key#X, supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Arguments: hashpartitioning(supp_nation#X, cust_nation#X, l_year#X, 1), ENSURE_REQUIREMENTS, [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(49) ShuffleQueryStage +(50) ShuffleQueryStage Output [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Arguments: X -(50) InputAdapter +(51) InputAdapter Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] -(51) InputIteratorTransformer +(52) InputIteratorTransformer Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] -(52) RegularHashAggregateExecTransformer +(53) RegularHashAggregateExecTransformer Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Keys [3]: [supp_nation#X, cust_nation#X, l_year#X] Functions [1]: [sum(volume#X)] Aggregate Attributes [1]: [sum(volume#X)#X] Results [4]: [supp_nation#X, cust_nation#X, l_year#X, sum(volume#X)#X AS revenue#X] -(53) WholeStageCodegenTransformer (X) +(54) WholeStageCodegenTransformer (X) Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: false -(54) ColumnarExchange +(55) VeloxAppendBatches +Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] +Arguments: X + +(56) ColumnarExchange Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: rangepartitioning(supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(55) ShuffleQueryStage +(57) ShuffleQueryStage Output [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: X -(56) InputAdapter +(58) InputAdapter Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] -(57) InputIteratorTransformer +(59) InputIteratorTransformer Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] -(58) SortExecTransformer +(60) SortExecTransformer Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: [supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST], true, 0 -(59) WholeStageCodegenTransformer (X) +(61) WholeStageCodegenTransformer (X) Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: false -(60) VeloxColumnarToRowExec +(62) VeloxColumnarToRowExec Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] -(61) Scan parquet +(63) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(62) Filter +(64) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(63) BroadcastExchange +(65) BroadcastExchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(64) Scan parquet +(66) Scan parquet Output [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1995-01-01), LessThanOrEqual(l_shipdate,1996-12-31), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(65) Filter +(67) Filter Input [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : ((((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1995-01-01)) AND (l_shipdate#X <= 1996-12-31)) AND isnotnull(l_suppkey#X)) AND isnotnull(l_orderkey#X)) -(66) BroadcastHashJoin +(68) BroadcastHashJoin Left keys [1]: [s_suppkey#X] Right keys [1]: [l_suppkey#X] Join condition: None -(67) Project +(69) Project Output [5]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Input [7]: [s_suppkey#X, s_nationkey#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(68) Scan parquet +(70) Scan parquet Output [2]: [o_orderkey#X, o_custkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderkey), IsNotNull(o_custkey)] ReadSchema: struct -(69) Filter +(71) Filter Input [2]: [o_orderkey#X, o_custkey#X] Condition : (isnotnull(o_orderkey#X) AND isnotnull(o_custkey#X)) -(70) BroadcastExchange +(72) BroadcastExchange Input [2]: [o_orderkey#X, o_custkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(71) BroadcastHashJoin +(73) BroadcastHashJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join condition: None -(72) Project +(74) Project Output [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] Input [7]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_orderkey#X, o_custkey#X] -(73) Scan parquet +(75) Scan parquet Output [2]: [c_custkey#X, c_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey), IsNotNull(c_nationkey)] ReadSchema: struct -(74) Filter +(76) Filter Input [2]: [c_custkey#X, c_nationkey#X] Condition : (isnotnull(c_custkey#X) AND isnotnull(c_nationkey#X)) -(75) BroadcastExchange +(77) BroadcastExchange Input [2]: [c_custkey#X, c_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(76) BroadcastHashJoin +(78) BroadcastHashJoin Left keys [1]: [o_custkey#X] Right keys [1]: [c_custkey#X] Join condition: None -(77) Project +(79) Project Output [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] Input [7]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X, c_custkey#X, c_nationkey#X] -(78) Scan parquet +(80) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), Or(EqualTo(n_name,FRANCE),EqualTo(n_name,GERMANY))] ReadSchema: struct -(79) Filter +(81) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : (isnotnull(n_nationkey#X) AND ((n_name#X = FRANCE) OR (n_name#X = GERMANY))) -(80) BroadcastExchange +(82) BroadcastExchange Input [2]: [n_nationkey#X, n_name#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(81) BroadcastHashJoin +(83) BroadcastHashJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(82) Project +(84) Project Output [5]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] Input [7]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_nationkey#X, n_name#X] -(83) Scan parquet +(85) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), Or(EqualTo(n_name,GERMANY),EqualTo(n_name,FRANCE))] ReadSchema: struct -(84) Filter +(86) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : (isnotnull(n_nationkey#X) AND ((n_name#X = GERMANY) OR (n_name#X = FRANCE))) -(85) BroadcastExchange +(87) BroadcastExchange Input [2]: [n_nationkey#X, n_name#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(86) BroadcastHashJoin +(88) BroadcastHashJoin Left keys [1]: [c_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: (((n_name#X = FRANCE) AND (n_name#X = GERMANY)) OR ((n_name#X = GERMANY) AND (n_name#X = FRANCE))) -(87) Project +(89) Project Output [4]: [n_name#X AS supp_nation#X, n_name#X AS cust_nation#X, year(l_shipdate#X) AS l_year#X, CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) AS volume#X] Input [7]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X, n_nationkey#X, n_name#X] -(88) HashAggregate +(90) HashAggregate Input [4]: [supp_nation#X, cust_nation#X, l_year#X, volume#X] Keys [3]: [supp_nation#X, cust_nation#X, l_year#X] Functions [1]: [partial_sum(volume#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] -(89) Exchange +(91) Exchange Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Arguments: hashpartitioning(supp_nation#X, cust_nation#X, l_year#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(90) HashAggregate +(92) HashAggregate Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Keys [3]: [supp_nation#X, cust_nation#X, l_year#X] Functions [1]: [sum(volume#X)] Aggregate Attributes [1]: [sum(volume#X)#X] Results [4]: [supp_nation#X, cust_nation#X, l_year#X, sum(volume#X)#X AS revenue#X] -(91) Exchange +(93) Exchange Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: rangepartitioning(supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(92) Sort +(94) Sort Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: [supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST], true, 0 -(93) AdaptiveSparkPlan +(95) AdaptiveSparkPlan Output [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/8.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/8.txt index 757b1fcca6b4e..f77b5fca35c8e 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/8.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/8.txt @@ -1,125 +1,118 @@ == Physical Plan == -AdaptiveSparkPlan (129) +AdaptiveSparkPlan (131) +- == Final Plan == - VeloxColumnarToRowExec (84) - +- ^ SortExecTransformer (82) - +- ^ InputIteratorTransformer (81) - +- ^ InputAdapter (80) - +- ^ ShuffleQueryStage (79), Statistics(X) - +- ColumnarExchange (78) - +- ^ ProjectExecTransformer (76) - +- ^ RegularHashAggregateExecTransformer (75) - +- ^ InputIteratorTransformer (74) - +- ^ InputAdapter (73) - +- ^ ShuffleQueryStage (72), Statistics(X) - +- ColumnarExchange (71) + VeloxColumnarToRowExec (86) + +- ^ SortExecTransformer (84) + +- ^ InputIteratorTransformer (83) + +- ShuffleQueryStage (81), Statistics(X) + +- ColumnarExchange (80) + +- VeloxAppendBatches (79) + +- ^ ProjectExecTransformer (77) + +- ^ RegularHashAggregateExecTransformer (76) + +- ^ InputIteratorTransformer (75) + +- ShuffleQueryStage (73), Statistics(X) + +- ColumnarExchange (72) + +- VeloxAppendBatches (71) +- ^ ProjectExecTransformer (69) +- ^ FlushableHashAggregateExecTransformer (68) +- ^ ProjectExecTransformer (67) - +- ^ BroadcastHashJoinExecTransformer Inner (66) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (66) :- ^ ProjectExecTransformer (57) - : +- ^ BroadcastHashJoinExecTransformer Inner (56) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (56) : :- ^ ProjectExecTransformer (48) - : : +- ^ BroadcastHashJoinExecTransformer Inner (47) + : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (47) : : :- ^ ProjectExecTransformer (39) - : : : +- ^ BroadcastHashJoinExecTransformer Inner (38) + : : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (38) : : : :- ^ ProjectExecTransformer (30) - : : : : +- ^ BroadcastHashJoinExecTransformer Inner (29) + : : : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (29) : : : : :- ^ ProjectExecTransformer (21) - : : : : : +- ^ BroadcastHashJoinExecTransformer Inner (20) + : : : : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (20) : : : : : :- ^ ProjectExecTransformer (12) - : : : : : : +- ^ BroadcastHashJoinExecTransformer Inner (11) + : : : : : : +- ^ BroadcastHashJoinExecTransformer Inner BuildLeft (11) : : : : : : :- ^ InputIteratorTransformer (8) - : : : : : : : +- ^ InputAdapter (7) - : : : : : : : +- ^ BroadcastQueryStage (6), Statistics(X) - : : : : : : : +- ColumnarBroadcastExchange (5) - : : : : : : : +- ^ ProjectExecTransformer (3) - : : : : : : : +- ^ FilterExecTransformer (2) - : : : : : : : +- ^ Scan parquet (1) + : : : : : : : +- BroadcastQueryStage (6), Statistics(X) + : : : : : : : +- ColumnarBroadcastExchange (5) + : : : : : : : +- ^ ProjectExecTransformer (3) + : : : : : : : +- ^ FilterExecTransformer (2) + : : : : : : : +- ^ Scan parquet (1) : : : : : : +- ^ FilterExecTransformer (10) : : : : : : +- ^ Scan parquet (9) : : : : : +- ^ InputIteratorTransformer (19) - : : : : : +- ^ InputAdapter (18) - : : : : : +- ^ BroadcastQueryStage (17), Statistics(X) - : : : : : +- ColumnarBroadcastExchange (16) - : : : : : +- ^ FilterExecTransformer (14) - : : : : : +- ^ Scan parquet (13) + : : : : : +- BroadcastQueryStage (17), Statistics(X) + : : : : : +- ColumnarBroadcastExchange (16) + : : : : : +- ^ FilterExecTransformer (14) + : : : : : +- ^ Scan parquet (13) : : : : +- ^ InputIteratorTransformer (28) - : : : : +- ^ InputAdapter (27) - : : : : +- ^ BroadcastQueryStage (26), Statistics(X) - : : : : +- ColumnarBroadcastExchange (25) - : : : : +- ^ FilterExecTransformer (23) - : : : : +- ^ Scan parquet (22) + : : : : +- BroadcastQueryStage (26), Statistics(X) + : : : : +- ColumnarBroadcastExchange (25) + : : : : +- ^ FilterExecTransformer (23) + : : : : +- ^ Scan parquet (22) : : : +- ^ InputIteratorTransformer (37) - : : : +- ^ InputAdapter (36) - : : : +- ^ BroadcastQueryStage (35), Statistics(X) - : : : +- ColumnarBroadcastExchange (34) - : : : +- ^ FilterExecTransformer (32) - : : : +- ^ Scan parquet (31) + : : : +- BroadcastQueryStage (35), Statistics(X) + : : : +- ColumnarBroadcastExchange (34) + : : : +- ^ FilterExecTransformer (32) + : : : +- ^ Scan parquet (31) : : +- ^ InputIteratorTransformer (46) - : : +- ^ InputAdapter (45) - : : +- ^ BroadcastQueryStage (44), Statistics(X) - : : +- ColumnarBroadcastExchange (43) - : : +- ^ FilterExecTransformer (41) - : : +- ^ Scan parquet (40) + : : +- BroadcastQueryStage (44), Statistics(X) + : : +- ColumnarBroadcastExchange (43) + : : +- ^ FilterExecTransformer (41) + : : +- ^ Scan parquet (40) : +- ^ InputIteratorTransformer (55) - : +- ^ InputAdapter (54) - : +- ^ BroadcastQueryStage (53), Statistics(X) - : +- ColumnarBroadcastExchange (52) - : +- ^ FilterExecTransformer (50) - : +- ^ Scan parquet (49) + : +- BroadcastQueryStage (53), Statistics(X) + : +- ColumnarBroadcastExchange (52) + : +- ^ FilterExecTransformer (50) + : +- ^ Scan parquet (49) +- ^ InputIteratorTransformer (65) - +- ^ InputAdapter (64) - +- ^ BroadcastQueryStage (63), Statistics(X) - +- ColumnarBroadcastExchange (62) - +- ^ ProjectExecTransformer (60) - +- ^ FilterExecTransformer (59) - +- ^ Scan parquet (58) + +- BroadcastQueryStage (63), Statistics(X) + +- ColumnarBroadcastExchange (62) + +- ^ ProjectExecTransformer (60) + +- ^ FilterExecTransformer (59) + +- ^ Scan parquet (58) +- == Initial Plan == - Sort (128) - +- Exchange (127) - +- HashAggregate (126) - +- Exchange (125) - +- HashAggregate (124) - +- Project (123) - +- BroadcastHashJoin Inner BuildRight (122) - :- Project (117) - : +- BroadcastHashJoin Inner BuildRight (116) - : :- Project (112) - : : +- BroadcastHashJoin Inner BuildRight (111) - : : :- Project (107) - : : : +- BroadcastHashJoin Inner BuildRight (106) - : : : :- Project (102) - : : : : +- BroadcastHashJoin Inner BuildRight (101) - : : : : :- Project (97) - : : : : : +- BroadcastHashJoin Inner BuildRight (96) - : : : : : :- Project (92) - : : : : : : +- BroadcastHashJoin Inner BuildLeft (91) - : : : : : : :- BroadcastExchange (88) - : : : : : : : +- Project (87) - : : : : : : : +- Filter (86) - : : : : : : : +- Scan parquet (85) - : : : : : : +- Filter (90) - : : : : : : +- Scan parquet (89) - : : : : : +- BroadcastExchange (95) - : : : : : +- Filter (94) - : : : : : +- Scan parquet (93) - : : : : +- BroadcastExchange (100) - : : : : +- Filter (99) - : : : : +- Scan parquet (98) - : : : +- BroadcastExchange (105) - : : : +- Filter (104) - : : : +- Scan parquet (103) - : : +- BroadcastExchange (110) - : : +- Filter (109) - : : +- Scan parquet (108) - : +- BroadcastExchange (115) - : +- Filter (114) - : +- Scan parquet (113) - +- BroadcastExchange (121) - +- Project (120) - +- Filter (119) - +- Scan parquet (118) + Sort (130) + +- Exchange (129) + +- HashAggregate (128) + +- Exchange (127) + +- HashAggregate (126) + +- Project (125) + +- BroadcastHashJoin Inner BuildRight (124) + :- Project (119) + : +- BroadcastHashJoin Inner BuildRight (118) + : :- Project (114) + : : +- BroadcastHashJoin Inner BuildRight (113) + : : :- Project (109) + : : : +- BroadcastHashJoin Inner BuildRight (108) + : : : :- Project (104) + : : : : +- BroadcastHashJoin Inner BuildRight (103) + : : : : :- Project (99) + : : : : : +- BroadcastHashJoin Inner BuildRight (98) + : : : : : :- Project (94) + : : : : : : +- BroadcastHashJoin Inner BuildLeft (93) + : : : : : : :- BroadcastExchange (90) + : : : : : : : +- Project (89) + : : : : : : : +- Filter (88) + : : : : : : : +- Scan parquet (87) + : : : : : : +- Filter (92) + : : : : : : +- Scan parquet (91) + : : : : : +- BroadcastExchange (97) + : : : : : +- Filter (96) + : : : : : +- Scan parquet (95) + : : : : +- BroadcastExchange (102) + : : : : +- Filter (101) + : : : : +- Scan parquet (100) + : : : +- BroadcastExchange (107) + : : : +- Filter (106) + : : : +- Scan parquet (105) + : : +- BroadcastExchange (112) + : : +- Filter (111) + : : +- Scan parquet (110) + : +- BroadcastExchange (117) + : +- Filter (116) + : +- Scan parquet (115) + +- BroadcastExchange (123) + +- Project (122) + +- Filter (121) + +- Scan parquet (120) (1) Scan parquet @@ -422,273 +415,281 @@ Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Input [6]: [hash_partition_key#X, o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: false -(71) ColumnarExchange +(71) VeloxAppendBatches +Input [6]: [hash_partition_key#X, o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] +Arguments: X + +(72) ColumnarExchange Input [6]: [hash_partition_key#X, o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: hashpartitioning(o_year#X, 1), ENSURE_REQUIREMENTS, [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(72) ShuffleQueryStage +(73) ShuffleQueryStage Output [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: X -(73) InputAdapter +(74) InputAdapter Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] -(74) InputIteratorTransformer +(75) InputIteratorTransformer Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] -(75) RegularHashAggregateExecTransformer +(76) RegularHashAggregateExecTransformer Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Keys [1]: [o_year#X] Functions [2]: [sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END), sum(volume#X)] Aggregate Attributes [2]: [sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X, sum(volume#X)#X] Results [3]: [o_year#X, sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X, sum(volume#X)#X] -(76) ProjectExecTransformer +(77) ProjectExecTransformer Output [2]: [o_year#X, CheckOverflow((promote_precision(sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X) / promote_precision(sum(volume#X)#X)), DecimalType(38,6)) AS mkt_share#X] Input [3]: [o_year#X, sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X, sum(volume#X)#X] -(77) WholeStageCodegenTransformer (X) +(78) WholeStageCodegenTransformer (X) Input [2]: [o_year#X, mkt_share#X] Arguments: false -(78) ColumnarExchange +(79) VeloxAppendBatches +Input [2]: [o_year#X, mkt_share#X] +Arguments: X + +(80) ColumnarExchange Input [2]: [o_year#X, mkt_share#X] Arguments: rangepartitioning(o_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(79) ShuffleQueryStage +(81) ShuffleQueryStage Output [2]: [o_year#X, mkt_share#X] Arguments: X -(80) InputAdapter +(82) InputAdapter Input [2]: [o_year#X, mkt_share#X] -(81) InputIteratorTransformer +(83) InputIteratorTransformer Input [2]: [o_year#X, mkt_share#X] -(82) SortExecTransformer +(84) SortExecTransformer Input [2]: [o_year#X, mkt_share#X] Arguments: [o_year#X ASC NULLS FIRST], true, 0 -(83) WholeStageCodegenTransformer (X) +(85) WholeStageCodegenTransformer (X) Input [2]: [o_year#X, mkt_share#X] Arguments: false -(84) VeloxColumnarToRowExec +(86) VeloxColumnarToRowExec Input [2]: [o_year#X, mkt_share#X] -(85) Scan parquet +(87) Scan parquet Output [2]: [p_partkey#X, p_type#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_type), EqualTo(p_type,ECONOMY ANODIZED STEEL), IsNotNull(p_partkey)] ReadSchema: struct -(86) Filter +(88) Filter Input [2]: [p_partkey#X, p_type#X] Condition : ((isnotnull(p_type#X) AND (p_type#X = ECONOMY ANODIZED STEEL)) AND isnotnull(p_partkey#X)) -(87) Project +(89) Project Output [1]: [p_partkey#X] Input [2]: [p_partkey#X, p_type#X] -(88) BroadcastExchange +(90) BroadcastExchange Input [1]: [p_partkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(89) Scan parquet +(91) Scan parquet Output [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(90) Filter +(92) Filter Input [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Condition : ((isnotnull(l_partkey#X) AND isnotnull(l_suppkey#X)) AND isnotnull(l_orderkey#X)) -(91) BroadcastHashJoin +(93) BroadcastHashJoin Left keys [1]: [p_partkey#X] Right keys [1]: [l_partkey#X] Join condition: None -(92) Project +(94) Project Output [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Input [6]: [p_partkey#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(93) Scan parquet +(95) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(94) Filter +(96) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(95) BroadcastExchange +(97) BroadcastExchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(96) BroadcastHashJoin +(98) BroadcastHashJoin Left keys [1]: [l_suppkey#X] Right keys [1]: [s_suppkey#X] Join condition: None -(97) Project +(99) Project Output [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Input [6]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, s_suppkey#X, s_nationkey#X] -(98) Scan parquet +(100) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1995-01-01), LessThanOrEqual(o_orderdate,1996-12-31), IsNotNull(o_orderkey), IsNotNull(o_custkey)] ReadSchema: struct -(99) Filter +(101) Filter Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Condition : ((((isnotnull(o_orderdate#X) AND (o_orderdate#X >= 1995-01-01)) AND (o_orderdate#X <= 1996-12-31)) AND isnotnull(o_orderkey#X)) AND isnotnull(o_custkey#X)) -(100) BroadcastExchange +(102) BroadcastExchange Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(101) BroadcastHashJoin +(103) BroadcastHashJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join condition: None -(102) Project +(104) Project Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] Input [7]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderkey#X, o_custkey#X, o_orderdate#X] -(103) Scan parquet +(105) Scan parquet Output [2]: [c_custkey#X, c_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey), IsNotNull(c_nationkey)] ReadSchema: struct -(104) Filter +(106) Filter Input [2]: [c_custkey#X, c_nationkey#X] Condition : (isnotnull(c_custkey#X) AND isnotnull(c_nationkey#X)) -(105) BroadcastExchange +(107) BroadcastExchange Input [2]: [c_custkey#X, c_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(106) BroadcastHashJoin +(108) BroadcastHashJoin Left keys [1]: [o_custkey#X] Right keys [1]: [c_custkey#X] Join condition: None -(107) Project +(109) Project Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] Input [7]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X, c_custkey#X, c_nationkey#X] -(108) Scan parquet +(110) Scan parquet Output [2]: [n_nationkey#X, n_regionkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), IsNotNull(n_regionkey)] ReadSchema: struct -(109) Filter +(111) Filter Input [2]: [n_nationkey#X, n_regionkey#X] Condition : (isnotnull(n_nationkey#X) AND isnotnull(n_regionkey#X)) -(110) BroadcastExchange +(112) BroadcastExchange Input [2]: [n_nationkey#X, n_regionkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(111) BroadcastHashJoin +(113) BroadcastHashJoin Left keys [1]: [c_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(112) Project +(114) Project Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] Input [7]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X, n_nationkey#X, n_regionkey#X] -(113) Scan parquet +(115) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey)] ReadSchema: struct -(114) Filter +(116) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : isnotnull(n_nationkey#X) -(115) BroadcastExchange +(117) BroadcastExchange Input [2]: [n_nationkey#X, n_name#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(116) BroadcastHashJoin +(118) BroadcastHashJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(117) Project +(119) Project Output [5]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] Input [7]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X, n_nationkey#X, n_name#X] -(118) Scan parquet +(120) Scan parquet Output [2]: [r_regionkey#X, r_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(r_name), EqualTo(r_name,AMERICA), IsNotNull(r_regionkey)] ReadSchema: struct -(119) Filter +(121) Filter Input [2]: [r_regionkey#X, r_name#X] Condition : ((isnotnull(r_name#X) AND (r_name#X = AMERICA)) AND isnotnull(r_regionkey#X)) -(120) Project +(122) Project Output [1]: [r_regionkey#X] Input [2]: [r_regionkey#X, r_name#X] -(121) BroadcastExchange +(123) BroadcastExchange Input [1]: [r_regionkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(122) BroadcastHashJoin +(124) BroadcastHashJoin Left keys [1]: [n_regionkey#X] Right keys [1]: [r_regionkey#X] Join condition: None -(123) Project +(125) Project Output [3]: [year(o_orderdate#X) AS o_year#X, CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) AS volume#X, n_name#X AS nation#X] Input [6]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X, r_regionkey#X] -(124) HashAggregate +(126) HashAggregate Input [3]: [o_year#X, volume#X, nation#X] Keys [1]: [o_year#X] Functions [2]: [partial_sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END), partial_sum(volume#X)] Aggregate Attributes [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Results [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] -(125) Exchange +(127) Exchange Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: hashpartitioning(o_year#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(126) HashAggregate +(128) HashAggregate Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Keys [1]: [o_year#X] Functions [2]: [sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END), sum(volume#X)] Aggregate Attributes [2]: [sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X, sum(volume#X)#X] Results [2]: [o_year#X, CheckOverflow((promote_precision(sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X) / promote_precision(sum(volume#X)#X)), DecimalType(38,6)) AS mkt_share#X] -(127) Exchange +(129) Exchange Input [2]: [o_year#X, mkt_share#X] Arguments: rangepartitioning(o_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(128) Sort +(130) Sort Input [2]: [o_year#X, mkt_share#X] Arguments: [o_year#X ASC NULLS FIRST], true, 0 -(129) AdaptiveSparkPlan +(131) AdaptiveSparkPlan Output [2]: [o_year#X, mkt_share#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/9.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/9.txt index 4b0cb7140dcde..1be487ed123ae 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/9.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/9.txt @@ -1,96 +1,91 @@ == Physical Plan == -AdaptiveSparkPlan (98) +AdaptiveSparkPlan (100) +- == Final Plan == - VeloxColumnarToRowExec (64) - +- ^ SortExecTransformer (62) - +- ^ InputIteratorTransformer (61) - +- ^ InputAdapter (60) - +- ^ ShuffleQueryStage (59), Statistics(X) - +- ColumnarExchange (58) - +- ^ RegularHashAggregateExecTransformer (56) - +- ^ InputIteratorTransformer (55) - +- ^ InputAdapter (54) - +- ^ ShuffleQueryStage (53), Statistics(X) - +- ColumnarExchange (52) + VeloxColumnarToRowExec (66) + +- ^ SortExecTransformer (64) + +- ^ InputIteratorTransformer (63) + +- ShuffleQueryStage (61), Statistics(X) + +- ColumnarExchange (60) + +- VeloxAppendBatches (59) + +- ^ RegularHashAggregateExecTransformer (57) + +- ^ InputIteratorTransformer (56) + +- ShuffleQueryStage (54), Statistics(X) + +- ColumnarExchange (53) + +- VeloxAppendBatches (52) +- ^ ProjectExecTransformer (50) +- ^ FlushableHashAggregateExecTransformer (49) +- ^ ProjectExecTransformer (48) - +- ^ BroadcastHashJoinExecTransformer Inner (47) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (47) :- ^ ProjectExecTransformer (39) - : +- ^ BroadcastHashJoinExecTransformer Inner (38) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (38) : :- ^ ProjectExecTransformer (30) - : : +- ^ BroadcastHashJoinExecTransformer Inner (29) + : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (29) : : :- ^ ProjectExecTransformer (21) - : : : +- ^ BroadcastHashJoinExecTransformer Inner (20) + : : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (20) : : : :- ^ ProjectExecTransformer (12) - : : : : +- ^ BroadcastHashJoinExecTransformer Inner (11) + : : : : +- ^ BroadcastHashJoinExecTransformer Inner BuildLeft (11) : : : : :- ^ InputIteratorTransformer (8) - : : : : : +- ^ InputAdapter (7) - : : : : : +- ^ BroadcastQueryStage (6), Statistics(X) - : : : : : +- ColumnarBroadcastExchange (5) - : : : : : +- ^ ProjectExecTransformer (3) - : : : : : +- ^ FilterExecTransformer (2) - : : : : : +- ^ Scan parquet (1) + : : : : : +- BroadcastQueryStage (6), Statistics(X) + : : : : : +- ColumnarBroadcastExchange (5) + : : : : : +- ^ ProjectExecTransformer (3) + : : : : : +- ^ FilterExecTransformer (2) + : : : : : +- ^ Scan parquet (1) : : : : +- ^ FilterExecTransformer (10) : : : : +- ^ Scan parquet (9) : : : +- ^ InputIteratorTransformer (19) - : : : +- ^ InputAdapter (18) - : : : +- ^ BroadcastQueryStage (17), Statistics(X) - : : : +- ColumnarBroadcastExchange (16) - : : : +- ^ FilterExecTransformer (14) - : : : +- ^ Scan parquet (13) + : : : +- BroadcastQueryStage (17), Statistics(X) + : : : +- ColumnarBroadcastExchange (16) + : : : +- ^ FilterExecTransformer (14) + : : : +- ^ Scan parquet (13) : : +- ^ InputIteratorTransformer (28) - : : +- ^ InputAdapter (27) - : : +- ^ BroadcastQueryStage (26), Statistics(X) - : : +- ColumnarBroadcastExchange (25) - : : +- ^ FilterExecTransformer (23) - : : +- ^ Scan parquet (22) + : : +- BroadcastQueryStage (26), Statistics(X) + : : +- ColumnarBroadcastExchange (25) + : : +- ^ FilterExecTransformer (23) + : : +- ^ Scan parquet (22) : +- ^ InputIteratorTransformer (37) - : +- ^ InputAdapter (36) - : +- ^ BroadcastQueryStage (35), Statistics(X) - : +- ColumnarBroadcastExchange (34) - : +- ^ FilterExecTransformer (32) - : +- ^ Scan parquet (31) + : +- BroadcastQueryStage (35), Statistics(X) + : +- ColumnarBroadcastExchange (34) + : +- ^ FilterExecTransformer (32) + : +- ^ Scan parquet (31) +- ^ InputIteratorTransformer (46) - +- ^ InputAdapter (45) - +- ^ BroadcastQueryStage (44), Statistics(X) - +- ColumnarBroadcastExchange (43) - +- ^ FilterExecTransformer (41) - +- ^ Scan parquet (40) + +- BroadcastQueryStage (44), Statistics(X) + +- ColumnarBroadcastExchange (43) + +- ^ FilterExecTransformer (41) + +- ^ Scan parquet (40) +- == Initial Plan == - Sort (97) - +- Exchange (96) - +- HashAggregate (95) - +- Exchange (94) - +- HashAggregate (93) - +- Project (92) - +- BroadcastHashJoin Inner BuildRight (91) - :- Project (87) - : +- BroadcastHashJoin Inner BuildRight (86) - : :- Project (82) - : : +- BroadcastHashJoin Inner BuildRight (81) - : : :- Project (77) - : : : +- BroadcastHashJoin Inner BuildRight (76) - : : : :- Project (72) - : : : : +- BroadcastHashJoin Inner BuildLeft (71) - : : : : :- BroadcastExchange (68) - : : : : : +- Project (67) - : : : : : +- Filter (66) - : : : : : +- Scan parquet (65) - : : : : +- Filter (70) - : : : : +- Scan parquet (69) - : : : +- BroadcastExchange (75) - : : : +- Filter (74) - : : : +- Scan parquet (73) - : : +- BroadcastExchange (80) - : : +- Filter (79) - : : +- Scan parquet (78) - : +- BroadcastExchange (85) - : +- Filter (84) - : +- Scan parquet (83) - +- BroadcastExchange (90) - +- Filter (89) - +- Scan parquet (88) + Sort (99) + +- Exchange (98) + +- HashAggregate (97) + +- Exchange (96) + +- HashAggregate (95) + +- Project (94) + +- BroadcastHashJoin Inner BuildRight (93) + :- Project (89) + : +- BroadcastHashJoin Inner BuildRight (88) + : :- Project (84) + : : +- BroadcastHashJoin Inner BuildRight (83) + : : :- Project (79) + : : : +- BroadcastHashJoin Inner BuildRight (78) + : : : :- Project (74) + : : : : +- BroadcastHashJoin Inner BuildLeft (73) + : : : : :- BroadcastExchange (70) + : : : : : +- Project (69) + : : : : : +- Filter (68) + : : : : : +- Scan parquet (67) + : : : : +- Filter (72) + : : : : +- Scan parquet (71) + : : : +- BroadcastExchange (77) + : : : +- Filter (76) + : : : +- Scan parquet (75) + : : +- BroadcastExchange (82) + : : +- Filter (81) + : : +- Scan parquet (80) + : +- BroadcastExchange (87) + : +- Filter (86) + : +- Scan parquet (85) + +- BroadcastExchange (92) + +- Filter (91) + +- Scan parquet (90) (1) Scan parquet @@ -313,217 +308,225 @@ Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] Input [5]: [hash_partition_key#X, nation#X, o_year#X, sum#X, isEmpty#X] Arguments: false -(52) ColumnarExchange +(52) VeloxAppendBatches +Input [5]: [hash_partition_key#X, nation#X, o_year#X, sum#X, isEmpty#X] +Arguments: X + +(53) ColumnarExchange Input [5]: [hash_partition_key#X, nation#X, o_year#X, sum#X, isEmpty#X] Arguments: hashpartitioning(nation#X, o_year#X, 1), ENSURE_REQUIREMENTS, [nation#X, o_year#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(53) ShuffleQueryStage +(54) ShuffleQueryStage Output [4]: [nation#X, o_year#X, sum#X, isEmpty#X] Arguments: X -(54) InputAdapter +(55) InputAdapter Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] -(55) InputIteratorTransformer +(56) InputIteratorTransformer Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] -(56) RegularHashAggregateExecTransformer +(57) RegularHashAggregateExecTransformer Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] Keys [2]: [nation#X, o_year#X] Functions [1]: [sum(amount#X)] Aggregate Attributes [1]: [sum(amount#X)#X] Results [3]: [nation#X, o_year#X, sum(amount#X)#X AS sum_profit#X] -(57) WholeStageCodegenTransformer (X) +(58) WholeStageCodegenTransformer (X) Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: false -(58) ColumnarExchange +(59) VeloxAppendBatches +Input [3]: [nation#X, o_year#X, sum_profit#X] +Arguments: X + +(60) ColumnarExchange Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: rangepartitioning(nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(59) ShuffleQueryStage +(61) ShuffleQueryStage Output [3]: [nation#X, o_year#X, sum_profit#X] Arguments: X -(60) InputAdapter +(62) InputAdapter Input [3]: [nation#X, o_year#X, sum_profit#X] -(61) InputIteratorTransformer +(63) InputIteratorTransformer Input [3]: [nation#X, o_year#X, sum_profit#X] -(62) SortExecTransformer +(64) SortExecTransformer Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: [nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST], true, 0 -(63) WholeStageCodegenTransformer (X) +(65) WholeStageCodegenTransformer (X) Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: false -(64) VeloxColumnarToRowExec +(66) VeloxColumnarToRowExec Input [3]: [nation#X, o_year#X, sum_profit#X] -(65) Scan parquet +(67) Scan parquet Output [2]: [p_partkey#X, p_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_name), StringContains(p_name,green), IsNotNull(p_partkey)] ReadSchema: struct -(66) Filter +(68) Filter Input [2]: [p_partkey#X, p_name#X] Condition : ((isnotnull(p_name#X) AND Contains(p_name#X, green)) AND isnotnull(p_partkey#X)) -(67) Project +(69) Project Output [1]: [p_partkey#X] Input [2]: [p_partkey#X, p_name#X] -(68) BroadcastExchange +(70) BroadcastExchange Input [1]: [p_partkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(69) Scan parquet +(71) Scan parquet Output [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(70) Filter +(72) Filter Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Condition : ((isnotnull(l_partkey#X) AND isnotnull(l_suppkey#X)) AND isnotnull(l_orderkey#X)) -(71) BroadcastHashJoin +(73) BroadcastHashJoin Left keys [1]: [p_partkey#X] Right keys [1]: [l_partkey#X] Join condition: None -(72) Project +(74) Project Output [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Input [7]: [p_partkey#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -(73) Scan parquet +(75) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(74) Filter +(76) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(75) BroadcastExchange +(77) BroadcastExchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(76) BroadcastHashJoin +(78) BroadcastHashJoin Left keys [1]: [l_suppkey#X] Right keys [1]: [s_suppkey#X] Join condition: None -(77) Project +(79) Project Output [7]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Input [8]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_suppkey#X, s_nationkey#X] -(78) Scan parquet +(80) Scan parquet Output [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_suppkey), IsNotNull(ps_partkey)] ReadSchema: struct -(79) Filter +(81) Filter Input [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Condition : (isnotnull(ps_suppkey#X) AND isnotnull(ps_partkey#X)) -(80) BroadcastExchange +(82) BroadcastExchange Input [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Arguments: HashedRelationBroadcastMode(List(input[1, bigint, false], input[0, bigint, false]),false), [plan_id=X] -(81) BroadcastHashJoin +(83) BroadcastHashJoin Left keys [2]: [l_suppkey#X, l_partkey#X] Right keys [2]: [ps_suppkey#X, ps_partkey#X] Join condition: None -(82) Project +(84) Project Output [6]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] Input [10]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_partkey#X, ps_suppkey#X, ps_supplycost#X] -(83) Scan parquet +(85) Scan parquet Output [2]: [o_orderkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderkey)] ReadSchema: struct -(84) Filter +(86) Filter Input [2]: [o_orderkey#X, o_orderdate#X] Condition : isnotnull(o_orderkey#X) -(85) BroadcastExchange +(87) BroadcastExchange Input [2]: [o_orderkey#X, o_orderdate#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(86) BroadcastHashJoin +(88) BroadcastHashJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join condition: None -(87) Project +(89) Project Output [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] Input [8]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderkey#X, o_orderdate#X] -(88) Scan parquet +(90) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey)] ReadSchema: struct -(89) Filter +(91) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : isnotnull(n_nationkey#X) -(90) BroadcastExchange +(92) BroadcastExchange Input [2]: [n_nationkey#X, n_name#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(91) BroadcastHashJoin +(93) BroadcastHashJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(92) Project +(94) Project Output [3]: [n_name#X AS nation#X, year(o_orderdate#X) AS o_year#X, CheckOverflow((promote_precision(cast(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) as decimal(27,4))) - promote_precision(cast(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(l_quantity#X)), DecimalType(25,4)) as decimal(27,4)))), DecimalType(27,4)) AS amount#X] Input [8]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X, n_nationkey#X, n_name#X] -(93) HashAggregate +(95) HashAggregate Input [3]: [nation#X, o_year#X, amount#X] Keys [2]: [nation#X, o_year#X] Functions [1]: [partial_sum(amount#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [4]: [nation#X, o_year#X, sum#X, isEmpty#X] -(94) Exchange +(96) Exchange Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] Arguments: hashpartitioning(nation#X, o_year#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(95) HashAggregate +(97) HashAggregate Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] Keys [2]: [nation#X, o_year#X] Functions [1]: [sum(amount#X)] Aggregate Attributes [1]: [sum(amount#X)#X] Results [3]: [nation#X, o_year#X, sum(amount#X)#X AS sum_profit#X] -(96) Exchange +(98) Exchange Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: rangepartitioning(nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(97) Sort +(99) Sort Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: [nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST], true, 0 -(98) AdaptiveSparkPlan +(100) AdaptiveSparkPlan Output [3]: [nation#X, o_year#X, sum_profit#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/1.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/1.txt index 634f26c86f24a..1e53cd90e1b35 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/1.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/1.txt @@ -1,31 +1,31 @@ == Physical Plan == -AdaptiveSparkPlan (28) +AdaptiveSparkPlan (30) +- == Final Plan == - VeloxColumnarToRowExec (19) - +- ^ SortExecTransformer (17) - +- ^ InputIteratorTransformer (16) - +- ^ InputAdapter (15) - +- ^ ShuffleQueryStage (14), Statistics(X) - +- ColumnarExchange (13) - +- ^ RegularHashAggregateExecTransformer (11) - +- ^ InputIteratorTransformer (10) - +- ^ InputAdapter (9) - +- ^ ShuffleQueryStage (8), Statistics(X) - +- ColumnarExchange (7) + VeloxColumnarToRowExec (21) + +- ^ SortExecTransformer (19) + +- ^ InputIteratorTransformer (18) + +- ShuffleQueryStage (16), Statistics(X) + +- ColumnarExchange (15) + +- VeloxAppendBatches (14) + +- ^ RegularHashAggregateExecTransformer (12) + +- ^ InputIteratorTransformer (11) + +- ShuffleQueryStage (9), Statistics(X) + +- ColumnarExchange (8) + +- VeloxAppendBatches (7) +- ^ ProjectExecTransformer (5) +- ^ FlushableHashAggregateExecTransformer (4) +- ^ ProjectExecTransformer (3) +- ^ FilterExecTransformer (2) +- ^ Scan parquet (1) +- == Initial Plan == - Sort (27) - +- Exchange (26) - +- HashAggregate (25) - +- Exchange (24) - +- HashAggregate (23) - +- Project (22) - +- Filter (21) - +- Scan parquet (20) + Sort (29) + +- Exchange (28) + +- HashAggregate (27) + +- Exchange (26) + +- HashAggregate (25) + +- Project (24) + +- Filter (23) + +- Scan parquet (22) (1) Scan parquet @@ -58,97 +58,105 @@ Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, Input [18]: [hash_partition_key#X, l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Arguments: false -(7) ColumnarExchange +(7) VeloxAppendBatches +Input [18]: [hash_partition_key#X, l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] +Arguments: X + +(8) ColumnarExchange Input [18]: [hash_partition_key#X, l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Arguments: hashpartitioning(l_returnflag#X, l_linestatus#X, 1), ENSURE_REQUIREMENTS, [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X], [plan_id=X], [id=#X] -(8) ShuffleQueryStage +(9) ShuffleQueryStage Output [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Arguments: X -(9) InputAdapter +(10) InputAdapter Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] -(10) InputIteratorTransformer +(11) InputIteratorTransformer Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] -(11) RegularHashAggregateExecTransformer +(12) RegularHashAggregateExecTransformer Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Keys [2]: [l_returnflag#X, l_linestatus#X] Functions [8]: [sum(l_quantity#X), sum(l_extendedprice#X), sum((l_extendedprice#X * (1 - l_discount#X))), sum(((l_extendedprice#X * (1 - l_discount#X)) * (1 + l_tax#X))), avg(l_quantity#X), avg(l_extendedprice#X), avg(l_discount#X), count(1)] Aggregate Attributes [8]: [sum(l_quantity#X)#X, sum(l_extendedprice#X)#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X, sum(((l_extendedprice#X * (1 - l_discount#X)) * (1 + l_tax#X)))#X, avg(l_quantity#X)#X, avg(l_extendedprice#X)#X, avg(l_discount#X)#X, count(1)#X] Results [10]: [l_returnflag#X, l_linestatus#X, sum(l_quantity#X)#X AS sum_qty#X, sum(l_extendedprice#X)#X AS sum_base_price#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X AS sum_disc_price#X, sum(((l_extendedprice#X * (1 - l_discount#X)) * (1 + l_tax#X)))#X AS sum_charge#X, avg(l_quantity#X)#X AS avg_qty#X, avg(l_extendedprice#X)#X AS avg_price#X, avg(l_discount#X)#X AS avg_disc#X, count(1)#X AS count_order#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] +Arguments: X + +(15) ColumnarExchange Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: rangepartitioning(l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] -(17) SortExecTransformer +(19) SortExecTransformer Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: [l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST], true, 0 -(18) WholeStageCodegenTransformer (X) +(20) WholeStageCodegenTransformer (X) Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: false -(19) VeloxColumnarToRowExec +(21) VeloxColumnarToRowExec Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] -(20) Scan parquet +(22) Scan parquet Output [7]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_tax#X, l_returnflag#X, l_linestatus#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), LessThanOrEqual(l_shipdate,1998-09-02)] ReadSchema: struct -(21) Filter +(23) Filter Input [7]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_tax#X, l_returnflag#X, l_linestatus#X, l_shipdate#X] Condition : (isnotnull(l_shipdate#X) AND (l_shipdate#X <= 1998-09-02)) -(22) Project +(24) Project Output [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_tax#X, l_returnflag#X, l_linestatus#X] Input [7]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_tax#X, l_returnflag#X, l_linestatus#X, l_shipdate#X] -(23) HashAggregate +(25) HashAggregate Input [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_tax#X, l_returnflag#X, l_linestatus#X] Keys [2]: [l_returnflag#X, l_linestatus#X] Functions [8]: [partial_sum(l_quantity#X), partial_sum(l_extendedprice#X), partial_sum((l_extendedprice#X * (1 - l_discount#X))), partial_sum(((l_extendedprice#X * (1 - l_discount#X)) * (1 + l_tax#X))), partial_avg(l_quantity#X), partial_avg(l_extendedprice#X), partial_avg(l_discount#X), partial_count(1)] Aggregate Attributes [15]: [sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Results [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] -(24) Exchange +(26) Exchange Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Arguments: hashpartitioning(l_returnflag#X, l_linestatus#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(25) HashAggregate +(27) HashAggregate Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Keys [2]: [l_returnflag#X, l_linestatus#X] Functions [8]: [sum(l_quantity#X), sum(l_extendedprice#X), sum((l_extendedprice#X * (1 - l_discount#X))), sum(((l_extendedprice#X * (1 - l_discount#X)) * (1 + l_tax#X))), avg(l_quantity#X), avg(l_extendedprice#X), avg(l_discount#X), count(1)] Aggregate Attributes [8]: [sum(l_quantity#X)#X, sum(l_extendedprice#X)#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X, sum(((l_extendedprice#X * (1 - l_discount#X)) * (1 + l_tax#X)))#X, avg(l_quantity#X)#X, avg(l_extendedprice#X)#X, avg(l_discount#X)#X, count(1)#X] Results [10]: [l_returnflag#X, l_linestatus#X, sum(l_quantity#X)#X AS sum_qty#X, sum(l_extendedprice#X)#X AS sum_base_price#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X AS sum_disc_price#X, sum(((l_extendedprice#X * (1 - l_discount#X)) * (1 + l_tax#X)))#X AS sum_charge#X, avg(l_quantity#X)#X AS avg_qty#X, avg(l_extendedprice#X)#X AS avg_price#X, avg(l_discount#X)#X AS avg_disc#X, count(1)#X AS count_order#X] -(26) Exchange +(28) Exchange Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: rangepartitioning(l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(27) Sort +(29) Sort Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: [l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST], true, 0 -(28) AdaptiveSparkPlan +(30) AdaptiveSparkPlan Output [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/10.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/10.txt index a0a2ead10a377..098b6610a2e12 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/10.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/10.txt @@ -1,68 +1,65 @@ == Physical Plan == -AdaptiveSparkPlan (67) +AdaptiveSparkPlan (68) +- == Final Plan == - VeloxColumnarToRowExec (43) - +- TakeOrderedAndProjectExecTransformer (42) - +- ^ ProjectExecTransformer (40) - +- ^ RegularHashAggregateExecTransformer (39) - +- ^ InputIteratorTransformer (38) - +- ^ InputAdapter (37) - +- ^ ShuffleQueryStage (36), Statistics(X) - +- ColumnarExchange (35) + VeloxColumnarToRowExec (44) + +- TakeOrderedAndProjectExecTransformer (43) + +- ^ ProjectExecTransformer (41) + +- ^ RegularHashAggregateExecTransformer (40) + +- ^ InputIteratorTransformer (39) + +- ShuffleQueryStage (37), Statistics(X) + +- ColumnarExchange (36) + +- VeloxAppendBatches (35) +- ^ ProjectExecTransformer (33) +- ^ FlushableHashAggregateExecTransformer (32) +- ^ ProjectExecTransformer (31) - +- ^ BroadcastHashJoinExecTransformer Inner (30) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (30) :- ^ ProjectExecTransformer (22) - : +- ^ BroadcastHashJoinExecTransformer Inner (21) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (21) : :- ^ ProjectExecTransformer (12) - : : +- ^ BroadcastHashJoinExecTransformer Inner (11) + : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (11) : : :- ^ FilterExecTransformer (2) : : : +- ^ Scan parquet (1) : : +- ^ InputIteratorTransformer (10) - : : +- ^ InputAdapter (9) - : : +- ^ BroadcastQueryStage (8), Statistics(X) - : : +- ColumnarBroadcastExchange (7) - : : +- ^ ProjectExecTransformer (5) - : : +- ^ FilterExecTransformer (4) - : : +- ^ Scan parquet (3) + : : +- BroadcastQueryStage (8), Statistics(X) + : : +- ColumnarBroadcastExchange (7) + : : +- ^ ProjectExecTransformer (5) + : : +- ^ FilterExecTransformer (4) + : : +- ^ Scan parquet (3) : +- ^ InputIteratorTransformer (20) - : +- ^ InputAdapter (19) - : +- ^ BroadcastQueryStage (18), Statistics(X) - : +- ColumnarBroadcastExchange (17) - : +- ^ ProjectExecTransformer (15) - : +- ^ FilterExecTransformer (14) - : +- ^ Scan parquet (13) + : +- BroadcastQueryStage (18), Statistics(X) + : +- ColumnarBroadcastExchange (17) + : +- ^ ProjectExecTransformer (15) + : +- ^ FilterExecTransformer (14) + : +- ^ Scan parquet (13) +- ^ InputIteratorTransformer (29) - +- ^ InputAdapter (28) - +- ^ BroadcastQueryStage (27), Statistics(X) - +- ColumnarBroadcastExchange (26) - +- ^ FilterExecTransformer (24) - +- ^ Scan parquet (23) + +- BroadcastQueryStage (27), Statistics(X) + +- ColumnarBroadcastExchange (26) + +- ^ FilterExecTransformer (24) + +- ^ Scan parquet (23) +- == Initial Plan == - TakeOrderedAndProject (66) - +- HashAggregate (65) - +- Exchange (64) - +- HashAggregate (63) - +- Project (62) - +- BroadcastHashJoin Inner BuildRight (61) - :- Project (57) - : +- BroadcastHashJoin Inner BuildRight (56) - : :- Project (51) - : : +- BroadcastHashJoin Inner BuildRight (50) - : : :- Filter (45) - : : : +- Scan parquet (44) - : : +- BroadcastExchange (49) - : : +- Project (48) - : : +- Filter (47) - : : +- Scan parquet (46) - : +- BroadcastExchange (55) - : +- Project (54) - : +- Filter (53) - : +- Scan parquet (52) - +- BroadcastExchange (60) - +- Filter (59) - +- Scan parquet (58) + TakeOrderedAndProject (67) + +- HashAggregate (66) + +- Exchange (65) + +- HashAggregate (64) + +- Project (63) + +- BroadcastHashJoin Inner BuildRight (62) + :- Project (58) + : +- BroadcastHashJoin Inner BuildRight (57) + : :- Project (52) + : : +- BroadcastHashJoin Inner BuildRight (51) + : : :- Filter (46) + : : : +- Scan parquet (45) + : : +- BroadcastExchange (50) + : : +- Project (49) + : : +- Filter (48) + : : +- Scan parquet (47) + : +- BroadcastExchange (56) + : +- Project (55) + : +- Filter (54) + : +- Scan parquet (53) + +- BroadcastExchange (61) + +- Filter (60) + +- Scan parquet (59) (1) Scan parquet @@ -216,158 +213,162 @@ Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Arguments: false -(35) ColumnarExchange +(35) VeloxAppendBatches +Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] +Arguments: X + +(36) ColumnarExchange Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Arguments: hashpartitioning(c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(36) ShuffleQueryStage +(37) ShuffleQueryStage Output [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Arguments: X -(37) InputAdapter +(38) InputAdapter Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] -(38) InputIteratorTransformer +(39) InputIteratorTransformer Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] -(39) RegularHashAggregateExecTransformer +(40) RegularHashAggregateExecTransformer Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Keys [7]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X] Functions [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X] Results [8]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X] -(40) ProjectExecTransformer +(41) ProjectExecTransformer Output [8]: [c_custkey#X, c_name#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X AS revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] Input [8]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X] -(41) WholeStageCodegenTransformer (X) +(42) WholeStageCodegenTransformer (X) Input [8]: [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] Arguments: false -(42) TakeOrderedAndProjectExecTransformer +(43) TakeOrderedAndProjectExecTransformer Input [8]: [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] Arguments: X, [revenue#X DESC NULLS LAST], [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X], 0 -(43) VeloxColumnarToRowExec +(44) VeloxColumnarToRowExec Input [8]: [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] -(44) Scan parquet +(45) Scan parquet Output [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey), IsNotNull(c_nationkey)] ReadSchema: struct -(45) Filter +(46) Filter Input [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] Condition : (isnotnull(c_custkey#X) AND isnotnull(c_nationkey#X)) -(46) Scan parquet +(47) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1993-10-01), LessThan(o_orderdate,1994-01-01), IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(47) Filter +(48) Filter Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Condition : ((((isnotnull(o_orderdate#X) AND (o_orderdate#X >= 1993-10-01)) AND (o_orderdate#X < 1994-01-01)) AND isnotnull(o_custkey#X)) AND isnotnull(o_orderkey#X)) -(48) Project +(49) Project Output [2]: [o_orderkey#X, o_custkey#X] Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] -(49) BroadcastExchange +(50) BroadcastExchange Input [2]: [o_orderkey#X, o_custkey#X] Arguments: HashedRelationBroadcastMode(List(input[1, bigint, true]),false), [plan_id=X] -(50) BroadcastHashJoin +(51) BroadcastHashJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join type: Inner Join condition: None -(51) Project +(52) Project Output [8]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] Input [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X, o_custkey#X] -(52) Scan parquet +(53) Scan parquet Output [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_returnflag#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_returnflag), EqualTo(l_returnflag,R), IsNotNull(l_orderkey)] ReadSchema: struct -(53) Filter +(54) Filter Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_returnflag#X] Condition : ((isnotnull(l_returnflag#X) AND (l_returnflag#X = R)) AND isnotnull(l_orderkey#X)) -(54) Project +(55) Project Output [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_returnflag#X] -(55) BroadcastExchange +(56) BroadcastExchange Input [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(56) BroadcastHashJoin +(57) BroadcastHashJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: Inner Join condition: None -(57) Project +(58) Project Output [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] Input [11]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X] -(58) Scan parquet +(59) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey)] ReadSchema: struct -(59) Filter +(60) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : isnotnull(n_nationkey#X) -(60) BroadcastExchange +(61) BroadcastExchange Input [2]: [n_nationkey#X, n_name#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(61) BroadcastHashJoin +(62) BroadcastHashJoin Left keys [1]: [c_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(62) Project +(63) Project Output [9]: [c_custkey#X, c_name#X, c_address#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X, n_name#X] Input [11]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X, n_nationkey#X, n_name#X] -(63) HashAggregate +(64) HashAggregate Input [9]: [c_custkey#X, c_name#X, c_address#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X, n_name#X] Keys [7]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X] Functions [1]: [partial_sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] -(64) Exchange +(65) Exchange Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Arguments: hashpartitioning(c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(65) HashAggregate +(66) HashAggregate Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Keys [7]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X] Functions [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X] Results [8]: [c_custkey#X, c_name#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X AS revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] -(66) TakeOrderedAndProject +(67) TakeOrderedAndProject Input [8]: [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] Arguments: X, [revenue#X DESC NULLS LAST], [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] -(67) AdaptiveSparkPlan +(68) AdaptiveSparkPlan Output [8]: [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/11.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/11.txt index c81d951f6e311..725debe00d5e3 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/11.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/11.txt @@ -1,59 +1,57 @@ == Physical Plan == -AdaptiveSparkPlan (58) +AdaptiveSparkPlan (60) +- == Final Plan == - VeloxColumnarToRowExec (38) - +- ^ SortExecTransformer (36) - +- ^ InputIteratorTransformer (35) - +- ^ InputAdapter (34) - +- ^ ShuffleQueryStage (33), Statistics(X) - +- ColumnarExchange (32) - +- ^ FilterExecTransformer (30) - +- ^ RegularHashAggregateExecTransformer (29) - +- ^ InputIteratorTransformer (28) - +- ^ InputAdapter (27) - +- ^ ShuffleQueryStage (26), Statistics(X) - +- ColumnarExchange (25) + VeloxColumnarToRowExec (40) + +- ^ SortExecTransformer (38) + +- ^ InputIteratorTransformer (37) + +- ShuffleQueryStage (35), Statistics(X) + +- ColumnarExchange (34) + +- VeloxAppendBatches (33) + +- ^ FilterExecTransformer (31) + +- ^ RegularHashAggregateExecTransformer (30) + +- ^ InputIteratorTransformer (29) + +- ShuffleQueryStage (27), Statistics(X) + +- ColumnarExchange (26) + +- VeloxAppendBatches (25) +- ^ ProjectExecTransformer (23) +- ^ FlushableHashAggregateExecTransformer (22) +- ^ ProjectExecTransformer (21) - +- ^ BroadcastHashJoinExecTransformer Inner (20) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (20) :- ^ ProjectExecTransformer (11) - : +- ^ BroadcastHashJoinExecTransformer Inner (10) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (10) : :- ^ FilterExecTransformer (2) : : +- ^ Scan parquet (1) : +- ^ InputIteratorTransformer (9) - : +- ^ InputAdapter (8) - : +- ^ BroadcastQueryStage (7), Statistics(X) - : +- ColumnarBroadcastExchange (6) - : +- ^ FilterExecTransformer (4) - : +- ^ Scan parquet (3) + : +- BroadcastQueryStage (7), Statistics(X) + : +- ColumnarBroadcastExchange (6) + : +- ^ FilterExecTransformer (4) + : +- ^ Scan parquet (3) +- ^ InputIteratorTransformer (19) - +- ^ InputAdapter (18) - +- ^ BroadcastQueryStage (17), Statistics(X) - +- ColumnarBroadcastExchange (16) - +- ^ ProjectExecTransformer (14) - +- ^ FilterExecTransformer (13) - +- ^ Scan parquet (12) + +- BroadcastQueryStage (17), Statistics(X) + +- ColumnarBroadcastExchange (16) + +- ^ ProjectExecTransformer (14) + +- ^ FilterExecTransformer (13) + +- ^ Scan parquet (12) +- == Initial Plan == - Sort (57) - +- Exchange (56) - +- Filter (55) - +- HashAggregate (54) - +- Exchange (53) - +- HashAggregate (52) - +- Project (51) - +- BroadcastHashJoin Inner BuildRight (50) - :- Project (45) - : +- BroadcastHashJoin Inner BuildRight (44) - : :- Filter (40) - : : +- Scan parquet (39) - : +- BroadcastExchange (43) - : +- Filter (42) - : +- Scan parquet (41) - +- BroadcastExchange (49) - +- Project (48) - +- Filter (47) - +- Scan parquet (46) + Sort (59) + +- Exchange (58) + +- Filter (57) + +- HashAggregate (56) + +- Exchange (55) + +- HashAggregate (54) + +- Project (53) + +- BroadcastHashJoin Inner BuildRight (52) + :- Project (47) + : +- BroadcastHashJoin Inner BuildRight (46) + : :- Filter (42) + : : +- Scan parquet (41) + : +- BroadcastExchange (45) + : +- Filter (44) + : +- Scan parquet (43) + +- BroadcastExchange (51) + +- Project (50) + +- Filter (49) + +- Scan parquet (48) (1) Scan parquet @@ -164,388 +162,398 @@ Input [3]: [ps_partkey#X, sum#X, isEmpty#X] Input [4]: [hash_partition_key#X, ps_partkey#X, sum#X, isEmpty#X] Arguments: false -(25) ColumnarExchange +(25) VeloxAppendBatches +Input [4]: [hash_partition_key#X, ps_partkey#X, sum#X, isEmpty#X] +Arguments: X + +(26) ColumnarExchange Input [4]: [hash_partition_key#X, ps_partkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(26) ShuffleQueryStage +(27) ShuffleQueryStage Output [3]: [ps_partkey#X, sum#X, isEmpty#X] Arguments: X -(27) InputAdapter +(28) InputAdapter Input [3]: [ps_partkey#X, sum#X, isEmpty#X] -(28) InputIteratorTransformer +(29) InputIteratorTransformer Input [3]: [ps_partkey#X, sum#X, isEmpty#X] -(29) RegularHashAggregateExecTransformer +(30) RegularHashAggregateExecTransformer Input [3]: [ps_partkey#X, sum#X, isEmpty#X] Keys [1]: [ps_partkey#X] Functions [1]: [sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))] Aggregate Attributes [1]: [sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))#X] Results [2]: [ps_partkey#X, sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))#X AS value#X] -(30) FilterExecTransformer +(31) FilterExecTransformer Input [2]: [ps_partkey#X, value#X] Arguments: (isnotnull(value#X) AND (cast(value#X as decimal(38,6)) > Subquery subquery#X, [id=#X])) -(31) WholeStageCodegenTransformer (X) +(32) WholeStageCodegenTransformer (X) Input [2]: [ps_partkey#X, value#X] Arguments: false -(32) ColumnarExchange +(33) VeloxAppendBatches +Input [2]: [ps_partkey#X, value#X] +Arguments: X + +(34) ColumnarExchange Input [2]: [ps_partkey#X, value#X] Arguments: rangepartitioning(value#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(33) ShuffleQueryStage +(35) ShuffleQueryStage Output [2]: [ps_partkey#X, value#X] Arguments: X -(34) InputAdapter +(36) InputAdapter Input [2]: [ps_partkey#X, value#X] -(35) InputIteratorTransformer +(37) InputIteratorTransformer Input [2]: [ps_partkey#X, value#X] -(36) SortExecTransformer +(38) SortExecTransformer Input [2]: [ps_partkey#X, value#X] Arguments: [value#X DESC NULLS LAST], true, 0 -(37) WholeStageCodegenTransformer (X) +(39) WholeStageCodegenTransformer (X) Input [2]: [ps_partkey#X, value#X] Arguments: false -(38) VeloxColumnarToRowExec +(40) VeloxColumnarToRowExec Input [2]: [ps_partkey#X, value#X] -(39) Scan parquet +(41) Scan parquet Output [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_suppkey)] ReadSchema: struct -(40) Filter +(42) Filter Input [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Condition : isnotnull(ps_suppkey#X) -(41) Scan parquet +(43) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(42) Filter +(44) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(43) BroadcastExchange +(45) BroadcastExchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(44) BroadcastHashJoin +(46) BroadcastHashJoin Left keys [1]: [ps_suppkey#X] Right keys [1]: [s_suppkey#X] Join type: Inner Join condition: None -(45) Project +(47) Project Output [4]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] Input [6]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X, s_suppkey#X, s_nationkey#X] -(46) Scan parquet +(48) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,GERMANY), IsNotNull(n_nationkey)] ReadSchema: struct -(47) Filter +(49) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : ((isnotnull(n_name#X) AND (n_name#X = GERMANY)) AND isnotnull(n_nationkey#X)) -(48) Project +(50) Project Output [1]: [n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(49) BroadcastExchange +(51) BroadcastExchange Input [1]: [n_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(50) BroadcastHashJoin +(52) BroadcastHashJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(51) Project +(53) Project Output [3]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X] Input [5]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X, n_nationkey#X] -(52) HashAggregate +(54) HashAggregate Input [3]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X] Keys [1]: [ps_partkey#X] Functions [1]: [partial_sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [ps_partkey#X, sum#X, isEmpty#X] -(53) Exchange +(55) Exchange Input [3]: [ps_partkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(54) HashAggregate +(56) HashAggregate Input [3]: [ps_partkey#X, sum#X, isEmpty#X] Keys [1]: [ps_partkey#X] Functions [1]: [sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))] Aggregate Attributes [1]: [sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))#X] Results [2]: [ps_partkey#X, sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))#X AS value#X] -(55) Filter +(57) Filter Input [2]: [ps_partkey#X, value#X] Condition : (isnotnull(value#X) AND (cast(value#X as decimal(38,6)) > Subquery subquery#X, [id=#X])) -(56) Exchange +(58) Exchange Input [2]: [ps_partkey#X, value#X] Arguments: rangepartitioning(value#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(57) Sort +(59) Sort Input [2]: [ps_partkey#X, value#X] Arguments: [value#X DESC NULLS LAST], true, 0 -(58) AdaptiveSparkPlan +(60) AdaptiveSparkPlan Output [2]: [ps_partkey#X, value#X] Arguments: isFinalPlan=true ===== Subqueries ===== -Subquery:1 Hosting operator id = 30 Hosting Expression = Subquery subquery#X, [id=#X] -AdaptiveSparkPlan (99) +Subquery:1 Hosting operator id = 31 Hosting Expression = Subquery subquery#X, [id=#X] +AdaptiveSparkPlan (102) +- == Final Plan == - VeloxColumnarToRowExec (82) - +- ^ ProjectExecTransformer (80) - +- ^ RegularHashAggregateExecTransformer (79) - +- ^ InputIteratorTransformer (78) - +- ^ InputAdapter (77) - +- ^ ShuffleQueryStage (76), Statistics(X) - +- ColumnarExchange (75) - +- ^ FlushableHashAggregateExecTransformer (73) - +- ^ ProjectExecTransformer (72) - +- ^ BroadcastHashJoinExecTransformer Inner (71) - :- ^ ProjectExecTransformer (66) - : +- ^ BroadcastHashJoinExecTransformer Inner (65) - : :- ^ FilterExecTransformer (60) - : : +- ^ Scan parquet (59) - : +- ^ InputIteratorTransformer (64) - : +- ^ InputAdapter (63) - : +- ^ BroadcastQueryStage (62), Statistics(X) - : +- ReusedExchange (61) - +- ^ InputIteratorTransformer (70) - +- ^ InputAdapter (69) - +- ^ BroadcastQueryStage (68), Statistics(X) - +- ReusedExchange (67) + VeloxColumnarToRowExec (85) + +- ^ ProjectExecTransformer (83) + +- ^ RegularHashAggregateExecTransformer (82) + +- ^ InputIteratorTransformer (81) + +- ShuffleQueryStage (79), Statistics(X) + +- ColumnarExchange (78) + +- VeloxAppendBatches (77) + +- ^ FlushableHashAggregateExecTransformer (75) + +- ^ ProjectExecTransformer (74) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (73) + :- ^ ProjectExecTransformer (68) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (67) + : :- ^ FilterExecTransformer (62) + : : +- ^ Scan parquet (61) + : +- ^ InputIteratorTransformer (66) + : +- BroadcastQueryStage (64), Statistics(X) + : +- ReusedExchange (63) + +- ^ InputIteratorTransformer (72) + +- BroadcastQueryStage (70), Statistics(X) + +- ReusedExchange (69) +- == Initial Plan == - HashAggregate (98) - +- Exchange (97) - +- HashAggregate (96) - +- Project (95) - +- BroadcastHashJoin Inner BuildRight (94) - :- Project (89) - : +- BroadcastHashJoin Inner BuildRight (88) - : :- Filter (84) - : : +- Scan parquet (83) - : +- BroadcastExchange (87) - : +- Filter (86) - : +- Scan parquet (85) - +- BroadcastExchange (93) - +- Project (92) - +- Filter (91) - +- Scan parquet (90) - - -(59) Scan parquet + HashAggregate (101) + +- Exchange (100) + +- HashAggregate (99) + +- Project (98) + +- BroadcastHashJoin Inner BuildRight (97) + :- Project (92) + : +- BroadcastHashJoin Inner BuildRight (91) + : :- Filter (87) + : : +- Scan parquet (86) + : +- BroadcastExchange (90) + : +- Filter (89) + : +- Scan parquet (88) + +- BroadcastExchange (96) + +- Project (95) + +- Filter (94) + +- Scan parquet (93) + + +(61) Scan parquet Output [3]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_suppkey)] ReadSchema: struct -(60) FilterExecTransformer +(62) FilterExecTransformer Input [3]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Arguments: isnotnull(ps_suppkey#X) -(61) ReusedExchange [Reuses operator id: 6] +(63) ReusedExchange [Reuses operator id: 6] Output [2]: [s_suppkey#X, s_nationkey#X] -(62) BroadcastQueryStage +(64) BroadcastQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] Arguments: X -(63) InputAdapter +(65) InputAdapter Input [2]: [s_suppkey#X, s_nationkey#X] -(64) InputIteratorTransformer +(66) InputIteratorTransformer Input [2]: [s_suppkey#X, s_nationkey#X] -(65) BroadcastHashJoinExecTransformer +(67) BroadcastHashJoinExecTransformer Left keys [1]: [ps_suppkey#X] Right keys [1]: [s_suppkey#X] Join type: Inner Join condition: None -(66) ProjectExecTransformer +(68) ProjectExecTransformer Output [3]: [ps_availqty#X, ps_supplycost#X, s_nationkey#X] Input [5]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X, s_suppkey#X, s_nationkey#X] -(67) ReusedExchange [Reuses operator id: 16] +(69) ReusedExchange [Reuses operator id: 16] Output [1]: [n_nationkey#X] -(68) BroadcastQueryStage +(70) BroadcastQueryStage Output [1]: [n_nationkey#X] Arguments: X -(69) InputAdapter +(71) InputAdapter Input [1]: [n_nationkey#X] -(70) InputIteratorTransformer +(72) InputIteratorTransformer Input [1]: [n_nationkey#X] -(71) BroadcastHashJoinExecTransformer +(73) BroadcastHashJoinExecTransformer Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(72) ProjectExecTransformer +(74) ProjectExecTransformer Output [3]: [ps_availqty#X, ps_supplycost#X, (ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))) AS _pre_X#X] Input [4]: [ps_availqty#X, ps_supplycost#X, s_nationkey#X, n_nationkey#X] -(73) FlushableHashAggregateExecTransformer +(75) FlushableHashAggregateExecTransformer Input [3]: [ps_availqty#X, ps_supplycost#X, _pre_X#X] Keys: [] Functions [1]: [partial_sum(_pre_X#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(74) WholeStageCodegenTransformer (X) +(76) WholeStageCodegenTransformer (X) Input [2]: [sum#X, isEmpty#X] Arguments: false -(75) ColumnarExchange +(77) VeloxAppendBatches +Input [2]: [sum#X, isEmpty#X] +Arguments: X + +(78) ColumnarExchange Input [2]: [sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(76) ShuffleQueryStage +(79) ShuffleQueryStage Output [2]: [sum#X, isEmpty#X] Arguments: X -(77) InputAdapter +(80) InputAdapter Input [2]: [sum#X, isEmpty#X] -(78) InputIteratorTransformer +(81) InputIteratorTransformer Input [2]: [sum#X, isEmpty#X] -(79) RegularHashAggregateExecTransformer +(82) RegularHashAggregateExecTransformer Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))] Aggregate Attributes [1]: [sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))#X] Results [1]: [sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))#X] -(80) ProjectExecTransformer +(83) ProjectExecTransformer Output [1]: [(sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))#X * 0.0001000000) AS (sum((ps_supplycost * ps_availqty)) * 0.0001000000)#X] Input [1]: [sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))#X] -(81) WholeStageCodegenTransformer (X) +(84) WholeStageCodegenTransformer (X) Input [1]: [(sum((ps_supplycost * ps_availqty)) * 0.0001000000)#X] Arguments: false -(82) VeloxColumnarToRowExec +(85) VeloxColumnarToRowExec Input [1]: [(sum((ps_supplycost * ps_availqty)) * 0.0001000000)#X] -(83) Scan parquet +(86) Scan parquet Output [3]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_suppkey)] ReadSchema: struct -(84) Filter +(87) Filter Input [3]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Condition : isnotnull(ps_suppkey#X) -(85) Scan parquet +(88) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(86) Filter +(89) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(87) BroadcastExchange +(90) BroadcastExchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(88) BroadcastHashJoin +(91) BroadcastHashJoin Left keys [1]: [ps_suppkey#X] Right keys [1]: [s_suppkey#X] Join type: Inner Join condition: None -(89) Project +(92) Project Output [3]: [ps_availqty#X, ps_supplycost#X, s_nationkey#X] Input [5]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X, s_suppkey#X, s_nationkey#X] -(90) Scan parquet +(93) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,GERMANY), IsNotNull(n_nationkey)] ReadSchema: struct -(91) Filter +(94) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : ((isnotnull(n_name#X) AND (n_name#X = GERMANY)) AND isnotnull(n_nationkey#X)) -(92) Project +(95) Project Output [1]: [n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(93) BroadcastExchange +(96) BroadcastExchange Input [1]: [n_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(94) BroadcastHashJoin +(97) BroadcastHashJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(95) Project +(98) Project Output [2]: [ps_availqty#X, ps_supplycost#X] Input [4]: [ps_availqty#X, ps_supplycost#X, s_nationkey#X, n_nationkey#X] -(96) HashAggregate +(99) HashAggregate Input [2]: [ps_availqty#X, ps_supplycost#X] Keys: [] Functions [1]: [partial_sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(97) Exchange +(100) Exchange Input [2]: [sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X] -(98) HashAggregate +(101) HashAggregate Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))] Aggregate Attributes [1]: [sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))#X] Results [1]: [(sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))#X * 0.0001000000) AS (sum((ps_supplycost * ps_availqty)) * 0.0001000000)#X] -(99) AdaptiveSparkPlan +(102) AdaptiveSparkPlan Output [1]: [(sum((ps_supplycost * ps_availqty)) * 0.0001000000)#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/12.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/12.txt index 36ebd2bf5e76d..e2b9e5153009f 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/12.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/12.txt @@ -1,44 +1,43 @@ == Physical Plan == -AdaptiveSparkPlan (42) +AdaptiveSparkPlan (44) +- == Final Plan == - VeloxColumnarToRowExec (28) - +- ^ SortExecTransformer (26) - +- ^ InputIteratorTransformer (25) - +- ^ InputAdapter (24) - +- ^ ShuffleQueryStage (23), Statistics(X) - +- ColumnarExchange (22) - +- ^ RegularHashAggregateExecTransformer (20) - +- ^ InputIteratorTransformer (19) - +- ^ InputAdapter (18) - +- ^ ShuffleQueryStage (17), Statistics(X) - +- ColumnarExchange (16) + VeloxColumnarToRowExec (30) + +- ^ SortExecTransformer (28) + +- ^ InputIteratorTransformer (27) + +- ShuffleQueryStage (25), Statistics(X) + +- ColumnarExchange (24) + +- VeloxAppendBatches (23) + +- ^ RegularHashAggregateExecTransformer (21) + +- ^ InputIteratorTransformer (20) + +- ShuffleQueryStage (18), Statistics(X) + +- ColumnarExchange (17) + +- VeloxAppendBatches (16) +- ^ ProjectExecTransformer (14) +- ^ FlushableHashAggregateExecTransformer (13) +- ^ ProjectExecTransformer (12) - +- ^ BroadcastHashJoinExecTransformer Inner (11) + +- ^ BroadcastHashJoinExecTransformer Inner BuildLeft (11) :- ^ InputIteratorTransformer (7) - : +- ^ InputAdapter (6) - : +- ^ BroadcastQueryStage (5), Statistics(X) - : +- ColumnarBroadcastExchange (4) - : +- ^ FilterExecTransformer (2) - : +- ^ Scan parquet (1) + : +- BroadcastQueryStage (5), Statistics(X) + : +- ColumnarBroadcastExchange (4) + : +- ^ FilterExecTransformer (2) + : +- ^ Scan parquet (1) +- ^ ProjectExecTransformer (10) +- ^ FilterExecTransformer (9) +- ^ Scan parquet (8) +- == Initial Plan == - Sort (41) - +- Exchange (40) - +- HashAggregate (39) - +- Exchange (38) - +- HashAggregate (37) - +- Project (36) - +- BroadcastHashJoin Inner BuildLeft (35) - :- BroadcastExchange (31) - : +- Filter (30) - : +- Scan parquet (29) - +- Project (34) - +- Filter (33) - +- Scan parquet (32) + Sort (43) + +- Exchange (42) + +- HashAggregate (41) + +- Exchange (40) + +- HashAggregate (39) + +- Project (38) + +- BroadcastHashJoin Inner BuildLeft (37) + :- BroadcastExchange (33) + : +- Filter (32) + : +- Scan parquet (31) + +- Project (36) + +- Filter (35) + +- Scan parquet (34) (1) Scan parquet @@ -110,122 +109,130 @@ Input [3]: [l_shipmode#X, sum#X, sum#X] Input [4]: [hash_partition_key#X, l_shipmode#X, sum#X, sum#X] Arguments: false -(16) ColumnarExchange +(16) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_shipmode#X, sum#X, sum#X] +Arguments: X + +(17) ColumnarExchange Input [4]: [hash_partition_key#X, l_shipmode#X, sum#X, sum#X] Arguments: hashpartitioning(l_shipmode#X, 1), ENSURE_REQUIREMENTS, [l_shipmode#X, sum#X, sum#X], [plan_id=X], [id=#X] -(17) ShuffleQueryStage +(18) ShuffleQueryStage Output [3]: [l_shipmode#X, sum#X, sum#X] Arguments: X -(18) InputAdapter +(19) InputAdapter Input [3]: [l_shipmode#X, sum#X, sum#X] -(19) InputIteratorTransformer +(20) InputIteratorTransformer Input [3]: [l_shipmode#X, sum#X, sum#X] -(20) RegularHashAggregateExecTransformer +(21) RegularHashAggregateExecTransformer Input [3]: [l_shipmode#X, sum#X, sum#X] Keys [1]: [l_shipmode#X] Functions [2]: [sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END), sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)] Aggregate Attributes [2]: [sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X, sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X] Results [3]: [l_shipmode#X, sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X AS high_line_count#X, sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X AS low_line_count#X] -(21) WholeStageCodegenTransformer (X) +(22) WholeStageCodegenTransformer (X) Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: false -(22) ColumnarExchange +(23) VeloxAppendBatches +Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] +Arguments: X + +(24) ColumnarExchange Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: rangepartitioning(l_shipmode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(23) ShuffleQueryStage +(25) ShuffleQueryStage Output [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: X -(24) InputAdapter +(26) InputAdapter Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] -(25) InputIteratorTransformer +(27) InputIteratorTransformer Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] -(26) SortExecTransformer +(28) SortExecTransformer Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: [l_shipmode#X ASC NULLS FIRST], true, 0 -(27) WholeStageCodegenTransformer (X) +(29) WholeStageCodegenTransformer (X) Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: false -(28) VeloxColumnarToRowExec +(30) VeloxColumnarToRowExec Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] -(29) Scan parquet +(31) Scan parquet Output [2]: [o_orderkey#X, o_orderpriority#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderkey)] ReadSchema: struct -(30) Filter +(32) Filter Input [2]: [o_orderkey#X, o_orderpriority#X] Condition : isnotnull(o_orderkey#X) -(31) BroadcastExchange +(33) BroadcastExchange Input [2]: [o_orderkey#X, o_orderpriority#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(32) Scan parquet +(34) Scan parquet Output [5]: [l_orderkey#X, l_shipdate#X, l_commitdate#X, l_receiptdate#X, l_shipmode#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_commitdate), IsNotNull(l_receiptdate), IsNotNull(l_shipdate), In(l_shipmode, [MAIL,SHIP]), GreaterThanOrEqual(l_receiptdate,1994-01-01), LessThan(l_receiptdate,1995-01-01), IsNotNull(l_orderkey)] ReadSchema: struct -(33) Filter +(35) Filter Input [5]: [l_orderkey#X, l_shipdate#X, l_commitdate#X, l_receiptdate#X, l_shipmode#X] Condition : ((((((((isnotnull(l_commitdate#X) AND isnotnull(l_receiptdate#X)) AND isnotnull(l_shipdate#X)) AND l_shipmode#X IN (MAIL,SHIP)) AND (l_commitdate#X < l_receiptdate#X)) AND (l_shipdate#X < l_commitdate#X)) AND (l_receiptdate#X >= 1994-01-01)) AND (l_receiptdate#X < 1995-01-01)) AND isnotnull(l_orderkey#X)) -(34) Project +(36) Project Output [2]: [l_orderkey#X, l_shipmode#X] Input [5]: [l_orderkey#X, l_shipdate#X, l_commitdate#X, l_receiptdate#X, l_shipmode#X] -(35) BroadcastHashJoin +(37) BroadcastHashJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: Inner Join condition: None -(36) Project +(38) Project Output [2]: [o_orderpriority#X, l_shipmode#X] Input [4]: [o_orderkey#X, o_orderpriority#X, l_orderkey#X, l_shipmode#X] -(37) HashAggregate +(39) HashAggregate Input [2]: [o_orderpriority#X, l_shipmode#X] Keys [1]: [l_shipmode#X] Functions [2]: [partial_sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END), partial_sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)] Aggregate Attributes [2]: [sum#X, sum#X] Results [3]: [l_shipmode#X, sum#X, sum#X] -(38) Exchange +(40) Exchange Input [3]: [l_shipmode#X, sum#X, sum#X] Arguments: hashpartitioning(l_shipmode#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(39) HashAggregate +(41) HashAggregate Input [3]: [l_shipmode#X, sum#X, sum#X] Keys [1]: [l_shipmode#X] Functions [2]: [sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END), sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)] Aggregate Attributes [2]: [sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X, sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X] Results [3]: [l_shipmode#X, sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X AS high_line_count#X, sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X AS low_line_count#X] -(40) Exchange +(42) Exchange Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: rangepartitioning(l_shipmode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(41) Sort +(43) Sort Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: [l_shipmode#X ASC NULLS FIRST], true, 0 -(42) AdaptiveSparkPlan +(44) AdaptiveSparkPlan Output [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/13.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/13.txt index aa873582fab21..f64de4dee4b20 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/13.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/13.txt @@ -1,53 +1,52 @@ == Physical Plan == -AdaptiveSparkPlan (52) +AdaptiveSparkPlan (55) +- == Final Plan == - VeloxColumnarToRowExec (36) - +- ^ SortExecTransformer (34) - +- ^ InputIteratorTransformer (33) - +- ^ InputAdapter (32) - +- ^ ShuffleQueryStage (31), Statistics(X) - +- ColumnarExchange (30) - +- ^ RegularHashAggregateExecTransformer (28) - +- ^ InputIteratorTransformer (27) - +- ^ InputAdapter (26) - +- ^ ShuffleQueryStage (25), Statistics(X) - +- ColumnarExchange (24) - +- ^ ProjectExecTransformer (22) - +- ^ FlushableHashAggregateExecTransformer (21) - +- ^ ProjectExecTransformer (20) - +- ^ RegularHashAggregateExecTransformer (19) - +- ^ InputIteratorTransformer (18) - +- ^ InputAdapter (17) - +- ^ ShuffleQueryStage (16), Statistics(X) - +- ColumnarExchange (15) + VeloxColumnarToRowExec (39) + +- ^ SortExecTransformer (37) + +- ^ InputIteratorTransformer (36) + +- ShuffleQueryStage (34), Statistics(X) + +- ColumnarExchange (33) + +- VeloxAppendBatches (32) + +- ^ RegularHashAggregateExecTransformer (30) + +- ^ InputIteratorTransformer (29) + +- ShuffleQueryStage (27), Statistics(X) + +- ColumnarExchange (26) + +- VeloxAppendBatches (25) + +- ^ ProjectExecTransformer (23) + +- ^ FlushableHashAggregateExecTransformer (22) + +- ^ ProjectExecTransformer (21) + +- ^ RegularHashAggregateExecTransformer (20) + +- ^ InputIteratorTransformer (19) + +- ShuffleQueryStage (17), Statistics(X) + +- ColumnarExchange (16) + +- VeloxAppendBatches (15) +- ^ ProjectExecTransformer (13) +- ^ FlushableHashAggregateExecTransformer (12) +- ^ ProjectExecTransformer (11) - +- ^ BroadcastHashJoinExecTransformer LeftOuter (10) + +- ^ BroadcastHashJoinExecTransformer LeftOuter BuildRight (10) :- ^ Scan parquet (1) +- ^ InputIteratorTransformer (9) - +- ^ InputAdapter (8) - +- ^ BroadcastQueryStage (7), Statistics(X) - +- ColumnarBroadcastExchange (6) - +- ^ ProjectExecTransformer (4) - +- ^ FilterExecTransformer (3) - +- ^ Scan parquet (2) + +- BroadcastQueryStage (7), Statistics(X) + +- ColumnarBroadcastExchange (6) + +- ^ ProjectExecTransformer (4) + +- ^ FilterExecTransformer (3) + +- ^ Scan parquet (2) +- == Initial Plan == - Sort (51) - +- Exchange (50) - +- HashAggregate (49) - +- Exchange (48) - +- HashAggregate (47) - +- HashAggregate (46) - +- Exchange (45) - +- HashAggregate (44) - +- Project (43) - +- BroadcastHashJoin LeftOuter BuildRight (42) - :- Scan parquet (37) - +- BroadcastExchange (41) - +- Project (40) - +- Filter (39) - +- Scan parquet (38) + Sort (54) + +- Exchange (53) + +- HashAggregate (52) + +- Exchange (51) + +- HashAggregate (50) + +- HashAggregate (49) + +- Exchange (48) + +- HashAggregate (47) + +- Project (46) + +- BroadcastHashJoin LeftOuter BuildRight (45) + :- Scan parquet (40) + +- BroadcastExchange (44) + +- Project (43) + +- Filter (42) + +- Scan parquet (41) (1) Scan parquet @@ -114,175 +113,187 @@ Input [2]: [c_custkey#X, count#X] Input [3]: [hash_partition_key#X, c_custkey#X, count#X] Arguments: false -(15) ColumnarExchange +(15) VeloxAppendBatches +Input [3]: [hash_partition_key#X, c_custkey#X, count#X] +Arguments: X + +(16) ColumnarExchange Input [3]: [hash_partition_key#X, c_custkey#X, count#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, count#X], [plan_id=X], [id=#X] -(16) ShuffleQueryStage +(17) ShuffleQueryStage Output [2]: [c_custkey#X, count#X] Arguments: X -(17) InputAdapter +(18) InputAdapter Input [2]: [c_custkey#X, count#X] -(18) InputIteratorTransformer +(19) InputIteratorTransformer Input [2]: [c_custkey#X, count#X] -(19) RegularHashAggregateExecTransformer +(20) RegularHashAggregateExecTransformer Input [2]: [c_custkey#X, count#X] Keys [1]: [c_custkey#X] Functions [1]: [count(o_orderkey#X)] Aggregate Attributes [1]: [count(o_orderkey#X)#X] Results [2]: [c_custkey#X, count(o_orderkey#X)#X] -(20) ProjectExecTransformer +(21) ProjectExecTransformer Output [1]: [count(o_orderkey#X)#X AS c_count#X] Input [2]: [c_custkey#X, count(o_orderkey#X)#X] -(21) FlushableHashAggregateExecTransformer +(22) FlushableHashAggregateExecTransformer Input [1]: [c_count#X] Keys [1]: [c_count#X] Functions [1]: [partial_count(1)] Aggregate Attributes [1]: [count#X] Results [2]: [c_count#X, count#X] -(22) ProjectExecTransformer +(23) ProjectExecTransformer Output [3]: [hash(c_count#X, 42) AS hash_partition_key#X, c_count#X, count#X] Input [2]: [c_count#X, count#X] -(23) WholeStageCodegenTransformer (X) +(24) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, c_count#X, count#X] Arguments: false -(24) ColumnarExchange +(25) VeloxAppendBatches +Input [3]: [hash_partition_key#X, c_count#X, count#X] +Arguments: X + +(26) ColumnarExchange Input [3]: [hash_partition_key#X, c_count#X, count#X] Arguments: hashpartitioning(c_count#X, 1), ENSURE_REQUIREMENTS, [c_count#X, count#X], [plan_id=X], [id=#X] -(25) ShuffleQueryStage +(27) ShuffleQueryStage Output [2]: [c_count#X, count#X] Arguments: X -(26) InputAdapter +(28) InputAdapter Input [2]: [c_count#X, count#X] -(27) InputIteratorTransformer +(29) InputIteratorTransformer Input [2]: [c_count#X, count#X] -(28) RegularHashAggregateExecTransformer +(30) RegularHashAggregateExecTransformer Input [2]: [c_count#X, count#X] Keys [1]: [c_count#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [c_count#X, count(1)#X AS custdist#X] -(29) WholeStageCodegenTransformer (X) +(31) WholeStageCodegenTransformer (X) Input [2]: [c_count#X, custdist#X] Arguments: false -(30) ColumnarExchange +(32) VeloxAppendBatches +Input [2]: [c_count#X, custdist#X] +Arguments: X + +(33) ColumnarExchange Input [2]: [c_count#X, custdist#X] Arguments: rangepartitioning(custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(31) ShuffleQueryStage +(34) ShuffleQueryStage Output [2]: [c_count#X, custdist#X] Arguments: X -(32) InputAdapter +(35) InputAdapter Input [2]: [c_count#X, custdist#X] -(33) InputIteratorTransformer +(36) InputIteratorTransformer Input [2]: [c_count#X, custdist#X] -(34) SortExecTransformer +(37) SortExecTransformer Input [2]: [c_count#X, custdist#X] Arguments: [custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST], true, 0 -(35) WholeStageCodegenTransformer (X) +(38) WholeStageCodegenTransformer (X) Input [2]: [c_count#X, custdist#X] Arguments: false -(36) VeloxColumnarToRowExec +(39) VeloxColumnarToRowExec Input [2]: [c_count#X, custdist#X] -(37) Scan parquet +(40) Scan parquet Output [1]: [c_custkey#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(38) Scan parquet +(41) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_comment#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_comment), IsNotNull(o_custkey)] ReadSchema: struct -(39) Filter +(42) Filter Input [3]: [o_orderkey#X, o_custkey#X, o_comment#X] Condition : ((isnotnull(o_comment#X) AND NOT o_comment#X LIKE %special%requests%) AND isnotnull(o_custkey#X)) -(40) Project +(43) Project Output [2]: [o_orderkey#X, o_custkey#X] Input [3]: [o_orderkey#X, o_custkey#X, o_comment#X] -(41) BroadcastExchange +(44) BroadcastExchange Input [2]: [o_orderkey#X, o_custkey#X] Arguments: HashedRelationBroadcastMode(List(input[1, bigint, true]),false), [plan_id=X] -(42) BroadcastHashJoin +(45) BroadcastHashJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join type: LeftOuter Join condition: None -(43) Project +(46) Project Output [2]: [c_custkey#X, o_orderkey#X] Input [3]: [c_custkey#X, o_orderkey#X, o_custkey#X] -(44) HashAggregate +(47) HashAggregate Input [2]: [c_custkey#X, o_orderkey#X] Keys [1]: [c_custkey#X] Functions [1]: [partial_count(o_orderkey#X)] Aggregate Attributes [1]: [count#X] Results [2]: [c_custkey#X, count#X] -(45) Exchange +(48) Exchange Input [2]: [c_custkey#X, count#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(46) HashAggregate +(49) HashAggregate Input [2]: [c_custkey#X, count#X] Keys [1]: [c_custkey#X] Functions [1]: [count(o_orderkey#X)] Aggregate Attributes [1]: [count(o_orderkey#X)#X] Results [1]: [count(o_orderkey#X)#X AS c_count#X] -(47) HashAggregate +(50) HashAggregate Input [1]: [c_count#X] Keys [1]: [c_count#X] Functions [1]: [partial_count(1)] Aggregate Attributes [1]: [count#X] Results [2]: [c_count#X, count#X] -(48) Exchange +(51) Exchange Input [2]: [c_count#X, count#X] Arguments: hashpartitioning(c_count#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(49) HashAggregate +(52) HashAggregate Input [2]: [c_count#X, count#X] Keys [1]: [c_count#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [c_count#X, count(1)#X AS custdist#X] -(50) Exchange +(53) Exchange Input [2]: [c_count#X, custdist#X] Arguments: rangepartitioning(custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(51) Sort +(54) Sort Input [2]: [c_count#X, custdist#X] Arguments: [custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST], true, 0 -(52) AdaptiveSparkPlan +(55) AdaptiveSparkPlan Output [2]: [c_count#X, custdist#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/14.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/14.txt index 3b5202f36c1ea..901c481fac69a 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/14.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/14.txt @@ -1,37 +1,36 @@ == Physical Plan == -AdaptiveSparkPlan (34) +AdaptiveSparkPlan (35) +- == Final Plan == - VeloxColumnarToRowExec (22) - +- ^ ProjectExecTransformer (20) - +- ^ RegularHashAggregateExecTransformer (19) - +- ^ InputIteratorTransformer (18) - +- ^ InputAdapter (17) - +- ^ ShuffleQueryStage (16), Statistics(X) - +- ColumnarExchange (15) + VeloxColumnarToRowExec (23) + +- ^ ProjectExecTransformer (21) + +- ^ RegularHashAggregateExecTransformer (20) + +- ^ InputIteratorTransformer (19) + +- ShuffleQueryStage (17), Statistics(X) + +- ColumnarExchange (16) + +- VeloxAppendBatches (15) +- ^ FlushableHashAggregateExecTransformer (13) +- ^ ProjectExecTransformer (12) - +- ^ BroadcastHashJoinExecTransformer Inner (11) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (11) :- ^ ProjectExecTransformer (3) : +- ^ FilterExecTransformer (2) : +- ^ Scan parquet (1) +- ^ InputIteratorTransformer (10) - +- ^ InputAdapter (9) - +- ^ BroadcastQueryStage (8), Statistics(X) - +- ColumnarBroadcastExchange (7) - +- ^ FilterExecTransformer (5) - +- ^ Scan parquet (4) + +- BroadcastQueryStage (8), Statistics(X) + +- ColumnarBroadcastExchange (7) + +- ^ FilterExecTransformer (5) + +- ^ Scan parquet (4) +- == Initial Plan == - HashAggregate (33) - +- Exchange (32) - +- HashAggregate (31) - +- Project (30) - +- BroadcastHashJoin Inner BuildRight (29) - :- Project (25) - : +- Filter (24) - : +- Scan parquet (23) - +- BroadcastExchange (28) - +- Filter (27) - +- Scan parquet (26) + HashAggregate (34) + +- Exchange (33) + +- HashAggregate (32) + +- Project (31) + +- BroadcastHashJoin Inner BuildRight (30) + :- Project (26) + : +- Filter (25) + : +- Scan parquet (24) + +- BroadcastExchange (29) + +- Filter (28) + +- Scan parquet (27) (1) Scan parquet @@ -99,96 +98,100 @@ Results [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: false -(15) ColumnarExchange +(15) VeloxAppendBatches +Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] +Arguments: X + +(16) ColumnarExchange Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(16) ShuffleQueryStage +(17) ShuffleQueryStage Output [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: X -(17) InputAdapter +(18) InputAdapter Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] -(18) InputIteratorTransformer +(19) InputIteratorTransformer Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] -(19) RegularHashAggregateExecTransformer +(20) RegularHashAggregateExecTransformer Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Keys: [] Functions [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN (l_extendedprice#X * (1 - l_discount#X)) ELSE 0.0000 END), sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN (l_extendedprice#X * (1 - l_discount#X)) ELSE 0.0000 END)#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X] Results [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN (l_extendedprice#X * (1 - l_discount#X)) ELSE 0.0000 END)#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X] -(20) ProjectExecTransformer +(21) ProjectExecTransformer Output [1]: [((100.00 * sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN (l_extendedprice#X * (1 - l_discount#X)) ELSE 0.0000 END)#X) / sum((l_extendedprice#X * (1 - l_discount#X)))#X) AS promo_revenue#X] Input [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN (l_extendedprice#X * (1 - l_discount#X)) ELSE 0.0000 END)#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X] -(21) WholeStageCodegenTransformer (X) +(22) WholeStageCodegenTransformer (X) Input [1]: [promo_revenue#X] Arguments: false -(22) VeloxColumnarToRowExec +(23) VeloxColumnarToRowExec Input [1]: [promo_revenue#X] -(23) Scan parquet +(24) Scan parquet Output [4]: [l_partkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1995-09-01), LessThan(l_shipdate,1995-10-01), IsNotNull(l_partkey)] ReadSchema: struct -(24) Filter +(25) Filter Input [4]: [l_partkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : (((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1995-09-01)) AND (l_shipdate#X < 1995-10-01)) AND isnotnull(l_partkey#X)) -(25) Project +(26) Project Output [3]: [l_partkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_partkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(26) Scan parquet +(27) Scan parquet Output [2]: [p_partkey#X, p_type#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_partkey)] ReadSchema: struct -(27) Filter +(28) Filter Input [2]: [p_partkey#X, p_type#X] Condition : isnotnull(p_partkey#X) -(28) BroadcastExchange +(29) BroadcastExchange Input [2]: [p_partkey#X, p_type#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(29) BroadcastHashJoin +(30) BroadcastHashJoin Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join type: Inner Join condition: None -(30) Project +(31) Project Output [3]: [l_extendedprice#X, l_discount#X, p_type#X] Input [5]: [l_partkey#X, l_extendedprice#X, l_discount#X, p_partkey#X, p_type#X] -(31) HashAggregate +(32) HashAggregate Input [3]: [l_extendedprice#X, l_discount#X, p_type#X] Keys: [] Functions [2]: [partial_sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN (l_extendedprice#X * (1 - l_discount#X)) ELSE 0.0000 END), partial_sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Results [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] -(32) Exchange +(33) Exchange Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X] -(33) HashAggregate +(34) HashAggregate Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Keys: [] Functions [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN (l_extendedprice#X * (1 - l_discount#X)) ELSE 0.0000 END), sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN (l_extendedprice#X * (1 - l_discount#X)) ELSE 0.0000 END)#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X] Results [1]: [((100.00 * sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN (l_extendedprice#X * (1 - l_discount#X)) ELSE 0.0000 END)#X) / sum((l_extendedprice#X * (1 - l_discount#X)))#X) AS promo_revenue#X] -(34) AdaptiveSparkPlan +(35) AdaptiveSparkPlan Output [1]: [promo_revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/15.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/15.txt index 0fd34791c1507..a87fdff4537d9 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/15.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/15.txt @@ -1,44 +1,44 @@ == Physical Plan == -AdaptiveSparkPlan (41) +AdaptiveSparkPlan (43) +- == Final Plan == - VeloxColumnarToRowExec (26) - +- AQEShuffleRead (25) - +- ShuffleQueryStage (24), Statistics(X) - +- ColumnarExchange (23) - +- ^ ProjectExecTransformer (21) - +- ^ BroadcastHashJoinExecTransformer Inner (20) - :- ^ InputIteratorTransformer (7) - : +- ^ InputAdapter (6) - : +- ^ BroadcastQueryStage (5), Statistics(X) - : +- ColumnarBroadcastExchange (4) - : +- ^ FilterExecTransformer (2) - : +- ^ Scan parquet (1) - +- ^ FilterExecTransformer (19) - +- ^ RegularHashAggregateExecTransformer (18) - +- ^ InputIteratorTransformer (17) - +- ^ InputAdapter (16) - +- ^ ShuffleQueryStage (15), Statistics(X) - +- ColumnarExchange (14) - +- ^ ProjectExecTransformer (12) - +- ^ FlushableHashAggregateExecTransformer (11) - +- ^ ProjectExecTransformer (10) - +- ^ FilterExecTransformer (9) - +- ^ Scan parquet (8) + VeloxColumnarToRowExec (28) + +- AQEShuffleRead (27) + +- ShuffleQueryStage (26), Statistics(X) + +- ColumnarExchange (25) + +- VeloxAppendBatches (24) + +- ^ ProjectExecTransformer (22) + +- ^ BroadcastHashJoinExecTransformer Inner BuildLeft (21) + :- ^ InputIteratorTransformer (7) + : +- BroadcastQueryStage (5), Statistics(X) + : +- ColumnarBroadcastExchange (4) + : +- ^ FilterExecTransformer (2) + : +- ^ Scan parquet (1) + +- ^ FilterExecTransformer (20) + +- ^ RegularHashAggregateExecTransformer (19) + +- ^ InputIteratorTransformer (18) + +- ShuffleQueryStage (16), Statistics(X) + +- ColumnarExchange (15) + +- VeloxAppendBatches (14) + +- ^ ProjectExecTransformer (12) + +- ^ FlushableHashAggregateExecTransformer (11) + +- ^ ProjectExecTransformer (10) + +- ^ FilterExecTransformer (9) + +- ^ Scan parquet (8) +- == Initial Plan == - Sort (40) - +- Exchange (39) - +- Project (38) - +- BroadcastHashJoin Inner BuildLeft (37) - :- BroadcastExchange (29) - : +- Filter (28) - : +- Scan parquet (27) - +- Filter (36) - +- HashAggregate (35) - +- Exchange (34) - +- HashAggregate (33) - +- Project (32) - +- Filter (31) - +- Scan parquet (30) + Sort (42) + +- Exchange (41) + +- Project (40) + +- BroadcastHashJoin Inner BuildLeft (39) + :- BroadcastExchange (31) + : +- Filter (30) + : +- Scan parquet (29) + +- Filter (38) + +- HashAggregate (37) + +- Exchange (36) + +- HashAggregate (35) + +- Project (34) + +- Filter (33) + +- Scan parquet (32) (1) Scan parquet @@ -100,287 +100,299 @@ Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: false -(14) ColumnarExchange +(14) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] +Arguments: X + +(15) ColumnarExchange Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(15) ShuffleQueryStage +(16) ShuffleQueryStage Output [3]: [l_suppkey#X, sum#X, isEmpty#X] Arguments: X -(16) InputAdapter +(17) InputAdapter Input [3]: [l_suppkey#X, sum#X, isEmpty#X] -(17) InputIteratorTransformer +(18) InputIteratorTransformer Input [3]: [l_suppkey#X, sum#X, isEmpty#X] -(18) RegularHashAggregateExecTransformer +(19) RegularHashAggregateExecTransformer Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Keys [1]: [l_suppkey#X] Functions [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X] Results [2]: [l_suppkey#X AS supplier_no#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X AS total_revenue#X] -(19) FilterExecTransformer +(20) FilterExecTransformer Input [2]: [supplier_no#X, total_revenue#X] Arguments: (isnotnull(total_revenue#X) AND (total_revenue#X = Subquery subquery#X, [id=#X])) -(20) BroadcastHashJoinExecTransformer +(21) BroadcastHashJoinExecTransformer Left keys [1]: [s_suppkey#X] Right keys [1]: [supplier_no#X] Join type: Inner Join condition: None -(21) ProjectExecTransformer +(22) ProjectExecTransformer Output [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Input [6]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, supplier_no#X, total_revenue#X] -(22) WholeStageCodegenTransformer (X) +(23) WholeStageCodegenTransformer (X) Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: false -(23) ColumnarExchange +(24) VeloxAppendBatches +Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] +Arguments: X + +(25) ColumnarExchange Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: rangepartitioning(s_suppkey#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(24) ShuffleQueryStage +(26) ShuffleQueryStage Output [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: X -(25) AQEShuffleRead +(27) AQEShuffleRead Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: local -(26) VeloxColumnarToRowExec +(28) VeloxColumnarToRowExec Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] -(27) Scan parquet +(29) Scan parquet Output [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey)] ReadSchema: struct -(28) Filter +(30) Filter Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] Condition : isnotnull(s_suppkey#X) -(29) BroadcastExchange +(31) BroadcastExchange Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(30) Scan parquet +(32) Scan parquet Output [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1996-01-01), LessThan(l_shipdate,1996-04-01), IsNotNull(l_suppkey)] ReadSchema: struct -(31) Filter +(33) Filter Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : (((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1996-01-01)) AND (l_shipdate#X < 1996-04-01)) AND isnotnull(l_suppkey#X)) -(32) Project +(34) Project Output [3]: [l_suppkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(33) HashAggregate +(35) HashAggregate Input [3]: [l_suppkey#X, l_extendedprice#X, l_discount#X] Keys [1]: [l_suppkey#X] Functions [1]: [partial_sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_suppkey#X, sum#X, isEmpty#X] -(34) Exchange +(36) Exchange Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(35) HashAggregate +(37) HashAggregate Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Keys [1]: [l_suppkey#X] Functions [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X] Results [2]: [l_suppkey#X AS supplier_no#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X AS total_revenue#X] -(36) Filter +(38) Filter Input [2]: [supplier_no#X, total_revenue#X] Condition : (isnotnull(total_revenue#X) AND (total_revenue#X = Subquery subquery#X, [id=#X])) -(37) BroadcastHashJoin +(39) BroadcastHashJoin Left keys [1]: [s_suppkey#X] Right keys [1]: [supplier_no#X] Join type: Inner Join condition: None -(38) Project +(40) Project Output [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Input [6]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, supplier_no#X, total_revenue#X] -(39) Exchange +(41) Exchange Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: rangepartitioning(s_suppkey#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(40) Sort +(42) Sort Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: [s_suppkey#X ASC NULLS FIRST], true, 0 -(41) AdaptiveSparkPlan +(43) AdaptiveSparkPlan Output [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: isFinalPlan=true ===== Subqueries ===== -Subquery:1 Hosting operator id = 19 Hosting Expression = Subquery subquery#X, [id=#X] -AdaptiveSparkPlan (66) +Subquery:1 Hosting operator id = 20 Hosting Expression = Subquery subquery#X, [id=#X] +AdaptiveSparkPlan (69) +- == Final Plan == - VeloxColumnarToRowExec (57) - +- ^ RegularHashAggregateExecTransformer (55) - +- ^ RegularHashAggregateExecTransformer (54) - +- ^ ProjectExecTransformer (53) - +- ^ RegularHashAggregateExecTransformer (52) - +- ^ InputIteratorTransformer (51) - +- ^ InputAdapter (50) - +- ^ ShuffleQueryStage (49), Statistics(X) - +- ColumnarExchange (48) - +- ^ ProjectExecTransformer (46) - +- ^ FlushableHashAggregateExecTransformer (45) - +- ^ ProjectExecTransformer (44) - +- ^ FilterExecTransformer (43) - +- ^ Scan parquet (42) + VeloxColumnarToRowExec (60) + +- ^ RegularHashAggregateExecTransformer (58) + +- ^ RegularHashAggregateExecTransformer (57) + +- ^ ProjectExecTransformer (56) + +- ^ RegularHashAggregateExecTransformer (55) + +- ^ InputIteratorTransformer (54) + +- ShuffleQueryStage (52), Statistics(X) + +- ColumnarExchange (51) + +- VeloxAppendBatches (50) + +- ^ ProjectExecTransformer (48) + +- ^ FlushableHashAggregateExecTransformer (47) + +- ^ ProjectExecTransformer (46) + +- ^ FilterExecTransformer (45) + +- ^ Scan parquet (44) +- == Initial Plan == - HashAggregate (65) - +- HashAggregate (64) - +- HashAggregate (63) - +- Exchange (62) - +- HashAggregate (61) - +- Project (60) - +- Filter (59) - +- Scan parquet (58) + HashAggregate (68) + +- HashAggregate (67) + +- HashAggregate (66) + +- Exchange (65) + +- HashAggregate (64) + +- Project (63) + +- Filter (62) + +- Scan parquet (61) -(42) Scan parquet +(44) Scan parquet Output [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1996-01-01), LessThan(l_shipdate,1996-04-01)] ReadSchema: struct -(43) FilterExecTransformer +(45) FilterExecTransformer Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: ((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1996-01-01)) AND (l_shipdate#X < 1996-04-01)) -(44) ProjectExecTransformer +(46) ProjectExecTransformer Output [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, (l_extendedprice#X * (1 - l_discount#X)) AS _pre_X#X] Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(45) FlushableHashAggregateExecTransformer +(47) FlushableHashAggregateExecTransformer Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, _pre_X#X] Keys [1]: [l_suppkey#X] Functions [1]: [partial_sum(_pre_X#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_suppkey#X, sum#X, isEmpty#X] -(46) ProjectExecTransformer +(48) ProjectExecTransformer Output [4]: [hash(l_suppkey#X, 42) AS hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] Input [3]: [l_suppkey#X, sum#X, isEmpty#X] -(47) WholeStageCodegenTransformer (X) +(49) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: false -(48) ColumnarExchange +(50) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] +Arguments: X + +(51) ColumnarExchange Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(49) ShuffleQueryStage +(52) ShuffleQueryStage Output [3]: [l_suppkey#X, sum#X, isEmpty#X] Arguments: X -(50) InputAdapter +(53) InputAdapter Input [3]: [l_suppkey#X, sum#X, isEmpty#X] -(51) InputIteratorTransformer +(54) InputIteratorTransformer Input [3]: [l_suppkey#X, sum#X, isEmpty#X] -(52) RegularHashAggregateExecTransformer +(55) RegularHashAggregateExecTransformer Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Keys [1]: [l_suppkey#X] Functions [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X] Results [2]: [l_suppkey#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X] -(53) ProjectExecTransformer +(56) ProjectExecTransformer Output [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X AS total_revenue#X] Input [2]: [l_suppkey#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X] -(54) RegularHashAggregateExecTransformer +(57) RegularHashAggregateExecTransformer Input [1]: [total_revenue#X] Keys: [] Functions [1]: [partial_max(total_revenue#X)] Aggregate Attributes [1]: [max#X] Results [1]: [max#X] -(55) RegularHashAggregateExecTransformer +(58) RegularHashAggregateExecTransformer Input [1]: [max#X] Keys: [] Functions [1]: [max(total_revenue#X)] Aggregate Attributes [1]: [max(total_revenue#X)#X] Results [1]: [max(total_revenue#X)#X AS max(total_revenue)#X] -(56) WholeStageCodegenTransformer (X) +(59) WholeStageCodegenTransformer (X) Input [1]: [max(total_revenue)#X] Arguments: false -(57) VeloxColumnarToRowExec +(60) VeloxColumnarToRowExec Input [1]: [max(total_revenue)#X] -(58) Scan parquet +(61) Scan parquet Output [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1996-01-01), LessThan(l_shipdate,1996-04-01)] ReadSchema: struct -(59) Filter +(62) Filter Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : ((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1996-01-01)) AND (l_shipdate#X < 1996-04-01)) -(60) Project +(63) Project Output [3]: [l_suppkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(61) HashAggregate +(64) HashAggregate Input [3]: [l_suppkey#X, l_extendedprice#X, l_discount#X] Keys [1]: [l_suppkey#X] Functions [1]: [partial_sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_suppkey#X, sum#X, isEmpty#X] -(62) Exchange +(65) Exchange Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(63) HashAggregate +(66) HashAggregate Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Keys [1]: [l_suppkey#X] Functions [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X] Results [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X AS total_revenue#X] -(64) HashAggregate +(67) HashAggregate Input [1]: [total_revenue#X] Keys: [] Functions [1]: [partial_max(total_revenue#X)] Aggregate Attributes [1]: [max#X] Results [1]: [max#X] -(65) HashAggregate +(68) HashAggregate Input [1]: [max#X] Keys: [] Functions [1]: [max(total_revenue#X)] Aggregate Attributes [1]: [max(total_revenue#X)#X] Results [1]: [max(total_revenue#X)#X AS max(total_revenue)#X] -(66) AdaptiveSparkPlan +(69) AdaptiveSparkPlan Output [1]: [max(total_revenue)#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/16.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/16.txt index ed932e494f66f..e2f073b8908f0 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/16.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/16.txt @@ -1,57 +1,56 @@ == Physical Plan == -AdaptiveSparkPlan (56) +AdaptiveSparkPlan (59) +- == Final Plan == - VeloxColumnarToRowExec (35) - +- ^ SortExecTransformer (33) - +- ^ InputIteratorTransformer (32) - +- ^ InputAdapter (31) - +- ^ ShuffleQueryStage (30), Statistics(X) - +- ColumnarExchange (29) - +- ^ RegularHashAggregateExecTransformer (27) - +- ^ InputIteratorTransformer (26) - +- ^ InputAdapter (25) - +- ^ ShuffleQueryStage (24), Statistics(X) - +- ColumnarExchange (23) - +- ^ ProjectExecTransformer (21) - +- ^ FlushableHashAggregateExecTransformer (20) - +- ^ RegularHashAggregateExecTransformer (19) - +- ^ InputIteratorTransformer (18) - +- ^ InputAdapter (17) - +- ^ ShuffleQueryStage (16), Statistics(X) - +- ColumnarExchange (15) + VeloxColumnarToRowExec (38) + +- ^ SortExecTransformer (36) + +- ^ InputIteratorTransformer (35) + +- ShuffleQueryStage (33), Statistics(X) + +- ColumnarExchange (32) + +- VeloxAppendBatches (31) + +- ^ RegularHashAggregateExecTransformer (29) + +- ^ InputIteratorTransformer (28) + +- ShuffleQueryStage (26), Statistics(X) + +- ColumnarExchange (25) + +- VeloxAppendBatches (24) + +- ^ ProjectExecTransformer (22) + +- ^ FlushableHashAggregateExecTransformer (21) + +- ^ RegularHashAggregateExecTransformer (20) + +- ^ InputIteratorTransformer (19) + +- ShuffleQueryStage (17), Statistics(X) + +- ColumnarExchange (16) + +- VeloxAppendBatches (15) +- ^ ProjectExecTransformer (13) +- ^ FlushableHashAggregateExecTransformer (12) +- ^ ProjectExecTransformer (11) - +- ^ BroadcastHashJoinExecTransformer Inner (10) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (10) :- ^ FilterExecTransformer (2) : +- ^ Scan parquet (1) +- ^ InputIteratorTransformer (9) - +- ^ InputAdapter (8) - +- ^ BroadcastQueryStage (7), Statistics(X) - +- ColumnarBroadcastExchange (6) - +- ^ FilterExecTransformer (4) - +- ^ Scan parquet (3) + +- BroadcastQueryStage (7), Statistics(X) + +- ColumnarBroadcastExchange (6) + +- ^ FilterExecTransformer (4) + +- ^ Scan parquet (3) +- == Initial Plan == - Sort (55) - +- Exchange (54) - +- HashAggregate (53) - +- Exchange (52) - +- HashAggregate (51) - +- HashAggregate (50) - +- Exchange (49) - +- HashAggregate (48) - +- Project (47) - +- BroadcastHashJoin Inner BuildRight (46) - :- BroadcastHashJoin LeftAnti BuildRight (42) - : :- Filter (37) - : : +- Scan parquet (36) - : +- BroadcastExchange (41) - : +- Project (40) - : +- Filter (39) - : +- Scan parquet (38) - +- BroadcastExchange (45) - +- Filter (44) - +- Scan parquet (43) + Sort (58) + +- Exchange (57) + +- HashAggregate (56) + +- Exchange (55) + +- HashAggregate (54) + +- HashAggregate (53) + +- Exchange (52) + +- HashAggregate (51) + +- Project (50) + +- BroadcastHashJoin Inner BuildRight (49) + :- BroadcastHashJoin LeftAnti BuildRight (45) + : :- Filter (40) + : : +- Scan parquet (39) + : +- BroadcastExchange (44) + : +- Project (43) + : +- Filter (42) + : +- Scan parquet (41) + +- BroadcastExchange (48) + +- Filter (47) + +- Scan parquet (46) (1) Scan parquet @@ -119,197 +118,209 @@ Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Arguments: false -(15) ColumnarExchange +(15) VeloxAppendBatches +Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, ps_suppkey#X] +Arguments: X + +(16) ColumnarExchange Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, ps_suppkey#X], [plan_id=X], [id=#X] -(16) ShuffleQueryStage +(17) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Arguments: X -(17) InputAdapter +(18) InputAdapter Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(18) InputIteratorTransformer +(19) InputIteratorTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(19) RegularHashAggregateExecTransformer +(20) RegularHashAggregateExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Keys [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Functions: [] Aggregate Attributes: [] Results [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(20) FlushableHashAggregateExecTransformer +(21) FlushableHashAggregateExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] Functions [1]: [partial_count(distinct ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count#X] -(21) ProjectExecTransformer +(22) ProjectExecTransformer Output [5]: [hash(p_brand#X, p_type#X, p_size#X, 42) AS hash_partition_key#X, p_brand#X, p_type#X, p_size#X, count#X] Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] -(22) WholeStageCodegenTransformer (X) +(23) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, count#X] Arguments: false -(23) ColumnarExchange +(24) VeloxAppendBatches +Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, count#X] +Arguments: X + +(25) ColumnarExchange Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, count#X] Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, count#X], [plan_id=X], [id=#X] -(24) ShuffleQueryStage +(26) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, count#X] Arguments: X -(25) InputAdapter +(27) InputAdapter Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] -(26) InputIteratorTransformer +(28) InputIteratorTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] -(27) RegularHashAggregateExecTransformer +(29) RegularHashAggregateExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] Functions [1]: [count(distinct ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count(ps_suppkey#X)#X AS supplier_cnt#X] -(28) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: false -(29) ColumnarExchange +(31) VeloxAppendBatches +Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] +Arguments: X + +(32) ColumnarExchange Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: rangepartitioning(supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(30) ShuffleQueryStage +(33) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: X -(31) InputAdapter +(34) InputAdapter Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] -(32) InputIteratorTransformer +(35) InputIteratorTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] -(33) SortExecTransformer +(36) SortExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: [supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST], true, 0 -(34) WholeStageCodegenTransformer (X) +(37) WholeStageCodegenTransformer (X) Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: false -(35) VeloxColumnarToRowExec +(38) VeloxColumnarToRowExec Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] -(36) Scan parquet +(39) Scan parquet Output [2]: [ps_partkey#X, ps_suppkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_partkey)] ReadSchema: struct -(37) Filter +(40) Filter Input [2]: [ps_partkey#X, ps_suppkey#X] Condition : isnotnull(ps_partkey#X) -(38) Scan parquet +(41) Scan parquet Output [2]: [s_suppkey#X, s_comment#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_comment)] ReadSchema: struct -(39) Filter +(42) Filter Input [2]: [s_suppkey#X, s_comment#X] Condition : (isnotnull(s_comment#X) AND s_comment#X LIKE %Customer%Complaints%) -(40) Project +(43) Project Output [1]: [s_suppkey#X] Input [2]: [s_suppkey#X, s_comment#X] -(41) BroadcastExchange +(44) BroadcastExchange Input [1]: [s_suppkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),true), [plan_id=X] -(42) BroadcastHashJoin +(45) BroadcastHashJoin Left keys [1]: [ps_suppkey#X] Right keys [1]: [s_suppkey#X] Join type: LeftAnti Join condition: None -(43) Scan parquet +(46) Scan parquet Output [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_brand), IsNotNull(p_type), Not(EqualTo(p_brand,Brand#X)), Not(StringStartsWith(p_type,MEDIUM POLISHED)), In(p_size, [14,19,23,3,36,45,49,9]), IsNotNull(p_partkey)] ReadSchema: struct -(44) Filter +(47) Filter Input [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] Condition : (((((isnotnull(p_brand#X) AND isnotnull(p_type#X)) AND NOT (p_brand#X = Brand#X)) AND NOT StartsWith(p_type#X, MEDIUM POLISHED)) AND p_size#X IN (49,14,23,45,19,3,36,9)) AND isnotnull(p_partkey#X)) -(45) BroadcastExchange +(48) BroadcastExchange Input [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(46) BroadcastHashJoin +(49) BroadcastHashJoin Left keys [1]: [ps_partkey#X] Right keys [1]: [p_partkey#X] Join type: Inner Join condition: None -(47) Project +(50) Project Output [4]: [ps_suppkey#X, p_brand#X, p_type#X, p_size#X] Input [6]: [ps_partkey#X, ps_suppkey#X, p_partkey#X, p_brand#X, p_type#X, p_size#X] -(48) HashAggregate +(51) HashAggregate Input [4]: [ps_suppkey#X, p_brand#X, p_type#X, p_size#X] Keys [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Functions: [] Aggregate Attributes: [] Results [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(49) Exchange +(52) Exchange Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(50) HashAggregate +(53) HashAggregate Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Keys [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Functions: [] Aggregate Attributes: [] Results [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(51) HashAggregate +(54) HashAggregate Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] Functions [1]: [partial_count(distinct ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count#X] -(52) Exchange +(55) Exchange Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(53) HashAggregate +(56) HashAggregate Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] Functions [1]: [count(distinct ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count(ps_suppkey#X)#X AS supplier_cnt#X] -(54) Exchange +(57) Exchange Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: rangepartitioning(supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(55) Sort +(58) Sort Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: [supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST], true, 0 -(56) AdaptiveSparkPlan +(59) AdaptiveSparkPlan Output [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/17.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/17.txt index 4a2b5a1744e07..77733ad6f8a5b 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/17.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/17.txt @@ -1,39 +1,38 @@ == Physical Plan == -AdaptiveSparkPlan (35) +AdaptiveSparkPlan (36) +- == Final Plan == - VeloxColumnarToRowExec (14) - +- ^ ProjectExecTransformer (12) - +- ^ RegularHashAggregateExecTransformer (11) - +- ^ InputIteratorTransformer (10) - +- ^ InputAdapter (9) - +- ^ ShuffleQueryStage (8), Statistics(X) - +- ColumnarExchange (7) + VeloxColumnarToRowExec (15) + +- ^ ProjectExecTransformer (13) + +- ^ RegularHashAggregateExecTransformer (12) + +- ^ InputIteratorTransformer (11) + +- ShuffleQueryStage (9), Statistics(X) + +- ColumnarExchange (8) + +- VeloxAppendBatches (7) +- ^ FlushableHashAggregateExecTransformer (5) +- ^ InputIteratorTransformer (4) - +- ^ InputAdapter (3) - +- ^ RowToVeloxColumnar (2) - +- ^ LocalTableScan (1) + +- RowToVeloxColumnar (2) + +- LocalTableScan (1) +- == Initial Plan == - HashAggregate (34) - +- Exchange (33) - +- HashAggregate (32) - +- Project (31) - +- BroadcastHashJoin Inner BuildRight (30) - :- Project (22) - : +- BroadcastHashJoin Inner BuildRight (21) - : :- Filter (16) - : : +- Scan parquet (15) - : +- BroadcastExchange (20) - : +- Project (19) - : +- Filter (18) - : +- Scan parquet (17) - +- BroadcastExchange (29) - +- Filter (28) - +- HashAggregate (27) - +- Exchange (26) - +- HashAggregate (25) - +- Filter (24) - +- Scan parquet (23) + HashAggregate (35) + +- Exchange (34) + +- HashAggregate (33) + +- Project (32) + +- BroadcastHashJoin Inner BuildRight (31) + :- Project (23) + : +- BroadcastHashJoin Inner BuildRight (22) + : :- Filter (17) + : : +- Scan parquet (16) + : +- BroadcastExchange (21) + : +- Project (20) + : +- Filter (19) + : +- Scan parquet (18) + +- BroadcastExchange (30) + +- Filter (29) + +- HashAggregate (28) + +- Exchange (27) + +- HashAggregate (26) + +- Filter (25) + +- Scan parquet (24) (1) LocalTableScan @@ -60,143 +59,147 @@ Results [2]: [sum#X, isEmpty#X] Input [2]: [sum#X, isEmpty#X] Arguments: false -(7) ColumnarExchange +(7) VeloxAppendBatches +Input [2]: [sum#X, isEmpty#X] +Arguments: X + +(8) ColumnarExchange Input [2]: [sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(8) ShuffleQueryStage +(9) ShuffleQueryStage Output [2]: [sum#X, isEmpty#X] Arguments: X -(9) InputAdapter +(10) InputAdapter Input [2]: [sum#X, isEmpty#X] -(10) InputIteratorTransformer +(11) InputIteratorTransformer Input [2]: [sum#X, isEmpty#X] -(11) RegularHashAggregateExecTransformer +(12) RegularHashAggregateExecTransformer Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(l_extendedprice#X)] Aggregate Attributes [1]: [sum(l_extendedprice#X)#X] Results [1]: [sum(l_extendedprice#X)#X] -(12) ProjectExecTransformer +(13) ProjectExecTransformer Output [1]: [(sum(l_extendedprice#X)#X / 7.0) AS avg_yearly#X] Input [1]: [sum(l_extendedprice#X)#X] -(13) WholeStageCodegenTransformer (X) +(14) WholeStageCodegenTransformer (X) Input [1]: [avg_yearly#X] Arguments: false -(14) VeloxColumnarToRowExec +(15) VeloxColumnarToRowExec Input [1]: [avg_yearly#X] -(15) Scan parquet +(16) Scan parquet Output [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey), IsNotNull(l_quantity)] ReadSchema: struct -(16) Filter +(17) Filter Input [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] Condition : (isnotnull(l_partkey#X) AND isnotnull(l_quantity#X)) -(17) Scan parquet +(18) Scan parquet Output [3]: [p_partkey#X, p_brand#X, p_container#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_brand), IsNotNull(p_container), EqualTo(p_brand,Brand#X), EqualTo(p_container,MED BOX), IsNotNull(p_partkey)] ReadSchema: struct -(18) Filter +(19) Filter Input [3]: [p_partkey#X, p_brand#X, p_container#X] Condition : ((((isnotnull(p_brand#X) AND isnotnull(p_container#X)) AND (p_brand#X = Brand#X)) AND (p_container#X = MED BOX)) AND isnotnull(p_partkey#X)) -(19) Project +(20) Project Output [1]: [p_partkey#X] Input [3]: [p_partkey#X, p_brand#X, p_container#X] -(20) BroadcastExchange +(21) BroadcastExchange Input [1]: [p_partkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(21) BroadcastHashJoin +(22) BroadcastHashJoin Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join type: Inner Join condition: None -(22) Project +(23) Project Output [3]: [l_quantity#X, l_extendedprice#X, p_partkey#X] Input [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, p_partkey#X] -(23) Scan parquet +(24) Scan parquet Output [2]: [l_partkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey)] ReadSchema: struct -(24) Filter +(25) Filter Input [2]: [l_partkey#X, l_quantity#X] Condition : isnotnull(l_partkey#X) -(25) HashAggregate +(26) HashAggregate Input [2]: [l_partkey#X, l_quantity#X] Keys [1]: [l_partkey#X] Functions [1]: [partial_avg(l_quantity#X)] Aggregate Attributes [2]: [sum#X, count#X] Results [3]: [l_partkey#X, sum#X, count#X] -(26) Exchange +(27) Exchange Input [3]: [l_partkey#X, sum#X, count#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(27) HashAggregate +(28) HashAggregate Input [3]: [l_partkey#X, sum#X, count#X] Keys [1]: [l_partkey#X] Functions [1]: [avg(l_quantity#X)] Aggregate Attributes [1]: [avg(l_quantity#X)#X] Results [2]: [(0.2 * avg(l_quantity#X)#X) AS (0.2 * avg(l_quantity))#X, l_partkey#X] -(28) Filter +(29) Filter Input [2]: [(0.2 * avg(l_quantity))#X, l_partkey#X] Condition : isnotnull((0.2 * avg(l_quantity))#X) -(29) BroadcastExchange +(30) BroadcastExchange Input [2]: [(0.2 * avg(l_quantity))#X, l_partkey#X] Arguments: HashedRelationBroadcastMode(List(input[1, bigint, true]),false), [plan_id=X] -(30) BroadcastHashJoin +(31) BroadcastHashJoin Left keys [1]: [p_partkey#X] Right keys [1]: [l_partkey#X] Join type: Inner Join condition: (cast(l_quantity#X as decimal(18,7)) < (0.2 * avg(l_quantity))#X) -(31) Project +(32) Project Output [1]: [l_extendedprice#X] Input [5]: [l_quantity#X, l_extendedprice#X, p_partkey#X, (0.2 * avg(l_quantity))#X, l_partkey#X] -(32) HashAggregate +(33) HashAggregate Input [1]: [l_extendedprice#X] Keys: [] Functions [1]: [partial_sum(l_extendedprice#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(33) Exchange +(34) Exchange Input [2]: [sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X] -(34) HashAggregate +(35) HashAggregate Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(l_extendedprice#X)] Aggregate Attributes [1]: [sum(l_extendedprice#X)#X] Results [1]: [(sum(l_extendedprice#X)#X / 7.0) AS avg_yearly#X] -(35) AdaptiveSparkPlan +(36) AdaptiveSparkPlan Output [1]: [avg_yearly#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/18.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/18.txt index 0969c7267af50..4f36a185da726 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/18.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/18.txt @@ -1,86 +1,82 @@ == Physical Plan == -AdaptiveSparkPlan (86) +AdaptiveSparkPlan (88) +- == Final Plan == - VeloxColumnarToRowExec (53) - +- TakeOrderedAndProjectExecTransformer (52) - +- ^ RegularHashAggregateExecTransformer (50) - +- ^ InputIteratorTransformer (49) - +- ^ InputAdapter (48) - +- ^ ShuffleQueryStage (47), Statistics(X) - +- ColumnarExchange (46) - +- ^ ProjectExecTransformer (44) - +- ^ FlushableHashAggregateExecTransformer (43) - +- ^ ProjectExecTransformer (42) - +- ^ BroadcastHashJoinExecTransformer Inner (41) - :- ^ ProjectExecTransformer (28) - : +- ^ BroadcastHashJoinExecTransformer Inner (27) + VeloxColumnarToRowExec (55) + +- TakeOrderedAndProjectExecTransformer (54) + +- ^ RegularHashAggregateExecTransformer (52) + +- ^ InputIteratorTransformer (51) + +- ShuffleQueryStage (49), Statistics(X) + +- ColumnarExchange (48) + +- VeloxAppendBatches (47) + +- ^ ProjectExecTransformer (45) + +- ^ FlushableHashAggregateExecTransformer (44) + +- ^ ProjectExecTransformer (43) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (42) + :- ^ ProjectExecTransformer (29) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildLeft (28) : :- ^ InputIteratorTransformer (7) - : : +- ^ InputAdapter (6) - : : +- ^ BroadcastQueryStage (5), Statistics(X) - : : +- ColumnarBroadcastExchange (4) - : : +- ^ FilterExecTransformer (2) - : : +- ^ Scan parquet (1) - : +- ^ BroadcastHashJoinExecTransformer LeftSemi (26) + : : +- BroadcastQueryStage (5), Statistics(X) + : : +- ColumnarBroadcastExchange (4) + : : +- ^ FilterExecTransformer (2) + : : +- ^ Scan parquet (1) + : +- ^ BroadcastHashJoinExecTransformer LeftSemi BuildRight (27) : :- ^ FilterExecTransformer (9) : : +- ^ Scan parquet (8) - : +- ^ InputIteratorTransformer (25) - : +- ^ InputAdapter (24) - : +- ^ BroadcastQueryStage (23), Statistics(X) - : +- ColumnarBroadcastExchange (22) - : +- ^ ProjectExecTransformer (20) - : +- ^ FilterExecTransformer (19) - : +- ^ RegularHashAggregateExecTransformer (18) - : +- ^ InputIteratorTransformer (17) - : +- ^ InputAdapter (16) - : +- ^ ShuffleQueryStage (15), Statistics(X) - : +- ColumnarExchange (14) - : +- ^ ProjectExecTransformer (12) - : +- ^ FlushableHashAggregateExecTransformer (11) - : +- ^ Scan parquet (10) - +- ^ InputIteratorTransformer (40) - +- ^ InputAdapter (39) - +- ^ BroadcastQueryStage (38), Statistics(X) - +- ColumnarBroadcastExchange (37) - +- ^ BroadcastHashJoinExecTransformer LeftSemi (35) - :- ^ FilterExecTransformer (30) - : +- ^ Scan parquet (29) - +- ^ InputIteratorTransformer (34) - +- ^ InputAdapter (33) - +- ^ BroadcastQueryStage (32), Statistics(X) - +- ReusedExchange (31) + : +- ^ InputIteratorTransformer (26) + : +- BroadcastQueryStage (24), Statistics(X) + : +- ColumnarBroadcastExchange (23) + : +- ^ ProjectExecTransformer (21) + : +- ^ FilterExecTransformer (20) + : +- ^ RegularHashAggregateExecTransformer (19) + : +- ^ InputIteratorTransformer (18) + : +- ShuffleQueryStage (16), Statistics(X) + : +- ColumnarExchange (15) + : +- VeloxAppendBatches (14) + : +- ^ ProjectExecTransformer (12) + : +- ^ FlushableHashAggregateExecTransformer (11) + : +- ^ Scan parquet (10) + +- ^ InputIteratorTransformer (41) + +- BroadcastQueryStage (39), Statistics(X) + +- ColumnarBroadcastExchange (38) + +- ^ BroadcastHashJoinExecTransformer LeftSemi BuildRight (36) + :- ^ FilterExecTransformer (31) + : +- ^ Scan parquet (30) + +- ^ InputIteratorTransformer (35) + +- BroadcastQueryStage (33), Statistics(X) + +- ReusedExchange (32) +- == Initial Plan == - TakeOrderedAndProject (85) - +- HashAggregate (84) - +- Exchange (83) - +- HashAggregate (82) - +- Project (81) - +- BroadcastHashJoin Inner BuildRight (80) - :- Project (68) - : +- BroadcastHashJoin Inner BuildLeft (67) - : :- BroadcastExchange (56) - : : +- Filter (55) - : : +- Scan parquet (54) - : +- BroadcastHashJoin LeftSemi BuildRight (66) - : :- Filter (58) - : : +- Scan parquet (57) - : +- BroadcastExchange (65) - : +- Project (64) - : +- Filter (63) - : +- HashAggregate (62) - : +- Exchange (61) - : +- HashAggregate (60) - : +- Scan parquet (59) - +- BroadcastExchange (79) - +- BroadcastHashJoin LeftSemi BuildRight (78) - :- Filter (70) - : +- Scan parquet (69) - +- BroadcastExchange (77) - +- Project (76) - +- Filter (75) - +- HashAggregate (74) - +- Exchange (73) - +- HashAggregate (72) - +- Scan parquet (71) + TakeOrderedAndProject (87) + +- HashAggregate (86) + +- Exchange (85) + +- HashAggregate (84) + +- Project (83) + +- BroadcastHashJoin Inner BuildRight (82) + :- Project (70) + : +- BroadcastHashJoin Inner BuildLeft (69) + : :- BroadcastExchange (58) + : : +- Filter (57) + : : +- Scan parquet (56) + : +- BroadcastHashJoin LeftSemi BuildRight (68) + : :- Filter (60) + : : +- Scan parquet (59) + : +- BroadcastExchange (67) + : +- Project (66) + : +- Filter (65) + : +- HashAggregate (64) + : +- Exchange (63) + : +- HashAggregate (62) + : +- Scan parquet (61) + +- BroadcastExchange (81) + +- BroadcastHashJoin LeftSemi BuildRight (80) + :- Filter (72) + : +- Scan parquet (71) + +- BroadcastExchange (79) + +- Project (78) + +- Filter (77) + +- HashAggregate (76) + +- Exchange (75) + +- HashAggregate (74) + +- Scan parquet (73) (1) Scan parquet @@ -144,341 +140,349 @@ Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Input [4]: [hash_partition_key#X, l_orderkey#X, sum#X, isEmpty#X] Arguments: false -(14) ColumnarExchange +(14) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_orderkey#X, sum#X, isEmpty#X] +Arguments: X + +(15) ColumnarExchange Input [4]: [hash_partition_key#X, l_orderkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(15) ShuffleQueryStage +(16) ShuffleQueryStage Output [3]: [l_orderkey#X, sum#X, isEmpty#X] Arguments: X -(16) InputAdapter +(17) InputAdapter Input [3]: [l_orderkey#X, sum#X, isEmpty#X] -(17) InputIteratorTransformer +(18) InputIteratorTransformer Input [3]: [l_orderkey#X, sum#X, isEmpty#X] -(18) RegularHashAggregateExecTransformer +(19) RegularHashAggregateExecTransformer Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Keys [1]: [l_orderkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [2]: [l_orderkey#X, sum(l_quantity#X)#X AS sum(l_quantity#X)#X] -(19) FilterExecTransformer +(20) FilterExecTransformer Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] Arguments: (isnotnull(sum(l_quantity#X)#X) AND (sum(l_quantity#X)#X > 300.00)) -(20) ProjectExecTransformer +(21) ProjectExecTransformer Output [1]: [l_orderkey#X] Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] -(21) WholeStageCodegenTransformer (X) +(22) WholeStageCodegenTransformer (X) Input [1]: [l_orderkey#X] Arguments: false -(22) ColumnarBroadcastExchange +(23) ColumnarBroadcastExchange Input [1]: [l_orderkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(23) BroadcastQueryStage +(24) BroadcastQueryStage Output [1]: [l_orderkey#X] Arguments: X -(24) InputAdapter +(25) InputAdapter Input [1]: [l_orderkey#X] -(25) InputIteratorTransformer +(26) InputIteratorTransformer Input [1]: [l_orderkey#X] -(26) BroadcastHashJoinExecTransformer +(27) BroadcastHashJoinExecTransformer Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: LeftSemi Join condition: None -(27) BroadcastHashJoinExecTransformer +(28) BroadcastHashJoinExecTransformer Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join type: Inner Join condition: None -(28) ProjectExecTransformer +(29) ProjectExecTransformer Output [5]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] Input [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -(29) Scan parquet +(30) Scan parquet Output [2]: [l_orderkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_orderkey)] ReadSchema: struct -(30) FilterExecTransformer +(31) FilterExecTransformer Input [2]: [l_orderkey#X, l_quantity#X] Arguments: isnotnull(l_orderkey#X) -(31) ReusedExchange [Reuses operator id: 22] +(32) ReusedExchange [Reuses operator id: 23] Output [1]: [l_orderkey#X] -(32) BroadcastQueryStage +(33) BroadcastQueryStage Output [1]: [l_orderkey#X] Arguments: X -(33) InputAdapter +(34) InputAdapter Input [1]: [l_orderkey#X] -(34) InputIteratorTransformer +(35) InputIteratorTransformer Input [1]: [l_orderkey#X] -(35) BroadcastHashJoinExecTransformer +(36) BroadcastHashJoinExecTransformer Left keys [1]: [l_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: LeftSemi Join condition: None -(36) WholeStageCodegenTransformer (X) +(37) WholeStageCodegenTransformer (X) Input [2]: [l_orderkey#X, l_quantity#X] Arguments: false -(37) ColumnarBroadcastExchange +(38) ColumnarBroadcastExchange Input [2]: [l_orderkey#X, l_quantity#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(38) BroadcastQueryStage +(39) BroadcastQueryStage Output [2]: [l_orderkey#X, l_quantity#X] Arguments: X -(39) InputAdapter +(40) InputAdapter Input [2]: [l_orderkey#X, l_quantity#X] -(40) InputIteratorTransformer +(41) InputIteratorTransformer Input [2]: [l_orderkey#X, l_quantity#X] -(41) BroadcastHashJoinExecTransformer +(42) BroadcastHashJoinExecTransformer Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: Inner Join condition: None -(42) ProjectExecTransformer +(43) ProjectExecTransformer Output [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_quantity#X] Input [7]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_orderkey#X, l_quantity#X] -(43) FlushableHashAggregateExecTransformer +(44) FlushableHashAggregateExecTransformer Input [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_quantity#X] Keys [5]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] -(44) ProjectExecTransformer +(45) ProjectExecTransformer Output [8]: [hash(c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, 42) AS hash_partition_key#X, c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] Input [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] -(45) WholeStageCodegenTransformer (X) +(46) WholeStageCodegenTransformer (X) Input [8]: [hash_partition_key#X, c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] Arguments: false -(46) ColumnarExchange +(47) VeloxAppendBatches +Input [8]: [hash_partition_key#X, c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] +Arguments: X + +(48) ColumnarExchange Input [8]: [hash_partition_key#X, c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] Arguments: hashpartitioning(c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, 1), ENSURE_REQUIREMENTS, [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(47) ShuffleQueryStage +(49) ShuffleQueryStage Output [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] Arguments: X -(48) InputAdapter +(50) InputAdapter Input [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] -(49) InputIteratorTransformer +(51) InputIteratorTransformer Input [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] -(50) RegularHashAggregateExecTransformer +(52) RegularHashAggregateExecTransformer Input [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] Keys [5]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity#X)#X AS sum(l_quantity)#X] -(51) WholeStageCodegenTransformer (X) +(53) WholeStageCodegenTransformer (X) Input [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] Arguments: false -(52) TakeOrderedAndProjectExecTransformer +(54) TakeOrderedAndProjectExecTransformer Input [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] Arguments: X, [o_totalprice#X DESC NULLS LAST, o_orderdate#X ASC NULLS FIRST], [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X], 0 -(53) VeloxColumnarToRowExec +(55) VeloxColumnarToRowExec Input [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] -(54) Scan parquet +(56) Scan parquet Output [2]: [c_custkey#X, c_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey)] ReadSchema: struct -(55) Filter +(57) Filter Input [2]: [c_custkey#X, c_name#X] Condition : isnotnull(c_custkey#X) -(56) BroadcastExchange +(58) BroadcastExchange Input [2]: [c_custkey#X, c_name#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(57) Scan parquet +(59) Scan parquet Output [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(58) Filter +(60) Filter Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Condition : (isnotnull(o_custkey#X) AND isnotnull(o_orderkey#X)) -(59) Scan parquet +(61) Scan parquet Output [2]: [l_orderkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(60) HashAggregate +(62) HashAggregate Input [2]: [l_orderkey#X, l_quantity#X] Keys [1]: [l_orderkey#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_orderkey#X, sum#X, isEmpty#X] -(61) Exchange +(63) Exchange Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(62) HashAggregate +(64) HashAggregate Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Keys [1]: [l_orderkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [2]: [l_orderkey#X, sum(l_quantity#X)#X AS sum(l_quantity#X)#X] -(63) Filter +(65) Filter Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] Condition : (isnotnull(sum(l_quantity#X)#X) AND (sum(l_quantity#X)#X > 300.00)) -(64) Project +(66) Project Output [1]: [l_orderkey#X] Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] -(65) BroadcastExchange +(67) BroadcastExchange Input [1]: [l_orderkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(66) BroadcastHashJoin +(68) BroadcastHashJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: LeftSemi Join condition: None -(67) BroadcastHashJoin +(69) BroadcastHashJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join type: Inner Join condition: None -(68) Project +(70) Project Output [5]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] Input [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -(69) Scan parquet +(71) Scan parquet Output [2]: [l_orderkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_orderkey)] ReadSchema: struct -(70) Filter +(72) Filter Input [2]: [l_orderkey#X, l_quantity#X] Condition : isnotnull(l_orderkey#X) -(71) Scan parquet +(73) Scan parquet Output [2]: [l_orderkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(72) HashAggregate +(74) HashAggregate Input [2]: [l_orderkey#X, l_quantity#X] Keys [1]: [l_orderkey#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_orderkey#X, sum#X, isEmpty#X] -(73) Exchange +(75) Exchange Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(74) HashAggregate +(76) HashAggregate Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Keys [1]: [l_orderkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [2]: [l_orderkey#X, sum(l_quantity#X)#X AS sum(l_quantity#X)#X] -(75) Filter +(77) Filter Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] Condition : (isnotnull(sum(l_quantity#X)#X) AND (sum(l_quantity#X)#X > 300.00)) -(76) Project +(78) Project Output [1]: [l_orderkey#X] Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] -(77) BroadcastExchange +(79) BroadcastExchange Input [1]: [l_orderkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(78) BroadcastHashJoin +(80) BroadcastHashJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: LeftSemi Join condition: None -(79) BroadcastExchange +(81) BroadcastExchange Input [2]: [l_orderkey#X, l_quantity#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(80) BroadcastHashJoin +(82) BroadcastHashJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: Inner Join condition: None -(81) Project +(83) Project Output [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_quantity#X] Input [7]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_orderkey#X, l_quantity#X] -(82) HashAggregate +(84) HashAggregate Input [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_quantity#X] Keys [5]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] -(83) Exchange +(85) Exchange Input [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] Arguments: hashpartitioning(c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(84) HashAggregate +(86) HashAggregate Input [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] Keys [5]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity#X)#X AS sum(l_quantity)#X] -(85) TakeOrderedAndProject +(87) TakeOrderedAndProject Input [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] Arguments: X, [o_totalprice#X DESC NULLS LAST, o_orderdate#X ASC NULLS FIRST], [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] -(86) AdaptiveSparkPlan +(88) AdaptiveSparkPlan Output [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/19.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/19.txt index 87f00043a52a6..7854a767b26bf 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/19.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/19.txt @@ -1,36 +1,35 @@ == Physical Plan == -AdaptiveSparkPlan (33) +AdaptiveSparkPlan (34) +- == Final Plan == - VeloxColumnarToRowExec (21) - +- ^ RegularHashAggregateExecTransformer (19) - +- ^ InputIteratorTransformer (18) - +- ^ InputAdapter (17) - +- ^ ShuffleQueryStage (16), Statistics(X) - +- ColumnarExchange (15) + VeloxColumnarToRowExec (22) + +- ^ RegularHashAggregateExecTransformer (20) + +- ^ InputIteratorTransformer (19) + +- ShuffleQueryStage (17), Statistics(X) + +- ColumnarExchange (16) + +- VeloxAppendBatches (15) +- ^ FlushableHashAggregateExecTransformer (13) +- ^ ProjectExecTransformer (12) - +- ^ BroadcastHashJoinExecTransformer Inner (11) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (11) :- ^ ProjectExecTransformer (3) : +- ^ FilterExecTransformer (2) : +- ^ Scan parquet (1) +- ^ InputIteratorTransformer (10) - +- ^ InputAdapter (9) - +- ^ BroadcastQueryStage (8), Statistics(X) - +- ColumnarBroadcastExchange (7) - +- ^ FilterExecTransformer (5) - +- ^ Scan parquet (4) + +- BroadcastQueryStage (8), Statistics(X) + +- ColumnarBroadcastExchange (7) + +- ^ FilterExecTransformer (5) + +- ^ Scan parquet (4) +- == Initial Plan == - HashAggregate (32) - +- Exchange (31) - +- HashAggregate (30) - +- Project (29) - +- BroadcastHashJoin Inner BuildRight (28) - :- Project (24) - : +- Filter (23) - : +- Scan parquet (22) - +- BroadcastExchange (27) - +- Filter (26) - +- Scan parquet (25) + HashAggregate (33) + +- Exchange (32) + +- HashAggregate (31) + +- Project (30) + +- BroadcastHashJoin Inner BuildRight (29) + :- Project (25) + : +- Filter (24) + : +- Scan parquet (23) + +- BroadcastExchange (28) + +- Filter (27) + +- Scan parquet (26) (1) Scan parquet @@ -98,92 +97,96 @@ Results [2]: [sum#X, isEmpty#X] Input [2]: [sum#X, isEmpty#X] Arguments: false -(15) ColumnarExchange +(15) VeloxAppendBatches +Input [2]: [sum#X, isEmpty#X] +Arguments: X + +(16) ColumnarExchange Input [2]: [sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(16) ShuffleQueryStage +(17) ShuffleQueryStage Output [2]: [sum#X, isEmpty#X] Arguments: X -(17) InputAdapter +(18) InputAdapter Input [2]: [sum#X, isEmpty#X] -(18) InputIteratorTransformer +(19) InputIteratorTransformer Input [2]: [sum#X, isEmpty#X] -(19) RegularHashAggregateExecTransformer +(20) RegularHashAggregateExecTransformer Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X] Results [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X AS revenue#X] -(20) WholeStageCodegenTransformer (X) +(21) WholeStageCodegenTransformer (X) Input [1]: [revenue#X] Arguments: false -(21) VeloxColumnarToRowExec +(22) VeloxColumnarToRowExec Input [1]: [revenue#X] -(22) Scan parquet +(23) Scan parquet Output [6]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, l_shipinstruct#X, l_shipmode#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipinstruct), In(l_shipmode, [AIR,AIR REG]), EqualTo(l_shipinstruct,DELIVER IN PERSON), IsNotNull(l_partkey), Or(Or(And(GreaterThanOrEqual(l_quantity,1.00),LessThanOrEqual(l_quantity,11.00)),And(GreaterThanOrEqual(l_quantity,10.00),LessThanOrEqual(l_quantity,20.00))),And(GreaterThanOrEqual(l_quantity,20.00),LessThanOrEqual(l_quantity,30.00)))] ReadSchema: struct -(23) Filter +(24) Filter Input [6]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, l_shipinstruct#X, l_shipmode#X] Condition : ((((isnotnull(l_shipinstruct#X) AND l_shipmode#X IN (AIR,AIR REG)) AND (l_shipinstruct#X = DELIVER IN PERSON)) AND isnotnull(l_partkey#X)) AND ((((l_quantity#X >= 1.00) AND (l_quantity#X <= 11.00)) OR ((l_quantity#X >= 10.00) AND (l_quantity#X <= 20.00))) OR ((l_quantity#X >= 20.00) AND (l_quantity#X <= 30.00)))) -(24) Project +(25) Project Output [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Input [6]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, l_shipinstruct#X, l_shipmode#X] -(25) Scan parquet +(26) Scan parquet Output [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_size), GreaterThanOrEqual(p_size,1), IsNotNull(p_partkey), Or(Or(And(And(EqualTo(p_brand,Brand#X),In(p_container, [SM BOX,SM CASE,SM PACK,SM PKG])),LessThanOrEqual(p_size,5)),And(And(EqualTo(p_brand,Brand#X),In(p_container, [MED BAG,MED BOX,MED PACK,MED PKG])),LessThanOrEqual(p_size,10))),And(And(EqualTo(p_brand,Brand#X),In(p_container, [LG BOX,LG CASE,LG PACK,LG PKG])),LessThanOrEqual(p_size,15)))] ReadSchema: struct -(26) Filter +(27) Filter Input [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] Condition : (((isnotnull(p_size#X) AND (p_size#X >= 1)) AND isnotnull(p_partkey#X)) AND (((((p_brand#X = Brand#X) AND p_container#X IN (SM CASE,SM BOX,SM PACK,SM PKG)) AND (p_size#X <= 5)) OR (((p_brand#X = Brand#X) AND p_container#X IN (MED BAG,MED BOX,MED PKG,MED PACK)) AND (p_size#X <= 10))) OR (((p_brand#X = Brand#X) AND p_container#X IN (LG CASE,LG BOX,LG PACK,LG PKG)) AND (p_size#X <= 15)))) -(27) BroadcastExchange +(28) BroadcastExchange Input [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(28) BroadcastHashJoin +(29) BroadcastHashJoin Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join type: Inner Join condition: (((((((p_brand#X = Brand#X) AND p_container#X IN (SM CASE,SM BOX,SM PACK,SM PKG)) AND (l_quantity#X >= 1.00)) AND (l_quantity#X <= 11.00)) AND (p_size#X <= 5)) OR (((((p_brand#X = Brand#X) AND p_container#X IN (MED BAG,MED BOX,MED PKG,MED PACK)) AND (l_quantity#X >= 10.00)) AND (l_quantity#X <= 20.00)) AND (p_size#X <= 10))) OR (((((p_brand#X = Brand#X) AND p_container#X IN (LG CASE,LG BOX,LG PACK,LG PKG)) AND (l_quantity#X >= 20.00)) AND (l_quantity#X <= 30.00)) AND (p_size#X <= 15))) -(29) Project +(30) Project Output [2]: [l_extendedprice#X, l_discount#X] Input [8]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, p_partkey#X, p_brand#X, p_size#X, p_container#X] -(30) HashAggregate +(31) HashAggregate Input [2]: [l_extendedprice#X, l_discount#X] Keys: [] Functions [1]: [partial_sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(31) Exchange +(32) Exchange Input [2]: [sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X] -(32) HashAggregate +(33) HashAggregate Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X] Results [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X AS revenue#X] -(33) AdaptiveSparkPlan +(34) AdaptiveSparkPlan Output [1]: [revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/20.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/20.txt index 5db32ebb42279..981017da501e3 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/20.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/20.txt @@ -1,96 +1,92 @@ == Physical Plan == -AdaptiveSparkPlan (96) +AdaptiveSparkPlan (98) +- == Final Plan == - VeloxColumnarToRowExec (60) - +- AQEShuffleRead (59) - +- ShuffleQueryStage (58), Statistics(X) - +- ColumnarExchange (57) - +- ^ ProjectExecTransformer (55) - +- ^ BroadcastHashJoinExecTransformer Inner (54) - :- ^ ProjectExecTransformer (45) - : +- ^ BroadcastHashJoinExecTransformer LeftSemi (44) - : :- ^ FilterExecTransformer (2) - : : +- ^ Scan parquet (1) - : +- ^ InputIteratorTransformer (43) - : +- ^ InputAdapter (42) - : +- ^ BroadcastQueryStage (41), Statistics(X) - : +- ColumnarBroadcastExchange (40) - : +- ^ ProjectExecTransformer (38) - : +- ^ BroadcastHashJoinExecTransformer Inner (37) - : :- ^ InputIteratorTransformer (18) - : : +- ^ InputAdapter (17) - : : +- ^ BroadcastQueryStage (16), Statistics(X) - : : +- ColumnarBroadcastExchange (15) - : : +- ^ BroadcastHashJoinExecTransformer LeftSemi (13) - : : :- ^ FilterExecTransformer (4) - : : : +- ^ Scan parquet (3) - : : +- ^ InputIteratorTransformer (12) - : : +- ^ InputAdapter (11) - : : +- ^ BroadcastQueryStage (10), Statistics(X) - : : +- ColumnarBroadcastExchange (9) - : : +- ^ ProjectExecTransformer (7) - : : +- ^ FilterExecTransformer (6) - : : +- ^ Scan parquet (5) - : +- ^ FilterExecTransformer (36) - : +- ^ ProjectExecTransformer (35) - : +- ^ RegularHashAggregateExecTransformer (34) - : +- ^ InputIteratorTransformer (33) - : +- ^ InputAdapter (32) - : +- ^ ShuffleQueryStage (31), Statistics(X) - : +- ColumnarExchange (30) - : +- ^ ProjectExecTransformer (28) - : +- ^ FlushableHashAggregateExecTransformer (27) - : +- ^ BroadcastHashJoinExecTransformer LeftSemi (26) - : :- ^ ProjectExecTransformer (21) - : : +- ^ FilterExecTransformer (20) - : : +- ^ Scan parquet (19) - : +- ^ InputIteratorTransformer (25) - : +- ^ InputAdapter (24) - : +- ^ BroadcastQueryStage (23), Statistics(X) - : +- ReusedExchange (22) - +- ^ InputIteratorTransformer (53) - +- ^ InputAdapter (52) - +- ^ BroadcastQueryStage (51), Statistics(X) - +- ColumnarBroadcastExchange (50) - +- ^ ProjectExecTransformer (48) - +- ^ FilterExecTransformer (47) - +- ^ Scan parquet (46) + VeloxColumnarToRowExec (62) + +- AQEShuffleRead (61) + +- ShuffleQueryStage (60), Statistics(X) + +- ColumnarExchange (59) + +- VeloxAppendBatches (58) + +- ^ ProjectExecTransformer (56) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (55) + :- ^ ProjectExecTransformer (46) + : +- ^ BroadcastHashJoinExecTransformer LeftSemi BuildRight (45) + : :- ^ FilterExecTransformer (2) + : : +- ^ Scan parquet (1) + : +- ^ InputIteratorTransformer (44) + : +- BroadcastQueryStage (42), Statistics(X) + : +- ColumnarBroadcastExchange (41) + : +- ^ ProjectExecTransformer (39) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildLeft (38) + : :- ^ InputIteratorTransformer (18) + : : +- BroadcastQueryStage (16), Statistics(X) + : : +- ColumnarBroadcastExchange (15) + : : +- ^ BroadcastHashJoinExecTransformer LeftSemi BuildRight (13) + : : :- ^ FilterExecTransformer (4) + : : : +- ^ Scan parquet (3) + : : +- ^ InputIteratorTransformer (12) + : : +- BroadcastQueryStage (10), Statistics(X) + : : +- ColumnarBroadcastExchange (9) + : : +- ^ ProjectExecTransformer (7) + : : +- ^ FilterExecTransformer (6) + : : +- ^ Scan parquet (5) + : +- ^ FilterExecTransformer (37) + : +- ^ ProjectExecTransformer (36) + : +- ^ RegularHashAggregateExecTransformer (35) + : +- ^ InputIteratorTransformer (34) + : +- ShuffleQueryStage (32), Statistics(X) + : +- ColumnarExchange (31) + : +- VeloxAppendBatches (30) + : +- ^ ProjectExecTransformer (28) + : +- ^ FlushableHashAggregateExecTransformer (27) + : +- ^ BroadcastHashJoinExecTransformer LeftSemi BuildRight (26) + : :- ^ ProjectExecTransformer (21) + : : +- ^ FilterExecTransformer (20) + : : +- ^ Scan parquet (19) + : +- ^ InputIteratorTransformer (25) + : +- BroadcastQueryStage (23), Statistics(X) + : +- ReusedExchange (22) + +- ^ InputIteratorTransformer (54) + +- BroadcastQueryStage (52), Statistics(X) + +- ColumnarBroadcastExchange (51) + +- ^ ProjectExecTransformer (49) + +- ^ FilterExecTransformer (48) + +- ^ Scan parquet (47) +- == Initial Plan == - Sort (95) - +- Exchange (94) - +- Project (93) - +- BroadcastHashJoin Inner BuildRight (92) - :- Project (87) - : +- BroadcastHashJoin LeftSemi BuildRight (86) - : :- Filter (62) - : : +- Scan parquet (61) - : +- BroadcastExchange (85) - : +- Project (84) - : +- BroadcastHashJoin Inner BuildLeft (83) - : :- BroadcastExchange (70) - : : +- BroadcastHashJoin LeftSemi BuildRight (69) - : : :- Filter (64) - : : : +- Scan parquet (63) - : : +- BroadcastExchange (68) - : : +- Project (67) - : : +- Filter (66) - : : +- Scan parquet (65) - : +- Filter (82) - : +- HashAggregate (81) - : +- Exchange (80) - : +- HashAggregate (79) - : +- BroadcastHashJoin LeftSemi BuildRight (78) - : :- Project (73) - : : +- Filter (72) - : : +- Scan parquet (71) - : +- BroadcastExchange (77) - : +- Project (76) - : +- Filter (75) - : +- Scan parquet (74) - +- BroadcastExchange (91) - +- Project (90) - +- Filter (89) - +- Scan parquet (88) + Sort (97) + +- Exchange (96) + +- Project (95) + +- BroadcastHashJoin Inner BuildRight (94) + :- Project (89) + : +- BroadcastHashJoin LeftSemi BuildRight (88) + : :- Filter (64) + : : +- Scan parquet (63) + : +- BroadcastExchange (87) + : +- Project (86) + : +- BroadcastHashJoin Inner BuildLeft (85) + : :- BroadcastExchange (72) + : : +- BroadcastHashJoin LeftSemi BuildRight (71) + : : :- Filter (66) + : : : +- Scan parquet (65) + : : +- BroadcastExchange (70) + : : +- Project (69) + : : +- Filter (68) + : : +- Scan parquet (67) + : +- Filter (84) + : +- HashAggregate (83) + : +- Exchange (82) + : +- HashAggregate (81) + : +- BroadcastHashJoin LeftSemi BuildRight (80) + : :- Project (75) + : : +- Filter (74) + : : +- Scan parquet (73) + : +- BroadcastExchange (79) + : +- Project (78) + : +- Filter (77) + : +- Scan parquet (76) + +- BroadcastExchange (93) + +- Project (92) + +- Filter (91) + +- Scan parquet (90) (1) Scan parquet @@ -221,309 +217,317 @@ Input [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] Input [5]: [hash_partition_key#X, l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: false -(30) ColumnarExchange +(30) VeloxAppendBatches +Input [5]: [hash_partition_key#X, l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] +Arguments: X + +(31) ColumnarExchange Input [5]: [hash_partition_key#X, l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_partkey#X, l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(31) ShuffleQueryStage +(32) ShuffleQueryStage Output [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: X -(32) InputAdapter +(33) InputAdapter Input [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] -(33) InputIteratorTransformer +(34) InputIteratorTransformer Input [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] -(34) RegularHashAggregateExecTransformer +(35) RegularHashAggregateExecTransformer Input [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] Keys [2]: [l_partkey#X, l_suppkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [3]: [l_partkey#X, l_suppkey#X, sum(l_quantity#X)#X] -(35) ProjectExecTransformer +(36) ProjectExecTransformer Output [3]: [(0.5 * sum(l_quantity#X)#X) AS (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Input [3]: [l_partkey#X, l_suppkey#X, sum(l_quantity#X)#X] -(36) FilterExecTransformer +(37) FilterExecTransformer Input [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Arguments: isnotnull((0.5 * sum(l_quantity))#X) -(37) BroadcastHashJoinExecTransformer +(38) BroadcastHashJoinExecTransformer Left keys [2]: [ps_partkey#X, ps_suppkey#X] Right keys [2]: [l_partkey#X, l_suppkey#X] Join type: Inner Join condition: (cast(ps_availqty#X as decimal(24,3)) > (0.5 * sum(l_quantity))#X) -(38) ProjectExecTransformer +(39) ProjectExecTransformer Output [1]: [ps_suppkey#X] Input [6]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] -(39) WholeStageCodegenTransformer (X) +(40) WholeStageCodegenTransformer (X) Input [1]: [ps_suppkey#X] Arguments: false -(40) ColumnarBroadcastExchange +(41) ColumnarBroadcastExchange Input [1]: [ps_suppkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(41) BroadcastQueryStage +(42) BroadcastQueryStage Output [1]: [ps_suppkey#X] Arguments: X -(42) InputAdapter +(43) InputAdapter Input [1]: [ps_suppkey#X] -(43) InputIteratorTransformer +(44) InputIteratorTransformer Input [1]: [ps_suppkey#X] -(44) BroadcastHashJoinExecTransformer +(45) BroadcastHashJoinExecTransformer Left keys [1]: [s_suppkey#X] Right keys [1]: [ps_suppkey#X] Join type: LeftSemi Join condition: None -(45) ProjectExecTransformer +(46) ProjectExecTransformer Output [3]: [s_name#X, s_address#X, s_nationkey#X] Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] -(46) Scan parquet +(47) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,CANADA), IsNotNull(n_nationkey)] ReadSchema: struct -(47) FilterExecTransformer +(48) FilterExecTransformer Input [2]: [n_nationkey#X, n_name#X] Arguments: ((isnotnull(n_name#X) AND (n_name#X = CANADA)) AND isnotnull(n_nationkey#X)) -(48) ProjectExecTransformer +(49) ProjectExecTransformer Output [1]: [n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(49) WholeStageCodegenTransformer (X) +(50) WholeStageCodegenTransformer (X) Input [1]: [n_nationkey#X] Arguments: false -(50) ColumnarBroadcastExchange +(51) ColumnarBroadcastExchange Input [1]: [n_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(51) BroadcastQueryStage +(52) BroadcastQueryStage Output [1]: [n_nationkey#X] Arguments: X -(52) InputAdapter +(53) InputAdapter Input [1]: [n_nationkey#X] -(53) InputIteratorTransformer +(54) InputIteratorTransformer Input [1]: [n_nationkey#X] -(54) BroadcastHashJoinExecTransformer +(55) BroadcastHashJoinExecTransformer Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(55) ProjectExecTransformer +(56) ProjectExecTransformer Output [2]: [s_name#X, s_address#X] Input [4]: [s_name#X, s_address#X, s_nationkey#X, n_nationkey#X] -(56) WholeStageCodegenTransformer (X) +(57) WholeStageCodegenTransformer (X) Input [2]: [s_name#X, s_address#X] Arguments: false -(57) ColumnarExchange +(58) VeloxAppendBatches +Input [2]: [s_name#X, s_address#X] +Arguments: X + +(59) ColumnarExchange Input [2]: [s_name#X, s_address#X] Arguments: rangepartitioning(s_name#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(58) ShuffleQueryStage +(60) ShuffleQueryStage Output [2]: [s_name#X, s_address#X] Arguments: X -(59) AQEShuffleRead +(61) AQEShuffleRead Input [2]: [s_name#X, s_address#X] Arguments: local -(60) VeloxColumnarToRowExec +(62) VeloxColumnarToRowExec Input [2]: [s_name#X, s_address#X] -(61) Scan parquet +(63) Scan parquet Output [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_nationkey)] ReadSchema: struct -(62) Filter +(64) Filter Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Condition : isnotnull(s_nationkey#X) -(63) Scan parquet +(65) Scan parquet Output [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_availqty), IsNotNull(ps_partkey), IsNotNull(ps_suppkey)] ReadSchema: struct -(64) Filter +(66) Filter Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Condition : ((isnotnull(ps_availqty#X) AND isnotnull(ps_partkey#X)) AND isnotnull(ps_suppkey#X)) -(65) Scan parquet +(67) Scan parquet Output [2]: [p_partkey#X, p_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_name), StringStartsWith(p_name,forest)] ReadSchema: struct -(66) Filter +(68) Filter Input [2]: [p_partkey#X, p_name#X] Condition : (isnotnull(p_name#X) AND StartsWith(p_name#X, forest)) -(67) Project +(69) Project Output [1]: [p_partkey#X] Input [2]: [p_partkey#X, p_name#X] -(68) BroadcastExchange +(70) BroadcastExchange Input [1]: [p_partkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(69) BroadcastHashJoin +(71) BroadcastHashJoin Left keys [1]: [ps_partkey#X] Right keys [1]: [p_partkey#X] Join type: LeftSemi Join condition: None -(70) BroadcastExchange +(72) BroadcastExchange Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false], input[1, bigint, false]),false), [plan_id=X] -(71) Scan parquet +(73) Scan parquet Output [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1994-01-01), LessThan(l_shipdate,1995-01-01), IsNotNull(l_partkey), IsNotNull(l_suppkey)] ReadSchema: struct -(72) Filter +(74) Filter Input [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] Condition : ((((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1994-01-01)) AND (l_shipdate#X < 1995-01-01)) AND isnotnull(l_partkey#X)) AND isnotnull(l_suppkey#X)) -(73) Project +(75) Project Output [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] Input [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] -(74) Scan parquet +(76) Scan parquet Output [2]: [p_partkey#X, p_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_name), StringStartsWith(p_name,forest)] ReadSchema: struct -(75) Filter +(77) Filter Input [2]: [p_partkey#X, p_name#X] Condition : (isnotnull(p_name#X) AND StartsWith(p_name#X, forest)) -(76) Project +(78) Project Output [1]: [p_partkey#X] Input [2]: [p_partkey#X, p_name#X] -(77) BroadcastExchange +(79) BroadcastExchange Input [1]: [p_partkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(78) BroadcastHashJoin +(80) BroadcastHashJoin Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join type: LeftSemi Join condition: None -(79) HashAggregate +(81) HashAggregate Input [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] Keys [2]: [l_partkey#X, l_suppkey#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] -(80) Exchange +(82) Exchange Input [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_partkey#X, l_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(81) HashAggregate +(83) HashAggregate Input [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] Keys [2]: [l_partkey#X, l_suppkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [3]: [(0.5 * sum(l_quantity#X)#X) AS (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] -(82) Filter +(84) Filter Input [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Condition : isnotnull((0.5 * sum(l_quantity))#X) -(83) BroadcastHashJoin +(85) BroadcastHashJoin Left keys [2]: [ps_partkey#X, ps_suppkey#X] Right keys [2]: [l_partkey#X, l_suppkey#X] Join type: Inner Join condition: (cast(ps_availqty#X as decimal(24,3)) > (0.5 * sum(l_quantity))#X) -(84) Project +(86) Project Output [1]: [ps_suppkey#X] Input [6]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] -(85) BroadcastExchange +(87) BroadcastExchange Input [1]: [ps_suppkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(86) BroadcastHashJoin +(88) BroadcastHashJoin Left keys [1]: [s_suppkey#X] Right keys [1]: [ps_suppkey#X] Join type: LeftSemi Join condition: None -(87) Project +(89) Project Output [3]: [s_name#X, s_address#X, s_nationkey#X] Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] -(88) Scan parquet +(90) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,CANADA), IsNotNull(n_nationkey)] ReadSchema: struct -(89) Filter +(91) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : ((isnotnull(n_name#X) AND (n_name#X = CANADA)) AND isnotnull(n_nationkey#X)) -(90) Project +(92) Project Output [1]: [n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(91) BroadcastExchange +(93) BroadcastExchange Input [1]: [n_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(92) BroadcastHashJoin +(94) BroadcastHashJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(93) Project +(95) Project Output [2]: [s_name#X, s_address#X] Input [4]: [s_name#X, s_address#X, s_nationkey#X, n_nationkey#X] -(94) Exchange +(96) Exchange Input [2]: [s_name#X, s_address#X] Arguments: rangepartitioning(s_name#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(95) Sort +(97) Sort Input [2]: [s_name#X, s_address#X] Arguments: [s_name#X ASC NULLS FIRST], true, 0 -(96) AdaptiveSparkPlan +(98) AdaptiveSparkPlan Output [2]: [s_name#X, s_address#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/21.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/21.txt index 6e415d4a7c387..5aef62e337655 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/21.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/21.txt @@ -1,90 +1,85 @@ == Physical Plan == -AdaptiveSparkPlan (91) +AdaptiveSparkPlan (92) +- == Final Plan == - VeloxColumnarToRowExec (58) - +- ^ RegularHashAggregateExecTransformer (56) - +- ^ InputIteratorTransformer (55) - +- ^ InputAdapter (54) - +- ^ ShuffleQueryStage (53), Statistics(X) - +- ColumnarExchange (52) + VeloxColumnarToRowExec (59) + +- ^ RegularHashAggregateExecTransformer (57) + +- ^ InputIteratorTransformer (56) + +- ShuffleQueryStage (54), Statistics(X) + +- ColumnarExchange (53) + +- VeloxAppendBatches (52) +- ^ ProjectExecTransformer (50) +- ^ FlushableHashAggregateExecTransformer (49) +- ^ ProjectExecTransformer (48) - +- ^ BroadcastHashJoinExecTransformer Inner (47) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (47) :- ^ ProjectExecTransformer (38) - : +- ^ BroadcastHashJoinExecTransformer Inner (37) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (37) : :- ^ ProjectExecTransformer (28) - : : +- ^ BroadcastHashJoinExecTransformer Inner (27) + : : +- ^ BroadcastHashJoinExecTransformer Inner BuildLeft (27) : : :- ^ InputIteratorTransformer (7) - : : : +- ^ InputAdapter (6) - : : : +- ^ BroadcastQueryStage (5), Statistics(X) - : : : +- ColumnarBroadcastExchange (4) - : : : +- ^ FilterExecTransformer (2) - : : : +- ^ Scan parquet (1) - : : +- ^ BroadcastHashJoinExecTransformer LeftAnti (26) - : : :- ^ BroadcastHashJoinExecTransformer LeftSemi (17) + : : : +- BroadcastQueryStage (5), Statistics(X) + : : : +- ColumnarBroadcastExchange (4) + : : : +- ^ FilterExecTransformer (2) + : : : +- ^ Scan parquet (1) + : : +- ^ BroadcastHashJoinExecTransformer LeftAnti BuildRight (26) + : : :- ^ BroadcastHashJoinExecTransformer LeftSemi BuildRight (17) : : : :- ^ ProjectExecTransformer (10) : : : : +- ^ FilterExecTransformer (9) : : : : +- ^ Scan parquet (8) : : : +- ^ InputIteratorTransformer (16) - : : : +- ^ InputAdapter (15) - : : : +- ^ BroadcastQueryStage (14), Statistics(X) - : : : +- ColumnarBroadcastExchange (13) - : : : +- ^ Scan parquet (11) + : : : +- BroadcastQueryStage (14), Statistics(X) + : : : +- ColumnarBroadcastExchange (13) + : : : +- ^ Scan parquet (11) : : +- ^ InputIteratorTransformer (25) - : : +- ^ InputAdapter (24) - : : +- ^ BroadcastQueryStage (23), Statistics(X) - : : +- ColumnarBroadcastExchange (22) - : : +- ^ ProjectExecTransformer (20) - : : +- ^ FilterExecTransformer (19) - : : +- ^ Scan parquet (18) + : : +- BroadcastQueryStage (23), Statistics(X) + : : +- ColumnarBroadcastExchange (22) + : : +- ^ ProjectExecTransformer (20) + : : +- ^ FilterExecTransformer (19) + : : +- ^ Scan parquet (18) : +- ^ InputIteratorTransformer (36) - : +- ^ InputAdapter (35) - : +- ^ BroadcastQueryStage (34), Statistics(X) - : +- ColumnarBroadcastExchange (33) - : +- ^ ProjectExecTransformer (31) - : +- ^ FilterExecTransformer (30) - : +- ^ Scan parquet (29) + : +- BroadcastQueryStage (34), Statistics(X) + : +- ColumnarBroadcastExchange (33) + : +- ^ ProjectExecTransformer (31) + : +- ^ FilterExecTransformer (30) + : +- ^ Scan parquet (29) +- ^ InputIteratorTransformer (46) - +- ^ InputAdapter (45) - +- ^ BroadcastQueryStage (44), Statistics(X) - +- ColumnarBroadcastExchange (43) - +- ^ ProjectExecTransformer (41) - +- ^ FilterExecTransformer (40) - +- ^ Scan parquet (39) + +- BroadcastQueryStage (44), Statistics(X) + +- ColumnarBroadcastExchange (43) + +- ^ ProjectExecTransformer (41) + +- ^ FilterExecTransformer (40) + +- ^ Scan parquet (39) +- == Initial Plan == - TakeOrderedAndProject (90) - +- HashAggregate (89) - +- Exchange (88) - +- HashAggregate (87) - +- Project (86) - +- BroadcastHashJoin Inner BuildRight (85) - :- Project (80) - : +- BroadcastHashJoin Inner BuildRight (79) - : :- Project (74) - : : +- BroadcastHashJoin Inner BuildLeft (73) - : : :- BroadcastExchange (61) - : : : +- Filter (60) - : : : +- Scan parquet (59) - : : +- BroadcastHashJoin LeftAnti BuildRight (72) - : : :- BroadcastHashJoin LeftSemi BuildRight (67) - : : : :- Project (64) - : : : : +- Filter (63) - : : : : +- Scan parquet (62) - : : : +- BroadcastExchange (66) - : : : +- Scan parquet (65) - : : +- BroadcastExchange (71) - : : +- Project (70) - : : +- Filter (69) - : : +- Scan parquet (68) - : +- BroadcastExchange (78) - : +- Project (77) - : +- Filter (76) - : +- Scan parquet (75) - +- BroadcastExchange (84) - +- Project (83) - +- Filter (82) - +- Scan parquet (81) + TakeOrderedAndProject (91) + +- HashAggregate (90) + +- Exchange (89) + +- HashAggregate (88) + +- Project (87) + +- BroadcastHashJoin Inner BuildRight (86) + :- Project (81) + : +- BroadcastHashJoin Inner BuildRight (80) + : :- Project (75) + : : +- BroadcastHashJoin Inner BuildLeft (74) + : : :- BroadcastExchange (62) + : : : +- Filter (61) + : : : +- Scan parquet (60) + : : +- BroadcastHashJoin LeftAnti BuildRight (73) + : : :- BroadcastHashJoin LeftSemi BuildRight (68) + : : : :- Project (65) + : : : : +- Filter (64) + : : : : +- Scan parquet (63) + : : : +- BroadcastExchange (67) + : : : +- Scan parquet (66) + : : +- BroadcastExchange (72) + : : +- Project (71) + : : +- Filter (70) + : : +- Scan parquet (69) + : +- BroadcastExchange (79) + : +- Project (78) + : +- Filter (77) + : +- Scan parquet (76) + +- BroadcastExchange (85) + +- Project (84) + +- Filter (83) + +- Scan parquet (82) (1) Scan parquet @@ -311,195 +306,199 @@ Input [2]: [s_name#X, count#X] Input [3]: [hash_partition_key#X, s_name#X, count#X] Arguments: false -(52) ColumnarExchange +(52) VeloxAppendBatches +Input [3]: [hash_partition_key#X, s_name#X, count#X] +Arguments: X + +(53) ColumnarExchange Input [3]: [hash_partition_key#X, s_name#X, count#X] Arguments: hashpartitioning(s_name#X, 1), ENSURE_REQUIREMENTS, [s_name#X, count#X], [plan_id=X], [id=#X] -(53) ShuffleQueryStage +(54) ShuffleQueryStage Output [2]: [s_name#X, count#X] Arguments: X -(54) InputAdapter +(55) InputAdapter Input [2]: [s_name#X, count#X] -(55) InputIteratorTransformer +(56) InputIteratorTransformer Input [2]: [s_name#X, count#X] -(56) RegularHashAggregateExecTransformer +(57) RegularHashAggregateExecTransformer Input [2]: [s_name#X, count#X] Keys [1]: [s_name#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [s_name#X, count(1)#X AS numwait#X] -(57) WholeStageCodegenTransformer (X) +(58) WholeStageCodegenTransformer (X) Input [2]: [s_name#X, numwait#X] Arguments: false -(58) VeloxColumnarToRowExec +(59) VeloxColumnarToRowExec Input [2]: [s_name#X, numwait#X] -(59) Scan parquet +(60) Scan parquet Output [3]: [s_suppkey#X, s_name#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(60) Filter +(61) Filter Input [3]: [s_suppkey#X, s_name#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(61) BroadcastExchange +(62) BroadcastExchange Input [3]: [s_suppkey#X, s_name#X, s_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(62) Scan parquet +(63) Scan parquet Output [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_receiptdate), IsNotNull(l_commitdate), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(63) Filter +(64) Filter Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Condition : ((((isnotnull(l_receiptdate#X) AND isnotnull(l_commitdate#X)) AND (l_receiptdate#X > l_commitdate#X)) AND isnotnull(l_suppkey#X)) AND isnotnull(l_orderkey#X)) -(64) Project +(65) Project Output [2]: [l_orderkey#X, l_suppkey#X] Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] -(65) Scan parquet +(66) Scan parquet Output [2]: [l_orderkey#X, l_suppkey#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(66) BroadcastExchange +(67) BroadcastExchange Input [2]: [l_orderkey#X, l_suppkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(67) BroadcastHashJoin +(68) BroadcastHashJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: LeftSemi Join condition: NOT (l_suppkey#X = l_suppkey#X) -(68) Scan parquet +(69) Scan parquet Output [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_receiptdate), IsNotNull(l_commitdate)] ReadSchema: struct -(69) Filter +(70) Filter Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Condition : ((isnotnull(l_receiptdate#X) AND isnotnull(l_commitdate#X)) AND (l_receiptdate#X > l_commitdate#X)) -(70) Project +(71) Project Output [2]: [l_orderkey#X, l_suppkey#X] Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] -(71) BroadcastExchange +(72) BroadcastExchange Input [2]: [l_orderkey#X, l_suppkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(72) BroadcastHashJoin +(73) BroadcastHashJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: LeftAnti Join condition: NOT (l_suppkey#X = l_suppkey#X) -(73) BroadcastHashJoin +(74) BroadcastHashJoin Left keys [1]: [s_suppkey#X] Right keys [1]: [l_suppkey#X] Join type: Inner Join condition: None -(74) Project +(75) Project Output [3]: [s_name#X, s_nationkey#X, l_orderkey#X] Input [5]: [s_suppkey#X, s_name#X, s_nationkey#X, l_orderkey#X, l_suppkey#X] -(75) Scan parquet +(76) Scan parquet Output [2]: [o_orderkey#X, o_orderstatus#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderstatus), EqualTo(o_orderstatus,F), IsNotNull(o_orderkey)] ReadSchema: struct -(76) Filter +(77) Filter Input [2]: [o_orderkey#X, o_orderstatus#X] Condition : ((isnotnull(o_orderstatus#X) AND (o_orderstatus#X = F)) AND isnotnull(o_orderkey#X)) -(77) Project +(78) Project Output [1]: [o_orderkey#X] Input [2]: [o_orderkey#X, o_orderstatus#X] -(78) BroadcastExchange +(79) BroadcastExchange Input [1]: [o_orderkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(79) BroadcastHashJoin +(80) BroadcastHashJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join type: Inner Join condition: None -(80) Project +(81) Project Output [2]: [s_name#X, s_nationkey#X] Input [4]: [s_name#X, s_nationkey#X, l_orderkey#X, o_orderkey#X] -(81) Scan parquet +(82) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,SAUDI ARABIA), IsNotNull(n_nationkey)] ReadSchema: struct -(82) Filter +(83) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : ((isnotnull(n_name#X) AND (n_name#X = SAUDI ARABIA)) AND isnotnull(n_nationkey#X)) -(83) Project +(84) Project Output [1]: [n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(84) BroadcastExchange +(85) BroadcastExchange Input [1]: [n_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(85) BroadcastHashJoin +(86) BroadcastHashJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(86) Project +(87) Project Output [1]: [s_name#X] Input [3]: [s_name#X, s_nationkey#X, n_nationkey#X] -(87) HashAggregate +(88) HashAggregate Input [1]: [s_name#X] Keys [1]: [s_name#X] Functions [1]: [partial_count(1)] Aggregate Attributes [1]: [count#X] Results [2]: [s_name#X, count#X] -(88) Exchange +(89) Exchange Input [2]: [s_name#X, count#X] Arguments: hashpartitioning(s_name#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(89) HashAggregate +(90) HashAggregate Input [2]: [s_name#X, count#X] Keys [1]: [s_name#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [s_name#X, count(1)#X AS numwait#X] -(90) TakeOrderedAndProject +(91) TakeOrderedAndProject Input [2]: [s_name#X, numwait#X] Arguments: X, [numwait#X DESC NULLS LAST, s_name#X ASC NULLS FIRST], [s_name#X, numwait#X] -(91) AdaptiveSparkPlan +(92) AdaptiveSparkPlan Output [2]: [s_name#X, numwait#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/22.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/22.txt index b0b5964d0a94c..fbda7224d6424 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/22.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/22.txt @@ -1,40 +1,39 @@ == Physical Plan == -AdaptiveSparkPlan (38) +AdaptiveSparkPlan (40) +- == Final Plan == - VeloxColumnarToRowExec (26) - +- ^ SortExecTransformer (24) - +- ^ InputIteratorTransformer (23) - +- ^ InputAdapter (22) - +- ^ ShuffleQueryStage (21), Statistics(X) - +- ColumnarExchange (20) - +- ^ RegularHashAggregateExecTransformer (18) - +- ^ InputIteratorTransformer (17) - +- ^ InputAdapter (16) - +- ^ ShuffleQueryStage (15), Statistics(X) - +- ColumnarExchange (14) + VeloxColumnarToRowExec (28) + +- ^ SortExecTransformer (26) + +- ^ InputIteratorTransformer (25) + +- ShuffleQueryStage (23), Statistics(X) + +- ColumnarExchange (22) + +- VeloxAppendBatches (21) + +- ^ RegularHashAggregateExecTransformer (19) + +- ^ InputIteratorTransformer (18) + +- ShuffleQueryStage (16), Statistics(X) + +- ColumnarExchange (15) + +- VeloxAppendBatches (14) +- ^ ProjectExecTransformer (12) +- ^ FlushableHashAggregateExecTransformer (11) +- ^ ProjectExecTransformer (10) - +- ^ BroadcastHashJoinExecTransformer LeftAnti (9) + +- ^ BroadcastHashJoinExecTransformer LeftAnti BuildRight (9) :- ^ FilterExecTransformer (2) : +- ^ Scan parquet (1) +- ^ InputIteratorTransformer (8) - +- ^ InputAdapter (7) - +- ^ BroadcastQueryStage (6), Statistics(X) - +- ColumnarBroadcastExchange (5) - +- ^ Scan parquet (3) + +- BroadcastQueryStage (6), Statistics(X) + +- ColumnarBroadcastExchange (5) + +- ^ Scan parquet (3) +- == Initial Plan == - Sort (37) - +- Exchange (36) - +- HashAggregate (35) - +- Exchange (34) - +- HashAggregate (33) - +- Project (32) - +- BroadcastHashJoin LeftAnti BuildRight (31) - :- Filter (28) - : +- Scan parquet (27) - +- BroadcastExchange (30) - +- Scan parquet (29) + Sort (39) + +- Exchange (38) + +- HashAggregate (37) + +- Exchange (36) + +- HashAggregate (35) + +- Project (34) + +- BroadcastHashJoin LeftAnti BuildRight (33) + :- Filter (30) + : +- Scan parquet (29) + +- BroadcastExchange (32) + +- Scan parquet (31) (1) Scan parquet @@ -97,249 +96,261 @@ Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] Input [5]: [hash_partition_key#X, cntrycode#X, count#X, sum#X, isEmpty#X] Arguments: false -(14) ColumnarExchange +(14) VeloxAppendBatches +Input [5]: [hash_partition_key#X, cntrycode#X, count#X, sum#X, isEmpty#X] +Arguments: X + +(15) ColumnarExchange Input [5]: [hash_partition_key#X, cntrycode#X, count#X, sum#X, isEmpty#X] Arguments: hashpartitioning(cntrycode#X, 1), ENSURE_REQUIREMENTS, [cntrycode#X, count#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(15) ShuffleQueryStage +(16) ShuffleQueryStage Output [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] Arguments: X -(16) InputAdapter +(17) InputAdapter Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] -(17) InputIteratorTransformer +(18) InputIteratorTransformer Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] -(18) RegularHashAggregateExecTransformer +(19) RegularHashAggregateExecTransformer Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] Keys [1]: [cntrycode#X] Functions [2]: [count(1), sum(c_acctbal#X)] Aggregate Attributes [2]: [count(1)#X, sum(c_acctbal#X)#X] Results [3]: [cntrycode#X, count(1)#X AS numcust#X, sum(c_acctbal#X)#X AS totacctbal#X] -(19) WholeStageCodegenTransformer (X) +(20) WholeStageCodegenTransformer (X) Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: false -(20) ColumnarExchange +(21) VeloxAppendBatches +Input [3]: [cntrycode#X, numcust#X, totacctbal#X] +Arguments: X + +(22) ColumnarExchange Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: rangepartitioning(cntrycode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(21) ShuffleQueryStage +(23) ShuffleQueryStage Output [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: X -(22) InputAdapter +(24) InputAdapter Input [3]: [cntrycode#X, numcust#X, totacctbal#X] -(23) InputIteratorTransformer +(25) InputIteratorTransformer Input [3]: [cntrycode#X, numcust#X, totacctbal#X] -(24) SortExecTransformer +(26) SortExecTransformer Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: [cntrycode#X ASC NULLS FIRST], true, 0 -(25) WholeStageCodegenTransformer (X) +(27) WholeStageCodegenTransformer (X) Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: false -(26) VeloxColumnarToRowExec +(28) VeloxColumnarToRowExec Input [3]: [cntrycode#X, numcust#X, totacctbal#X] -(27) Scan parquet +(29) Scan parquet Output [3]: [c_custkey#X, c_phone#X, c_acctbal#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_acctbal)] ReadSchema: struct -(28) Filter +(30) Filter Input [3]: [c_custkey#X, c_phone#X, c_acctbal#X] Condition : ((isnotnull(c_acctbal#X) AND substring(c_phone#X, 1, 2) IN (13,31,23,29,30,18,17)) AND (cast(c_acctbal#X as decimal(16,6)) > Subquery subquery#X, [id=#X])) -(29) Scan parquet +(31) Scan parquet Output [1]: [o_custkey#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(30) BroadcastExchange +(32) BroadcastExchange Input [1]: [o_custkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(31) BroadcastHashJoin +(33) BroadcastHashJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join type: LeftAnti Join condition: None -(32) Project +(34) Project Output [2]: [substring(c_phone#X, 1, 2) AS cntrycode#X, c_acctbal#X] Input [3]: [c_custkey#X, c_phone#X, c_acctbal#X] -(33) HashAggregate +(35) HashAggregate Input [2]: [cntrycode#X, c_acctbal#X] Keys [1]: [cntrycode#X] Functions [2]: [partial_count(1), partial_sum(c_acctbal#X)] Aggregate Attributes [3]: [count#X, sum#X, isEmpty#X] Results [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] -(34) Exchange +(36) Exchange Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] Arguments: hashpartitioning(cntrycode#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(35) HashAggregate +(37) HashAggregate Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] Keys [1]: [cntrycode#X] Functions [2]: [count(1), sum(c_acctbal#X)] Aggregate Attributes [2]: [count(1)#X, sum(c_acctbal#X)#X] Results [3]: [cntrycode#X, count(1)#X AS numcust#X, sum(c_acctbal#X)#X AS totacctbal#X] -(36) Exchange +(38) Exchange Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: rangepartitioning(cntrycode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(37) Sort +(39) Sort Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: [cntrycode#X ASC NULLS FIRST], true, 0 -(38) AdaptiveSparkPlan +(40) AdaptiveSparkPlan Output [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: isFinalPlan=true ===== Subqueries ===== Subquery:1 Hosting operator id = 2 Hosting Expression = Subquery subquery#X, [id=#X] -AdaptiveSparkPlan (57) +AdaptiveSparkPlan (60) +- == Final Plan == - VeloxColumnarToRowExec (50) - +- ^ RegularHashAggregateExecTransformer (48) - +- ^ InputIteratorTransformer (47) - +- ^ InputAdapter (46) - +- ^ ShuffleQueryStage (45), Statistics(X) - +- ColumnarExchange (44) - +- ^ FlushableHashAggregateExecTransformer (42) - +- ^ ProjectExecTransformer (41) - +- ^ FilterExecTransformer (40) - +- ^ Scan parquet (39) + VeloxColumnarToRowExec (53) + +- ^ RegularHashAggregateExecTransformer (51) + +- ^ InputIteratorTransformer (50) + +- ShuffleQueryStage (48), Statistics(X) + +- ColumnarExchange (47) + +- VeloxAppendBatches (46) + +- ^ FlushableHashAggregateExecTransformer (44) + +- ^ ProjectExecTransformer (43) + +- ^ FilterExecTransformer (42) + +- ^ Scan parquet (41) +- == Initial Plan == - HashAggregate (56) - +- Exchange (55) - +- HashAggregate (54) - +- Project (53) - +- Filter (52) - +- Scan parquet (51) + HashAggregate (59) + +- Exchange (58) + +- HashAggregate (57) + +- Project (56) + +- Filter (55) + +- Scan parquet (54) -(39) Scan parquet +(41) Scan parquet Output [2]: [c_phone#X, c_acctbal#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_acctbal), GreaterThan(c_acctbal,0.00)] ReadSchema: struct -(40) FilterExecTransformer +(42) FilterExecTransformer Input [2]: [c_phone#X, c_acctbal#X] Arguments: ((isnotnull(c_acctbal#X) AND (c_acctbal#X > 0.00)) AND substring(c_phone#X, 1, 2) IN (13,31,23,29,30,18,17)) -(41) ProjectExecTransformer +(43) ProjectExecTransformer Output [1]: [c_acctbal#X] Input [2]: [c_phone#X, c_acctbal#X] -(42) FlushableHashAggregateExecTransformer +(44) FlushableHashAggregateExecTransformer Input [1]: [c_acctbal#X] Keys: [] Functions [1]: [partial_avg(c_acctbal#X)] Aggregate Attributes [2]: [sum#X, count#X] Results [2]: [sum#X, count#X] -(43) WholeStageCodegenTransformer (X) +(45) WholeStageCodegenTransformer (X) Input [2]: [sum#X, count#X] Arguments: false -(44) ColumnarExchange +(46) VeloxAppendBatches +Input [2]: [sum#X, count#X] +Arguments: X + +(47) ColumnarExchange Input [2]: [sum#X, count#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(45) ShuffleQueryStage +(48) ShuffleQueryStage Output [2]: [sum#X, count#X] Arguments: X -(46) InputAdapter +(49) InputAdapter Input [2]: [sum#X, count#X] -(47) InputIteratorTransformer +(50) InputIteratorTransformer Input [2]: [sum#X, count#X] -(48) RegularHashAggregateExecTransformer +(51) RegularHashAggregateExecTransformer Input [2]: [sum#X, count#X] Keys: [] Functions [1]: [avg(c_acctbal#X)] Aggregate Attributes [1]: [avg(c_acctbal#X)#X] Results [1]: [avg(c_acctbal#X)#X AS avg(c_acctbal)#X] -(49) WholeStageCodegenTransformer (X) +(52) WholeStageCodegenTransformer (X) Input [1]: [avg(c_acctbal)#X] Arguments: false -(50) VeloxColumnarToRowExec +(53) VeloxColumnarToRowExec Input [1]: [avg(c_acctbal)#X] -(51) Scan parquet +(54) Scan parquet Output [2]: [c_phone#X, c_acctbal#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_acctbal), GreaterThan(c_acctbal,0.00)] ReadSchema: struct -(52) Filter +(55) Filter Input [2]: [c_phone#X, c_acctbal#X] Condition : ((isnotnull(c_acctbal#X) AND (c_acctbal#X > 0.00)) AND substring(c_phone#X, 1, 2) IN (13,31,23,29,30,18,17)) -(53) Project +(56) Project Output [1]: [c_acctbal#X] Input [2]: [c_phone#X, c_acctbal#X] -(54) HashAggregate +(57) HashAggregate Input [1]: [c_acctbal#X] Keys: [] Functions [1]: [partial_avg(c_acctbal#X)] Aggregate Attributes [2]: [sum#X, count#X] Results [2]: [sum#X, count#X] -(55) Exchange +(58) Exchange Input [2]: [sum#X, count#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X] -(56) HashAggregate +(59) HashAggregate Input [2]: [sum#X, count#X] Keys: [] Functions [1]: [avg(c_acctbal#X)] Aggregate Attributes [1]: [avg(c_acctbal#X)#X] Results [1]: [avg(c_acctbal#X)#X AS avg(c_acctbal)#X] -(57) AdaptiveSparkPlan +(60) AdaptiveSparkPlan Output [1]: [avg(c_acctbal)#X] Arguments: isFinalPlan=true Subquery:2 Hosting operator id = 1 Hosting Expression = Subquery subquery#X, [id=#X] -AdaptiveSparkPlan (57) +AdaptiveSparkPlan (60) +- == Final Plan == - VeloxColumnarToRowExec (50) - +- ^ RegularHashAggregateExecTransformer (48) - +- ^ InputIteratorTransformer (47) - +- ^ InputAdapter (46) - +- ^ ShuffleQueryStage (45), Statistics(X) - +- ColumnarExchange (44) - +- ^ FlushableHashAggregateExecTransformer (42) - +- ^ ProjectExecTransformer (41) - +- ^ FilterExecTransformer (40) - +- ^ Scan parquet (39) + VeloxColumnarToRowExec (53) + +- ^ RegularHashAggregateExecTransformer (51) + +- ^ InputIteratorTransformer (50) + +- ShuffleQueryStage (48), Statistics(X) + +- ColumnarExchange (47) + +- VeloxAppendBatches (46) + +- ^ FlushableHashAggregateExecTransformer (44) + +- ^ ProjectExecTransformer (43) + +- ^ FilterExecTransformer (42) + +- ^ Scan parquet (41) +- == Initial Plan == - HashAggregate (56) - +- Exchange (55) - +- HashAggregate (54) - +- Project (53) - +- Filter (52) - +- Scan parquet (51) \ No newline at end of file + HashAggregate (59) + +- Exchange (58) + +- HashAggregate (57) + +- Project (56) + +- Filter (55) + +- Scan parquet (54) \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/3.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/3.txt index 6ce73eb0fa383..6d518ac272144 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/3.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/3.txt @@ -1,55 +1,53 @@ == Physical Plan == -AdaptiveSparkPlan (53) +AdaptiveSparkPlan (54) +- == Final Plan == - VeloxColumnarToRowExec (34) - +- TakeOrderedAndProjectExecTransformer (33) - +- ^ ProjectExecTransformer (31) - +- ^ RegularHashAggregateExecTransformer (30) - +- ^ InputIteratorTransformer (29) - +- ^ InputAdapter (28) - +- ^ ShuffleQueryStage (27), Statistics(X) - +- ColumnarExchange (26) + VeloxColumnarToRowExec (35) + +- TakeOrderedAndProjectExecTransformer (34) + +- ^ ProjectExecTransformer (32) + +- ^ RegularHashAggregateExecTransformer (31) + +- ^ InputIteratorTransformer (30) + +- ShuffleQueryStage (28), Statistics(X) + +- ColumnarExchange (27) + +- VeloxAppendBatches (26) +- ^ ProjectExecTransformer (24) +- ^ FlushableHashAggregateExecTransformer (23) +- ^ ProjectExecTransformer (22) - +- ^ BroadcastHashJoinExecTransformer Inner (21) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (21) :- ^ ProjectExecTransformer (12) - : +- ^ BroadcastHashJoinExecTransformer Inner (11) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildLeft (11) : :- ^ InputIteratorTransformer (8) - : : +- ^ InputAdapter (7) - : : +- ^ BroadcastQueryStage (6), Statistics(X) - : : +- ColumnarBroadcastExchange (5) - : : +- ^ ProjectExecTransformer (3) - : : +- ^ FilterExecTransformer (2) - : : +- ^ Scan parquet (1) + : : +- BroadcastQueryStage (6), Statistics(X) + : : +- ColumnarBroadcastExchange (5) + : : +- ^ ProjectExecTransformer (3) + : : +- ^ FilterExecTransformer (2) + : : +- ^ Scan parquet (1) : +- ^ FilterExecTransformer (10) : +- ^ Scan parquet (9) +- ^ InputIteratorTransformer (20) - +- ^ InputAdapter (19) - +- ^ BroadcastQueryStage (18), Statistics(X) - +- ColumnarBroadcastExchange (17) - +- ^ ProjectExecTransformer (15) - +- ^ FilterExecTransformer (14) - +- ^ Scan parquet (13) + +- BroadcastQueryStage (18), Statistics(X) + +- ColumnarBroadcastExchange (17) + +- ^ ProjectExecTransformer (15) + +- ^ FilterExecTransformer (14) + +- ^ Scan parquet (13) +- == Initial Plan == - TakeOrderedAndProject (52) - +- HashAggregate (51) - +- Exchange (50) - +- HashAggregate (49) - +- Project (48) - +- BroadcastHashJoin Inner BuildRight (47) - :- Project (42) - : +- BroadcastHashJoin Inner BuildLeft (41) - : :- BroadcastExchange (38) - : : +- Project (37) - : : +- Filter (36) - : : +- Scan parquet (35) - : +- Filter (40) - : +- Scan parquet (39) - +- BroadcastExchange (46) - +- Project (45) - +- Filter (44) - +- Scan parquet (43) + TakeOrderedAndProject (53) + +- HashAggregate (52) + +- Exchange (51) + +- HashAggregate (50) + +- Project (49) + +- BroadcastHashJoin Inner BuildRight (48) + :- Project (43) + : +- BroadcastHashJoin Inner BuildLeft (42) + : :- BroadcastExchange (39) + : : +- Project (38) + : : +- Filter (37) + : : +- Scan parquet (36) + : +- Filter (41) + : +- Scan parquet (40) + +- BroadcastExchange (47) + +- Project (46) + +- Filter (45) + +- Scan parquet (44) (1) Scan parquet @@ -164,133 +162,137 @@ Input [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] Input [6]: [hash_partition_key#X, l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] Arguments: false -(26) ColumnarExchange +(26) VeloxAppendBatches +Input [6]: [hash_partition_key#X, l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] +Arguments: X + +(27) ColumnarExchange Input [6]: [hash_partition_key#X, l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_orderkey#X, o_orderdate#X, o_shippriority#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(27) ShuffleQueryStage +(28) ShuffleQueryStage Output [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] Arguments: X -(28) InputAdapter +(29) InputAdapter Input [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] -(29) InputIteratorTransformer +(30) InputIteratorTransformer Input [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] -(30) RegularHashAggregateExecTransformer +(31) RegularHashAggregateExecTransformer Input [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] Keys [3]: [l_orderkey#X, o_orderdate#X, o_shippriority#X] Functions [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X] Results [4]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X] -(31) ProjectExecTransformer +(32) ProjectExecTransformer Output [4]: [l_orderkey#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X AS revenue#X, o_orderdate#X, o_shippriority#X] Input [4]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X] -(32) WholeStageCodegenTransformer (X) +(33) WholeStageCodegenTransformer (X) Input [4]: [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] Arguments: false -(33) TakeOrderedAndProjectExecTransformer +(34) TakeOrderedAndProjectExecTransformer Input [4]: [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] Arguments: X, [revenue#X DESC NULLS LAST, o_orderdate#X ASC NULLS FIRST], [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X], 0 -(34) VeloxColumnarToRowExec +(35) VeloxColumnarToRowExec Input [4]: [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] -(35) Scan parquet +(36) Scan parquet Output [2]: [c_custkey#X, c_mktsegment#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_mktsegment), EqualTo(c_mktsegment,BUILDING), IsNotNull(c_custkey)] ReadSchema: struct -(36) Filter +(37) Filter Input [2]: [c_custkey#X, c_mktsegment#X] Condition : ((isnotnull(c_mktsegment#X) AND (c_mktsegment#X = BUILDING)) AND isnotnull(c_custkey#X)) -(37) Project +(38) Project Output [1]: [c_custkey#X] Input [2]: [c_custkey#X, c_mktsegment#X] -(38) BroadcastExchange +(39) BroadcastExchange Input [1]: [c_custkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(39) Scan parquet +(40) Scan parquet Output [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), LessThan(o_orderdate,1995-03-15), IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(40) Filter +(41) Filter Input [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Condition : (((isnotnull(o_orderdate#X) AND (o_orderdate#X < 1995-03-15)) AND isnotnull(o_custkey#X)) AND isnotnull(o_orderkey#X)) -(41) BroadcastHashJoin +(42) BroadcastHashJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join type: Inner Join condition: None -(42) Project +(43) Project Output [3]: [o_orderkey#X, o_orderdate#X, o_shippriority#X] Input [5]: [c_custkey#X, o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] -(43) Scan parquet +(44) Scan parquet Output [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThan(l_shipdate,1995-03-15), IsNotNull(l_orderkey)] ReadSchema: struct -(44) Filter +(45) Filter Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : ((isnotnull(l_shipdate#X) AND (l_shipdate#X > 1995-03-15)) AND isnotnull(l_orderkey#X)) -(45) Project +(46) Project Output [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(46) BroadcastExchange +(47) BroadcastExchange Input [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(47) BroadcastHashJoin +(48) BroadcastHashJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: Inner Join condition: None -(48) Project +(49) Project Output [5]: [o_orderdate#X, o_shippriority#X, l_orderkey#X, l_extendedprice#X, l_discount#X] Input [6]: [o_orderkey#X, o_orderdate#X, o_shippriority#X, l_orderkey#X, l_extendedprice#X, l_discount#X] -(49) HashAggregate +(50) HashAggregate Input [5]: [o_orderdate#X, o_shippriority#X, l_orderkey#X, l_extendedprice#X, l_discount#X] Keys [3]: [l_orderkey#X, o_orderdate#X, o_shippriority#X] Functions [1]: [partial_sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] -(50) Exchange +(51) Exchange Input [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_orderkey#X, o_orderdate#X, o_shippriority#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(51) HashAggregate +(52) HashAggregate Input [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] Keys [3]: [l_orderkey#X, o_orderdate#X, o_shippriority#X] Functions [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X] Results [4]: [l_orderkey#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X AS revenue#X, o_orderdate#X, o_shippriority#X] -(52) TakeOrderedAndProject +(53) TakeOrderedAndProject Input [4]: [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] Arguments: X, [revenue#X DESC NULLS LAST, o_orderdate#X ASC NULLS FIRST], [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] -(53) AdaptiveSparkPlan +(54) AdaptiveSparkPlan Output [4]: [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/4.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/4.txt index 2401dc071221f..bb6c149c39e1d 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/4.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/4.txt @@ -1,46 +1,45 @@ == Physical Plan == -AdaptiveSparkPlan (44) +AdaptiveSparkPlan (46) +- == Final Plan == - VeloxColumnarToRowExec (29) - +- ^ SortExecTransformer (27) - +- ^ InputIteratorTransformer (26) - +- ^ InputAdapter (25) - +- ^ ShuffleQueryStage (24), Statistics(X) - +- ColumnarExchange (23) - +- ^ RegularHashAggregateExecTransformer (21) - +- ^ InputIteratorTransformer (20) - +- ^ InputAdapter (19) - +- ^ ShuffleQueryStage (18), Statistics(X) - +- ColumnarExchange (17) + VeloxColumnarToRowExec (31) + +- ^ SortExecTransformer (29) + +- ^ InputIteratorTransformer (28) + +- ShuffleQueryStage (26), Statistics(X) + +- ColumnarExchange (25) + +- VeloxAppendBatches (24) + +- ^ RegularHashAggregateExecTransformer (22) + +- ^ InputIteratorTransformer (21) + +- ShuffleQueryStage (19), Statistics(X) + +- ColumnarExchange (18) + +- VeloxAppendBatches (17) +- ^ ProjectExecTransformer (15) +- ^ FlushableHashAggregateExecTransformer (14) +- ^ ProjectExecTransformer (13) - +- ^ BroadcastHashJoinExecTransformer LeftSemi (12) + +- ^ BroadcastHashJoinExecTransformer LeftSemi BuildRight (12) :- ^ ProjectExecTransformer (3) : +- ^ FilterExecTransformer (2) : +- ^ Scan parquet (1) +- ^ InputIteratorTransformer (11) - +- ^ InputAdapter (10) - +- ^ BroadcastQueryStage (9), Statistics(X) - +- ColumnarBroadcastExchange (8) - +- ^ ProjectExecTransformer (6) - +- ^ FilterExecTransformer (5) - +- ^ Scan parquet (4) + +- BroadcastQueryStage (9), Statistics(X) + +- ColumnarBroadcastExchange (8) + +- ^ ProjectExecTransformer (6) + +- ^ FilterExecTransformer (5) + +- ^ Scan parquet (4) +- == Initial Plan == - Sort (43) - +- Exchange (42) - +- HashAggregate (41) - +- Exchange (40) - +- HashAggregate (39) - +- Project (38) - +- BroadcastHashJoin LeftSemi BuildRight (37) - :- Project (32) - : +- Filter (31) - : +- Scan parquet (30) - +- BroadcastExchange (36) - +- Project (35) - +- Filter (34) - +- Scan parquet (33) + Sort (45) + +- Exchange (44) + +- HashAggregate (43) + +- Exchange (42) + +- HashAggregate (41) + +- Project (40) + +- BroadcastHashJoin LeftSemi BuildRight (39) + :- Project (34) + : +- Filter (33) + : +- Scan parquet (32) + +- BroadcastExchange (38) + +- Project (37) + +- Filter (36) + +- Scan parquet (35) (1) Scan parquet @@ -116,126 +115,134 @@ Input [2]: [o_orderpriority#X, count#X] Input [3]: [hash_partition_key#X, o_orderpriority#X, count#X] Arguments: false -(17) ColumnarExchange +(17) VeloxAppendBatches +Input [3]: [hash_partition_key#X, o_orderpriority#X, count#X] +Arguments: X + +(18) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderpriority#X, count#X] Arguments: hashpartitioning(o_orderpriority#X, 1), ENSURE_REQUIREMENTS, [o_orderpriority#X, count#X], [plan_id=X], [id=#X] -(18) ShuffleQueryStage +(19) ShuffleQueryStage Output [2]: [o_orderpriority#X, count#X] Arguments: X -(19) InputAdapter +(20) InputAdapter Input [2]: [o_orderpriority#X, count#X] -(20) InputIteratorTransformer +(21) InputIteratorTransformer Input [2]: [o_orderpriority#X, count#X] -(21) RegularHashAggregateExecTransformer +(22) RegularHashAggregateExecTransformer Input [2]: [o_orderpriority#X, count#X] Keys [1]: [o_orderpriority#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [o_orderpriority#X, count(1)#X AS order_count#X] -(22) WholeStageCodegenTransformer (X) +(23) WholeStageCodegenTransformer (X) Input [2]: [o_orderpriority#X, order_count#X] Arguments: false -(23) ColumnarExchange +(24) VeloxAppendBatches +Input [2]: [o_orderpriority#X, order_count#X] +Arguments: X + +(25) ColumnarExchange Input [2]: [o_orderpriority#X, order_count#X] Arguments: rangepartitioning(o_orderpriority#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(24) ShuffleQueryStage +(26) ShuffleQueryStage Output [2]: [o_orderpriority#X, order_count#X] Arguments: X -(25) InputAdapter +(27) InputAdapter Input [2]: [o_orderpriority#X, order_count#X] -(26) InputIteratorTransformer +(28) InputIteratorTransformer Input [2]: [o_orderpriority#X, order_count#X] -(27) SortExecTransformer +(29) SortExecTransformer Input [2]: [o_orderpriority#X, order_count#X] Arguments: [o_orderpriority#X ASC NULLS FIRST], true, 0 -(28) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [2]: [o_orderpriority#X, order_count#X] Arguments: false -(29) VeloxColumnarToRowExec +(31) VeloxColumnarToRowExec Input [2]: [o_orderpriority#X, order_count#X] -(30) Scan parquet +(32) Scan parquet Output [3]: [o_orderkey#X, o_orderdate#X, o_orderpriority#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1993-07-01), LessThan(o_orderdate,1993-10-01)] ReadSchema: struct -(31) Filter +(33) Filter Input [3]: [o_orderkey#X, o_orderdate#X, o_orderpriority#X] Condition : ((isnotnull(o_orderdate#X) AND (o_orderdate#X >= 1993-07-01)) AND (o_orderdate#X < 1993-10-01)) -(32) Project +(34) Project Output [2]: [o_orderkey#X, o_orderpriority#X] Input [3]: [o_orderkey#X, o_orderdate#X, o_orderpriority#X] -(33) Scan parquet +(35) Scan parquet Output [3]: [l_orderkey#X, l_commitdate#X, l_receiptdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_commitdate), IsNotNull(l_receiptdate)] ReadSchema: struct -(34) Filter +(36) Filter Input [3]: [l_orderkey#X, l_commitdate#X, l_receiptdate#X] Condition : ((isnotnull(l_commitdate#X) AND isnotnull(l_receiptdate#X)) AND (l_commitdate#X < l_receiptdate#X)) -(35) Project +(37) Project Output [1]: [l_orderkey#X] Input [3]: [l_orderkey#X, l_commitdate#X, l_receiptdate#X] -(36) BroadcastExchange +(38) BroadcastExchange Input [1]: [l_orderkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(37) BroadcastHashJoin +(39) BroadcastHashJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: LeftSemi Join condition: None -(38) Project +(40) Project Output [1]: [o_orderpriority#X] Input [2]: [o_orderkey#X, o_orderpriority#X] -(39) HashAggregate +(41) HashAggregate Input [1]: [o_orderpriority#X] Keys [1]: [o_orderpriority#X] Functions [1]: [partial_count(1)] Aggregate Attributes [1]: [count#X] Results [2]: [o_orderpriority#X, count#X] -(40) Exchange +(42) Exchange Input [2]: [o_orderpriority#X, count#X] Arguments: hashpartitioning(o_orderpriority#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(41) HashAggregate +(43) HashAggregate Input [2]: [o_orderpriority#X, count#X] Keys [1]: [o_orderpriority#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [o_orderpriority#X, count(1)#X AS order_count#X] -(42) Exchange +(44) Exchange Input [2]: [o_orderpriority#X, order_count#X] Arguments: rangepartitioning(o_orderpriority#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(43) Sort +(45) Sort Input [2]: [o_orderpriority#X, order_count#X] Arguments: [o_orderpriority#X ASC NULLS FIRST], true, 0 -(44) AdaptiveSparkPlan +(46) AdaptiveSparkPlan Output [2]: [o_orderpriority#X, order_count#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/5.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/5.txt index e811b63703667..afb9aa3699667 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/5.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/5.txt @@ -1,98 +1,93 @@ == Physical Plan == -AdaptiveSparkPlan (100) +AdaptiveSparkPlan (102) +- == Final Plan == - VeloxColumnarToRowExec (65) - +- ^ SortExecTransformer (63) - +- ^ InputIteratorTransformer (62) - +- ^ InputAdapter (61) - +- ^ ShuffleQueryStage (60), Statistics(X) - +- ColumnarExchange (59) - +- ^ RegularHashAggregateExecTransformer (57) - +- ^ InputIteratorTransformer (56) - +- ^ InputAdapter (55) - +- ^ ShuffleQueryStage (54), Statistics(X) - +- ColumnarExchange (53) + VeloxColumnarToRowExec (67) + +- ^ SortExecTransformer (65) + +- ^ InputIteratorTransformer (64) + +- ShuffleQueryStage (62), Statistics(X) + +- ColumnarExchange (61) + +- VeloxAppendBatches (60) + +- ^ RegularHashAggregateExecTransformer (58) + +- ^ InputIteratorTransformer (57) + +- ShuffleQueryStage (55), Statistics(X) + +- ColumnarExchange (54) + +- VeloxAppendBatches (53) +- ^ ProjectExecTransformer (51) +- ^ FlushableHashAggregateExecTransformer (50) +- ^ ProjectExecTransformer (49) - +- ^ BroadcastHashJoinExecTransformer Inner (48) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (48) :- ^ ProjectExecTransformer (39) - : +- ^ BroadcastHashJoinExecTransformer Inner (38) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (38) : :- ^ ProjectExecTransformer (30) - : : +- ^ BroadcastHashJoinExecTransformer Inner (29) + : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (29) : : :- ^ ProjectExecTransformer (21) - : : : +- ^ BroadcastHashJoinExecTransformer Inner (20) + : : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (20) : : : :- ^ ProjectExecTransformer (12) - : : : : +- ^ BroadcastHashJoinExecTransformer Inner (11) + : : : : +- ^ BroadcastHashJoinExecTransformer Inner BuildLeft (11) : : : : :- ^ InputIteratorTransformer (7) - : : : : : +- ^ InputAdapter (6) - : : : : : +- ^ BroadcastQueryStage (5), Statistics(X) - : : : : : +- ColumnarBroadcastExchange (4) - : : : : : +- ^ FilterExecTransformer (2) - : : : : : +- ^ Scan parquet (1) + : : : : : +- BroadcastQueryStage (5), Statistics(X) + : : : : : +- ColumnarBroadcastExchange (4) + : : : : : +- ^ FilterExecTransformer (2) + : : : : : +- ^ Scan parquet (1) : : : : +- ^ ProjectExecTransformer (10) : : : : +- ^ FilterExecTransformer (9) : : : : +- ^ Scan parquet (8) : : : +- ^ InputIteratorTransformer (19) - : : : +- ^ InputAdapter (18) - : : : +- ^ BroadcastQueryStage (17), Statistics(X) - : : : +- ColumnarBroadcastExchange (16) - : : : +- ^ FilterExecTransformer (14) - : : : +- ^ Scan parquet (13) + : : : +- BroadcastQueryStage (17), Statistics(X) + : : : +- ColumnarBroadcastExchange (16) + : : : +- ^ FilterExecTransformer (14) + : : : +- ^ Scan parquet (13) : : +- ^ InputIteratorTransformer (28) - : : +- ^ InputAdapter (27) - : : +- ^ BroadcastQueryStage (26), Statistics(X) - : : +- ColumnarBroadcastExchange (25) - : : +- ^ FilterExecTransformer (23) - : : +- ^ Scan parquet (22) + : : +- BroadcastQueryStage (26), Statistics(X) + : : +- ColumnarBroadcastExchange (25) + : : +- ^ FilterExecTransformer (23) + : : +- ^ Scan parquet (22) : +- ^ InputIteratorTransformer (37) - : +- ^ InputAdapter (36) - : +- ^ BroadcastQueryStage (35), Statistics(X) - : +- ColumnarBroadcastExchange (34) - : +- ^ FilterExecTransformer (32) - : +- ^ Scan parquet (31) + : +- BroadcastQueryStage (35), Statistics(X) + : +- ColumnarBroadcastExchange (34) + : +- ^ FilterExecTransformer (32) + : +- ^ Scan parquet (31) +- ^ InputIteratorTransformer (47) - +- ^ InputAdapter (46) - +- ^ BroadcastQueryStage (45), Statistics(X) - +- ColumnarBroadcastExchange (44) - +- ^ ProjectExecTransformer (42) - +- ^ FilterExecTransformer (41) - +- ^ Scan parquet (40) + +- BroadcastQueryStage (45), Statistics(X) + +- ColumnarBroadcastExchange (44) + +- ^ ProjectExecTransformer (42) + +- ^ FilterExecTransformer (41) + +- ^ Scan parquet (40) +- == Initial Plan == - Sort (99) - +- Exchange (98) - +- HashAggregate (97) - +- Exchange (96) - +- HashAggregate (95) - +- Project (94) - +- BroadcastHashJoin Inner BuildRight (93) - :- Project (88) - : +- BroadcastHashJoin Inner BuildRight (87) - : :- Project (83) - : : +- BroadcastHashJoin Inner BuildRight (82) - : : :- Project (78) - : : : +- BroadcastHashJoin Inner BuildRight (77) - : : : :- Project (73) - : : : : +- BroadcastHashJoin Inner BuildLeft (72) - : : : : :- BroadcastExchange (68) - : : : : : +- Filter (67) - : : : : : +- Scan parquet (66) - : : : : +- Project (71) - : : : : +- Filter (70) - : : : : +- Scan parquet (69) - : : : +- BroadcastExchange (76) - : : : +- Filter (75) - : : : +- Scan parquet (74) - : : +- BroadcastExchange (81) - : : +- Filter (80) - : : +- Scan parquet (79) - : +- BroadcastExchange (86) - : +- Filter (85) - : +- Scan parquet (84) - +- BroadcastExchange (92) - +- Project (91) - +- Filter (90) - +- Scan parquet (89) + Sort (101) + +- Exchange (100) + +- HashAggregate (99) + +- Exchange (98) + +- HashAggregate (97) + +- Project (96) + +- BroadcastHashJoin Inner BuildRight (95) + :- Project (90) + : +- BroadcastHashJoin Inner BuildRight (89) + : :- Project (85) + : : +- BroadcastHashJoin Inner BuildRight (84) + : : :- Project (80) + : : : +- BroadcastHashJoin Inner BuildRight (79) + : : : :- Project (75) + : : : : +- BroadcastHashJoin Inner BuildLeft (74) + : : : : :- BroadcastExchange (70) + : : : : : +- Filter (69) + : : : : : +- Scan parquet (68) + : : : : +- Project (73) + : : : : +- Filter (72) + : : : : +- Scan parquet (71) + : : : +- BroadcastExchange (78) + : : : +- Filter (77) + : : : +- Scan parquet (76) + : : +- BroadcastExchange (83) + : : +- Filter (82) + : : +- Scan parquet (81) + : +- BroadcastExchange (88) + : +- Filter (87) + : +- Scan parquet (86) + +- BroadcastExchange (94) + +- Project (93) + +- Filter (92) + +- Scan parquet (91) (1) Scan parquet @@ -324,226 +319,234 @@ Input [3]: [n_name#X, sum#X, isEmpty#X] Input [4]: [hash_partition_key#X, n_name#X, sum#X, isEmpty#X] Arguments: false -(53) ColumnarExchange +(53) VeloxAppendBatches +Input [4]: [hash_partition_key#X, n_name#X, sum#X, isEmpty#X] +Arguments: X + +(54) ColumnarExchange Input [4]: [hash_partition_key#X, n_name#X, sum#X, isEmpty#X] Arguments: hashpartitioning(n_name#X, 1), ENSURE_REQUIREMENTS, [n_name#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(54) ShuffleQueryStage +(55) ShuffleQueryStage Output [3]: [n_name#X, sum#X, isEmpty#X] Arguments: X -(55) InputAdapter +(56) InputAdapter Input [3]: [n_name#X, sum#X, isEmpty#X] -(56) InputIteratorTransformer +(57) InputIteratorTransformer Input [3]: [n_name#X, sum#X, isEmpty#X] -(57) RegularHashAggregateExecTransformer +(58) RegularHashAggregateExecTransformer Input [3]: [n_name#X, sum#X, isEmpty#X] Keys [1]: [n_name#X] Functions [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X] Results [2]: [n_name#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X AS revenue#X] -(58) WholeStageCodegenTransformer (X) +(59) WholeStageCodegenTransformer (X) Input [2]: [n_name#X, revenue#X] Arguments: false -(59) ColumnarExchange +(60) VeloxAppendBatches +Input [2]: [n_name#X, revenue#X] +Arguments: X + +(61) ColumnarExchange Input [2]: [n_name#X, revenue#X] Arguments: rangepartitioning(revenue#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(60) ShuffleQueryStage +(62) ShuffleQueryStage Output [2]: [n_name#X, revenue#X] Arguments: X -(61) InputAdapter +(63) InputAdapter Input [2]: [n_name#X, revenue#X] -(62) InputIteratorTransformer +(64) InputIteratorTransformer Input [2]: [n_name#X, revenue#X] -(63) SortExecTransformer +(65) SortExecTransformer Input [2]: [n_name#X, revenue#X] Arguments: [revenue#X DESC NULLS LAST], true, 0 -(64) WholeStageCodegenTransformer (X) +(66) WholeStageCodegenTransformer (X) Input [2]: [n_name#X, revenue#X] Arguments: false -(65) VeloxColumnarToRowExec +(67) VeloxColumnarToRowExec Input [2]: [n_name#X, revenue#X] -(66) Scan parquet +(68) Scan parquet Output [2]: [c_custkey#X, c_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey), IsNotNull(c_nationkey)] ReadSchema: struct -(67) Filter +(69) Filter Input [2]: [c_custkey#X, c_nationkey#X] Condition : (isnotnull(c_custkey#X) AND isnotnull(c_nationkey#X)) -(68) BroadcastExchange +(70) BroadcastExchange Input [2]: [c_custkey#X, c_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(69) Scan parquet +(71) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1994-01-01), LessThan(o_orderdate,1995-01-01), IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(70) Filter +(72) Filter Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Condition : ((((isnotnull(o_orderdate#X) AND (o_orderdate#X >= 1994-01-01)) AND (o_orderdate#X < 1995-01-01)) AND isnotnull(o_custkey#X)) AND isnotnull(o_orderkey#X)) -(71) Project +(73) Project Output [2]: [o_orderkey#X, o_custkey#X] Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] -(72) BroadcastHashJoin +(74) BroadcastHashJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join type: Inner Join condition: None -(73) Project +(75) Project Output [2]: [c_nationkey#X, o_orderkey#X] Input [4]: [c_custkey#X, c_nationkey#X, o_orderkey#X, o_custkey#X] -(74) Scan parquet +(76) Scan parquet Output [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_orderkey), IsNotNull(l_suppkey)] ReadSchema: struct -(75) Filter +(77) Filter Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Condition : (isnotnull(l_orderkey#X) AND isnotnull(l_suppkey#X)) -(76) BroadcastExchange +(78) BroadcastExchange Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(77) BroadcastHashJoin +(79) BroadcastHashJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: Inner Join condition: None -(78) Project +(80) Project Output [4]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Input [6]: [c_nationkey#X, o_orderkey#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(79) Scan parquet +(81) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(80) Filter +(82) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(81) BroadcastExchange +(83) BroadcastExchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false], input[1, bigint, false]),false), [plan_id=X] -(82) BroadcastHashJoin +(84) BroadcastHashJoin Left keys [2]: [l_suppkey#X, c_nationkey#X] Right keys [2]: [s_suppkey#X, s_nationkey#X] Join type: Inner Join condition: None -(83) Project +(85) Project Output [3]: [l_extendedprice#X, l_discount#X, s_nationkey#X] Input [6]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, s_suppkey#X, s_nationkey#X] -(84) Scan parquet +(86) Scan parquet Output [3]: [n_nationkey#X, n_name#X, n_regionkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), IsNotNull(n_regionkey)] ReadSchema: struct -(85) Filter +(87) Filter Input [3]: [n_nationkey#X, n_name#X, n_regionkey#X] Condition : (isnotnull(n_nationkey#X) AND isnotnull(n_regionkey#X)) -(86) BroadcastExchange +(88) BroadcastExchange Input [3]: [n_nationkey#X, n_name#X, n_regionkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(87) BroadcastHashJoin +(89) BroadcastHashJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(88) Project +(90) Project Output [4]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] Input [6]: [l_extendedprice#X, l_discount#X, s_nationkey#X, n_nationkey#X, n_name#X, n_regionkey#X] -(89) Scan parquet +(91) Scan parquet Output [2]: [r_regionkey#X, r_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(r_name), EqualTo(r_name,ASIA), IsNotNull(r_regionkey)] ReadSchema: struct -(90) Filter +(92) Filter Input [2]: [r_regionkey#X, r_name#X] Condition : ((isnotnull(r_name#X) AND (r_name#X = ASIA)) AND isnotnull(r_regionkey#X)) -(91) Project +(93) Project Output [1]: [r_regionkey#X] Input [2]: [r_regionkey#X, r_name#X] -(92) BroadcastExchange +(94) BroadcastExchange Input [1]: [r_regionkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(93) BroadcastHashJoin +(95) BroadcastHashJoin Left keys [1]: [n_regionkey#X] Right keys [1]: [r_regionkey#X] Join type: Inner Join condition: None -(94) Project +(96) Project Output [3]: [l_extendedprice#X, l_discount#X, n_name#X] Input [5]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X, r_regionkey#X] -(95) HashAggregate +(97) HashAggregate Input [3]: [l_extendedprice#X, l_discount#X, n_name#X] Keys [1]: [n_name#X] Functions [1]: [partial_sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [n_name#X, sum#X, isEmpty#X] -(96) Exchange +(98) Exchange Input [3]: [n_name#X, sum#X, isEmpty#X] Arguments: hashpartitioning(n_name#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(97) HashAggregate +(99) HashAggregate Input [3]: [n_name#X, sum#X, isEmpty#X] Keys [1]: [n_name#X] Functions [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X] Results [2]: [n_name#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X AS revenue#X] -(98) Exchange +(100) Exchange Input [2]: [n_name#X, revenue#X] Arguments: rangepartitioning(revenue#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(99) Sort +(101) Sort Input [2]: [n_name#X, revenue#X] Arguments: [revenue#X DESC NULLS LAST], true, 0 -(100) AdaptiveSparkPlan +(102) AdaptiveSparkPlan Output [2]: [n_name#X, revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/6.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/6.txt index 0882ff9e151c6..ddc921e22d0f6 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/6.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/6.txt @@ -1,23 +1,23 @@ == Physical Plan == -AdaptiveSparkPlan (19) +AdaptiveSparkPlan (20) +- == Final Plan == - VeloxColumnarToRowExec (12) - +- ^ RegularHashAggregateExecTransformer (10) - +- ^ InputIteratorTransformer (9) - +- ^ InputAdapter (8) - +- ^ ShuffleQueryStage (7), Statistics(X) - +- ColumnarExchange (6) + VeloxColumnarToRowExec (13) + +- ^ RegularHashAggregateExecTransformer (11) + +- ^ InputIteratorTransformer (10) + +- ShuffleQueryStage (8), Statistics(X) + +- ColumnarExchange (7) + +- VeloxAppendBatches (6) +- ^ FlushableHashAggregateExecTransformer (4) +- ^ ProjectExecTransformer (3) +- ^ FilterExecTransformer (2) +- ^ Scan parquet (1) +- == Initial Plan == - HashAggregate (18) - +- Exchange (17) - +- HashAggregate (16) - +- Project (15) - +- Filter (14) - +- Scan parquet (13) + HashAggregate (19) + +- Exchange (18) + +- HashAggregate (17) + +- Project (16) + +- Filter (15) + +- Scan parquet (14) (1) Scan parquet @@ -46,67 +46,71 @@ Results [2]: [sum#X, isEmpty#X] Input [2]: [sum#X, isEmpty#X] Arguments: false -(6) ColumnarExchange +(6) VeloxAppendBatches +Input [2]: [sum#X, isEmpty#X] +Arguments: X + +(7) ColumnarExchange Input [2]: [sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(7) ShuffleQueryStage +(8) ShuffleQueryStage Output [2]: [sum#X, isEmpty#X] Arguments: X -(8) InputAdapter +(9) InputAdapter Input [2]: [sum#X, isEmpty#X] -(9) InputIteratorTransformer +(10) InputIteratorTransformer Input [2]: [sum#X, isEmpty#X] -(10) RegularHashAggregateExecTransformer +(11) RegularHashAggregateExecTransformer Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum((l_extendedprice#X * l_discount#X))] Aggregate Attributes [1]: [sum((l_extendedprice#X * l_discount#X))#X] Results [1]: [sum((l_extendedprice#X * l_discount#X))#X AS revenue#X] -(11) WholeStageCodegenTransformer (X) +(12) WholeStageCodegenTransformer (X) Input [1]: [revenue#X] Arguments: false -(12) VeloxColumnarToRowExec +(13) VeloxColumnarToRowExec Input [1]: [revenue#X] -(13) Scan parquet +(14) Scan parquet Output [4]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), IsNotNull(l_discount), IsNotNull(l_quantity), GreaterThanOrEqual(l_shipdate,1994-01-01), LessThan(l_shipdate,1995-01-01), GreaterThanOrEqual(l_discount,0.05), LessThanOrEqual(l_discount,0.07), LessThan(l_quantity,24.00)] ReadSchema: struct -(14) Filter +(15) Filter Input [4]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : (((((((isnotnull(l_shipdate#X) AND isnotnull(l_discount#X)) AND isnotnull(l_quantity#X)) AND (l_shipdate#X >= 1994-01-01)) AND (l_shipdate#X < 1995-01-01)) AND (l_discount#X >= 0.05)) AND (l_discount#X <= 0.07)) AND (l_quantity#X < 24.00)) -(15) Project +(16) Project Output [2]: [l_extendedprice#X, l_discount#X] Input [4]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(16) HashAggregate +(17) HashAggregate Input [2]: [l_extendedprice#X, l_discount#X] Keys: [] Functions [1]: [partial_sum((l_extendedprice#X * l_discount#X))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(17) Exchange +(18) Exchange Input [2]: [sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X] -(18) HashAggregate +(19) HashAggregate Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum((l_extendedprice#X * l_discount#X))] Aggregate Attributes [1]: [sum((l_extendedprice#X * l_discount#X))#X] Results [1]: [sum((l_extendedprice#X * l_discount#X))#X AS revenue#X] -(19) AdaptiveSparkPlan +(20) AdaptiveSparkPlan Output [1]: [revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/7.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/7.txt index 129da90d3a2b2..5ca9bbe39ef2f 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/7.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/7.txt @@ -1,92 +1,87 @@ == Physical Plan == -AdaptiveSparkPlan (93) +AdaptiveSparkPlan (95) +- == Final Plan == - VeloxColumnarToRowExec (60) - +- ^ SortExecTransformer (58) - +- ^ InputIteratorTransformer (57) - +- ^ InputAdapter (56) - +- ^ ShuffleQueryStage (55), Statistics(X) - +- ColumnarExchange (54) - +- ^ RegularHashAggregateExecTransformer (52) - +- ^ InputIteratorTransformer (51) - +- ^ InputAdapter (50) - +- ^ ShuffleQueryStage (49), Statistics(X) - +- ColumnarExchange (48) + VeloxColumnarToRowExec (62) + +- ^ SortExecTransformer (60) + +- ^ InputIteratorTransformer (59) + +- ShuffleQueryStage (57), Statistics(X) + +- ColumnarExchange (56) + +- VeloxAppendBatches (55) + +- ^ RegularHashAggregateExecTransformer (53) + +- ^ InputIteratorTransformer (52) + +- ShuffleQueryStage (50), Statistics(X) + +- ColumnarExchange (49) + +- VeloxAppendBatches (48) +- ^ ProjectExecTransformer (46) +- ^ FlushableHashAggregateExecTransformer (45) +- ^ ProjectExecTransformer (44) - +- ^ BroadcastHashJoinExecTransformer Inner (43) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (43) :- ^ ProjectExecTransformer (38) - : +- ^ BroadcastHashJoinExecTransformer Inner (37) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (37) : :- ^ ProjectExecTransformer (29) - : : +- ^ BroadcastHashJoinExecTransformer Inner (28) + : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (28) : : :- ^ ProjectExecTransformer (20) - : : : +- ^ BroadcastHashJoinExecTransformer Inner (19) + : : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (19) : : : :- ^ ProjectExecTransformer (11) - : : : : +- ^ BroadcastHashJoinExecTransformer Inner (10) + : : : : +- ^ BroadcastHashJoinExecTransformer Inner BuildLeft (10) : : : : :- ^ InputIteratorTransformer (7) - : : : : : +- ^ InputAdapter (6) - : : : : : +- ^ BroadcastQueryStage (5), Statistics(X) - : : : : : +- ColumnarBroadcastExchange (4) - : : : : : +- ^ FilterExecTransformer (2) - : : : : : +- ^ Scan parquet (1) + : : : : : +- BroadcastQueryStage (5), Statistics(X) + : : : : : +- ColumnarBroadcastExchange (4) + : : : : : +- ^ FilterExecTransformer (2) + : : : : : +- ^ Scan parquet (1) : : : : +- ^ FilterExecTransformer (9) : : : : +- ^ Scan parquet (8) : : : +- ^ InputIteratorTransformer (18) - : : : +- ^ InputAdapter (17) - : : : +- ^ BroadcastQueryStage (16), Statistics(X) - : : : +- ColumnarBroadcastExchange (15) - : : : +- ^ FilterExecTransformer (13) - : : : +- ^ Scan parquet (12) + : : : +- BroadcastQueryStage (16), Statistics(X) + : : : +- ColumnarBroadcastExchange (15) + : : : +- ^ FilterExecTransformer (13) + : : : +- ^ Scan parquet (12) : : +- ^ InputIteratorTransformer (27) - : : +- ^ InputAdapter (26) - : : +- ^ BroadcastQueryStage (25), Statistics(X) - : : +- ColumnarBroadcastExchange (24) - : : +- ^ FilterExecTransformer (22) - : : +- ^ Scan parquet (21) + : : +- BroadcastQueryStage (25), Statistics(X) + : : +- ColumnarBroadcastExchange (24) + : : +- ^ FilterExecTransformer (22) + : : +- ^ Scan parquet (21) : +- ^ InputIteratorTransformer (36) - : +- ^ InputAdapter (35) - : +- ^ BroadcastQueryStage (34), Statistics(X) - : +- ColumnarBroadcastExchange (33) - : +- ^ FilterExecTransformer (31) - : +- ^ Scan parquet (30) + : +- BroadcastQueryStage (34), Statistics(X) + : +- ColumnarBroadcastExchange (33) + : +- ^ FilterExecTransformer (31) + : +- ^ Scan parquet (30) +- ^ InputIteratorTransformer (42) - +- ^ InputAdapter (41) - +- ^ BroadcastQueryStage (40), Statistics(X) - +- ReusedExchange (39) + +- BroadcastQueryStage (40), Statistics(X) + +- ReusedExchange (39) +- == Initial Plan == - Sort (92) - +- Exchange (91) - +- HashAggregate (90) - +- Exchange (89) - +- HashAggregate (88) - +- Project (87) - +- BroadcastHashJoin Inner BuildRight (86) - :- Project (82) - : +- BroadcastHashJoin Inner BuildRight (81) - : :- Project (77) - : : +- BroadcastHashJoin Inner BuildRight (76) - : : :- Project (72) - : : : +- BroadcastHashJoin Inner BuildRight (71) - : : : :- Project (67) - : : : : +- BroadcastHashJoin Inner BuildLeft (66) - : : : : :- BroadcastExchange (63) - : : : : : +- Filter (62) - : : : : : +- Scan parquet (61) - : : : : +- Filter (65) - : : : : +- Scan parquet (64) - : : : +- BroadcastExchange (70) - : : : +- Filter (69) - : : : +- Scan parquet (68) - : : +- BroadcastExchange (75) - : : +- Filter (74) - : : +- Scan parquet (73) - : +- BroadcastExchange (80) - : +- Filter (79) - : +- Scan parquet (78) - +- BroadcastExchange (85) - +- Filter (84) - +- Scan parquet (83) + Sort (94) + +- Exchange (93) + +- HashAggregate (92) + +- Exchange (91) + +- HashAggregate (90) + +- Project (89) + +- BroadcastHashJoin Inner BuildRight (88) + :- Project (84) + : +- BroadcastHashJoin Inner BuildRight (83) + : :- Project (79) + : : +- BroadcastHashJoin Inner BuildRight (78) + : : :- Project (74) + : : : +- BroadcastHashJoin Inner BuildRight (73) + : : : :- Project (69) + : : : : +- BroadcastHashJoin Inner BuildLeft (68) + : : : : :- BroadcastExchange (65) + : : : : : +- Filter (64) + : : : : : +- Scan parquet (63) + : : : : +- Filter (67) + : : : : +- Scan parquet (66) + : : : +- BroadcastExchange (72) + : : : +- Filter (71) + : : : +- Scan parquet (70) + : : +- BroadcastExchange (77) + : : +- Filter (76) + : : +- Scan parquet (75) + : +- BroadcastExchange (82) + : +- Filter (81) + : +- Scan parquet (80) + +- BroadcastExchange (87) + +- Filter (86) + +- Scan parquet (85) (1) Scan parquet @@ -294,218 +289,226 @@ Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Input [6]: [hash_partition_key#X, supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Arguments: false -(48) ColumnarExchange +(48) VeloxAppendBatches +Input [6]: [hash_partition_key#X, supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] +Arguments: X + +(49) ColumnarExchange Input [6]: [hash_partition_key#X, supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Arguments: hashpartitioning(supp_nation#X, cust_nation#X, l_year#X, 1), ENSURE_REQUIREMENTS, [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(49) ShuffleQueryStage +(50) ShuffleQueryStage Output [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Arguments: X -(50) InputAdapter +(51) InputAdapter Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] -(51) InputIteratorTransformer +(52) InputIteratorTransformer Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] -(52) RegularHashAggregateExecTransformer +(53) RegularHashAggregateExecTransformer Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Keys [3]: [supp_nation#X, cust_nation#X, l_year#X] Functions [1]: [sum(volume#X)] Aggregate Attributes [1]: [sum(volume#X)#X] Results [4]: [supp_nation#X, cust_nation#X, l_year#X, sum(volume#X)#X AS revenue#X] -(53) WholeStageCodegenTransformer (X) +(54) WholeStageCodegenTransformer (X) Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: false -(54) ColumnarExchange +(55) VeloxAppendBatches +Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] +Arguments: X + +(56) ColumnarExchange Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: rangepartitioning(supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(55) ShuffleQueryStage +(57) ShuffleQueryStage Output [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: X -(56) InputAdapter +(58) InputAdapter Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] -(57) InputIteratorTransformer +(59) InputIteratorTransformer Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] -(58) SortExecTransformer +(60) SortExecTransformer Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: [supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST], true, 0 -(59) WholeStageCodegenTransformer (X) +(61) WholeStageCodegenTransformer (X) Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: false -(60) VeloxColumnarToRowExec +(62) VeloxColumnarToRowExec Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] -(61) Scan parquet +(63) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(62) Filter +(64) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(63) BroadcastExchange +(65) BroadcastExchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(64) Scan parquet +(66) Scan parquet Output [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1995-01-01), LessThanOrEqual(l_shipdate,1996-12-31), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(65) Filter +(67) Filter Input [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : ((((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1995-01-01)) AND (l_shipdate#X <= 1996-12-31)) AND isnotnull(l_suppkey#X)) AND isnotnull(l_orderkey#X)) -(66) BroadcastHashJoin +(68) BroadcastHashJoin Left keys [1]: [s_suppkey#X] Right keys [1]: [l_suppkey#X] Join type: Inner Join condition: None -(67) Project +(69) Project Output [5]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Input [7]: [s_suppkey#X, s_nationkey#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(68) Scan parquet +(70) Scan parquet Output [2]: [o_orderkey#X, o_custkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderkey), IsNotNull(o_custkey)] ReadSchema: struct -(69) Filter +(71) Filter Input [2]: [o_orderkey#X, o_custkey#X] Condition : (isnotnull(o_orderkey#X) AND isnotnull(o_custkey#X)) -(70) BroadcastExchange +(72) BroadcastExchange Input [2]: [o_orderkey#X, o_custkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(71) BroadcastHashJoin +(73) BroadcastHashJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join type: Inner Join condition: None -(72) Project +(74) Project Output [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] Input [7]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_orderkey#X, o_custkey#X] -(73) Scan parquet +(75) Scan parquet Output [2]: [c_custkey#X, c_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey), IsNotNull(c_nationkey)] ReadSchema: struct -(74) Filter +(76) Filter Input [2]: [c_custkey#X, c_nationkey#X] Condition : (isnotnull(c_custkey#X) AND isnotnull(c_nationkey#X)) -(75) BroadcastExchange +(77) BroadcastExchange Input [2]: [c_custkey#X, c_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(76) BroadcastHashJoin +(78) BroadcastHashJoin Left keys [1]: [o_custkey#X] Right keys [1]: [c_custkey#X] Join type: Inner Join condition: None -(77) Project +(79) Project Output [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] Input [7]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X, c_custkey#X, c_nationkey#X] -(78) Scan parquet +(80) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), Or(EqualTo(n_name,FRANCE),EqualTo(n_name,GERMANY))] ReadSchema: struct -(79) Filter +(81) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : (isnotnull(n_nationkey#X) AND ((n_name#X = FRANCE) OR (n_name#X = GERMANY))) -(80) BroadcastExchange +(82) BroadcastExchange Input [2]: [n_nationkey#X, n_name#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(81) BroadcastHashJoin +(83) BroadcastHashJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(82) Project +(84) Project Output [5]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] Input [7]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_nationkey#X, n_name#X] -(83) Scan parquet +(85) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), Or(EqualTo(n_name,GERMANY),EqualTo(n_name,FRANCE))] ReadSchema: struct -(84) Filter +(86) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : (isnotnull(n_nationkey#X) AND ((n_name#X = GERMANY) OR (n_name#X = FRANCE))) -(85) BroadcastExchange +(87) BroadcastExchange Input [2]: [n_nationkey#X, n_name#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(86) BroadcastHashJoin +(88) BroadcastHashJoin Left keys [1]: [c_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: (((n_name#X = FRANCE) AND (n_name#X = GERMANY)) OR ((n_name#X = GERMANY) AND (n_name#X = FRANCE))) -(87) Project +(89) Project Output [4]: [n_name#X AS supp_nation#X, n_name#X AS cust_nation#X, year(l_shipdate#X) AS l_year#X, (l_extendedprice#X * (1 - l_discount#X)) AS volume#X] Input [7]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X, n_nationkey#X, n_name#X] -(88) HashAggregate +(90) HashAggregate Input [4]: [supp_nation#X, cust_nation#X, l_year#X, volume#X] Keys [3]: [supp_nation#X, cust_nation#X, l_year#X] Functions [1]: [partial_sum(volume#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] -(89) Exchange +(91) Exchange Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Arguments: hashpartitioning(supp_nation#X, cust_nation#X, l_year#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(90) HashAggregate +(92) HashAggregate Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Keys [3]: [supp_nation#X, cust_nation#X, l_year#X] Functions [1]: [sum(volume#X)] Aggregate Attributes [1]: [sum(volume#X)#X] Results [4]: [supp_nation#X, cust_nation#X, l_year#X, sum(volume#X)#X AS revenue#X] -(91) Exchange +(93) Exchange Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: rangepartitioning(supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(92) Sort +(94) Sort Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: [supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST], true, 0 -(93) AdaptiveSparkPlan +(95) AdaptiveSparkPlan Output [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/8.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/8.txt index 4ac782958199d..6c2c8eb46bc97 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/8.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/8.txt @@ -1,125 +1,118 @@ == Physical Plan == -AdaptiveSparkPlan (129) +AdaptiveSparkPlan (131) +- == Final Plan == - VeloxColumnarToRowExec (84) - +- ^ SortExecTransformer (82) - +- ^ InputIteratorTransformer (81) - +- ^ InputAdapter (80) - +- ^ ShuffleQueryStage (79), Statistics(X) - +- ColumnarExchange (78) - +- ^ ProjectExecTransformer (76) - +- ^ RegularHashAggregateExecTransformer (75) - +- ^ InputIteratorTransformer (74) - +- ^ InputAdapter (73) - +- ^ ShuffleQueryStage (72), Statistics(X) - +- ColumnarExchange (71) + VeloxColumnarToRowExec (86) + +- ^ SortExecTransformer (84) + +- ^ InputIteratorTransformer (83) + +- ShuffleQueryStage (81), Statistics(X) + +- ColumnarExchange (80) + +- VeloxAppendBatches (79) + +- ^ ProjectExecTransformer (77) + +- ^ RegularHashAggregateExecTransformer (76) + +- ^ InputIteratorTransformer (75) + +- ShuffleQueryStage (73), Statistics(X) + +- ColumnarExchange (72) + +- VeloxAppendBatches (71) +- ^ ProjectExecTransformer (69) +- ^ FlushableHashAggregateExecTransformer (68) +- ^ ProjectExecTransformer (67) - +- ^ BroadcastHashJoinExecTransformer Inner (66) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (66) :- ^ ProjectExecTransformer (57) - : +- ^ BroadcastHashJoinExecTransformer Inner (56) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (56) : :- ^ ProjectExecTransformer (48) - : : +- ^ BroadcastHashJoinExecTransformer Inner (47) + : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (47) : : :- ^ ProjectExecTransformer (39) - : : : +- ^ BroadcastHashJoinExecTransformer Inner (38) + : : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (38) : : : :- ^ ProjectExecTransformer (30) - : : : : +- ^ BroadcastHashJoinExecTransformer Inner (29) + : : : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (29) : : : : :- ^ ProjectExecTransformer (21) - : : : : : +- ^ BroadcastHashJoinExecTransformer Inner (20) + : : : : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (20) : : : : : :- ^ ProjectExecTransformer (12) - : : : : : : +- ^ BroadcastHashJoinExecTransformer Inner (11) + : : : : : : +- ^ BroadcastHashJoinExecTransformer Inner BuildLeft (11) : : : : : : :- ^ InputIteratorTransformer (8) - : : : : : : : +- ^ InputAdapter (7) - : : : : : : : +- ^ BroadcastQueryStage (6), Statistics(X) - : : : : : : : +- ColumnarBroadcastExchange (5) - : : : : : : : +- ^ ProjectExecTransformer (3) - : : : : : : : +- ^ FilterExecTransformer (2) - : : : : : : : +- ^ Scan parquet (1) + : : : : : : : +- BroadcastQueryStage (6), Statistics(X) + : : : : : : : +- ColumnarBroadcastExchange (5) + : : : : : : : +- ^ ProjectExecTransformer (3) + : : : : : : : +- ^ FilterExecTransformer (2) + : : : : : : : +- ^ Scan parquet (1) : : : : : : +- ^ FilterExecTransformer (10) : : : : : : +- ^ Scan parquet (9) : : : : : +- ^ InputIteratorTransformer (19) - : : : : : +- ^ InputAdapter (18) - : : : : : +- ^ BroadcastQueryStage (17), Statistics(X) - : : : : : +- ColumnarBroadcastExchange (16) - : : : : : +- ^ FilterExecTransformer (14) - : : : : : +- ^ Scan parquet (13) + : : : : : +- BroadcastQueryStage (17), Statistics(X) + : : : : : +- ColumnarBroadcastExchange (16) + : : : : : +- ^ FilterExecTransformer (14) + : : : : : +- ^ Scan parquet (13) : : : : +- ^ InputIteratorTransformer (28) - : : : : +- ^ InputAdapter (27) - : : : : +- ^ BroadcastQueryStage (26), Statistics(X) - : : : : +- ColumnarBroadcastExchange (25) - : : : : +- ^ FilterExecTransformer (23) - : : : : +- ^ Scan parquet (22) + : : : : +- BroadcastQueryStage (26), Statistics(X) + : : : : +- ColumnarBroadcastExchange (25) + : : : : +- ^ FilterExecTransformer (23) + : : : : +- ^ Scan parquet (22) : : : +- ^ InputIteratorTransformer (37) - : : : +- ^ InputAdapter (36) - : : : +- ^ BroadcastQueryStage (35), Statistics(X) - : : : +- ColumnarBroadcastExchange (34) - : : : +- ^ FilterExecTransformer (32) - : : : +- ^ Scan parquet (31) + : : : +- BroadcastQueryStage (35), Statistics(X) + : : : +- ColumnarBroadcastExchange (34) + : : : +- ^ FilterExecTransformer (32) + : : : +- ^ Scan parquet (31) : : +- ^ InputIteratorTransformer (46) - : : +- ^ InputAdapter (45) - : : +- ^ BroadcastQueryStage (44), Statistics(X) - : : +- ColumnarBroadcastExchange (43) - : : +- ^ FilterExecTransformer (41) - : : +- ^ Scan parquet (40) + : : +- BroadcastQueryStage (44), Statistics(X) + : : +- ColumnarBroadcastExchange (43) + : : +- ^ FilterExecTransformer (41) + : : +- ^ Scan parquet (40) : +- ^ InputIteratorTransformer (55) - : +- ^ InputAdapter (54) - : +- ^ BroadcastQueryStage (53), Statistics(X) - : +- ColumnarBroadcastExchange (52) - : +- ^ FilterExecTransformer (50) - : +- ^ Scan parquet (49) + : +- BroadcastQueryStage (53), Statistics(X) + : +- ColumnarBroadcastExchange (52) + : +- ^ FilterExecTransformer (50) + : +- ^ Scan parquet (49) +- ^ InputIteratorTransformer (65) - +- ^ InputAdapter (64) - +- ^ BroadcastQueryStage (63), Statistics(X) - +- ColumnarBroadcastExchange (62) - +- ^ ProjectExecTransformer (60) - +- ^ FilterExecTransformer (59) - +- ^ Scan parquet (58) + +- BroadcastQueryStage (63), Statistics(X) + +- ColumnarBroadcastExchange (62) + +- ^ ProjectExecTransformer (60) + +- ^ FilterExecTransformer (59) + +- ^ Scan parquet (58) +- == Initial Plan == - Sort (128) - +- Exchange (127) - +- HashAggregate (126) - +- Exchange (125) - +- HashAggregate (124) - +- Project (123) - +- BroadcastHashJoin Inner BuildRight (122) - :- Project (117) - : +- BroadcastHashJoin Inner BuildRight (116) - : :- Project (112) - : : +- BroadcastHashJoin Inner BuildRight (111) - : : :- Project (107) - : : : +- BroadcastHashJoin Inner BuildRight (106) - : : : :- Project (102) - : : : : +- BroadcastHashJoin Inner BuildRight (101) - : : : : :- Project (97) - : : : : : +- BroadcastHashJoin Inner BuildRight (96) - : : : : : :- Project (92) - : : : : : : +- BroadcastHashJoin Inner BuildLeft (91) - : : : : : : :- BroadcastExchange (88) - : : : : : : : +- Project (87) - : : : : : : : +- Filter (86) - : : : : : : : +- Scan parquet (85) - : : : : : : +- Filter (90) - : : : : : : +- Scan parquet (89) - : : : : : +- BroadcastExchange (95) - : : : : : +- Filter (94) - : : : : : +- Scan parquet (93) - : : : : +- BroadcastExchange (100) - : : : : +- Filter (99) - : : : : +- Scan parquet (98) - : : : +- BroadcastExchange (105) - : : : +- Filter (104) - : : : +- Scan parquet (103) - : : +- BroadcastExchange (110) - : : +- Filter (109) - : : +- Scan parquet (108) - : +- BroadcastExchange (115) - : +- Filter (114) - : +- Scan parquet (113) - +- BroadcastExchange (121) - +- Project (120) - +- Filter (119) - +- Scan parquet (118) + Sort (130) + +- Exchange (129) + +- HashAggregate (128) + +- Exchange (127) + +- HashAggregate (126) + +- Project (125) + +- BroadcastHashJoin Inner BuildRight (124) + :- Project (119) + : +- BroadcastHashJoin Inner BuildRight (118) + : :- Project (114) + : : +- BroadcastHashJoin Inner BuildRight (113) + : : :- Project (109) + : : : +- BroadcastHashJoin Inner BuildRight (108) + : : : :- Project (104) + : : : : +- BroadcastHashJoin Inner BuildRight (103) + : : : : :- Project (99) + : : : : : +- BroadcastHashJoin Inner BuildRight (98) + : : : : : :- Project (94) + : : : : : : +- BroadcastHashJoin Inner BuildLeft (93) + : : : : : : :- BroadcastExchange (90) + : : : : : : : +- Project (89) + : : : : : : : +- Filter (88) + : : : : : : : +- Scan parquet (87) + : : : : : : +- Filter (92) + : : : : : : +- Scan parquet (91) + : : : : : +- BroadcastExchange (97) + : : : : : +- Filter (96) + : : : : : +- Scan parquet (95) + : : : : +- BroadcastExchange (102) + : : : : +- Filter (101) + : : : : +- Scan parquet (100) + : : : +- BroadcastExchange (107) + : : : +- Filter (106) + : : : +- Scan parquet (105) + : : +- BroadcastExchange (112) + : : +- Filter (111) + : : +- Scan parquet (110) + : +- BroadcastExchange (117) + : +- Filter (116) + : +- Scan parquet (115) + +- BroadcastExchange (123) + +- Project (122) + +- Filter (121) + +- Scan parquet (120) (1) Scan parquet @@ -429,280 +422,288 @@ Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Input [6]: [hash_partition_key#X, o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: false -(71) ColumnarExchange +(71) VeloxAppendBatches +Input [6]: [hash_partition_key#X, o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] +Arguments: X + +(72) ColumnarExchange Input [6]: [hash_partition_key#X, o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: hashpartitioning(o_year#X, 1), ENSURE_REQUIREMENTS, [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(72) ShuffleQueryStage +(73) ShuffleQueryStage Output [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: X -(73) InputAdapter +(74) InputAdapter Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] -(74) InputIteratorTransformer +(75) InputIteratorTransformer Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] -(75) RegularHashAggregateExecTransformer +(76) RegularHashAggregateExecTransformer Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Keys [1]: [o_year#X] Functions [2]: [sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END), sum(volume#X)] Aggregate Attributes [2]: [sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X, sum(volume#X)#X] Results [3]: [o_year#X, sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X, sum(volume#X)#X] -(76) ProjectExecTransformer +(77) ProjectExecTransformer Output [2]: [o_year#X, (sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X / sum(volume#X)#X) AS mkt_share#X] Input [3]: [o_year#X, sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X, sum(volume#X)#X] -(77) WholeStageCodegenTransformer (X) +(78) WholeStageCodegenTransformer (X) Input [2]: [o_year#X, mkt_share#X] Arguments: false -(78) ColumnarExchange +(79) VeloxAppendBatches +Input [2]: [o_year#X, mkt_share#X] +Arguments: X + +(80) ColumnarExchange Input [2]: [o_year#X, mkt_share#X] Arguments: rangepartitioning(o_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(79) ShuffleQueryStage +(81) ShuffleQueryStage Output [2]: [o_year#X, mkt_share#X] Arguments: X -(80) InputAdapter +(82) InputAdapter Input [2]: [o_year#X, mkt_share#X] -(81) InputIteratorTransformer +(83) InputIteratorTransformer Input [2]: [o_year#X, mkt_share#X] -(82) SortExecTransformer +(84) SortExecTransformer Input [2]: [o_year#X, mkt_share#X] Arguments: [o_year#X ASC NULLS FIRST], true, 0 -(83) WholeStageCodegenTransformer (X) +(85) WholeStageCodegenTransformer (X) Input [2]: [o_year#X, mkt_share#X] Arguments: false -(84) VeloxColumnarToRowExec +(86) VeloxColumnarToRowExec Input [2]: [o_year#X, mkt_share#X] -(85) Scan parquet +(87) Scan parquet Output [2]: [p_partkey#X, p_type#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_type), EqualTo(p_type,ECONOMY ANODIZED STEEL), IsNotNull(p_partkey)] ReadSchema: struct -(86) Filter +(88) Filter Input [2]: [p_partkey#X, p_type#X] Condition : ((isnotnull(p_type#X) AND (p_type#X = ECONOMY ANODIZED STEEL)) AND isnotnull(p_partkey#X)) -(87) Project +(89) Project Output [1]: [p_partkey#X] Input [2]: [p_partkey#X, p_type#X] -(88) BroadcastExchange +(90) BroadcastExchange Input [1]: [p_partkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(89) Scan parquet +(91) Scan parquet Output [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(90) Filter +(92) Filter Input [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Condition : ((isnotnull(l_partkey#X) AND isnotnull(l_suppkey#X)) AND isnotnull(l_orderkey#X)) -(91) BroadcastHashJoin +(93) BroadcastHashJoin Left keys [1]: [p_partkey#X] Right keys [1]: [l_partkey#X] Join type: Inner Join condition: None -(92) Project +(94) Project Output [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Input [6]: [p_partkey#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(93) Scan parquet +(95) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(94) Filter +(96) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(95) BroadcastExchange +(97) BroadcastExchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(96) BroadcastHashJoin +(98) BroadcastHashJoin Left keys [1]: [l_suppkey#X] Right keys [1]: [s_suppkey#X] Join type: Inner Join condition: None -(97) Project +(99) Project Output [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Input [6]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, s_suppkey#X, s_nationkey#X] -(98) Scan parquet +(100) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1995-01-01), LessThanOrEqual(o_orderdate,1996-12-31), IsNotNull(o_orderkey), IsNotNull(o_custkey)] ReadSchema: struct -(99) Filter +(101) Filter Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Condition : ((((isnotnull(o_orderdate#X) AND (o_orderdate#X >= 1995-01-01)) AND (o_orderdate#X <= 1996-12-31)) AND isnotnull(o_orderkey#X)) AND isnotnull(o_custkey#X)) -(100) BroadcastExchange +(102) BroadcastExchange Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(101) BroadcastHashJoin +(103) BroadcastHashJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join type: Inner Join condition: None -(102) Project +(104) Project Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] Input [7]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderkey#X, o_custkey#X, o_orderdate#X] -(103) Scan parquet +(105) Scan parquet Output [2]: [c_custkey#X, c_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey), IsNotNull(c_nationkey)] ReadSchema: struct -(104) Filter +(106) Filter Input [2]: [c_custkey#X, c_nationkey#X] Condition : (isnotnull(c_custkey#X) AND isnotnull(c_nationkey#X)) -(105) BroadcastExchange +(107) BroadcastExchange Input [2]: [c_custkey#X, c_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(106) BroadcastHashJoin +(108) BroadcastHashJoin Left keys [1]: [o_custkey#X] Right keys [1]: [c_custkey#X] Join type: Inner Join condition: None -(107) Project +(109) Project Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] Input [7]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X, c_custkey#X, c_nationkey#X] -(108) Scan parquet +(110) Scan parquet Output [2]: [n_nationkey#X, n_regionkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), IsNotNull(n_regionkey)] ReadSchema: struct -(109) Filter +(111) Filter Input [2]: [n_nationkey#X, n_regionkey#X] Condition : (isnotnull(n_nationkey#X) AND isnotnull(n_regionkey#X)) -(110) BroadcastExchange +(112) BroadcastExchange Input [2]: [n_nationkey#X, n_regionkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(111) BroadcastHashJoin +(113) BroadcastHashJoin Left keys [1]: [c_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(112) Project +(114) Project Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] Input [7]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X, n_nationkey#X, n_regionkey#X] -(113) Scan parquet +(115) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey)] ReadSchema: struct -(114) Filter +(116) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : isnotnull(n_nationkey#X) -(115) BroadcastExchange +(117) BroadcastExchange Input [2]: [n_nationkey#X, n_name#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(116) BroadcastHashJoin +(118) BroadcastHashJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(117) Project +(119) Project Output [5]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] Input [7]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X, n_nationkey#X, n_name#X] -(118) Scan parquet +(120) Scan parquet Output [2]: [r_regionkey#X, r_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(r_name), EqualTo(r_name,AMERICA), IsNotNull(r_regionkey)] ReadSchema: struct -(119) Filter +(121) Filter Input [2]: [r_regionkey#X, r_name#X] Condition : ((isnotnull(r_name#X) AND (r_name#X = AMERICA)) AND isnotnull(r_regionkey#X)) -(120) Project +(122) Project Output [1]: [r_regionkey#X] Input [2]: [r_regionkey#X, r_name#X] -(121) BroadcastExchange +(123) BroadcastExchange Input [1]: [r_regionkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(122) BroadcastHashJoin +(124) BroadcastHashJoin Left keys [1]: [n_regionkey#X] Right keys [1]: [r_regionkey#X] Join type: Inner Join condition: None -(123) Project +(125) Project Output [3]: [year(o_orderdate#X) AS o_year#X, (l_extendedprice#X * (1 - l_discount#X)) AS volume#X, n_name#X AS nation#X] Input [6]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X, r_regionkey#X] -(124) HashAggregate +(126) HashAggregate Input [3]: [o_year#X, volume#X, nation#X] Keys [1]: [o_year#X] Functions [2]: [partial_sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END), partial_sum(volume#X)] Aggregate Attributes [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Results [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] -(125) Exchange +(127) Exchange Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: hashpartitioning(o_year#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(126) HashAggregate +(128) HashAggregate Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Keys [1]: [o_year#X] Functions [2]: [sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END), sum(volume#X)] Aggregate Attributes [2]: [sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X, sum(volume#X)#X] Results [2]: [o_year#X, (sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X / sum(volume#X)#X) AS mkt_share#X] -(127) Exchange +(129) Exchange Input [2]: [o_year#X, mkt_share#X] Arguments: rangepartitioning(o_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(128) Sort +(130) Sort Input [2]: [o_year#X, mkt_share#X] Arguments: [o_year#X ASC NULLS FIRST], true, 0 -(129) AdaptiveSparkPlan +(131) AdaptiveSparkPlan Output [2]: [o_year#X, mkt_share#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/9.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/9.txt index 1a41ac753c5c0..6a4faab249c87 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/9.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/9.txt @@ -1,96 +1,91 @@ == Physical Plan == -AdaptiveSparkPlan (98) +AdaptiveSparkPlan (100) +- == Final Plan == - VeloxColumnarToRowExec (64) - +- ^ SortExecTransformer (62) - +- ^ InputIteratorTransformer (61) - +- ^ InputAdapter (60) - +- ^ ShuffleQueryStage (59), Statistics(X) - +- ColumnarExchange (58) - +- ^ RegularHashAggregateExecTransformer (56) - +- ^ InputIteratorTransformer (55) - +- ^ InputAdapter (54) - +- ^ ShuffleQueryStage (53), Statistics(X) - +- ColumnarExchange (52) + VeloxColumnarToRowExec (66) + +- ^ SortExecTransformer (64) + +- ^ InputIteratorTransformer (63) + +- ShuffleQueryStage (61), Statistics(X) + +- ColumnarExchange (60) + +- VeloxAppendBatches (59) + +- ^ RegularHashAggregateExecTransformer (57) + +- ^ InputIteratorTransformer (56) + +- ShuffleQueryStage (54), Statistics(X) + +- ColumnarExchange (53) + +- VeloxAppendBatches (52) +- ^ ProjectExecTransformer (50) +- ^ FlushableHashAggregateExecTransformer (49) +- ^ ProjectExecTransformer (48) - +- ^ BroadcastHashJoinExecTransformer Inner (47) + +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (47) :- ^ ProjectExecTransformer (39) - : +- ^ BroadcastHashJoinExecTransformer Inner (38) + : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (38) : :- ^ ProjectExecTransformer (30) - : : +- ^ BroadcastHashJoinExecTransformer Inner (29) + : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (29) : : :- ^ ProjectExecTransformer (21) - : : : +- ^ BroadcastHashJoinExecTransformer Inner (20) + : : : +- ^ BroadcastHashJoinExecTransformer Inner BuildRight (20) : : : :- ^ ProjectExecTransformer (12) - : : : : +- ^ BroadcastHashJoinExecTransformer Inner (11) + : : : : +- ^ BroadcastHashJoinExecTransformer Inner BuildLeft (11) : : : : :- ^ InputIteratorTransformer (8) - : : : : : +- ^ InputAdapter (7) - : : : : : +- ^ BroadcastQueryStage (6), Statistics(X) - : : : : : +- ColumnarBroadcastExchange (5) - : : : : : +- ^ ProjectExecTransformer (3) - : : : : : +- ^ FilterExecTransformer (2) - : : : : : +- ^ Scan parquet (1) + : : : : : +- BroadcastQueryStage (6), Statistics(X) + : : : : : +- ColumnarBroadcastExchange (5) + : : : : : +- ^ ProjectExecTransformer (3) + : : : : : +- ^ FilterExecTransformer (2) + : : : : : +- ^ Scan parquet (1) : : : : +- ^ FilterExecTransformer (10) : : : : +- ^ Scan parquet (9) : : : +- ^ InputIteratorTransformer (19) - : : : +- ^ InputAdapter (18) - : : : +- ^ BroadcastQueryStage (17), Statistics(X) - : : : +- ColumnarBroadcastExchange (16) - : : : +- ^ FilterExecTransformer (14) - : : : +- ^ Scan parquet (13) + : : : +- BroadcastQueryStage (17), Statistics(X) + : : : +- ColumnarBroadcastExchange (16) + : : : +- ^ FilterExecTransformer (14) + : : : +- ^ Scan parquet (13) : : +- ^ InputIteratorTransformer (28) - : : +- ^ InputAdapter (27) - : : +- ^ BroadcastQueryStage (26), Statistics(X) - : : +- ColumnarBroadcastExchange (25) - : : +- ^ FilterExecTransformer (23) - : : +- ^ Scan parquet (22) + : : +- BroadcastQueryStage (26), Statistics(X) + : : +- ColumnarBroadcastExchange (25) + : : +- ^ FilterExecTransformer (23) + : : +- ^ Scan parquet (22) : +- ^ InputIteratorTransformer (37) - : +- ^ InputAdapter (36) - : +- ^ BroadcastQueryStage (35), Statistics(X) - : +- ColumnarBroadcastExchange (34) - : +- ^ FilterExecTransformer (32) - : +- ^ Scan parquet (31) + : +- BroadcastQueryStage (35), Statistics(X) + : +- ColumnarBroadcastExchange (34) + : +- ^ FilterExecTransformer (32) + : +- ^ Scan parquet (31) +- ^ InputIteratorTransformer (46) - +- ^ InputAdapter (45) - +- ^ BroadcastQueryStage (44), Statistics(X) - +- ColumnarBroadcastExchange (43) - +- ^ FilterExecTransformer (41) - +- ^ Scan parquet (40) + +- BroadcastQueryStage (44), Statistics(X) + +- ColumnarBroadcastExchange (43) + +- ^ FilterExecTransformer (41) + +- ^ Scan parquet (40) +- == Initial Plan == - Sort (97) - +- Exchange (96) - +- HashAggregate (95) - +- Exchange (94) - +- HashAggregate (93) - +- Project (92) - +- BroadcastHashJoin Inner BuildRight (91) - :- Project (87) - : +- BroadcastHashJoin Inner BuildRight (86) - : :- Project (82) - : : +- BroadcastHashJoin Inner BuildRight (81) - : : :- Project (77) - : : : +- BroadcastHashJoin Inner BuildRight (76) - : : : :- Project (72) - : : : : +- BroadcastHashJoin Inner BuildLeft (71) - : : : : :- BroadcastExchange (68) - : : : : : +- Project (67) - : : : : : +- Filter (66) - : : : : : +- Scan parquet (65) - : : : : +- Filter (70) - : : : : +- Scan parquet (69) - : : : +- BroadcastExchange (75) - : : : +- Filter (74) - : : : +- Scan parquet (73) - : : +- BroadcastExchange (80) - : : +- Filter (79) - : : +- Scan parquet (78) - : +- BroadcastExchange (85) - : +- Filter (84) - : +- Scan parquet (83) - +- BroadcastExchange (90) - +- Filter (89) - +- Scan parquet (88) + Sort (99) + +- Exchange (98) + +- HashAggregate (97) + +- Exchange (96) + +- HashAggregate (95) + +- Project (94) + +- BroadcastHashJoin Inner BuildRight (93) + :- Project (89) + : +- BroadcastHashJoin Inner BuildRight (88) + : :- Project (84) + : : +- BroadcastHashJoin Inner BuildRight (83) + : : :- Project (79) + : : : +- BroadcastHashJoin Inner BuildRight (78) + : : : :- Project (74) + : : : : +- BroadcastHashJoin Inner BuildLeft (73) + : : : : :- BroadcastExchange (70) + : : : : : +- Project (69) + : : : : : +- Filter (68) + : : : : : +- Scan parquet (67) + : : : : +- Filter (72) + : : : : +- Scan parquet (71) + : : : +- BroadcastExchange (77) + : : : +- Filter (76) + : : : +- Scan parquet (75) + : : +- BroadcastExchange (82) + : : +- Filter (81) + : : +- Scan parquet (80) + : +- BroadcastExchange (87) + : +- Filter (86) + : +- Scan parquet (85) + +- BroadcastExchange (92) + +- Filter (91) + +- Scan parquet (90) (1) Scan parquet @@ -318,222 +313,230 @@ Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] Input [5]: [hash_partition_key#X, nation#X, o_year#X, sum#X, isEmpty#X] Arguments: false -(52) ColumnarExchange +(52) VeloxAppendBatches +Input [5]: [hash_partition_key#X, nation#X, o_year#X, sum#X, isEmpty#X] +Arguments: X + +(53) ColumnarExchange Input [5]: [hash_partition_key#X, nation#X, o_year#X, sum#X, isEmpty#X] Arguments: hashpartitioning(nation#X, o_year#X, 1), ENSURE_REQUIREMENTS, [nation#X, o_year#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(53) ShuffleQueryStage +(54) ShuffleQueryStage Output [4]: [nation#X, o_year#X, sum#X, isEmpty#X] Arguments: X -(54) InputAdapter +(55) InputAdapter Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] -(55) InputIteratorTransformer +(56) InputIteratorTransformer Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] -(56) RegularHashAggregateExecTransformer +(57) RegularHashAggregateExecTransformer Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] Keys [2]: [nation#X, o_year#X] Functions [1]: [sum(amount#X)] Aggregate Attributes [1]: [sum(amount#X)#X] Results [3]: [nation#X, o_year#X, sum(amount#X)#X AS sum_profit#X] -(57) WholeStageCodegenTransformer (X) +(58) WholeStageCodegenTransformer (X) Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: false -(58) ColumnarExchange +(59) VeloxAppendBatches +Input [3]: [nation#X, o_year#X, sum_profit#X] +Arguments: X + +(60) ColumnarExchange Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: rangepartitioning(nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(59) ShuffleQueryStage +(61) ShuffleQueryStage Output [3]: [nation#X, o_year#X, sum_profit#X] Arguments: X -(60) InputAdapter +(62) InputAdapter Input [3]: [nation#X, o_year#X, sum_profit#X] -(61) InputIteratorTransformer +(63) InputIteratorTransformer Input [3]: [nation#X, o_year#X, sum_profit#X] -(62) SortExecTransformer +(64) SortExecTransformer Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: [nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST], true, 0 -(63) WholeStageCodegenTransformer (X) +(65) WholeStageCodegenTransformer (X) Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: false -(64) VeloxColumnarToRowExec +(66) VeloxColumnarToRowExec Input [3]: [nation#X, o_year#X, sum_profit#X] -(65) Scan parquet +(67) Scan parquet Output [2]: [p_partkey#X, p_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_name), StringContains(p_name,green), IsNotNull(p_partkey)] ReadSchema: struct -(66) Filter +(68) Filter Input [2]: [p_partkey#X, p_name#X] Condition : ((isnotnull(p_name#X) AND Contains(p_name#X, green)) AND isnotnull(p_partkey#X)) -(67) Project +(69) Project Output [1]: [p_partkey#X] Input [2]: [p_partkey#X, p_name#X] -(68) BroadcastExchange +(70) BroadcastExchange Input [1]: [p_partkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=X] -(69) Scan parquet +(71) Scan parquet Output [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(70) Filter +(72) Filter Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Condition : ((isnotnull(l_partkey#X) AND isnotnull(l_suppkey#X)) AND isnotnull(l_orderkey#X)) -(71) BroadcastHashJoin +(73) BroadcastHashJoin Left keys [1]: [p_partkey#X] Right keys [1]: [l_partkey#X] Join type: Inner Join condition: None -(72) Project +(74) Project Output [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Input [7]: [p_partkey#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -(73) Scan parquet +(75) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(74) Filter +(76) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(75) BroadcastExchange +(77) BroadcastExchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(76) BroadcastHashJoin +(78) BroadcastHashJoin Left keys [1]: [l_suppkey#X] Right keys [1]: [s_suppkey#X] Join type: Inner Join condition: None -(77) Project +(79) Project Output [7]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Input [8]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_suppkey#X, s_nationkey#X] -(78) Scan parquet +(80) Scan parquet Output [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_suppkey), IsNotNull(ps_partkey)] ReadSchema: struct -(79) Filter +(81) Filter Input [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Condition : (isnotnull(ps_suppkey#X) AND isnotnull(ps_partkey#X)) -(80) BroadcastExchange +(82) BroadcastExchange Input [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Arguments: HashedRelationBroadcastMode(List(input[1, bigint, false], input[0, bigint, false]),false), [plan_id=X] -(81) BroadcastHashJoin +(83) BroadcastHashJoin Left keys [2]: [l_suppkey#X, l_partkey#X] Right keys [2]: [ps_suppkey#X, ps_partkey#X] Join type: Inner Join condition: None -(82) Project +(84) Project Output [6]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] Input [10]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_partkey#X, ps_suppkey#X, ps_supplycost#X] -(83) Scan parquet +(85) Scan parquet Output [2]: [o_orderkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderkey)] ReadSchema: struct -(84) Filter +(86) Filter Input [2]: [o_orderkey#X, o_orderdate#X] Condition : isnotnull(o_orderkey#X) -(85) BroadcastExchange +(87) BroadcastExchange Input [2]: [o_orderkey#X, o_orderdate#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(86) BroadcastHashJoin +(88) BroadcastHashJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join type: Inner Join condition: None -(87) Project +(89) Project Output [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] Input [8]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderkey#X, o_orderdate#X] -(88) Scan parquet +(90) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey)] ReadSchema: struct -(89) Filter +(91) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : isnotnull(n_nationkey#X) -(90) BroadcastExchange +(92) BroadcastExchange Input [2]: [n_nationkey#X, n_name#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=X] -(91) BroadcastHashJoin +(93) BroadcastHashJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(92) Project +(94) Project Output [3]: [n_name#X AS nation#X, year(o_orderdate#X) AS o_year#X, ((l_extendedprice#X * (1 - l_discount#X)) - (ps_supplycost#X * l_quantity#X)) AS amount#X] Input [8]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X, n_nationkey#X, n_name#X] -(93) HashAggregate +(95) HashAggregate Input [3]: [nation#X, o_year#X, amount#X] Keys [2]: [nation#X, o_year#X] Functions [1]: [partial_sum(amount#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [4]: [nation#X, o_year#X, sum#X, isEmpty#X] -(94) Exchange +(96) Exchange Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] Arguments: hashpartitioning(nation#X, o_year#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(95) HashAggregate +(97) HashAggregate Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] Keys [2]: [nation#X, o_year#X] Functions [1]: [sum(amount#X)] Aggregate Attributes [1]: [sum(amount#X)#X] Results [3]: [nation#X, o_year#X, sum(amount#X)#X AS sum_profit#X] -(96) Exchange +(98) Exchange Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: rangepartitioning(nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(97) Sort +(99) Sort Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: [nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST], true, 0 -(98) AdaptiveSparkPlan +(100) AdaptiveSparkPlan Output [3]: [nation#X, o_year#X, sum_profit#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/1.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/1.txt index 8d67aad16c3dc..53edb933c1fb4 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/1.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/1.txt @@ -1,31 +1,31 @@ == Physical Plan == -AdaptiveSparkPlan (28) +AdaptiveSparkPlan (30) +- == Final Plan == - VeloxColumnarToRowExec (19) - +- ^ SortExecTransformer (17) - +- ^ InputIteratorTransformer (16) - +- ^ InputAdapter (15) - +- ^ ShuffleQueryStage (14) - +- ColumnarExchange (13) - +- ^ RegularHashAggregateExecTransformer (11) - +- ^ InputIteratorTransformer (10) - +- ^ InputAdapter (9) - +- ^ ShuffleQueryStage (8) - +- ColumnarExchange (7) + VeloxColumnarToRowExec (21) + +- ^ SortExecTransformer (19) + +- ^ InputIteratorTransformer (18) + +- ShuffleQueryStage (16) + +- ColumnarExchange (15) + +- VeloxAppendBatches (14) + +- ^ RegularHashAggregateExecTransformer (12) + +- ^ InputIteratorTransformer (11) + +- ShuffleQueryStage (9) + +- ColumnarExchange (8) + +- VeloxAppendBatches (7) +- ^ ProjectExecTransformer (5) +- ^ FlushableHashAggregateExecTransformer (4) +- ^ ProjectExecTransformer (3) +- ^ NoopFilter (2) +- ^ Scan parquet (1) +- == Initial Plan == - Sort (27) - +- Exchange (26) - +- HashAggregate (25) - +- Exchange (24) - +- HashAggregate (23) - +- Project (22) - +- Filter (21) - +- Scan parquet (20) + Sort (29) + +- Exchange (28) + +- HashAggregate (27) + +- Exchange (26) + +- HashAggregate (25) + +- Project (24) + +- Filter (23) + +- Scan parquet (22) (1) Scan parquet @@ -58,97 +58,105 @@ Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, Input [18]: [hash_partition_key#X, l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Arguments: false -(7) ColumnarExchange +(7) VeloxAppendBatches +Input [18]: [hash_partition_key#X, l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] +Arguments: X + +(8) ColumnarExchange Input [18]: [hash_partition_key#X, l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Arguments: hashpartitioning(l_returnflag#X, l_linestatus#X, 1), ENSURE_REQUIREMENTS, [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X], [plan_id=X], [id=#X] -(8) ShuffleQueryStage +(9) ShuffleQueryStage Output [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Arguments: X -(9) InputAdapter +(10) InputAdapter Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] -(10) InputIteratorTransformer +(11) InputIteratorTransformer Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] -(11) RegularHashAggregateExecTransformer +(12) RegularHashAggregateExecTransformer Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Keys [2]: [l_returnflag#X, l_linestatus#X] Functions [8]: [sum(l_quantity#X), sum(l_extendedprice#X), sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true)), sum(CheckOverflow((promote_precision(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true)) * promote_precision(cast(CheckOverflow((1.00 + promote_precision(cast(l_tax#X as decimal(13,2)))), DecimalType(13,2), true) as decimal(26,4)))), DecimalType(38,6), true)), avg(l_quantity#X), avg(l_extendedprice#X), avg(l_discount#X), count(1)] Aggregate Attributes [8]: [sum(l_quantity#X)#X, sum(l_extendedprice#X)#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X, sum(CheckOverflow((promote_precision(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true)) * promote_precision(cast(CheckOverflow((1.00 + promote_precision(cast(l_tax#X as decimal(13,2)))), DecimalType(13,2), true) as decimal(26,4)))), DecimalType(38,6), true))#X, avg(l_quantity#X)#X, avg(l_extendedprice#X)#X, avg(l_discount#X)#X, count(1)#X] Results [10]: [l_returnflag#X, l_linestatus#X, sum(l_quantity#X)#X AS sum_qty#X, sum(l_extendedprice#X)#X AS sum_base_price#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X AS sum_disc_price#X, sum(CheckOverflow((promote_precision(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true)) * promote_precision(cast(CheckOverflow((1.00 + promote_precision(cast(l_tax#X as decimal(13,2)))), DecimalType(13,2), true) as decimal(26,4)))), DecimalType(38,6), true))#X AS sum_charge#X, avg(l_quantity#X)#X AS avg_qty#X, avg(l_extendedprice#X)#X AS avg_price#X, avg(l_discount#X)#X AS avg_disc#X, count(1)#X AS count_order#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] +Arguments: X + +(15) ColumnarExchange Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: rangepartitioning(l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] -(17) SortExecTransformer +(19) SortExecTransformer Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: [l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST], true, 0 -(18) WholeStageCodegenTransformer (X) +(20) WholeStageCodegenTransformer (X) Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: false -(19) VeloxColumnarToRowExec +(21) VeloxColumnarToRowExec Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] -(20) Scan parquet +(22) Scan parquet Output [7]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_tax#X, l_returnflag#X, l_linestatus#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), LessThanOrEqual(l_shipdate,1998-09-02)] ReadSchema: struct -(21) Filter +(23) Filter Input [7]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_tax#X, l_returnflag#X, l_linestatus#X, l_shipdate#X] Condition : (isnotnull(l_shipdate#X) AND (l_shipdate#X <= 1998-09-02)) -(22) Project +(24) Project Output [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_tax#X, l_returnflag#X, l_linestatus#X] Input [7]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_tax#X, l_returnflag#X, l_linestatus#X, l_shipdate#X] -(23) HashAggregate +(25) HashAggregate Input [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_tax#X, l_returnflag#X, l_linestatus#X] Keys [2]: [l_returnflag#X, l_linestatus#X] Functions [8]: [partial_sum(l_quantity#X), partial_sum(l_extendedprice#X), partial_sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true)), partial_sum(CheckOverflow((promote_precision(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true)) * promote_precision(cast(CheckOverflow((1.00 + promote_precision(cast(l_tax#X as decimal(13,2)))), DecimalType(13,2), true) as decimal(26,4)))), DecimalType(38,6), true)), partial_avg(l_quantity#X), partial_avg(l_extendedprice#X), partial_avg(l_discount#X), partial_count(1)] Aggregate Attributes [15]: [sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Results [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] -(24) Exchange +(26) Exchange Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Arguments: hashpartitioning(l_returnflag#X, l_linestatus#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(25) HashAggregate +(27) HashAggregate Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Keys [2]: [l_returnflag#X, l_linestatus#X] Functions [8]: [sum(l_quantity#X), sum(l_extendedprice#X), sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true)), sum(CheckOverflow((promote_precision(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true)) * promote_precision(cast(CheckOverflow((1.00 + promote_precision(cast(l_tax#X as decimal(13,2)))), DecimalType(13,2), true) as decimal(26,4)))), DecimalType(38,6), true)), avg(l_quantity#X), avg(l_extendedprice#X), avg(l_discount#X), count(1)] Aggregate Attributes [8]: [sum(l_quantity#X)#X, sum(l_extendedprice#X)#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X, sum(CheckOverflow((promote_precision(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true)) * promote_precision(cast(CheckOverflow((1.00 + promote_precision(cast(l_tax#X as decimal(13,2)))), DecimalType(13,2), true) as decimal(26,4)))), DecimalType(38,6), true))#X, avg(l_quantity#X)#X, avg(l_extendedprice#X)#X, avg(l_discount#X)#X, count(1)#X] Results [10]: [l_returnflag#X, l_linestatus#X, sum(l_quantity#X)#X AS sum_qty#X, sum(l_extendedprice#X)#X AS sum_base_price#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X AS sum_disc_price#X, sum(CheckOverflow((promote_precision(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true)) * promote_precision(cast(CheckOverflow((1.00 + promote_precision(cast(l_tax#X as decimal(13,2)))), DecimalType(13,2), true) as decimal(26,4)))), DecimalType(38,6), true))#X AS sum_charge#X, avg(l_quantity#X)#X AS avg_qty#X, avg(l_extendedprice#X)#X AS avg_price#X, avg(l_discount#X)#X AS avg_disc#X, count(1)#X AS count_order#X] -(26) Exchange +(28) Exchange Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: rangepartitioning(l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(27) Sort +(29) Sort Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: [l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST], true, 0 -(28) AdaptiveSparkPlan +(30) AdaptiveSparkPlan Output [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/10.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/10.txt index 9af231ec1bb81..ec46bfd07b911 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/10.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/10.txt @@ -1,85 +1,91 @@ == Physical Plan == -AdaptiveSparkPlan (87) +AdaptiveSparkPlan (100) +- == Final Plan == - VeloxColumnarToRowExec (60) - +- TakeOrderedAndProjectExecTransformer (59) - +- ^ ProjectExecTransformer (57) - +- ^ RegularHashAggregateExecTransformer (56) - +- ^ InputIteratorTransformer (55) - +- ^ InputAdapter (54) - +- ^ ShuffleQueryStage (53) - +- ColumnarExchange (52) - +- ^ ProjectExecTransformer (50) - +- ^ FlushableHashAggregateExecTransformer (49) - +- ^ ProjectExecTransformer (48) - +- ^ ShuffledHashJoinExecTransformer Inner (47) - :- ^ InputIteratorTransformer (38) - : +- ^ InputAdapter (37) - : +- ^ ShuffleQueryStage (36) - : +- ColumnarExchange (35) - : +- ^ ProjectExecTransformer (33) - : +- ^ ShuffledHashJoinExecTransformer Inner (32) - : :- ^ InputIteratorTransformer (23) - : : +- ^ InputAdapter (22) - : : +- ^ ShuffleQueryStage (21) - : : +- ColumnarExchange (20) - : : +- ^ ProjectExecTransformer (18) - : : +- ^ ShuffledHashJoinExecTransformer Inner (17) - : : :- ^ InputIteratorTransformer (8) - : : : +- ^ InputAdapter (7) - : : : +- ^ ShuffleQueryStage (6) - : : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (67) + +- TakeOrderedAndProjectExecTransformer (66) + +- ^ ProjectExecTransformer (64) + +- ^ RegularHashAggregateExecTransformer (63) + +- ^ InputIteratorTransformer (62) + +- ShuffleQueryStage (60) + +- ColumnarExchange (59) + +- VeloxAppendBatches (58) + +- ^ ProjectExecTransformer (56) + +- ^ FlushableHashAggregateExecTransformer (55) + +- ^ ProjectExecTransformer (54) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (53) + :- ^ InputIteratorTransformer (43) + : +- ShuffleQueryStage (41) + : +- ColumnarExchange (40) + : +- VeloxAppendBatches (39) + : +- ^ ProjectExecTransformer (37) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (36) + : :- ^ InputIteratorTransformer (26) + : : +- ShuffleQueryStage (24) + : : +- ColumnarExchange (23) + : : +- VeloxAppendBatches (22) + : : +- ^ ProjectExecTransformer (20) + : : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (19) + : : :- ^ InputIteratorTransformer (9) + : : : +- ShuffleQueryStage (7) + : : : +- ColumnarExchange (6) + : : : +- VeloxAppendBatches (5) : : : +- ^ ProjectExecTransformer (3) : : : +- ^ NoopFilter (2) : : : +- ^ Scan parquet (1) - : : +- ^ InputIteratorTransformer (16) - : : +- ^ InputAdapter (15) - : : +- ^ ShuffleQueryStage (14) - : : +- ColumnarExchange (13) - : : +- ^ ProjectExecTransformer (11) - : : +- ^ NoopFilter (10) - : : +- ^ Scan parquet (9) - : +- ^ InputIteratorTransformer (31) - : +- ^ InputAdapter (30) - : +- ^ ShuffleQueryStage (29) - : +- ColumnarExchange (28) - : +- ^ ProjectExecTransformer (26) - : +- ^ NoopFilter (25) - : +- ^ Scan parquet (24) - +- ^ InputIteratorTransformer (46) - +- ^ InputAdapter (45) - +- ^ ShuffleQueryStage (44) - +- ColumnarExchange (43) - +- ^ ProjectExecTransformer (41) - +- ^ NoopFilter (40) - +- ^ Scan parquet (39) + : : +- ^ InputIteratorTransformer (18) + : : +- ShuffleQueryStage (16) + : : +- ColumnarExchange (15) + : : +- VeloxAppendBatches (14) + : : +- ^ ProjectExecTransformer (12) + : : +- ^ NoopFilter (11) + : : +- ^ Scan parquet (10) + : +- ^ InputIteratorTransformer (35) + : +- ShuffleQueryStage (33) + : +- ColumnarExchange (32) + : +- VeloxAppendBatches (31) + : +- ^ ProjectExecTransformer (29) + : +- ^ NoopFilter (28) + : +- ^ Scan parquet (27) + +- ^ InputIteratorTransformer (52) + +- ShuffleQueryStage (50) + +- ColumnarExchange (49) + +- VeloxAppendBatches (48) + +- ^ ProjectExecTransformer (46) + +- ^ NoopFilter (45) + +- ^ Scan parquet (44) +- == Initial Plan == - TakeOrderedAndProject (86) - +- HashAggregate (85) - +- Exchange (84) - +- HashAggregate (83) - +- Project (82) - +- ShuffledHashJoin Inner BuildRight (81) - :- Exchange (77) - : +- Project (76) - : +- ShuffledHashJoin Inner BuildRight (75) - : :- Exchange (70) - : : +- Project (69) - : : +- ShuffledHashJoin Inner BuildRight (68) - : : :- Exchange (63) - : : : +- Filter (62) - : : : +- Scan parquet (61) - : : +- Exchange (67) - : : +- Project (66) - : : +- Filter (65) - : : +- Scan parquet (64) - : +- Exchange (74) - : +- Project (73) - : +- Filter (72) - : +- Scan parquet (71) - +- Exchange (80) - +- Filter (79) - +- Scan parquet (78) + TakeOrderedAndProject (99) + +- HashAggregate (98) + +- Exchange (97) + +- HashAggregate (96) + +- Project (95) + +- SortMergeJoin Inner (94) + :- Sort (89) + : +- Exchange (88) + : +- Project (87) + : +- SortMergeJoin Inner (86) + : :- Sort (80) + : : +- Exchange (79) + : : +- Project (78) + : : +- SortMergeJoin Inner (77) + : : :- Sort (71) + : : : +- Exchange (70) + : : : +- Filter (69) + : : : +- Scan parquet (68) + : : +- Sort (76) + : : +- Exchange (75) + : : +- Project (74) + : : +- Filter (73) + : : +- Scan parquet (72) + : +- Sort (85) + : +- Exchange (84) + : +- Project (83) + : +- Filter (82) + : +- Scan parquet (81) + +- Sort (93) + +- Exchange (92) + +- Filter (91) + +- Scan parquet (90) (1) Scan parquet @@ -101,358 +107,410 @@ Input [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acct Input [8]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [8]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] +Arguments: X + +(6) ColumnarExchange Input [8]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] -(9) Scan parquet +(10) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1993-10-01), LessThan(o_orderdate,1994-01-01), IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(10) NoopFilter +(11) NoopFilter Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Arguments: [o_orderkey#X, o_custkey#X, o_orderdate#X] -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [3]: [hash(o_custkey#X, 42) AS hash_partition_key#X, o_orderkey#X, o_custkey#X] Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] +Arguments: X + +(15) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [2]: [o_orderkey#X, o_custkey#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [2]: [o_orderkey#X, o_custkey#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [2]: [o_orderkey#X, o_custkey#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [9]: [hash(o_orderkey#X, 42) AS hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] Input [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X, o_custkey#X] -(19) WholeStageCodegenTransformer (X) +(21) WholeStageCodegenTransformer (X) Input [9]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] Arguments: false -(20) ColumnarExchange +(22) VeloxAppendBatches +Input [9]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] +Arguments: X + +(23) ColumnarExchange Input [9]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X], [plan_id=X], [id=#X] -(21) ShuffleQueryStage +(24) ShuffleQueryStage Output [8]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] Arguments: X -(22) InputAdapter +(25) InputAdapter Input [8]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] -(23) InputIteratorTransformer +(26) InputIteratorTransformer Input [8]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] -(24) Scan parquet +(27) Scan parquet Output [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_returnflag#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_returnflag), EqualTo(l_returnflag,R), IsNotNull(l_orderkey)] ReadSchema: struct -(25) NoopFilter +(28) NoopFilter Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_returnflag#X] Arguments: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_returnflag#X] -(26) ProjectExecTransformer +(29) ProjectExecTransformer Output [4]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_returnflag#X] -(27) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X] Arguments: false -(28) ColumnarExchange +(31) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(32) ColumnarExchange Input [4]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(29) ShuffleQueryStage +(33) ShuffleQueryStage Output [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] Arguments: X -(30) InputAdapter +(34) InputAdapter Input [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] -(31) InputIteratorTransformer +(35) InputIteratorTransformer Input [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] -(32) ShuffledHashJoinExecTransformer +(36) ShuffledHashJoinExecTransformer Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(33) ProjectExecTransformer +(37) ProjectExecTransformer Output [10]: [hash(c_nationkey#X, 42) AS hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] Input [11]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X] -(34) WholeStageCodegenTransformer (X) +(38) WholeStageCodegenTransformer (X) Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] Arguments: false -(35) ColumnarExchange +(39) VeloxAppendBatches +Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(40) ColumnarExchange Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(36) ShuffleQueryStage +(41) ShuffleQueryStage Output [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] Arguments: X -(37) InputAdapter +(42) InputAdapter Input [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] -(38) InputIteratorTransformer +(43) InputIteratorTransformer Input [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] -(39) Scan parquet +(44) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey)] ReadSchema: struct -(40) NoopFilter +(45) NoopFilter Input [2]: [n_nationkey#X, n_name#X] Arguments: [n_nationkey#X, n_name#X] -(41) ProjectExecTransformer +(46) ProjectExecTransformer Output [3]: [hash(n_nationkey#X, 42) AS hash_partition_key#X, n_nationkey#X, n_name#X] Input [2]: [n_nationkey#X, n_name#X] -(42) WholeStageCodegenTransformer (X) +(47) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] Arguments: false -(43) ColumnarExchange +(48) VeloxAppendBatches +Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] +Arguments: X + +(49) ColumnarExchange Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [id=#X] -(44) ShuffleQueryStage +(50) ShuffleQueryStage Output [2]: [n_nationkey#X, n_name#X] Arguments: X -(45) InputAdapter +(51) InputAdapter Input [2]: [n_nationkey#X, n_name#X] -(46) InputIteratorTransformer +(52) InputIteratorTransformer Input [2]: [n_nationkey#X, n_name#X] -(47) ShuffledHashJoinExecTransformer +(53) ShuffledHashJoinExecTransformer Left keys [1]: [c_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(48) ProjectExecTransformer +(54) ProjectExecTransformer Output [10]: [c_custkey#X, c_name#X, c_address#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X, n_name#X, CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) AS _pre_X#X] Input [11]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X, n_nationkey#X, n_name#X] -(49) FlushableHashAggregateExecTransformer +(55) FlushableHashAggregateExecTransformer Input [10]: [c_custkey#X, c_name#X, c_address#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X, n_name#X, _pre_X#X] Keys [7]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X] Functions [1]: [partial_sum(_pre_X#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] -(50) ProjectExecTransformer +(56) ProjectExecTransformer Output [10]: [hash(c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, 42) AS hash_partition_key#X, c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] -(51) WholeStageCodegenTransformer (X) +(57) WholeStageCodegenTransformer (X) Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Arguments: false -(52) ColumnarExchange +(58) VeloxAppendBatches +Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] +Arguments: X + +(59) ColumnarExchange Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Arguments: hashpartitioning(c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(53) ShuffleQueryStage +(60) ShuffleQueryStage Output [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Arguments: X -(54) InputAdapter +(61) InputAdapter Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] -(55) InputIteratorTransformer +(62) InputIteratorTransformer Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] -(56) RegularHashAggregateExecTransformer +(63) RegularHashAggregateExecTransformer Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Keys [7]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] Results [8]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] -(57) ProjectExecTransformer +(64) ProjectExecTransformer Output [8]: [c_custkey#X, c_name#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X AS revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] Input [8]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] -(58) WholeStageCodegenTransformer (X) +(65) WholeStageCodegenTransformer (X) Input [8]: [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] Arguments: false -(59) TakeOrderedAndProjectExecTransformer +(66) TakeOrderedAndProjectExecTransformer Input [8]: [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] Arguments: X, [revenue#X DESC NULLS LAST], [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X], 0 -(60) VeloxColumnarToRowExec +(67) VeloxColumnarToRowExec Input [8]: [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] -(61) Scan parquet +(68) Scan parquet Output [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey), IsNotNull(c_nationkey)] ReadSchema: struct -(62) Filter +(69) Filter Input [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] Condition : (isnotnull(c_custkey#X) AND isnotnull(c_nationkey#X)) -(63) Exchange +(70) Exchange Input [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(64) Scan parquet +(71) Sort +Input [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] +Arguments: [c_custkey#X ASC NULLS FIRST], false, 0 + +(72) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1993-10-01), LessThan(o_orderdate,1994-01-01), IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(65) Filter +(73) Filter Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Condition : ((((isnotnull(o_orderdate#X) AND (o_orderdate#X >= 1993-10-01)) AND (o_orderdate#X < 1994-01-01)) AND isnotnull(o_custkey#X)) AND isnotnull(o_orderkey#X)) -(66) Project +(74) Project Output [2]: [o_orderkey#X, o_custkey#X] Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] -(67) Exchange +(75) Exchange Input [2]: [o_orderkey#X, o_custkey#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(68) ShuffledHashJoin +(76) Sort +Input [2]: [o_orderkey#X, o_custkey#X] +Arguments: [o_custkey#X ASC NULLS FIRST], false, 0 + +(77) SortMergeJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(69) Project +(78) Project Output [8]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] Input [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X, o_custkey#X] -(70) Exchange +(79) Exchange Input [8]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(71) Scan parquet +(80) Sort +Input [8]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(81) Scan parquet Output [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_returnflag#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_returnflag), EqualTo(l_returnflag,R), IsNotNull(l_orderkey)] ReadSchema: struct -(72) Filter +(82) Filter Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_returnflag#X] Condition : ((isnotnull(l_returnflag#X) AND (l_returnflag#X = R)) AND isnotnull(l_orderkey#X)) -(73) Project +(83) Project Output [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_returnflag#X] -(74) Exchange +(84) Exchange Input [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(75) ShuffledHashJoin +(85) Sort +Input [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(86) SortMergeJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(76) Project +(87) Project Output [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] Input [11]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X] -(77) Exchange +(88) Exchange Input [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(78) Scan parquet +(89) Sort +Input [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] +Arguments: [c_nationkey#X ASC NULLS FIRST], false, 0 + +(90) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey)] ReadSchema: struct -(79) Filter +(91) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : isnotnull(n_nationkey#X) -(80) Exchange +(92) Exchange Input [2]: [n_nationkey#X, n_name#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(81) ShuffledHashJoin +(93) Sort +Input [2]: [n_nationkey#X, n_name#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(94) SortMergeJoin Left keys [1]: [c_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(82) Project +(95) Project Output [9]: [c_custkey#X, c_name#X, c_address#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X, n_name#X] Input [11]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X, n_nationkey#X, n_name#X] -(83) HashAggregate +(96) HashAggregate Input [9]: [c_custkey#X, c_name#X, c_address#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X, n_name#X] Keys [7]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X] Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] -(84) Exchange +(97) Exchange Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Arguments: hashpartitioning(c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(85) HashAggregate +(98) HashAggregate Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Keys [7]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] Results [8]: [c_custkey#X, c_name#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X AS revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] -(86) TakeOrderedAndProject +(99) TakeOrderedAndProject Input [8]: [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] Arguments: X, [revenue#X DESC NULLS LAST], [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] -(87) AdaptiveSparkPlan +(100) AdaptiveSparkPlan Output [8]: [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/11.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/11.txt index f028b8daa9b34..cccf1408bea99 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/11.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/11.txt @@ -1,71 +1,75 @@ == Physical Plan == -AdaptiveSparkPlan (72) +AdaptiveSparkPlan (82) +- == Final Plan == - VeloxColumnarToRowExec (50) - +- ^ SortExecTransformer (48) - +- ^ InputIteratorTransformer (47) - +- ^ InputAdapter (46) - +- ^ ShuffleQueryStage (45) - +- ColumnarExchange (44) - +- ^ FilterExecTransformer (42) - +- ^ RegularHashAggregateExecTransformer (41) - +- ^ InputIteratorTransformer (40) - +- ^ InputAdapter (39) - +- ^ ShuffleQueryStage (38) - +- ColumnarExchange (37) - +- ^ ProjectExecTransformer (35) - +- ^ FlushableHashAggregateExecTransformer (34) - +- ^ ProjectExecTransformer (33) - +- ^ ShuffledHashJoinExecTransformer Inner (32) - :- ^ InputIteratorTransformer (23) - : +- ^ InputAdapter (22) - : +- ^ ShuffleQueryStage (21) - : +- ColumnarExchange (20) - : +- ^ ProjectExecTransformer (18) - : +- ^ ShuffledHashJoinExecTransformer Inner (17) - : :- ^ InputIteratorTransformer (8) - : : +- ^ InputAdapter (7) - : : +- ^ ShuffleQueryStage (6) - : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (56) + +- ^ SortExecTransformer (54) + +- ^ InputIteratorTransformer (53) + +- ShuffleQueryStage (51) + +- ColumnarExchange (50) + +- VeloxAppendBatches (49) + +- ^ FilterExecTransformer (47) + +- ^ RegularHashAggregateExecTransformer (46) + +- ^ InputIteratorTransformer (45) + +- ShuffleQueryStage (43) + +- ColumnarExchange (42) + +- VeloxAppendBatches (41) + +- ^ ProjectExecTransformer (39) + +- ^ FlushableHashAggregateExecTransformer (38) + +- ^ ProjectExecTransformer (37) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (36) + :- ^ InputIteratorTransformer (26) + : +- ShuffleQueryStage (24) + : +- ColumnarExchange (23) + : +- VeloxAppendBatches (22) + : +- ^ ProjectExecTransformer (20) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (19) + : :- ^ InputIteratorTransformer (9) + : : +- ShuffleQueryStage (7) + : : +- ColumnarExchange (6) + : : +- VeloxAppendBatches (5) : : +- ^ ProjectExecTransformer (3) : : +- ^ NoopFilter (2) : : +- ^ Scan parquet (1) - : +- ^ InputIteratorTransformer (16) - : +- ^ InputAdapter (15) - : +- ^ ShuffleQueryStage (14) - : +- ColumnarExchange (13) - : +- ^ ProjectExecTransformer (11) - : +- ^ NoopFilter (10) - : +- ^ Scan parquet (9) - +- ^ InputIteratorTransformer (31) - +- ^ InputAdapter (30) - +- ^ ShuffleQueryStage (29) - +- ColumnarExchange (28) - +- ^ ProjectExecTransformer (26) - +- ^ NoopFilter (25) - +- ^ Scan parquet (24) + : +- ^ InputIteratorTransformer (18) + : +- ShuffleQueryStage (16) + : +- ColumnarExchange (15) + : +- VeloxAppendBatches (14) + : +- ^ ProjectExecTransformer (12) + : +- ^ NoopFilter (11) + : +- ^ Scan parquet (10) + +- ^ InputIteratorTransformer (35) + +- ShuffleQueryStage (33) + +- ColumnarExchange (32) + +- VeloxAppendBatches (31) + +- ^ ProjectExecTransformer (29) + +- ^ NoopFilter (28) + +- ^ Scan parquet (27) +- == Initial Plan == - Sort (71) - +- Exchange (70) - +- Filter (69) - +- HashAggregate (68) - +- Exchange (67) - +- HashAggregate (66) - +- Project (65) - +- ShuffledHashJoin Inner BuildRight (64) - :- Exchange (59) - : +- Project (58) - : +- ShuffledHashJoin Inner BuildRight (57) - : :- Exchange (53) - : : +- Filter (52) - : : +- Scan parquet (51) - : +- Exchange (56) - : +- Filter (55) - : +- Scan parquet (54) - +- Exchange (63) - +- Project (62) - +- Filter (61) - +- Scan parquet (60) + Sort (81) + +- Exchange (80) + +- Filter (79) + +- HashAggregate (78) + +- Exchange (77) + +- HashAggregate (76) + +- Project (75) + +- SortMergeJoin Inner (74) + :- Sort (68) + : +- Exchange (67) + : +- Project (66) + : +- SortMergeJoin Inner (65) + : :- Sort (60) + : : +- Exchange (59) + : : +- Filter (58) + : : +- Scan parquet (57) + : +- Sort (64) + : +- Exchange (63) + : +- Filter (62) + : +- Scan parquet (61) + +- Sort (73) + +- Exchange (72) + +- Project (71) + +- Filter (70) + +- Scan parquet (69) (1) Scan parquet @@ -87,292 +91,332 @@ Input [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Input [5]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [5]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] +Arguments: X + +(6) ColumnarExchange Input [5]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] -(9) Scan parquet +(10) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(10) NoopFilter +(11) NoopFilter Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: [s_suppkey#X, s_nationkey#X] -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [3]: [hash(s_suppkey#X, 42) AS hash_partition_key#X, s_suppkey#X, s_nationkey#X] Input [2]: [s_suppkey#X, s_nationkey#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] +Arguments: X + +(15) ColumnarExchange Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [2]: [s_suppkey#X, s_nationkey#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [2]: [s_suppkey#X, s_nationkey#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [ps_suppkey#X] Right keys [1]: [s_suppkey#X] Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [5]: [hash(s_nationkey#X, 42) AS hash_partition_key#X, ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] Input [6]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X, s_suppkey#X, s_nationkey#X] -(19) WholeStageCodegenTransformer (X) +(21) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] Arguments: false -(20) ColumnarExchange +(22) VeloxAppendBatches +Input [5]: [hash_partition_key#X, ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] +Arguments: X + +(23) ColumnarExchange Input [5]: [hash_partition_key#X, ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X], [plan_id=X], [id=#X] -(21) ShuffleQueryStage +(24) ShuffleQueryStage Output [4]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] Arguments: X -(22) InputAdapter +(25) InputAdapter Input [4]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] -(23) InputIteratorTransformer +(26) InputIteratorTransformer Input [4]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] -(24) Scan parquet +(27) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,GERMANY), IsNotNull(n_nationkey)] ReadSchema: struct -(25) NoopFilter +(28) NoopFilter Input [2]: [n_nationkey#X, n_name#X] Arguments: [n_nationkey#X, n_name#X] -(26) ProjectExecTransformer +(29) ProjectExecTransformer Output [2]: [hash(n_nationkey#X, 42) AS hash_partition_key#X, n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(27) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, n_nationkey#X] Arguments: false -(28) ColumnarExchange +(31) VeloxAppendBatches +Input [2]: [hash_partition_key#X, n_nationkey#X] +Arguments: X + +(32) ColumnarExchange Input [2]: [hash_partition_key#X, n_nationkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X], [plan_id=X], [id=#X] -(29) ShuffleQueryStage +(33) ShuffleQueryStage Output [1]: [n_nationkey#X] Arguments: X -(30) InputAdapter +(34) InputAdapter Input [1]: [n_nationkey#X] -(31) InputIteratorTransformer +(35) InputIteratorTransformer Input [1]: [n_nationkey#X] -(32) ShuffledHashJoinExecTransformer +(36) ShuffledHashJoinExecTransformer Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(33) ProjectExecTransformer +(37) ProjectExecTransformer Output [4]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(cast(ps_availqty#X as decimal(10,0)) as decimal(12,2)))), DecimalType(23,2), true) AS _pre_X#X] Input [5]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X, n_nationkey#X] -(34) FlushableHashAggregateExecTransformer +(38) FlushableHashAggregateExecTransformer Input [4]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, _pre_X#X] Keys [1]: [ps_partkey#X] Functions [1]: [partial_sum(_pre_X#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [ps_partkey#X, sum#X, isEmpty#X] -(35) ProjectExecTransformer +(39) ProjectExecTransformer Output [4]: [hash(ps_partkey#X, 42) AS hash_partition_key#X, ps_partkey#X, sum#X, isEmpty#X] Input [3]: [ps_partkey#X, sum#X, isEmpty#X] -(36) WholeStageCodegenTransformer (X) +(40) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, ps_partkey#X, sum#X, isEmpty#X] Arguments: false -(37) ColumnarExchange +(41) VeloxAppendBatches +Input [4]: [hash_partition_key#X, ps_partkey#X, sum#X, isEmpty#X] +Arguments: X + +(42) ColumnarExchange Input [4]: [hash_partition_key#X, ps_partkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(38) ShuffleQueryStage +(43) ShuffleQueryStage Output [3]: [ps_partkey#X, sum#X, isEmpty#X] Arguments: X -(39) InputAdapter +(44) InputAdapter Input [3]: [ps_partkey#X, sum#X, isEmpty#X] -(40) InputIteratorTransformer +(45) InputIteratorTransformer Input [3]: [ps_partkey#X, sum#X, isEmpty#X] -(41) RegularHashAggregateExecTransformer +(46) RegularHashAggregateExecTransformer Input [3]: [ps_partkey#X, sum#X, isEmpty#X] Keys [1]: [ps_partkey#X] Functions [1]: [sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(cast(ps_availqty#X as decimal(10,0)) as decimal(12,2)))), DecimalType(23,2), true))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(cast(ps_availqty#X as decimal(10,0)) as decimal(12,2)))), DecimalType(23,2), true))#X] Results [2]: [ps_partkey#X, sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(cast(ps_availqty#X as decimal(10,0)) as decimal(12,2)))), DecimalType(23,2), true))#X AS value#X] -(42) FilterExecTransformer +(47) FilterExecTransformer Input [2]: [ps_partkey#X, value#X] Arguments: (isnotnull(value#X) AND (cast(value#X as decimal(38,6)) > Subquery subquery#X, [id=#X])) -(43) WholeStageCodegenTransformer (X) +(48) WholeStageCodegenTransformer (X) Input [2]: [ps_partkey#X, value#X] Arguments: false -(44) ColumnarExchange +(49) VeloxAppendBatches +Input [2]: [ps_partkey#X, value#X] +Arguments: X + +(50) ColumnarExchange Input [2]: [ps_partkey#X, value#X] Arguments: rangepartitioning(value#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(45) ShuffleQueryStage +(51) ShuffleQueryStage Output [2]: [ps_partkey#X, value#X] Arguments: X -(46) InputAdapter +(52) InputAdapter Input [2]: [ps_partkey#X, value#X] -(47) InputIteratorTransformer +(53) InputIteratorTransformer Input [2]: [ps_partkey#X, value#X] -(48) SortExecTransformer +(54) SortExecTransformer Input [2]: [ps_partkey#X, value#X] Arguments: [value#X DESC NULLS LAST], true, 0 -(49) WholeStageCodegenTransformer (X) +(55) WholeStageCodegenTransformer (X) Input [2]: [ps_partkey#X, value#X] Arguments: false -(50) VeloxColumnarToRowExec +(56) VeloxColumnarToRowExec Input [2]: [ps_partkey#X, value#X] -(51) Scan parquet +(57) Scan parquet Output [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_suppkey)] ReadSchema: struct -(52) Filter +(58) Filter Input [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Condition : isnotnull(ps_suppkey#X) -(53) Exchange +(59) Exchange Input [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(54) Scan parquet +(60) Sort +Input [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] +Arguments: [ps_suppkey#X ASC NULLS FIRST], false, 0 + +(61) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(55) Filter +(62) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(56) Exchange +(63) Exchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(57) ShuffledHashJoin +(64) Sort +Input [2]: [s_suppkey#X, s_nationkey#X] +Arguments: [s_suppkey#X ASC NULLS FIRST], false, 0 + +(65) SortMergeJoin Left keys [1]: [ps_suppkey#X] Right keys [1]: [s_suppkey#X] Join condition: None -(58) Project +(66) Project Output [4]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] Input [6]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X, s_suppkey#X, s_nationkey#X] -(59) Exchange +(67) Exchange Input [4]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(60) Scan parquet +(68) Sort +Input [4]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] +Arguments: [s_nationkey#X ASC NULLS FIRST], false, 0 + +(69) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,GERMANY), IsNotNull(n_nationkey)] ReadSchema: struct -(61) Filter +(70) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : ((isnotnull(n_name#X) AND (n_name#X = GERMANY)) AND isnotnull(n_nationkey#X)) -(62) Project +(71) Project Output [1]: [n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(63) Exchange +(72) Exchange Input [1]: [n_nationkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(64) ShuffledHashJoin +(73) Sort +Input [1]: [n_nationkey#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(74) SortMergeJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(65) Project +(75) Project Output [3]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X] Input [5]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X, n_nationkey#X] -(66) HashAggregate +(76) HashAggregate Input [3]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X] Keys [1]: [ps_partkey#X] Functions [1]: [partial_sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(cast(ps_availqty#X as decimal(10,0)) as decimal(12,2)))), DecimalType(23,2), true))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [ps_partkey#X, sum#X, isEmpty#X] -(67) Exchange +(77) Exchange Input [3]: [ps_partkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(68) HashAggregate +(78) HashAggregate Input [3]: [ps_partkey#X, sum#X, isEmpty#X] Keys [1]: [ps_partkey#X] Functions [1]: [sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(cast(ps_availqty#X as decimal(10,0)) as decimal(12,2)))), DecimalType(23,2), true))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(cast(ps_availqty#X as decimal(10,0)) as decimal(12,2)))), DecimalType(23,2), true))#X] Results [2]: [ps_partkey#X, sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(cast(ps_availqty#X as decimal(10,0)) as decimal(12,2)))), DecimalType(23,2), true))#X AS value#X] -(69) Filter +(79) Filter Input [2]: [ps_partkey#X, value#X] Condition : (isnotnull(value#X) AND (cast(value#X as decimal(38,6)) > Subquery subquery#X, [id=#X])) -(70) Exchange +(80) Exchange Input [2]: [ps_partkey#X, value#X] Arguments: rangepartitioning(value#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(71) Sort +(81) Sort Input [2]: [ps_partkey#X, value#X] Arguments: [value#X DESC NULLS LAST], true, 0 -(72) AdaptiveSparkPlan +(82) AdaptiveSparkPlan Output [2]: [ps_partkey#X, value#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/12.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/12.txt index 914b1813df44c..17cdf62608cc5 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/12.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/12.txt @@ -1,50 +1,52 @@ == Physical Plan == -AdaptiveSparkPlan (49) +AdaptiveSparkPlan (55) +- == Final Plan == - VeloxColumnarToRowExec (34) - +- ^ SortExecTransformer (32) - +- ^ InputIteratorTransformer (31) - +- ^ InputAdapter (30) - +- ^ ShuffleQueryStage (29) - +- ColumnarExchange (28) - +- ^ RegularHashAggregateExecTransformer (26) - +- ^ InputIteratorTransformer (25) - +- ^ InputAdapter (24) - +- ^ ShuffleQueryStage (23) - +- ColumnarExchange (22) - +- ^ ProjectExecTransformer (20) - +- ^ FlushableHashAggregateExecTransformer (19) - +- ^ ProjectExecTransformer (18) - +- ^ ShuffledHashJoinExecTransformer Inner (17) - :- ^ InputIteratorTransformer (8) - : +- ^ InputAdapter (7) - : +- ^ ShuffleQueryStage (6) - : +- ColumnarExchange (5) + VeloxColumnarToRowExec (38) + +- ^ SortExecTransformer (36) + +- ^ InputIteratorTransformer (35) + +- ShuffleQueryStage (33) + +- ColumnarExchange (32) + +- VeloxAppendBatches (31) + +- ^ RegularHashAggregateExecTransformer (29) + +- ^ InputIteratorTransformer (28) + +- ShuffleQueryStage (26) + +- ColumnarExchange (25) + +- VeloxAppendBatches (24) + +- ^ ProjectExecTransformer (22) + +- ^ FlushableHashAggregateExecTransformer (21) + +- ^ ProjectExecTransformer (20) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (19) + :- ^ InputIteratorTransformer (9) + : +- ShuffleQueryStage (7) + : +- ColumnarExchange (6) + : +- VeloxAppendBatches (5) : +- ^ ProjectExecTransformer (3) : +- ^ NoopFilter (2) : +- ^ Scan parquet (1) - +- ^ InputIteratorTransformer (16) - +- ^ InputAdapter (15) - +- ^ ShuffleQueryStage (14) - +- ColumnarExchange (13) - +- ^ ProjectExecTransformer (11) - +- ^ NoopFilter (10) - +- ^ Scan parquet (9) + +- ^ InputIteratorTransformer (18) + +- ShuffleQueryStage (16) + +- ColumnarExchange (15) + +- VeloxAppendBatches (14) + +- ^ ProjectExecTransformer (12) + +- ^ NoopFilter (11) + +- ^ Scan parquet (10) +- == Initial Plan == - Sort (48) - +- Exchange (47) - +- HashAggregate (46) - +- Exchange (45) - +- HashAggregate (44) - +- Project (43) - +- ShuffledHashJoin Inner BuildLeft (42) - :- Exchange (37) - : +- Filter (36) - : +- Scan parquet (35) - +- Exchange (41) - +- Project (40) - +- Filter (39) - +- Scan parquet (38) + Sort (54) + +- Exchange (53) + +- HashAggregate (52) + +- Exchange (51) + +- HashAggregate (50) + +- Project (49) + +- SortMergeJoin Inner (48) + :- Sort (42) + : +- Exchange (41) + : +- Filter (40) + : +- Scan parquet (39) + +- Sort (47) + +- Exchange (46) + +- Project (45) + +- Filter (44) + +- Scan parquet (43) (1) Scan parquet @@ -66,196 +68,220 @@ Input [2]: [o_orderkey#X, o_orderpriority#X] Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderpriority#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderpriority#X] +Arguments: X + +(6) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderpriority#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderpriority#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [2]: [o_orderkey#X, o_orderpriority#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [2]: [o_orderkey#X, o_orderpriority#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [2]: [o_orderkey#X, o_orderpriority#X] -(9) Scan parquet +(10) Scan parquet Output [5]: [l_orderkey#X, l_shipdate#X, l_commitdate#X, l_receiptdate#X, l_shipmode#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_commitdate), IsNotNull(l_receiptdate), IsNotNull(l_shipdate), In(l_shipmode, [MAIL,SHIP]), GreaterThanOrEqual(l_receiptdate,1994-01-01), LessThan(l_receiptdate,1995-01-01), IsNotNull(l_orderkey)] ReadSchema: struct -(10) NoopFilter +(11) NoopFilter Input [5]: [l_orderkey#X, l_shipdate#X, l_commitdate#X, l_receiptdate#X, l_shipmode#X] Arguments: [l_orderkey#X, l_shipdate#X, l_commitdate#X, l_receiptdate#X, l_shipmode#X] -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [3]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_shipmode#X] Input [5]: [l_orderkey#X, l_shipdate#X, l_commitdate#X, l_receiptdate#X, l_shipmode#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, l_orderkey#X, l_shipmode#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [3]: [hash_partition_key#X, l_orderkey#X, l_shipmode#X] +Arguments: X + +(15) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_shipmode#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_shipmode#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [2]: [l_orderkey#X, l_shipmode#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [2]: [l_orderkey#X, l_shipmode#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [2]: [l_orderkey#X, l_shipmode#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [4]: [o_orderpriority#X, l_shipmode#X, CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END AS _pre_X#X, CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END AS _pre_X#X] Input [4]: [o_orderkey#X, o_orderpriority#X, l_orderkey#X, l_shipmode#X] -(19) FlushableHashAggregateExecTransformer +(21) FlushableHashAggregateExecTransformer Input [4]: [o_orderpriority#X, l_shipmode#X, _pre_X#X, _pre_X#X] Keys [1]: [l_shipmode#X] Functions [2]: [partial_sum(_pre_X#X), partial_sum(_pre_X#X)] Aggregate Attributes [2]: [sum#X, sum#X] Results [3]: [l_shipmode#X, sum#X, sum#X] -(20) ProjectExecTransformer +(22) ProjectExecTransformer Output [4]: [hash(l_shipmode#X, 42) AS hash_partition_key#X, l_shipmode#X, sum#X, sum#X] Input [3]: [l_shipmode#X, sum#X, sum#X] -(21) WholeStageCodegenTransformer (X) +(23) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, l_shipmode#X, sum#X, sum#X] Arguments: false -(22) ColumnarExchange +(24) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_shipmode#X, sum#X, sum#X] +Arguments: X + +(25) ColumnarExchange Input [4]: [hash_partition_key#X, l_shipmode#X, sum#X, sum#X] Arguments: hashpartitioning(l_shipmode#X, 1), ENSURE_REQUIREMENTS, [l_shipmode#X, sum#X, sum#X], [plan_id=X], [id=#X] -(23) ShuffleQueryStage +(26) ShuffleQueryStage Output [3]: [l_shipmode#X, sum#X, sum#X] Arguments: X -(24) InputAdapter +(27) InputAdapter Input [3]: [l_shipmode#X, sum#X, sum#X] -(25) InputIteratorTransformer +(28) InputIteratorTransformer Input [3]: [l_shipmode#X, sum#X, sum#X] -(26) RegularHashAggregateExecTransformer +(29) RegularHashAggregateExecTransformer Input [3]: [l_shipmode#X, sum#X, sum#X] Keys [1]: [l_shipmode#X] Functions [2]: [sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END), sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)] Aggregate Attributes [2]: [sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X, sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X] Results [3]: [l_shipmode#X, sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X AS high_line_count#X, sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X AS low_line_count#X] -(27) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: false -(28) ColumnarExchange +(31) VeloxAppendBatches +Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] +Arguments: X + +(32) ColumnarExchange Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: rangepartitioning(l_shipmode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(29) ShuffleQueryStage +(33) ShuffleQueryStage Output [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: X -(30) InputAdapter +(34) InputAdapter Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] -(31) InputIteratorTransformer +(35) InputIteratorTransformer Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] -(32) SortExecTransformer +(36) SortExecTransformer Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: [l_shipmode#X ASC NULLS FIRST], true, 0 -(33) WholeStageCodegenTransformer (X) +(37) WholeStageCodegenTransformer (X) Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: false -(34) VeloxColumnarToRowExec +(38) VeloxColumnarToRowExec Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] -(35) Scan parquet +(39) Scan parquet Output [2]: [o_orderkey#X, o_orderpriority#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderkey)] ReadSchema: struct -(36) Filter +(40) Filter Input [2]: [o_orderkey#X, o_orderpriority#X] Condition : isnotnull(o_orderkey#X) -(37) Exchange +(41) Exchange Input [2]: [o_orderkey#X, o_orderpriority#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(38) Scan parquet +(42) Sort +Input [2]: [o_orderkey#X, o_orderpriority#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(43) Scan parquet Output [5]: [l_orderkey#X, l_shipdate#X, l_commitdate#X, l_receiptdate#X, l_shipmode#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_commitdate), IsNotNull(l_receiptdate), IsNotNull(l_shipdate), In(l_shipmode, [MAIL,SHIP]), GreaterThanOrEqual(l_receiptdate,1994-01-01), LessThan(l_receiptdate,1995-01-01), IsNotNull(l_orderkey)] ReadSchema: struct -(39) Filter +(44) Filter Input [5]: [l_orderkey#X, l_shipdate#X, l_commitdate#X, l_receiptdate#X, l_shipmode#X] Condition : ((((((((isnotnull(l_commitdate#X) AND isnotnull(l_receiptdate#X)) AND isnotnull(l_shipdate#X)) AND l_shipmode#X IN (MAIL,SHIP)) AND (l_commitdate#X < l_receiptdate#X)) AND (l_shipdate#X < l_commitdate#X)) AND (l_receiptdate#X >= 1994-01-01)) AND (l_receiptdate#X < 1995-01-01)) AND isnotnull(l_orderkey#X)) -(40) Project +(45) Project Output [2]: [l_orderkey#X, l_shipmode#X] Input [5]: [l_orderkey#X, l_shipdate#X, l_commitdate#X, l_receiptdate#X, l_shipmode#X] -(41) Exchange +(46) Exchange Input [2]: [l_orderkey#X, l_shipmode#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(42) ShuffledHashJoin +(47) Sort +Input [2]: [l_orderkey#X, l_shipmode#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(48) SortMergeJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(43) Project +(49) Project Output [2]: [o_orderpriority#X, l_shipmode#X] Input [4]: [o_orderkey#X, o_orderpriority#X, l_orderkey#X, l_shipmode#X] -(44) HashAggregate +(50) HashAggregate Input [2]: [o_orderpriority#X, l_shipmode#X] Keys [1]: [l_shipmode#X] Functions [2]: [partial_sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END), partial_sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)] Aggregate Attributes [2]: [sum#X, sum#X] Results [3]: [l_shipmode#X, sum#X, sum#X] -(45) Exchange +(51) Exchange Input [3]: [l_shipmode#X, sum#X, sum#X] Arguments: hashpartitioning(l_shipmode#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(46) HashAggregate +(52) HashAggregate Input [3]: [l_shipmode#X, sum#X, sum#X] Keys [1]: [l_shipmode#X] Functions [2]: [sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END), sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)] Aggregate Attributes [2]: [sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X, sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X] Results [3]: [l_shipmode#X, sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X AS high_line_count#X, sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X AS low_line_count#X] -(47) Exchange +(53) Exchange Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: rangepartitioning(l_shipmode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(48) Sort +(54) Sort Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: [l_shipmode#X ASC NULLS FIRST], true, 0 -(49) AdaptiveSparkPlan +(55) AdaptiveSparkPlan Output [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/13.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/13.txt index 408b8a0f7f979..730f0e0a438a5 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/13.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/13.txt @@ -1,53 +1,55 @@ == Physical Plan == -AdaptiveSparkPlan (52) +AdaptiveSparkPlan (58) +- == Final Plan == - VeloxColumnarToRowExec (36) - +- ^ SortExecTransformer (34) - +- ^ InputIteratorTransformer (33) - +- ^ InputAdapter (32) - +- ^ ShuffleQueryStage (31) - +- ColumnarExchange (30) - +- ^ RegularHashAggregateExecTransformer (28) - +- ^ InputIteratorTransformer (27) - +- ^ InputAdapter (26) - +- ^ ShuffleQueryStage (25) - +- ColumnarExchange (24) - +- ^ ProjectExecTransformer (22) - +- ^ FlushableHashAggregateExecTransformer (21) - +- ^ ProjectExecTransformer (20) - +- ^ RegularHashAggregateExecTransformer (19) - +- ^ RegularHashAggregateExecTransformer (18) - +- ^ ProjectExecTransformer (17) - +- ^ ShuffledHashJoinExecTransformer LeftOuter (16) - :- ^ InputIteratorTransformer (7) - : +- ^ InputAdapter (6) - : +- ^ ShuffleQueryStage (5) - : +- ColumnarExchange (4) + VeloxColumnarToRowExec (40) + +- ^ SortExecTransformer (38) + +- ^ InputIteratorTransformer (37) + +- ShuffleQueryStage (35) + +- ColumnarExchange (34) + +- VeloxAppendBatches (33) + +- ^ RegularHashAggregateExecTransformer (31) + +- ^ InputIteratorTransformer (30) + +- ShuffleQueryStage (28) + +- ColumnarExchange (27) + +- VeloxAppendBatches (26) + +- ^ ProjectExecTransformer (24) + +- ^ FlushableHashAggregateExecTransformer (23) + +- ^ ProjectExecTransformer (22) + +- ^ RegularHashAggregateExecTransformer (21) + +- ^ RegularHashAggregateExecTransformer (20) + +- ^ ProjectExecTransformer (19) + +- ^ ShuffledHashJoinExecTransformer LeftOuter BuildLeft (18) + :- ^ InputIteratorTransformer (8) + : +- ShuffleQueryStage (6) + : +- ColumnarExchange (5) + : +- VeloxAppendBatches (4) : +- ^ ProjectExecTransformer (2) : +- ^ Scan parquet (1) - +- ^ InputIteratorTransformer (15) - +- ^ InputAdapter (14) - +- ^ ShuffleQueryStage (13) - +- ColumnarExchange (12) - +- ^ ProjectExecTransformer (10) - +- ^ NoopFilter (9) - +- ^ Scan parquet (8) + +- ^ InputIteratorTransformer (17) + +- ShuffleQueryStage (15) + +- ColumnarExchange (14) + +- VeloxAppendBatches (13) + +- ^ ProjectExecTransformer (11) + +- ^ NoopFilter (10) + +- ^ Scan parquet (9) +- == Initial Plan == - Sort (51) - +- Exchange (50) - +- HashAggregate (49) - +- Exchange (48) - +- HashAggregate (47) - +- HashAggregate (46) - +- HashAggregate (45) - +- Project (44) - +- ShuffledHashJoin LeftOuter BuildRight (43) - :- Exchange (38) - : +- Scan parquet (37) - +- Exchange (42) - +- Project (41) - +- Filter (40) - +- Scan parquet (39) + Sort (57) + +- Exchange (56) + +- HashAggregate (55) + +- Exchange (54) + +- HashAggregate (53) + +- HashAggregate (52) + +- HashAggregate (51) + +- Project (50) + +- SortMergeJoin LeftOuter (49) + :- Sort (43) + : +- Exchange (42) + : +- Scan parquet (41) + +- Sort (48) + +- Exchange (47) + +- Project (46) + +- Filter (45) + +- Scan parquet (44) (1) Scan parquet @@ -64,223 +66,247 @@ Input [1]: [c_custkey#X] Input [2]: [hash_partition_key#X, c_custkey#X] Arguments: false -(4) ColumnarExchange +(4) VeloxAppendBatches +Input [2]: [hash_partition_key#X, c_custkey#X] +Arguments: X + +(5) ColumnarExchange Input [2]: [hash_partition_key#X, c_custkey#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X], [plan_id=X], [id=#X] -(5) ShuffleQueryStage +(6) ShuffleQueryStage Output [1]: [c_custkey#X] Arguments: X -(6) InputAdapter +(7) InputAdapter Input [1]: [c_custkey#X] -(7) InputIteratorTransformer +(8) InputIteratorTransformer Input [1]: [c_custkey#X] -(8) Scan parquet +(9) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_comment#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_comment), IsNotNull(o_custkey)] ReadSchema: struct -(9) NoopFilter +(10) NoopFilter Input [3]: [o_orderkey#X, o_custkey#X, o_comment#X] Arguments: [o_orderkey#X, o_custkey#X, o_comment#X] -(10) ProjectExecTransformer +(11) ProjectExecTransformer Output [3]: [hash(o_custkey#X, 42) AS hash_partition_key#X, o_orderkey#X, o_custkey#X] Input [3]: [o_orderkey#X, o_custkey#X, o_comment#X] -(11) WholeStageCodegenTransformer (X) +(12) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] Arguments: false -(12) ColumnarExchange +(13) VeloxAppendBatches +Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] +Arguments: X + +(14) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [id=#X] -(13) ShuffleQueryStage +(15) ShuffleQueryStage Output [2]: [o_orderkey#X, o_custkey#X] Arguments: X -(14) InputAdapter +(16) InputAdapter Input [2]: [o_orderkey#X, o_custkey#X] -(15) InputIteratorTransformer +(17) InputIteratorTransformer Input [2]: [o_orderkey#X, o_custkey#X] -(16) ShuffledHashJoinExecTransformer +(18) ShuffledHashJoinExecTransformer Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(17) ProjectExecTransformer +(19) ProjectExecTransformer Output [2]: [c_custkey#X, o_orderkey#X] Input [3]: [c_custkey#X, o_orderkey#X, o_custkey#X] -(18) RegularHashAggregateExecTransformer +(20) RegularHashAggregateExecTransformer Input [2]: [c_custkey#X, o_orderkey#X] Keys [1]: [c_custkey#X] Functions [1]: [partial_count(o_orderkey#X)] Aggregate Attributes [1]: [count#X] Results [2]: [c_custkey#X, count#X] -(19) RegularHashAggregateExecTransformer +(21) RegularHashAggregateExecTransformer Input [2]: [c_custkey#X, count#X] Keys [1]: [c_custkey#X] Functions [1]: [count(o_orderkey#X)] Aggregate Attributes [1]: [count(o_orderkey#X)#X] Results [2]: [c_custkey#X, count(o_orderkey#X)#X] -(20) ProjectExecTransformer +(22) ProjectExecTransformer Output [1]: [count(o_orderkey#X)#X AS c_count#X] Input [2]: [c_custkey#X, count(o_orderkey#X)#X] -(21) FlushableHashAggregateExecTransformer +(23) FlushableHashAggregateExecTransformer Input [1]: [c_count#X] Keys [1]: [c_count#X] Functions [1]: [partial_count(1)] Aggregate Attributes [1]: [count#X] Results [2]: [c_count#X, count#X] -(22) ProjectExecTransformer +(24) ProjectExecTransformer Output [3]: [hash(c_count#X, 42) AS hash_partition_key#X, c_count#X, count#X] Input [2]: [c_count#X, count#X] -(23) WholeStageCodegenTransformer (X) +(25) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, c_count#X, count#X] Arguments: false -(24) ColumnarExchange +(26) VeloxAppendBatches +Input [3]: [hash_partition_key#X, c_count#X, count#X] +Arguments: X + +(27) ColumnarExchange Input [3]: [hash_partition_key#X, c_count#X, count#X] Arguments: hashpartitioning(c_count#X, 1), ENSURE_REQUIREMENTS, [c_count#X, count#X], [plan_id=X], [id=#X] -(25) ShuffleQueryStage +(28) ShuffleQueryStage Output [2]: [c_count#X, count#X] Arguments: X -(26) InputAdapter +(29) InputAdapter Input [2]: [c_count#X, count#X] -(27) InputIteratorTransformer +(30) InputIteratorTransformer Input [2]: [c_count#X, count#X] -(28) RegularHashAggregateExecTransformer +(31) RegularHashAggregateExecTransformer Input [2]: [c_count#X, count#X] Keys [1]: [c_count#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [c_count#X, count(1)#X AS custdist#X] -(29) WholeStageCodegenTransformer (X) +(32) WholeStageCodegenTransformer (X) Input [2]: [c_count#X, custdist#X] Arguments: false -(30) ColumnarExchange +(33) VeloxAppendBatches +Input [2]: [c_count#X, custdist#X] +Arguments: X + +(34) ColumnarExchange Input [2]: [c_count#X, custdist#X] Arguments: rangepartitioning(custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(31) ShuffleQueryStage +(35) ShuffleQueryStage Output [2]: [c_count#X, custdist#X] Arguments: X -(32) InputAdapter +(36) InputAdapter Input [2]: [c_count#X, custdist#X] -(33) InputIteratorTransformer +(37) InputIteratorTransformer Input [2]: [c_count#X, custdist#X] -(34) SortExecTransformer +(38) SortExecTransformer Input [2]: [c_count#X, custdist#X] Arguments: [custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST], true, 0 -(35) WholeStageCodegenTransformer (X) +(39) WholeStageCodegenTransformer (X) Input [2]: [c_count#X, custdist#X] Arguments: false -(36) VeloxColumnarToRowExec +(40) VeloxColumnarToRowExec Input [2]: [c_count#X, custdist#X] -(37) Scan parquet +(41) Scan parquet Output [1]: [c_custkey#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(38) Exchange +(42) Exchange Input [1]: [c_custkey#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(39) Scan parquet +(43) Sort +Input [1]: [c_custkey#X] +Arguments: [c_custkey#X ASC NULLS FIRST], false, 0 + +(44) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_comment#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_comment), IsNotNull(o_custkey)] ReadSchema: struct -(40) Filter +(45) Filter Input [3]: [o_orderkey#X, o_custkey#X, o_comment#X] Condition : ((isnotnull(o_comment#X) AND NOT o_comment#X LIKE %special%requests%) AND isnotnull(o_custkey#X)) -(41) Project +(46) Project Output [2]: [o_orderkey#X, o_custkey#X] Input [3]: [o_orderkey#X, o_custkey#X, o_comment#X] -(42) Exchange +(47) Exchange Input [2]: [o_orderkey#X, o_custkey#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(43) ShuffledHashJoin +(48) Sort +Input [2]: [o_orderkey#X, o_custkey#X] +Arguments: [o_custkey#X ASC NULLS FIRST], false, 0 + +(49) SortMergeJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(44) Project +(50) Project Output [2]: [c_custkey#X, o_orderkey#X] Input [3]: [c_custkey#X, o_orderkey#X, o_custkey#X] -(45) HashAggregate +(51) HashAggregate Input [2]: [c_custkey#X, o_orderkey#X] Keys [1]: [c_custkey#X] Functions [1]: [partial_count(o_orderkey#X)] Aggregate Attributes [1]: [count#X] Results [2]: [c_custkey#X, count#X] -(46) HashAggregate +(52) HashAggregate Input [2]: [c_custkey#X, count#X] Keys [1]: [c_custkey#X] Functions [1]: [count(o_orderkey#X)] Aggregate Attributes [1]: [count(o_orderkey#X)#X] Results [1]: [count(o_orderkey#X)#X AS c_count#X] -(47) HashAggregate +(53) HashAggregate Input [1]: [c_count#X] Keys [1]: [c_count#X] Functions [1]: [partial_count(1)] Aggregate Attributes [1]: [count#X] Results [2]: [c_count#X, count#X] -(48) Exchange +(54) Exchange Input [2]: [c_count#X, count#X] Arguments: hashpartitioning(c_count#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(49) HashAggregate +(55) HashAggregate Input [2]: [c_count#X, count#X] Keys [1]: [c_count#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [c_count#X, count(1)#X AS custdist#X] -(50) Exchange +(56) Exchange Input [2]: [c_count#X, custdist#X] Arguments: rangepartitioning(custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(51) Sort +(57) Sort Input [2]: [c_count#X, custdist#X] Arguments: [custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST], true, 0 -(52) AdaptiveSparkPlan +(58) AdaptiveSparkPlan Output [2]: [c_count#X, custdist#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/14.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/14.txt index c1e2a1e521304..55111a31f8740 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/14.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/14.txt @@ -1,38 +1,40 @@ == Physical Plan == -AdaptiveSparkPlan (35) +AdaptiveSparkPlan (39) +- == Final Plan == - VeloxColumnarToRowExec (23) - +- ^ ProjectExecTransformer (21) - +- ^ RegularHashAggregateExecTransformer (20) - +- ^ RegularHashAggregateExecTransformer (19) - +- ^ ProjectExecTransformer (18) - +- ^ ShuffledHashJoinExecTransformer Inner (17) - :- ^ InputIteratorTransformer (8) - : +- ^ InputAdapter (7) - : +- ^ ShuffleQueryStage (6) - : +- ColumnarExchange (5) + VeloxColumnarToRowExec (25) + +- ^ ProjectExecTransformer (23) + +- ^ RegularHashAggregateExecTransformer (22) + +- ^ RegularHashAggregateExecTransformer (21) + +- ^ ProjectExecTransformer (20) + +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (19) + :- ^ InputIteratorTransformer (9) + : +- ShuffleQueryStage (7) + : +- ColumnarExchange (6) + : +- VeloxAppendBatches (5) : +- ^ ProjectExecTransformer (3) : +- ^ NoopFilter (2) : +- ^ Scan parquet (1) - +- ^ InputIteratorTransformer (16) - +- ^ InputAdapter (15) - +- ^ ShuffleQueryStage (14) - +- ColumnarExchange (13) - +- ^ ProjectExecTransformer (11) - +- ^ NoopFilter (10) - +- ^ Scan parquet (9) + +- ^ InputIteratorTransformer (18) + +- ShuffleQueryStage (16) + +- ColumnarExchange (15) + +- VeloxAppendBatches (14) + +- ^ ProjectExecTransformer (12) + +- ^ NoopFilter (11) + +- ^ Scan parquet (10) +- == Initial Plan == - HashAggregate (34) - +- HashAggregate (33) - +- Project (32) - +- ShuffledHashJoin Inner BuildRight (31) - :- Exchange (27) - : +- Project (26) - : +- Filter (25) - : +- Scan parquet (24) - +- Exchange (30) - +- Filter (29) - +- Scan parquet (28) + HashAggregate (38) + +- HashAggregate (37) + +- Project (36) + +- SortMergeJoin Inner (35) + :- Sort (30) + : +- Exchange (29) + : +- Project (28) + : +- Filter (27) + : +- Scan parquet (26) + +- Sort (34) + +- Exchange (33) + +- Filter (32) + +- Scan parquet (31) (1) Scan parquet @@ -54,144 +56,160 @@ Input [4]: [l_partkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Input [4]: [hash_partition_key#X, l_partkey#X, l_extendedprice#X, l_discount#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_partkey#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(6) ColumnarExchange Input [4]: [hash_partition_key#X, l_partkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [3]: [l_partkey#X, l_extendedprice#X, l_discount#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [3]: [l_partkey#X, l_extendedprice#X, l_discount#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [3]: [l_partkey#X, l_extendedprice#X, l_discount#X] -(9) Scan parquet +(10) Scan parquet Output [2]: [p_partkey#X, p_type#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_partkey)] ReadSchema: struct -(10) NoopFilter +(11) NoopFilter Input [2]: [p_partkey#X, p_type#X] Arguments: [p_partkey#X, p_type#X] -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [3]: [hash(p_partkey#X, 42) AS hash_partition_key#X, p_partkey#X, p_type#X] Input [2]: [p_partkey#X, p_type#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, p_partkey#X, p_type#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [3]: [hash_partition_key#X, p_partkey#X, p_type#X] +Arguments: X + +(15) ColumnarExchange Input [3]: [hash_partition_key#X, p_partkey#X, p_type#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X, p_type#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [2]: [p_partkey#X, p_type#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [2]: [p_partkey#X, p_type#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [2]: [p_partkey#X, p_type#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [5]: [l_extendedprice#X, l_discount#X, p_type#X, CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) ELSE 0.0000 END AS _pre_X#X, CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) AS _pre_X#X] Input [5]: [l_partkey#X, l_extendedprice#X, l_discount#X, p_partkey#X, p_type#X] -(19) RegularHashAggregateExecTransformer +(21) RegularHashAggregateExecTransformer Input [5]: [l_extendedprice#X, l_discount#X, p_type#X, _pre_X#X, _pre_X#X] Keys: [] Functions [2]: [partial_sum(_pre_X#X), partial_sum(_pre_X#X)] Aggregate Attributes [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Results [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] -(20) RegularHashAggregateExecTransformer +(22) RegularHashAggregateExecTransformer Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Keys: [] Functions [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) ELSE 0.0000 END), sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) ELSE 0.0000 END)#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] Results [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) ELSE 0.0000 END)#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] -(21) ProjectExecTransformer +(23) ProjectExecTransformer Output [1]: [CheckOverflow((promote_precision(CheckOverflow((100.0000 * promote_precision(sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) ELSE 0.0000 END)#X)), DecimalType(38,6), true)) / promote_precision(cast(sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X as decimal(38,6)))), DecimalType(38,6), true) AS promo_revenue#X] Input [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) ELSE 0.0000 END)#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] -(22) WholeStageCodegenTransformer (X) +(24) WholeStageCodegenTransformer (X) Input [1]: [promo_revenue#X] Arguments: false -(23) VeloxColumnarToRowExec +(25) VeloxColumnarToRowExec Input [1]: [promo_revenue#X] -(24) Scan parquet +(26) Scan parquet Output [4]: [l_partkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1995-09-01), LessThan(l_shipdate,1995-10-01), IsNotNull(l_partkey)] ReadSchema: struct -(25) Filter +(27) Filter Input [4]: [l_partkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : (((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1995-09-01)) AND (l_shipdate#X < 1995-10-01)) AND isnotnull(l_partkey#X)) -(26) Project +(28) Project Output [3]: [l_partkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_partkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(27) Exchange +(29) Exchange Input [3]: [l_partkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(28) Scan parquet +(30) Sort +Input [3]: [l_partkey#X, l_extendedprice#X, l_discount#X] +Arguments: [l_partkey#X ASC NULLS FIRST], false, 0 + +(31) Scan parquet Output [2]: [p_partkey#X, p_type#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_partkey)] ReadSchema: struct -(29) Filter +(32) Filter Input [2]: [p_partkey#X, p_type#X] Condition : isnotnull(p_partkey#X) -(30) Exchange +(33) Exchange Input [2]: [p_partkey#X, p_type#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(31) ShuffledHashJoin +(34) Sort +Input [2]: [p_partkey#X, p_type#X] +Arguments: [p_partkey#X ASC NULLS FIRST], false, 0 + +(35) SortMergeJoin Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(32) Project +(36) Project Output [3]: [l_extendedprice#X, l_discount#X, p_type#X] Input [5]: [l_partkey#X, l_extendedprice#X, l_discount#X, p_partkey#X, p_type#X] -(33) HashAggregate +(37) HashAggregate Input [3]: [l_extendedprice#X, l_discount#X, p_type#X] Keys: [] Functions [2]: [partial_sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) ELSE 0.0000 END), partial_sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Results [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] -(34) HashAggregate +(38) HashAggregate Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Keys: [] Functions [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) ELSE 0.0000 END), sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) ELSE 0.0000 END)#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] Results [1]: [CheckOverflow((promote_precision(CheckOverflow((100.0000 * promote_precision(sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) ELSE 0.0000 END)#X)), DecimalType(38,6), true)) / promote_precision(cast(sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X as decimal(38,6)))), DecimalType(38,6), true) AS promo_revenue#X] -(35) AdaptiveSparkPlan +(39) AdaptiveSparkPlan Output [1]: [promo_revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/15.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/15.txt index 1da11e4fffea2..db2df6c875443 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/15.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/15.txt @@ -1,47 +1,49 @@ == Physical Plan == -AdaptiveSparkPlan (45) +AdaptiveSparkPlan (50) +- == Final Plan == - VeloxColumnarToRowExec (30) - +- ^ SortExecTransformer (28) - +- ^ InputIteratorTransformer (27) - +- ^ InputAdapter (26) - +- ^ ShuffleQueryStage (25) - +- ColumnarExchange (24) - +- ^ ProjectExecTransformer (22) - +- ^ ShuffledHashJoinExecTransformer Inner (21) - :- ^ InputIteratorTransformer (8) - : +- ^ InputAdapter (7) - : +- ^ ShuffleQueryStage (6) - : +- ColumnarExchange (5) + VeloxColumnarToRowExec (33) + +- ^ SortExecTransformer (31) + +- ^ InputIteratorTransformer (30) + +- ShuffleQueryStage (28) + +- ColumnarExchange (27) + +- VeloxAppendBatches (26) + +- ^ ProjectExecTransformer (24) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (23) + :- ^ InputIteratorTransformer (9) + : +- ShuffleQueryStage (7) + : +- ColumnarExchange (6) + : +- VeloxAppendBatches (5) : +- ^ ProjectExecTransformer (3) : +- ^ NoopFilter (2) : +- ^ Scan parquet (1) - +- ^ FilterExecTransformer (20) - +- ^ RegularHashAggregateExecTransformer (19) - +- ^ InputIteratorTransformer (18) - +- ^ InputAdapter (17) - +- ^ ShuffleQueryStage (16) - +- ColumnarExchange (15) - +- ^ ProjectExecTransformer (13) - +- ^ FlushableHashAggregateExecTransformer (12) - +- ^ ProjectExecTransformer (11) - +- ^ NoopFilter (10) - +- ^ Scan parquet (9) + +- ^ FilterExecTransformer (22) + +- ^ RegularHashAggregateExecTransformer (21) + +- ^ InputIteratorTransformer (20) + +- ShuffleQueryStage (18) + +- ColumnarExchange (17) + +- VeloxAppendBatches (16) + +- ^ ProjectExecTransformer (14) + +- ^ FlushableHashAggregateExecTransformer (13) + +- ^ ProjectExecTransformer (12) + +- ^ NoopFilter (11) + +- ^ Scan parquet (10) +- == Initial Plan == - Sort (44) - +- Exchange (43) - +- Project (42) - +- ShuffledHashJoin Inner BuildLeft (41) - :- Exchange (33) - : +- Filter (32) - : +- Scan parquet (31) - +- Filter (40) - +- HashAggregate (39) - +- Exchange (38) - +- HashAggregate (37) - +- Project (36) - +- Filter (35) - +- Scan parquet (34) + Sort (49) + +- Exchange (48) + +- Project (47) + +- SortMergeJoin Inner (46) + :- Sort (37) + : +- Exchange (36) + : +- Filter (35) + : +- Scan parquet (34) + +- Sort (45) + +- Filter (44) + +- HashAggregate (43) + +- Exchange (42) + +- HashAggregate (41) + +- Project (40) + +- Filter (39) + +- Scan parquet (38) (1) Scan parquet @@ -63,182 +65,202 @@ Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_phone#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_phone#X] +Arguments: X + +(6) ColumnarExchange Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_phone#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_address#X, s_phone#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] -(9) Scan parquet +(10) Scan parquet Output [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1996-01-01), LessThan(l_shipdate,1996-04-01), IsNotNull(l_suppkey)] ReadSchema: struct -(10) NoopFilter +(11) NoopFilter Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) AS _pre_X#X] Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(12) FlushableHashAggregateExecTransformer +(13) FlushableHashAggregateExecTransformer Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, _pre_X#X] Keys [1]: [l_suppkey#X] Functions [1]: [partial_sum(_pre_X#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_suppkey#X, sum#X, isEmpty#X] -(13) ProjectExecTransformer +(14) ProjectExecTransformer Output [4]: [hash(l_suppkey#X, 42) AS hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] Input [3]: [l_suppkey#X, sum#X, isEmpty#X] -(14) WholeStageCodegenTransformer (X) +(15) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: false -(15) ColumnarExchange +(16) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] +Arguments: X + +(17) ColumnarExchange Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(16) ShuffleQueryStage +(18) ShuffleQueryStage Output [3]: [l_suppkey#X, sum#X, isEmpty#X] Arguments: X -(17) InputAdapter +(19) InputAdapter Input [3]: [l_suppkey#X, sum#X, isEmpty#X] -(18) InputIteratorTransformer +(20) InputIteratorTransformer Input [3]: [l_suppkey#X, sum#X, isEmpty#X] -(19) RegularHashAggregateExecTransformer +(21) RegularHashAggregateExecTransformer Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Keys [1]: [l_suppkey#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] Results [2]: [l_suppkey#X AS supplier_no#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X AS total_revenue#X] -(20) FilterExecTransformer +(22) FilterExecTransformer Input [2]: [supplier_no#X, total_revenue#X] Arguments: (isnotnull(total_revenue#X) AND (total_revenue#X = Subquery subquery#X, [id=#X])) -(21) ShuffledHashJoinExecTransformer +(23) ShuffledHashJoinExecTransformer Left keys [1]: [s_suppkey#X] Right keys [1]: [supplier_no#X] Join condition: None -(22) ProjectExecTransformer +(24) ProjectExecTransformer Output [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Input [6]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, supplier_no#X, total_revenue#X] -(23) WholeStageCodegenTransformer (X) +(25) WholeStageCodegenTransformer (X) Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: false -(24) ColumnarExchange +(26) VeloxAppendBatches +Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] +Arguments: X + +(27) ColumnarExchange Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: rangepartitioning(s_suppkey#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(25) ShuffleQueryStage +(28) ShuffleQueryStage Output [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: X -(26) InputAdapter +(29) InputAdapter Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] -(27) InputIteratorTransformer +(30) InputIteratorTransformer Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] -(28) SortExecTransformer +(31) SortExecTransformer Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: [s_suppkey#X ASC NULLS FIRST], true, 0 -(29) WholeStageCodegenTransformer (X) +(32) WholeStageCodegenTransformer (X) Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: false -(30) VeloxColumnarToRowExec +(33) VeloxColumnarToRowExec Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] -(31) Scan parquet +(34) Scan parquet Output [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey)] ReadSchema: struct -(32) Filter +(35) Filter Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] Condition : isnotnull(s_suppkey#X) -(33) Exchange +(36) Exchange Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(34) Scan parquet +(37) Sort +Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] +Arguments: [s_suppkey#X ASC NULLS FIRST], false, 0 + +(38) Scan parquet Output [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1996-01-01), LessThan(l_shipdate,1996-04-01), IsNotNull(l_suppkey)] ReadSchema: struct -(35) Filter +(39) Filter Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : (((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1996-01-01)) AND (l_shipdate#X < 1996-04-01)) AND isnotnull(l_suppkey#X)) -(36) Project +(40) Project Output [3]: [l_suppkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(37) HashAggregate +(41) HashAggregate Input [3]: [l_suppkey#X, l_extendedprice#X, l_discount#X] Keys [1]: [l_suppkey#X] Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_suppkey#X, sum#X, isEmpty#X] -(38) Exchange +(42) Exchange Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(39) HashAggregate +(43) HashAggregate Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Keys [1]: [l_suppkey#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] Results [2]: [l_suppkey#X AS supplier_no#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X AS total_revenue#X] -(40) Filter +(44) Filter Input [2]: [supplier_no#X, total_revenue#X] Condition : (isnotnull(total_revenue#X) AND (total_revenue#X = Subquery subquery#X, [id=#X])) -(41) ShuffledHashJoin +(45) Sort +Input [2]: [supplier_no#X, total_revenue#X] +Arguments: [supplier_no#X ASC NULLS FIRST], false, 0 + +(46) SortMergeJoin Left keys [1]: [s_suppkey#X] Right keys [1]: [supplier_no#X] Join condition: None -(42) Project +(47) Project Output [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Input [6]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, supplier_no#X, total_revenue#X] -(43) Exchange +(48) Exchange Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: rangepartitioning(s_suppkey#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(44) Sort +(49) Sort Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: [s_suppkey#X ASC NULLS FIRST], true, 0 -(45) AdaptiveSparkPlan +(50) AdaptiveSparkPlan Output [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/16.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/16.txt index 4a8b56e41f363..2eb5668906ba9 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/16.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/16.txt @@ -1,64 +1,66 @@ == Physical Plan == -AdaptiveSparkPlan (64) +AdaptiveSparkPlan (71) +- == Final Plan == - VeloxColumnarToRowExec (42) - +- ^ SortExecTransformer (40) - +- ^ InputIteratorTransformer (39) - +- ^ InputAdapter (38) - +- ^ ShuffleQueryStage (37) - +- ColumnarExchange (36) - +- ^ RegularHashAggregateExecTransformer (34) - +- ^ InputIteratorTransformer (33) - +- ^ InputAdapter (32) - +- ^ ShuffleQueryStage (31) - +- ColumnarExchange (30) - +- ^ ProjectExecTransformer (28) - +- ^ FlushableHashAggregateExecTransformer (27) - +- ^ RegularHashAggregateExecTransformer (26) - +- ^ InputIteratorTransformer (25) - +- ^ InputAdapter (24) - +- ^ ShuffleQueryStage (23) - +- ColumnarExchange (22) - +- ^ ProjectExecTransformer (20) - +- ^ FlushableHashAggregateExecTransformer (19) - +- ^ ProjectExecTransformer (18) - +- ^ ShuffledHashJoinExecTransformer Inner (17) - :- ^ InputIteratorTransformer (8) - : +- ^ InputAdapter (7) - : +- ^ ShuffleQueryStage (6) - : +- ColumnarExchange (5) + VeloxColumnarToRowExec (47) + +- ^ SortExecTransformer (45) + +- ^ InputIteratorTransformer (44) + +- ShuffleQueryStage (42) + +- ColumnarExchange (41) + +- VeloxAppendBatches (40) + +- ^ RegularHashAggregateExecTransformer (38) + +- ^ InputIteratorTransformer (37) + +- ShuffleQueryStage (35) + +- ColumnarExchange (34) + +- VeloxAppendBatches (33) + +- ^ ProjectExecTransformer (31) + +- ^ FlushableHashAggregateExecTransformer (30) + +- ^ RegularHashAggregateExecTransformer (29) + +- ^ InputIteratorTransformer (28) + +- ShuffleQueryStage (26) + +- ColumnarExchange (25) + +- VeloxAppendBatches (24) + +- ^ ProjectExecTransformer (22) + +- ^ FlushableHashAggregateExecTransformer (21) + +- ^ ProjectExecTransformer (20) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (19) + :- ^ InputIteratorTransformer (9) + : +- ShuffleQueryStage (7) + : +- ColumnarExchange (6) + : +- VeloxAppendBatches (5) : +- ^ ProjectExecTransformer (3) : +- ^ NoopFilter (2) : +- ^ Scan parquet (1) - +- ^ InputIteratorTransformer (16) - +- ^ InputAdapter (15) - +- ^ ShuffleQueryStage (14) - +- ColumnarExchange (13) - +- ^ ProjectExecTransformer (11) - +- ^ NoopFilter (10) - +- ^ Scan parquet (9) + +- ^ InputIteratorTransformer (18) + +- ShuffleQueryStage (16) + +- ColumnarExchange (15) + +- VeloxAppendBatches (14) + +- ^ ProjectExecTransformer (12) + +- ^ NoopFilter (11) + +- ^ Scan parquet (10) +- == Initial Plan == - Sort (63) - +- Exchange (62) - +- HashAggregate (61) - +- Exchange (60) - +- HashAggregate (59) - +- HashAggregate (58) - +- Exchange (57) - +- HashAggregate (56) - +- Project (55) - +- ShuffledHashJoin Inner BuildRight (54) - :- Exchange (50) - : +- BroadcastHashJoin LeftAnti BuildRight (49) - : :- Filter (44) - : : +- Scan parquet (43) - : +- BroadcastExchange (48) - : +- Project (47) - : +- Filter (46) - : +- Scan parquet (45) - +- Exchange (53) - +- Filter (52) - +- Scan parquet (51) + Sort (70) + +- Exchange (69) + +- HashAggregate (68) + +- Exchange (67) + +- HashAggregate (66) + +- HashAggregate (65) + +- Exchange (64) + +- HashAggregate (63) + +- Project (62) + +- SortMergeJoin Inner (61) + :- Sort (56) + : +- Exchange (55) + : +- BroadcastHashJoin LeftAnti BuildRight (54) + : :- Filter (49) + : : +- Scan parquet (48) + : +- BroadcastExchange (53) + : +- Project (52) + : +- Filter (51) + : +- Scan parquet (50) + +- Sort (60) + +- Exchange (59) + +- Filter (58) + +- Scan parquet (57) (1) Scan parquet @@ -80,270 +82,298 @@ Input [2]: [ps_partkey#X, ps_suppkey#X] Input [3]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [3]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X] +Arguments: X + +(6) ColumnarExchange Input [3]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X] Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [2]: [ps_partkey#X, ps_suppkey#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [2]: [ps_partkey#X, ps_suppkey#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [2]: [ps_partkey#X, ps_suppkey#X] -(9) Scan parquet +(10) Scan parquet Output [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_brand), IsNotNull(p_type), Not(EqualTo(p_brand,Brand#X)), Not(StringStartsWith(p_type,MEDIUM POLISHED)), In(p_size, [14,19,23,3,36,45,49,9]), IsNotNull(p_partkey)] ReadSchema: struct -(10) NoopFilter +(11) NoopFilter Input [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] Arguments: [p_partkey#X, p_brand#X, p_type#X, p_size#X] -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [5]: [hash(p_partkey#X, 42) AS hash_partition_key#X, p_partkey#X, p_brand#X, p_type#X, p_size#X] Input [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, p_partkey#X, p_brand#X, p_type#X, p_size#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [5]: [hash_partition_key#X, p_partkey#X, p_brand#X, p_type#X, p_size#X] +Arguments: X + +(15) ColumnarExchange Input [5]: [hash_partition_key#X, p_partkey#X, p_brand#X, p_type#X, p_size#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X, p_brand#X, p_type#X, p_size#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [ps_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [4]: [ps_suppkey#X, p_brand#X, p_type#X, p_size#X] Input [6]: [ps_partkey#X, ps_suppkey#X, p_partkey#X, p_brand#X, p_type#X, p_size#X] -(19) FlushableHashAggregateExecTransformer +(21) FlushableHashAggregateExecTransformer Input [4]: [ps_suppkey#X, p_brand#X, p_type#X, p_size#X] Keys [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Functions: [] Aggregate Attributes: [] Results [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(20) ProjectExecTransformer +(22) ProjectExecTransformer Output [5]: [hash(p_brand#X, p_type#X, p_size#X, ps_suppkey#X, 42) AS hash_partition_key#X, p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(21) WholeStageCodegenTransformer (X) +(23) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Arguments: false -(22) ColumnarExchange +(24) VeloxAppendBatches +Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, ps_suppkey#X] +Arguments: X + +(25) ColumnarExchange Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, ps_suppkey#X], [plan_id=X], [id=#X] -(23) ShuffleQueryStage +(26) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Arguments: X -(24) InputAdapter +(27) InputAdapter Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(25) InputIteratorTransformer +(28) InputIteratorTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(26) RegularHashAggregateExecTransformer +(29) RegularHashAggregateExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Keys [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Functions: [] Aggregate Attributes: [] Results [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(27) FlushableHashAggregateExecTransformer +(30) FlushableHashAggregateExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] Functions [1]: [partial_count(distinct ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count#X] -(28) ProjectExecTransformer +(31) ProjectExecTransformer Output [5]: [hash(p_brand#X, p_type#X, p_size#X, 42) AS hash_partition_key#X, p_brand#X, p_type#X, p_size#X, count#X] Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] -(29) WholeStageCodegenTransformer (X) +(32) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, count#X] Arguments: false -(30) ColumnarExchange +(33) VeloxAppendBatches +Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, count#X] +Arguments: X + +(34) ColumnarExchange Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, count#X] Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, count#X], [plan_id=X], [id=#X] -(31) ShuffleQueryStage +(35) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, count#X] Arguments: X -(32) InputAdapter +(36) InputAdapter Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] -(33) InputIteratorTransformer +(37) InputIteratorTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] -(34) RegularHashAggregateExecTransformer +(38) RegularHashAggregateExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] Functions [1]: [count(distinct ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count(ps_suppkey#X)#X AS supplier_cnt#X] -(35) WholeStageCodegenTransformer (X) +(39) WholeStageCodegenTransformer (X) Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: false -(36) ColumnarExchange +(40) VeloxAppendBatches +Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] +Arguments: X + +(41) ColumnarExchange Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: rangepartitioning(supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(37) ShuffleQueryStage +(42) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: X -(38) InputAdapter +(43) InputAdapter Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] -(39) InputIteratorTransformer +(44) InputIteratorTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] -(40) SortExecTransformer +(45) SortExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: [supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST], true, 0 -(41) WholeStageCodegenTransformer (X) +(46) WholeStageCodegenTransformer (X) Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: false -(42) VeloxColumnarToRowExec +(47) VeloxColumnarToRowExec Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] -(43) Scan parquet +(48) Scan parquet Output [2]: [ps_partkey#X, ps_suppkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_partkey)] ReadSchema: struct -(44) Filter +(49) Filter Input [2]: [ps_partkey#X, ps_suppkey#X] Condition : isnotnull(ps_partkey#X) -(45) Scan parquet +(50) Scan parquet Output [2]: [s_suppkey#X, s_comment#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_comment)] ReadSchema: struct -(46) Filter +(51) Filter Input [2]: [s_suppkey#X, s_comment#X] Condition : (isnotnull(s_comment#X) AND s_comment#X LIKE %Customer%Complaints%) -(47) Project +(52) Project Output [1]: [s_suppkey#X] Input [2]: [s_suppkey#X, s_comment#X] -(48) BroadcastExchange +(53) BroadcastExchange Input [1]: [s_suppkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),true), [plan_id=X] -(49) BroadcastHashJoin +(54) BroadcastHashJoin Left keys [1]: [ps_suppkey#X] Right keys [1]: [s_suppkey#X] Join condition: None -(50) Exchange +(55) Exchange Input [2]: [ps_partkey#X, ps_suppkey#X] Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(51) Scan parquet +(56) Sort +Input [2]: [ps_partkey#X, ps_suppkey#X] +Arguments: [ps_partkey#X ASC NULLS FIRST], false, 0 + +(57) Scan parquet Output [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_brand), IsNotNull(p_type), Not(EqualTo(p_brand,Brand#X)), Not(StringStartsWith(p_type,MEDIUM POLISHED)), In(p_size, [14,19,23,3,36,45,49,9]), IsNotNull(p_partkey)] ReadSchema: struct -(52) Filter +(58) Filter Input [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] Condition : (((((isnotnull(p_brand#X) AND isnotnull(p_type#X)) AND NOT (p_brand#X = Brand#X)) AND NOT StartsWith(p_type#X, MEDIUM POLISHED)) AND p_size#X IN (49,14,23,45,19,3,36,9)) AND isnotnull(p_partkey#X)) -(53) Exchange +(59) Exchange Input [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(54) ShuffledHashJoin +(60) Sort +Input [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] +Arguments: [p_partkey#X ASC NULLS FIRST], false, 0 + +(61) SortMergeJoin Left keys [1]: [ps_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(55) Project +(62) Project Output [4]: [ps_suppkey#X, p_brand#X, p_type#X, p_size#X] Input [6]: [ps_partkey#X, ps_suppkey#X, p_partkey#X, p_brand#X, p_type#X, p_size#X] -(56) HashAggregate +(63) HashAggregate Input [4]: [ps_suppkey#X, p_brand#X, p_type#X, p_size#X] Keys [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Functions: [] Aggregate Attributes: [] Results [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(57) Exchange +(64) Exchange Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(58) HashAggregate +(65) HashAggregate Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Keys [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Functions: [] Aggregate Attributes: [] Results [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(59) HashAggregate +(66) HashAggregate Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] Functions [1]: [partial_count(distinct ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count#X] -(60) Exchange +(67) Exchange Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(61) HashAggregate +(68) HashAggregate Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] Functions [1]: [count(distinct ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count(ps_suppkey#X)#X AS supplier_cnt#X] -(62) Exchange +(69) Exchange Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: rangepartitioning(supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(63) Sort +(70) Sort Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: [supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST], true, 0 -(64) AdaptiveSparkPlan +(71) AdaptiveSparkPlan Output [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/17.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/17.txt index 523c081b1d19a..5226aacff7531 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/17.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/17.txt @@ -1,59 +1,62 @@ == Physical Plan == -AdaptiveSparkPlan (57) +AdaptiveSparkPlan (63) +- == Final Plan == - VeloxColumnarToRowExec (37) - +- ^ ProjectExecTransformer (35) - +- ^ RegularHashAggregateExecTransformer (34) - +- ^ RegularHashAggregateExecTransformer (33) - +- ^ ProjectExecTransformer (32) - +- ^ ShuffledHashJoinExecTransformer Inner (31) - :- ^ ProjectExecTransformer (18) - : +- ^ ShuffledHashJoinExecTransformer Inner (17) - : :- ^ InputIteratorTransformer (8) - : : +- ^ InputAdapter (7) - : : +- ^ ShuffleQueryStage (6) - : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (40) + +- ^ ProjectExecTransformer (38) + +- ^ RegularHashAggregateExecTransformer (37) + +- ^ RegularHashAggregateExecTransformer (36) + +- ^ ProjectExecTransformer (35) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (34) + :- ^ ProjectExecTransformer (20) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (19) + : :- ^ InputIteratorTransformer (9) + : : +- ShuffleQueryStage (7) + : : +- ColumnarExchange (6) + : : +- VeloxAppendBatches (5) : : +- ^ ProjectExecTransformer (3) : : +- ^ NoopFilter (2) : : +- ^ Scan parquet (1) - : +- ^ InputIteratorTransformer (16) - : +- ^ InputAdapter (15) - : +- ^ ShuffleQueryStage (14) - : +- ColumnarExchange (13) - : +- ^ ProjectExecTransformer (11) - : +- ^ NoopFilter (10) - : +- ^ Scan parquet (9) - +- ^ FilterExecTransformer (30) - +- ^ ProjectExecTransformer (29) - +- ^ RegularHashAggregateExecTransformer (28) - +- ^ InputIteratorTransformer (27) - +- ^ InputAdapter (26) - +- ^ ShuffleQueryStage (25) - +- ColumnarExchange (24) - +- ^ ProjectExecTransformer (22) - +- ^ FlushableHashAggregateExecTransformer (21) - +- ^ NoopFilter (20) - +- ^ Scan parquet (19) + : +- ^ InputIteratorTransformer (18) + : +- ShuffleQueryStage (16) + : +- ColumnarExchange (15) + : +- VeloxAppendBatches (14) + : +- ^ ProjectExecTransformer (12) + : +- ^ NoopFilter (11) + : +- ^ Scan parquet (10) + +- ^ FilterExecTransformer (33) + +- ^ ProjectExecTransformer (32) + +- ^ RegularHashAggregateExecTransformer (31) + +- ^ InputIteratorTransformer (30) + +- ShuffleQueryStage (28) + +- ColumnarExchange (27) + +- VeloxAppendBatches (26) + +- ^ ProjectExecTransformer (24) + +- ^ FlushableHashAggregateExecTransformer (23) + +- ^ NoopFilter (22) + +- ^ Scan parquet (21) +- == Initial Plan == - HashAggregate (56) - +- HashAggregate (55) - +- Project (54) - +- ShuffledHashJoin Inner BuildRight (53) - :- Project (46) - : +- ShuffledHashJoin Inner BuildRight (45) - : :- Exchange (40) - : : +- Filter (39) - : : +- Scan parquet (38) - : +- Exchange (44) - : +- Project (43) - : +- Filter (42) - : +- Scan parquet (41) - +- Filter (52) - +- HashAggregate (51) - +- Exchange (50) - +- HashAggregate (49) - +- Filter (48) - +- Scan parquet (47) + HashAggregate (62) + +- HashAggregate (61) + +- Project (60) + +- SortMergeJoin Inner (59) + :- Project (51) + : +- SortMergeJoin Inner (50) + : :- Sort (44) + : : +- Exchange (43) + : : +- Filter (42) + : : +- Scan parquet (41) + : +- Sort (49) + : +- Exchange (48) + : +- Project (47) + : +- Filter (46) + : +- Scan parquet (45) + +- Sort (58) + +- Filter (57) + +- HashAggregate (56) + +- Exchange (55) + +- HashAggregate (54) + +- Filter (53) + +- Scan parquet (52) (1) Scan parquet @@ -75,250 +78,274 @@ Input [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] Input [4]: [hash_partition_key#X, l_partkey#X, l_quantity#X, l_extendedprice#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_partkey#X, l_quantity#X, l_extendedprice#X] +Arguments: X + +(6) ColumnarExchange Input [4]: [hash_partition_key#X, l_partkey#X, l_quantity#X, l_extendedprice#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_quantity#X, l_extendedprice#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] -(9) Scan parquet +(10) Scan parquet Output [3]: [p_partkey#X, p_brand#X, p_container#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_brand), IsNotNull(p_container), EqualTo(p_brand,Brand#X), EqualTo(p_container,MED BOX), IsNotNull(p_partkey)] ReadSchema: struct -(10) NoopFilter +(11) NoopFilter Input [3]: [p_partkey#X, p_brand#X, p_container#X] Arguments: [p_partkey#X, p_brand#X, p_container#X] -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [2]: [hash(p_partkey#X, 42) AS hash_partition_key#X, p_partkey#X] Input [3]: [p_partkey#X, p_brand#X, p_container#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, p_partkey#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [2]: [hash_partition_key#X, p_partkey#X] +Arguments: X + +(15) ColumnarExchange Input [2]: [hash_partition_key#X, p_partkey#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [1]: [p_partkey#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [1]: [p_partkey#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [1]: [p_partkey#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [3]: [l_quantity#X, l_extendedprice#X, p_partkey#X] Input [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, p_partkey#X] -(19) Scan parquet +(21) Scan parquet Output [2]: [l_partkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey)] ReadSchema: struct -(20) NoopFilter +(22) NoopFilter Input [2]: [l_partkey#X, l_quantity#X] Arguments: [l_partkey#X, l_quantity#X] -(21) FlushableHashAggregateExecTransformer +(23) FlushableHashAggregateExecTransformer Input [2]: [l_partkey#X, l_quantity#X] Keys [1]: [l_partkey#X] Functions [1]: [partial_avg(l_quantity#X)] Aggregate Attributes [2]: [sum#X, count#X] Results [3]: [l_partkey#X, sum#X, count#X] -(22) ProjectExecTransformer +(24) ProjectExecTransformer Output [4]: [hash(l_partkey#X, 42) AS hash_partition_key#X, l_partkey#X, sum#X, count#X] Input [3]: [l_partkey#X, sum#X, count#X] -(23) WholeStageCodegenTransformer (X) +(25) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, l_partkey#X, sum#X, count#X] Arguments: false -(24) ColumnarExchange +(26) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_partkey#X, sum#X, count#X] +Arguments: X + +(27) ColumnarExchange Input [4]: [hash_partition_key#X, l_partkey#X, sum#X, count#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, sum#X, count#X], [plan_id=X], [id=#X] -(25) ShuffleQueryStage +(28) ShuffleQueryStage Output [3]: [l_partkey#X, sum#X, count#X] Arguments: X -(26) InputAdapter +(29) InputAdapter Input [3]: [l_partkey#X, sum#X, count#X] -(27) InputIteratorTransformer +(30) InputIteratorTransformer Input [3]: [l_partkey#X, sum#X, count#X] -(28) RegularHashAggregateExecTransformer +(31) RegularHashAggregateExecTransformer Input [3]: [l_partkey#X, sum#X, count#X] Keys [1]: [l_partkey#X] Functions [1]: [avg(l_quantity#X)] Aggregate Attributes [1]: [avg(l_quantity#X)#X] Results [2]: [l_partkey#X, avg(l_quantity#X)#X] -(29) ProjectExecTransformer +(32) ProjectExecTransformer Output [2]: [CheckOverflow((0.200000 * promote_precision(avg(l_quantity#X)#X)), DecimalType(18,7), true) AS (0.2 * avg(l_quantity))#X, l_partkey#X] Input [2]: [l_partkey#X, avg(l_quantity#X)#X] -(30) FilterExecTransformer +(33) FilterExecTransformer Input [2]: [(0.2 * avg(l_quantity))#X, l_partkey#X] Arguments: isnotnull((0.2 * avg(l_quantity))#X) -(31) ShuffledHashJoinExecTransformer +(34) ShuffledHashJoinExecTransformer Left keys [1]: [p_partkey#X] Right keys [1]: [l_partkey#X] Join condition: (cast(l_quantity#X as decimal(18,7)) < (0.2 * avg(l_quantity))#X) -(32) ProjectExecTransformer +(35) ProjectExecTransformer Output [1]: [l_extendedprice#X] Input [5]: [l_quantity#X, l_extendedprice#X, p_partkey#X, (0.2 * avg(l_quantity))#X, l_partkey#X] -(33) RegularHashAggregateExecTransformer +(36) RegularHashAggregateExecTransformer Input [1]: [l_extendedprice#X] Keys: [] Functions [1]: [partial_sum(l_extendedprice#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(34) RegularHashAggregateExecTransformer +(37) RegularHashAggregateExecTransformer Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(l_extendedprice#X)] Aggregate Attributes [1]: [sum(l_extendedprice#X)#X] Results [1]: [sum(l_extendedprice#X)#X] -(35) ProjectExecTransformer +(38) ProjectExecTransformer Output [1]: [CheckOverflow((promote_precision(sum(l_extendedprice#X)#X) / 7.00), DecimalType(27,6), true) AS avg_yearly#X] Input [1]: [sum(l_extendedprice#X)#X] -(36) WholeStageCodegenTransformer (X) +(39) WholeStageCodegenTransformer (X) Input [1]: [avg_yearly#X] Arguments: false -(37) VeloxColumnarToRowExec +(40) VeloxColumnarToRowExec Input [1]: [avg_yearly#X] -(38) Scan parquet +(41) Scan parquet Output [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey), IsNotNull(l_quantity)] ReadSchema: struct -(39) Filter +(42) Filter Input [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] Condition : (isnotnull(l_partkey#X) AND isnotnull(l_quantity#X)) -(40) Exchange +(43) Exchange Input [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(41) Scan parquet +(44) Sort +Input [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] +Arguments: [l_partkey#X ASC NULLS FIRST], false, 0 + +(45) Scan parquet Output [3]: [p_partkey#X, p_brand#X, p_container#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_brand), IsNotNull(p_container), EqualTo(p_brand,Brand#X), EqualTo(p_container,MED BOX), IsNotNull(p_partkey)] ReadSchema: struct -(42) Filter +(46) Filter Input [3]: [p_partkey#X, p_brand#X, p_container#X] Condition : ((((isnotnull(p_brand#X) AND isnotnull(p_container#X)) AND (p_brand#X = Brand#X)) AND (p_container#X = MED BOX)) AND isnotnull(p_partkey#X)) -(43) Project +(47) Project Output [1]: [p_partkey#X] Input [3]: [p_partkey#X, p_brand#X, p_container#X] -(44) Exchange +(48) Exchange Input [1]: [p_partkey#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(45) ShuffledHashJoin +(49) Sort +Input [1]: [p_partkey#X] +Arguments: [p_partkey#X ASC NULLS FIRST], false, 0 + +(50) SortMergeJoin Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(46) Project +(51) Project Output [3]: [l_quantity#X, l_extendedprice#X, p_partkey#X] Input [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, p_partkey#X] -(47) Scan parquet +(52) Scan parquet Output [2]: [l_partkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey)] ReadSchema: struct -(48) Filter +(53) Filter Input [2]: [l_partkey#X, l_quantity#X] Condition : isnotnull(l_partkey#X) -(49) HashAggregate +(54) HashAggregate Input [2]: [l_partkey#X, l_quantity#X] Keys [1]: [l_partkey#X] Functions [1]: [partial_avg(l_quantity#X)] Aggregate Attributes [2]: [sum#X, count#X] Results [3]: [l_partkey#X, sum#X, count#X] -(50) Exchange +(55) Exchange Input [3]: [l_partkey#X, sum#X, count#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(51) HashAggregate +(56) HashAggregate Input [3]: [l_partkey#X, sum#X, count#X] Keys [1]: [l_partkey#X] Functions [1]: [avg(l_quantity#X)] Aggregate Attributes [1]: [avg(l_quantity#X)#X] Results [2]: [CheckOverflow((0.200000 * promote_precision(avg(l_quantity#X)#X)), DecimalType(18,7), true) AS (0.2 * avg(l_quantity))#X, l_partkey#X] -(52) Filter +(57) Filter Input [2]: [(0.2 * avg(l_quantity))#X, l_partkey#X] Condition : isnotnull((0.2 * avg(l_quantity))#X) -(53) ShuffledHashJoin +(58) Sort +Input [2]: [(0.2 * avg(l_quantity))#X, l_partkey#X] +Arguments: [l_partkey#X ASC NULLS FIRST], false, 0 + +(59) SortMergeJoin Left keys [1]: [p_partkey#X] Right keys [1]: [l_partkey#X] Join condition: (cast(l_quantity#X as decimal(18,7)) < (0.2 * avg(l_quantity))#X) -(54) Project +(60) Project Output [1]: [l_extendedprice#X] Input [5]: [l_quantity#X, l_extendedprice#X, p_partkey#X, (0.2 * avg(l_quantity))#X, l_partkey#X] -(55) HashAggregate +(61) HashAggregate Input [1]: [l_extendedprice#X] Keys: [] Functions [1]: [partial_sum(l_extendedprice#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(56) HashAggregate +(62) HashAggregate Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(l_extendedprice#X)] Aggregate Attributes [1]: [sum(l_extendedprice#X)#X] Results [1]: [CheckOverflow((promote_precision(sum(l_extendedprice#X)#X) / 7.00), DecimalType(27,6), true) AS avg_yearly#X] -(57) AdaptiveSparkPlan +(63) AdaptiveSparkPlan Output [1]: [avg_yearly#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/18.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/18.txt index 6045020cb49af..c1287b2d685a4 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/18.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/18.txt @@ -1,96 +1,102 @@ == Physical Plan == -AdaptiveSparkPlan (97) +AdaptiveSparkPlan (110) +- == Final Plan == - VeloxColumnarToRowExec (64) - +- TakeOrderedAndProjectExecTransformer (63) - +- ^ RegularHashAggregateExecTransformer (61) - +- ^ RegularHashAggregateExecTransformer (60) - +- ^ ProjectExecTransformer (59) - +- ^ ShuffledHashJoinExecTransformer Inner (58) - :- ^ InputIteratorTransformer (41) - : +- ^ InputAdapter (40) - : +- ^ ShuffleQueryStage (39) - : +- ColumnarExchange (38) - : +- ^ ProjectExecTransformer (36) - : +- ^ ShuffledHashJoinExecTransformer Inner (35) - : :- ^ InputIteratorTransformer (8) - : : +- ^ InputAdapter (7) - : : +- ^ ShuffleQueryStage (6) - : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (70) + +- TakeOrderedAndProjectExecTransformer (69) + +- ^ RegularHashAggregateExecTransformer (67) + +- ^ RegularHashAggregateExecTransformer (66) + +- ^ ProjectExecTransformer (65) + +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (64) + :- ^ InputIteratorTransformer (46) + : +- ShuffleQueryStage (44) + : +- ColumnarExchange (43) + : +- VeloxAppendBatches (42) + : +- ^ ProjectExecTransformer (40) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (39) + : :- ^ InputIteratorTransformer (9) + : : +- ShuffleQueryStage (7) + : : +- ColumnarExchange (6) + : : +- VeloxAppendBatches (5) : : +- ^ ProjectExecTransformer (3) : : +- ^ NoopFilter (2) : : +- ^ Scan parquet (1) - : +- ^ InputIteratorTransformer (34) - : +- ^ InputAdapter (33) - : +- ^ ShuffleQueryStage (32) - : +- ColumnarExchange (31) - : +- ^ ProjectExecTransformer (29) - : +- ^ ShuffledHashJoinExecTransformer LeftSemi (28) - : :- ^ InputIteratorTransformer (16) - : : +- ^ InputAdapter (15) - : : +- ^ ShuffleQueryStage (14) - : : +- ColumnarExchange (13) - : : +- ^ ProjectExecTransformer (11) - : : +- ^ NoopFilter (10) - : : +- ^ Scan parquet (9) - : +- ^ ProjectExecTransformer (27) - : +- ^ FilterExecTransformer (26) - : +- ^ RegularHashAggregateExecTransformer (25) - : +- ^ InputIteratorTransformer (24) - : +- ^ InputAdapter (23) - : +- ^ ShuffleQueryStage (22) - : +- ColumnarExchange (21) - : +- ^ ProjectExecTransformer (19) - : +- ^ FlushableHashAggregateExecTransformer (18) - : +- ^ Scan parquet (17) - +- ^ ShuffledHashJoinExecTransformer LeftSemi (57) - :- ^ InputIteratorTransformer (49) - : +- ^ InputAdapter (48) - : +- ^ ShuffleQueryStage (47) - : +- ColumnarExchange (46) - : +- ^ ProjectExecTransformer (44) - : +- ^ NoopFilter (43) - : +- ^ Scan parquet (42) - +- ^ ProjectExecTransformer (56) - +- ^ FilterExecTransformer (55) - +- ^ RegularHashAggregateExecTransformer (54) - +- ^ InputIteratorTransformer (53) - +- ^ InputAdapter (52) - +- ^ ShuffleQueryStage (51) - +- ReusedExchange (50) + : +- ^ InputIteratorTransformer (38) + : +- ShuffleQueryStage (36) + : +- ColumnarExchange (35) + : +- VeloxAppendBatches (34) + : +- ^ ProjectExecTransformer (32) + : +- ^ ShuffledHashJoinExecTransformer LeftSemi BuildRight (31) + : :- ^ InputIteratorTransformer (18) + : : +- ShuffleQueryStage (16) + : : +- ColumnarExchange (15) + : : +- VeloxAppendBatches (14) + : : +- ^ ProjectExecTransformer (12) + : : +- ^ NoopFilter (11) + : : +- ^ Scan parquet (10) + : +- ^ ProjectExecTransformer (30) + : +- ^ FilterExecTransformer (29) + : +- ^ RegularHashAggregateExecTransformer (28) + : +- ^ InputIteratorTransformer (27) + : +- ShuffleQueryStage (25) + : +- ColumnarExchange (24) + : +- VeloxAppendBatches (23) + : +- ^ ProjectExecTransformer (21) + : +- ^ FlushableHashAggregateExecTransformer (20) + : +- ^ Scan parquet (19) + +- ^ ShuffledHashJoinExecTransformer LeftSemi BuildRight (63) + :- ^ InputIteratorTransformer (55) + : +- ShuffleQueryStage (53) + : +- ColumnarExchange (52) + : +- VeloxAppendBatches (51) + : +- ^ ProjectExecTransformer (49) + : +- ^ NoopFilter (48) + : +- ^ Scan parquet (47) + +- ^ ProjectExecTransformer (62) + +- ^ FilterExecTransformer (61) + +- ^ RegularHashAggregateExecTransformer (60) + +- ^ InputIteratorTransformer (59) + +- ShuffleQueryStage (57) + +- ReusedExchange (56) +- == Initial Plan == - TakeOrderedAndProject (96) - +- HashAggregate (95) - +- HashAggregate (94) - +- Project (93) - +- ShuffledHashJoin Inner BuildRight (92) - :- Exchange (81) - : +- Project (80) - : +- ShuffledHashJoin Inner BuildLeft (79) - : :- Exchange (67) - : : +- Filter (66) - : : +- Scan parquet (65) - : +- Exchange (78) - : +- ShuffledHashJoin LeftSemi BuildRight (77) - : :- Exchange (70) - : : +- Filter (69) - : : +- Scan parquet (68) - : +- Project (76) - : +- Filter (75) - : +- HashAggregate (74) - : +- Exchange (73) - : +- HashAggregate (72) - : +- Scan parquet (71) - +- ShuffledHashJoin LeftSemi BuildRight (91) - :- Exchange (84) - : +- Filter (83) - : +- Scan parquet (82) - +- Project (90) - +- Filter (89) - +- HashAggregate (88) - +- Exchange (87) - +- HashAggregate (86) - +- Scan parquet (85) + TakeOrderedAndProject (109) + +- HashAggregate (108) + +- HashAggregate (107) + +- Project (106) + +- SortMergeJoin Inner (105) + :- Sort (92) + : +- Exchange (91) + : +- Project (90) + : +- SortMergeJoin Inner (89) + : :- Sort (74) + : : +- Exchange (73) + : : +- Filter (72) + : : +- Scan parquet (71) + : +- Sort (88) + : +- Exchange (87) + : +- SortMergeJoin LeftSemi (86) + : :- Sort (78) + : : +- Exchange (77) + : : +- Filter (76) + : : +- Scan parquet (75) + : +- Sort (85) + : +- Project (84) + : +- Filter (83) + : +- HashAggregate (82) + : +- Exchange (81) + : +- HashAggregate (80) + : +- Scan parquet (79) + +- SortMergeJoin LeftSemi (104) + :- Sort (96) + : +- Exchange (95) + : +- Filter (94) + : +- Scan parquet (93) + +- Sort (103) + +- Project (102) + +- Filter (101) + +- HashAggregate (100) + +- Exchange (99) + +- HashAggregate (98) + +- Scan parquet (97) (1) Scan parquet @@ -112,420 +118,472 @@ Input [2]: [c_custkey#X, c_name#X] Input [3]: [hash_partition_key#X, c_custkey#X, c_name#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [3]: [hash_partition_key#X, c_custkey#X, c_name#X] +Arguments: X + +(6) ColumnarExchange Input [3]: [hash_partition_key#X, c_custkey#X, c_name#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [2]: [c_custkey#X, c_name#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [2]: [c_custkey#X, c_name#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [2]: [c_custkey#X, c_name#X] -(9) Scan parquet +(10) Scan parquet Output [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(10) NoopFilter +(11) NoopFilter Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Arguments: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [5]: [hash(o_orderkey#X, 42) AS hash_partition_key#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] +Arguments: X + +(15) ColumnarExchange Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -(17) Scan parquet +(19) Scan parquet Output [2]: [l_orderkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(18) FlushableHashAggregateExecTransformer +(20) FlushableHashAggregateExecTransformer Input [2]: [l_orderkey#X, l_quantity#X] Keys [1]: [l_orderkey#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_orderkey#X, sum#X, isEmpty#X] -(19) ProjectExecTransformer +(21) ProjectExecTransformer Output [4]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, sum#X, isEmpty#X] Input [3]: [l_orderkey#X, sum#X, isEmpty#X] -(20) WholeStageCodegenTransformer (X) +(22) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, l_orderkey#X, sum#X, isEmpty#X] Arguments: false -(21) ColumnarExchange +(23) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_orderkey#X, sum#X, isEmpty#X] +Arguments: X + +(24) ColumnarExchange Input [4]: [hash_partition_key#X, l_orderkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(22) ShuffleQueryStage +(25) ShuffleQueryStage Output [3]: [l_orderkey#X, sum#X, isEmpty#X] Arguments: X -(23) InputAdapter +(26) InputAdapter Input [3]: [l_orderkey#X, sum#X, isEmpty#X] -(24) InputIteratorTransformer +(27) InputIteratorTransformer Input [3]: [l_orderkey#X, sum#X, isEmpty#X] -(25) RegularHashAggregateExecTransformer +(28) RegularHashAggregateExecTransformer Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Keys [1]: [l_orderkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [2]: [l_orderkey#X, sum(l_quantity#X)#X AS sum(l_quantity#X)#X] -(26) FilterExecTransformer +(29) FilterExecTransformer Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] Arguments: (isnotnull(sum(l_quantity#X)#X) AND (sum(l_quantity#X)#X > 300.00)) -(27) ProjectExecTransformer +(30) ProjectExecTransformer Output [1]: [l_orderkey#X] Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] -(28) ShuffledHashJoinExecTransformer +(31) ShuffledHashJoinExecTransformer Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(29) ProjectExecTransformer +(32) ProjectExecTransformer Output [5]: [hash(o_custkey#X, 42) AS hash_partition_key#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -(30) WholeStageCodegenTransformer (X) +(33) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Arguments: false -(31) ColumnarExchange +(34) VeloxAppendBatches +Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] +Arguments: X + +(35) ColumnarExchange Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X], [plan_id=X], [id=#X] -(32) ShuffleQueryStage +(36) ShuffleQueryStage Output [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Arguments: X -(33) InputAdapter +(37) InputAdapter Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -(34) InputIteratorTransformer +(38) InputIteratorTransformer Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -(35) ShuffledHashJoinExecTransformer +(39) ShuffledHashJoinExecTransformer Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(36) ProjectExecTransformer +(40) ProjectExecTransformer Output [6]: [hash(o_orderkey#X, 42) AS hash_partition_key#X, c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] Input [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -(37) WholeStageCodegenTransformer (X) +(41) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] Arguments: false -(38) ColumnarExchange +(42) VeloxAppendBatches +Input [6]: [hash_partition_key#X, c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] +Arguments: X + +(43) ColumnarExchange Input [6]: [hash_partition_key#X, c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X], [plan_id=X], [id=#X] -(39) ShuffleQueryStage +(44) ShuffleQueryStage Output [5]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] Arguments: X -(40) InputAdapter +(45) InputAdapter Input [5]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] -(41) InputIteratorTransformer +(46) InputIteratorTransformer Input [5]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] -(42) Scan parquet +(47) Scan parquet Output [2]: [l_orderkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_orderkey)] ReadSchema: struct -(43) NoopFilter +(48) NoopFilter Input [2]: [l_orderkey#X, l_quantity#X] Arguments: [l_orderkey#X, l_quantity#X] -(44) ProjectExecTransformer +(49) ProjectExecTransformer Output [3]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_quantity#X] Input [2]: [l_orderkey#X, l_quantity#X] -(45) WholeStageCodegenTransformer (X) +(50) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, l_orderkey#X, l_quantity#X] Arguments: false -(46) ColumnarExchange +(51) VeloxAppendBatches +Input [3]: [hash_partition_key#X, l_orderkey#X, l_quantity#X] +Arguments: X + +(52) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_quantity#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_quantity#X], [plan_id=X], [id=#X] -(47) ShuffleQueryStage +(53) ShuffleQueryStage Output [2]: [l_orderkey#X, l_quantity#X] Arguments: X -(48) InputAdapter +(54) InputAdapter Input [2]: [l_orderkey#X, l_quantity#X] -(49) InputIteratorTransformer +(55) InputIteratorTransformer Input [2]: [l_orderkey#X, l_quantity#X] -(50) ReusedExchange [Reuses operator id: 21] +(56) ReusedExchange [Reuses operator id: 24] Output [3]: [l_orderkey#X, sum#X, isEmpty#X] -(51) ShuffleQueryStage +(57) ShuffleQueryStage Output [3]: [l_orderkey#X, sum#X, isEmpty#X] Arguments: X -(52) InputAdapter +(58) InputAdapter Input [3]: [l_orderkey#X, sum#X, isEmpty#X] -(53) InputIteratorTransformer +(59) InputIteratorTransformer Input [3]: [l_orderkey#X, sum#X, isEmpty#X] -(54) RegularHashAggregateExecTransformer +(60) RegularHashAggregateExecTransformer Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Keys [1]: [l_orderkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [2]: [l_orderkey#X, sum(l_quantity#X)#X AS sum(l_quantity#X)#X] -(55) FilterExecTransformer +(61) FilterExecTransformer Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] Arguments: (isnotnull(sum(l_quantity#X)#X) AND (sum(l_quantity#X)#X > 300.00)) -(56) ProjectExecTransformer +(62) ProjectExecTransformer Output [1]: [l_orderkey#X] Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] -(57) ShuffledHashJoinExecTransformer +(63) ShuffledHashJoinExecTransformer Left keys [1]: [l_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(58) ShuffledHashJoinExecTransformer +(64) ShuffledHashJoinExecTransformer Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(59) ProjectExecTransformer +(65) ProjectExecTransformer Output [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_quantity#X] Input [7]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_orderkey#X, l_quantity#X] -(60) RegularHashAggregateExecTransformer +(66) RegularHashAggregateExecTransformer Input [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_quantity#X] Keys [5]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] -(61) RegularHashAggregateExecTransformer +(67) RegularHashAggregateExecTransformer Input [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] Keys [5]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity#X)#X AS sum(l_quantity)#X] -(62) WholeStageCodegenTransformer (X) +(68) WholeStageCodegenTransformer (X) Input [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] Arguments: false -(63) TakeOrderedAndProjectExecTransformer +(69) TakeOrderedAndProjectExecTransformer Input [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] Arguments: X, [o_totalprice#X DESC NULLS LAST, o_orderdate#X ASC NULLS FIRST], [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X], 0 -(64) VeloxColumnarToRowExec +(70) VeloxColumnarToRowExec Input [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] -(65) Scan parquet +(71) Scan parquet Output [2]: [c_custkey#X, c_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey)] ReadSchema: struct -(66) Filter +(72) Filter Input [2]: [c_custkey#X, c_name#X] Condition : isnotnull(c_custkey#X) -(67) Exchange +(73) Exchange Input [2]: [c_custkey#X, c_name#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(68) Scan parquet +(74) Sort +Input [2]: [c_custkey#X, c_name#X] +Arguments: [c_custkey#X ASC NULLS FIRST], false, 0 + +(75) Scan parquet Output [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(69) Filter +(76) Filter Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Condition : (isnotnull(o_custkey#X) AND isnotnull(o_orderkey#X)) -(70) Exchange +(77) Exchange Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(71) Scan parquet +(78) Sort +Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(79) Scan parquet Output [2]: [l_orderkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(72) HashAggregate +(80) HashAggregate Input [2]: [l_orderkey#X, l_quantity#X] Keys [1]: [l_orderkey#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_orderkey#X, sum#X, isEmpty#X] -(73) Exchange +(81) Exchange Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(74) HashAggregate +(82) HashAggregate Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Keys [1]: [l_orderkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [2]: [l_orderkey#X, sum(l_quantity#X)#X AS sum(l_quantity#X)#X] -(75) Filter +(83) Filter Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] Condition : (isnotnull(sum(l_quantity#X)#X) AND (sum(l_quantity#X)#X > 300.00)) -(76) Project +(84) Project Output [1]: [l_orderkey#X] Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] -(77) ShuffledHashJoin +(85) Sort +Input [1]: [l_orderkey#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(86) SortMergeJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(78) Exchange +(87) Exchange Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(79) ShuffledHashJoin +(88) Sort +Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] +Arguments: [o_custkey#X ASC NULLS FIRST], false, 0 + +(89) SortMergeJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(80) Project +(90) Project Output [5]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] Input [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -(81) Exchange +(91) Exchange Input [5]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(82) Scan parquet +(92) Sort +Input [5]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(93) Scan parquet Output [2]: [l_orderkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_orderkey)] ReadSchema: struct -(83) Filter +(94) Filter Input [2]: [l_orderkey#X, l_quantity#X] Condition : isnotnull(l_orderkey#X) -(84) Exchange +(95) Exchange Input [2]: [l_orderkey#X, l_quantity#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(85) Scan parquet +(96) Sort +Input [2]: [l_orderkey#X, l_quantity#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(97) Scan parquet Output [2]: [l_orderkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(86) HashAggregate +(98) HashAggregate Input [2]: [l_orderkey#X, l_quantity#X] Keys [1]: [l_orderkey#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_orderkey#X, sum#X, isEmpty#X] -(87) Exchange +(99) Exchange Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(88) HashAggregate +(100) HashAggregate Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Keys [1]: [l_orderkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [2]: [l_orderkey#X, sum(l_quantity#X)#X AS sum(l_quantity#X)#X] -(89) Filter +(101) Filter Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] Condition : (isnotnull(sum(l_quantity#X)#X) AND (sum(l_quantity#X)#X > 300.00)) -(90) Project +(102) Project Output [1]: [l_orderkey#X] Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] -(91) ShuffledHashJoin +(103) Sort +Input [1]: [l_orderkey#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(104) SortMergeJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(92) ShuffledHashJoin +(105) SortMergeJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(93) Project +(106) Project Output [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_quantity#X] Input [7]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_orderkey#X, l_quantity#X] -(94) HashAggregate +(107) HashAggregate Input [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_quantity#X] Keys [5]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] -(95) HashAggregate +(108) HashAggregate Input [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] Keys [5]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity#X)#X AS sum(l_quantity)#X] -(96) TakeOrderedAndProject +(109) TakeOrderedAndProject Input [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] Arguments: X, [o_totalprice#X DESC NULLS LAST, o_orderdate#X ASC NULLS FIRST], [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] -(97) AdaptiveSparkPlan +(110) AdaptiveSparkPlan Output [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/19.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/19.txt index 7084e61f61d58..21e4f472f3b3a 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/19.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/19.txt @@ -1,37 +1,39 @@ == Physical Plan == -AdaptiveSparkPlan (34) +AdaptiveSparkPlan (38) +- == Final Plan == - VeloxColumnarToRowExec (22) - +- ^ RegularHashAggregateExecTransformer (20) - +- ^ RegularHashAggregateExecTransformer (19) - +- ^ ProjectExecTransformer (18) - +- ^ ShuffledHashJoinExecTransformer Inner (17) - :- ^ InputIteratorTransformer (8) - : +- ^ InputAdapter (7) - : +- ^ ShuffleQueryStage (6) - : +- ColumnarExchange (5) + VeloxColumnarToRowExec (24) + +- ^ RegularHashAggregateExecTransformer (22) + +- ^ RegularHashAggregateExecTransformer (21) + +- ^ ProjectExecTransformer (20) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (19) + :- ^ InputIteratorTransformer (9) + : +- ShuffleQueryStage (7) + : +- ColumnarExchange (6) + : +- VeloxAppendBatches (5) : +- ^ ProjectExecTransformer (3) : +- ^ NoopFilter (2) : +- ^ Scan parquet (1) - +- ^ InputIteratorTransformer (16) - +- ^ InputAdapter (15) - +- ^ ShuffleQueryStage (14) - +- ColumnarExchange (13) - +- ^ ProjectExecTransformer (11) - +- ^ NoopFilter (10) - +- ^ Scan parquet (9) + +- ^ InputIteratorTransformer (18) + +- ShuffleQueryStage (16) + +- ColumnarExchange (15) + +- VeloxAppendBatches (14) + +- ^ ProjectExecTransformer (12) + +- ^ NoopFilter (11) + +- ^ Scan parquet (10) +- == Initial Plan == - HashAggregate (33) - +- HashAggregate (32) - +- Project (31) - +- ShuffledHashJoin Inner BuildRight (30) - :- Exchange (26) - : +- Project (25) - : +- Filter (24) - : +- Scan parquet (23) - +- Exchange (29) - +- Filter (28) - +- Scan parquet (27) + HashAggregate (37) + +- HashAggregate (36) + +- Project (35) + +- SortMergeJoin Inner (34) + :- Sort (29) + : +- Exchange (28) + : +- Project (27) + : +- Filter (26) + : +- Scan parquet (25) + +- Sort (33) + +- Exchange (32) + +- Filter (31) + +- Scan parquet (30) (1) Scan parquet @@ -53,140 +55,156 @@ Input [6]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, l_shipin Input [5]: [hash_partition_key#X, l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [5]: [hash_partition_key#X, l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(6) ColumnarExchange Input [5]: [hash_partition_key#X, l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -(9) Scan parquet +(10) Scan parquet Output [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_size), GreaterThanOrEqual(p_size,1), IsNotNull(p_partkey), Or(Or(And(And(EqualTo(p_brand,Brand#X),In(p_container, [SM BOX,SM CASE,SM PACK,SM PKG])),LessThanOrEqual(p_size,5)),And(And(EqualTo(p_brand,Brand#X),In(p_container, [MED BAG,MED BOX,MED PACK,MED PKG])),LessThanOrEqual(p_size,10))),And(And(EqualTo(p_brand,Brand#X),In(p_container, [LG BOX,LG CASE,LG PACK,LG PKG])),LessThanOrEqual(p_size,15)))] ReadSchema: struct -(10) NoopFilter +(11) NoopFilter Input [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] Arguments: [p_partkey#X, p_brand#X, p_size#X, p_container#X] -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [5]: [hash(p_partkey#X, 42) AS hash_partition_key#X, p_partkey#X, p_brand#X, p_size#X, p_container#X] Input [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, p_partkey#X, p_brand#X, p_size#X, p_container#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [5]: [hash_partition_key#X, p_partkey#X, p_brand#X, p_size#X, p_container#X] +Arguments: X + +(15) ColumnarExchange Input [5]: [hash_partition_key#X, p_partkey#X, p_brand#X, p_size#X, p_container#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X, p_brand#X, p_size#X, p_container#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join condition: (((((((p_brand#X = Brand#X) AND p_container#X IN (SM CASE,SM BOX,SM PACK,SM PKG)) AND (l_quantity#X >= 1.00)) AND (l_quantity#X <= 11.00)) AND (p_size#X <= 5)) OR (((((p_brand#X = Brand#X) AND p_container#X IN (MED BAG,MED BOX,MED PKG,MED PACK)) AND (l_quantity#X >= 10.00)) AND (l_quantity#X <= 20.00)) AND (p_size#X <= 10))) OR (((((p_brand#X = Brand#X) AND p_container#X IN (LG CASE,LG BOX,LG PACK,LG PKG)) AND (l_quantity#X >= 20.00)) AND (l_quantity#X <= 30.00)) AND (p_size#X <= 15))) -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [3]: [l_extendedprice#X, l_discount#X, CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) AS _pre_X#X] Input [8]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, p_partkey#X, p_brand#X, p_size#X, p_container#X] -(19) RegularHashAggregateExecTransformer +(21) RegularHashAggregateExecTransformer Input [3]: [l_extendedprice#X, l_discount#X, _pre_X#X] Keys: [] Functions [1]: [partial_sum(_pre_X#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(20) RegularHashAggregateExecTransformer +(22) RegularHashAggregateExecTransformer Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] Results [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X AS revenue#X] -(21) WholeStageCodegenTransformer (X) +(23) WholeStageCodegenTransformer (X) Input [1]: [revenue#X] Arguments: false -(22) VeloxColumnarToRowExec +(24) VeloxColumnarToRowExec Input [1]: [revenue#X] -(23) Scan parquet +(25) Scan parquet Output [6]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, l_shipinstruct#X, l_shipmode#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipinstruct), In(l_shipmode, [AIR,AIR REG]), EqualTo(l_shipinstruct,DELIVER IN PERSON), IsNotNull(l_partkey), Or(Or(And(GreaterThanOrEqual(l_quantity,1.00),LessThanOrEqual(l_quantity,11.00)),And(GreaterThanOrEqual(l_quantity,10.00),LessThanOrEqual(l_quantity,20.00))),And(GreaterThanOrEqual(l_quantity,20.00),LessThanOrEqual(l_quantity,30.00)))] ReadSchema: struct -(24) Filter +(26) Filter Input [6]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, l_shipinstruct#X, l_shipmode#X] Condition : ((((isnotnull(l_shipinstruct#X) AND l_shipmode#X IN (AIR,AIR REG)) AND (l_shipinstruct#X = DELIVER IN PERSON)) AND isnotnull(l_partkey#X)) AND ((((l_quantity#X >= 1.00) AND (l_quantity#X <= 11.00)) OR ((l_quantity#X >= 10.00) AND (l_quantity#X <= 20.00))) OR ((l_quantity#X >= 20.00) AND (l_quantity#X <= 30.00)))) -(25) Project +(27) Project Output [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Input [6]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, l_shipinstruct#X, l_shipmode#X] -(26) Exchange +(28) Exchange Input [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(27) Scan parquet +(29) Sort +Input [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] +Arguments: [l_partkey#X ASC NULLS FIRST], false, 0 + +(30) Scan parquet Output [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_size), GreaterThanOrEqual(p_size,1), IsNotNull(p_partkey), Or(Or(And(And(EqualTo(p_brand,Brand#X),In(p_container, [SM BOX,SM CASE,SM PACK,SM PKG])),LessThanOrEqual(p_size,5)),And(And(EqualTo(p_brand,Brand#X),In(p_container, [MED BAG,MED BOX,MED PACK,MED PKG])),LessThanOrEqual(p_size,10))),And(And(EqualTo(p_brand,Brand#X),In(p_container, [LG BOX,LG CASE,LG PACK,LG PKG])),LessThanOrEqual(p_size,15)))] ReadSchema: struct -(28) Filter +(31) Filter Input [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] Condition : (((isnotnull(p_size#X) AND (p_size#X >= 1)) AND isnotnull(p_partkey#X)) AND (((((p_brand#X = Brand#X) AND p_container#X IN (SM CASE,SM BOX,SM PACK,SM PKG)) AND (p_size#X <= 5)) OR (((p_brand#X = Brand#X) AND p_container#X IN (MED BAG,MED BOX,MED PKG,MED PACK)) AND (p_size#X <= 10))) OR (((p_brand#X = Brand#X) AND p_container#X IN (LG CASE,LG BOX,LG PACK,LG PKG)) AND (p_size#X <= 15)))) -(29) Exchange +(32) Exchange Input [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(30) ShuffledHashJoin +(33) Sort +Input [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] +Arguments: [p_partkey#X ASC NULLS FIRST], false, 0 + +(34) SortMergeJoin Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join condition: (((((((p_brand#X = Brand#X) AND p_container#X IN (SM CASE,SM BOX,SM PACK,SM PKG)) AND (l_quantity#X >= 1.00)) AND (l_quantity#X <= 11.00)) AND (p_size#X <= 5)) OR (((((p_brand#X = Brand#X) AND p_container#X IN (MED BAG,MED BOX,MED PKG,MED PACK)) AND (l_quantity#X >= 10.00)) AND (l_quantity#X <= 20.00)) AND (p_size#X <= 10))) OR (((((p_brand#X = Brand#X) AND p_container#X IN (LG CASE,LG BOX,LG PACK,LG PKG)) AND (l_quantity#X >= 20.00)) AND (l_quantity#X <= 30.00)) AND (p_size#X <= 15))) -(31) Project +(35) Project Output [2]: [l_extendedprice#X, l_discount#X] Input [8]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, p_partkey#X, p_brand#X, p_size#X, p_container#X] -(32) HashAggregate +(36) HashAggregate Input [2]: [l_extendedprice#X, l_discount#X] Keys: [] Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(33) HashAggregate +(37) HashAggregate Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] Results [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X AS revenue#X] -(34) AdaptiveSparkPlan +(38) AdaptiveSparkPlan Output [1]: [revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/20.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/20.txt index 3f6d175124779..1ac0992834eb2 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/20.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/20.txt @@ -1,121 +1,130 @@ == Physical Plan == -AdaptiveSparkPlan (126) +AdaptiveSparkPlan (146) +- == Final Plan == - VeloxColumnarToRowExec (86) - +- ^ SortExecTransformer (84) - +- ^ InputIteratorTransformer (83) - +- ^ InputAdapter (82) - +- ^ ShuffleQueryStage (81) - +- ColumnarExchange (80) - +- ^ ProjectExecTransformer (78) - +- ^ ShuffledHashJoinExecTransformer Inner (77) - :- ^ InputIteratorTransformer (68) - : +- ^ InputAdapter (67) - : +- ^ ShuffleQueryStage (66) - : +- ColumnarExchange (65) - : +- ^ ProjectExecTransformer (63) - : +- ^ ShuffledHashJoinExecTransformer LeftSemi (62) - : :- ^ InputIteratorTransformer (8) - : : +- ^ InputAdapter (7) - : : +- ^ ShuffleQueryStage (6) - : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (96) + +- ^ SortExecTransformer (94) + +- ^ InputIteratorTransformer (93) + +- ShuffleQueryStage (91) + +- ColumnarExchange (90) + +- VeloxAppendBatches (89) + +- ^ ProjectExecTransformer (87) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (86) + :- ^ InputIteratorTransformer (76) + : +- ShuffleQueryStage (74) + : +- ColumnarExchange (73) + : +- VeloxAppendBatches (72) + : +- ^ ProjectExecTransformer (70) + : +- ^ ShuffledHashJoinExecTransformer LeftSemi BuildRight (69) + : :- ^ InputIteratorTransformer (9) + : : +- ShuffleQueryStage (7) + : : +- ColumnarExchange (6) + : : +- VeloxAppendBatches (5) : : +- ^ ProjectExecTransformer (3) : : +- ^ NoopFilter (2) : : +- ^ Scan parquet (1) - : +- ^ InputIteratorTransformer (61) - : +- ^ InputAdapter (60) - : +- ^ ShuffleQueryStage (59) - : +- ColumnarExchange (58) - : +- ^ ProjectExecTransformer (56) - : +- ^ ShuffledHashJoinExecTransformer Inner (55) - : :- ^ InputIteratorTransformer (31) - : : +- ^ InputAdapter (30) - : : +- ^ ShuffleQueryStage (29) - : : +- ColumnarExchange (28) - : : +- ^ ProjectExecTransformer (26) - : : +- ^ ShuffledHashJoinExecTransformer LeftSemi (25) - : : :- ^ InputIteratorTransformer (16) - : : : +- ^ InputAdapter (15) - : : : +- ^ ShuffleQueryStage (14) - : : : +- ColumnarExchange (13) - : : : +- ^ ProjectExecTransformer (11) - : : : +- ^ NoopFilter (10) - : : : +- ^ Scan parquet (9) - : : +- ^ InputIteratorTransformer (24) - : : +- ^ InputAdapter (23) - : : +- ^ ShuffleQueryStage (22) - : : +- ColumnarExchange (21) - : : +- ^ ProjectExecTransformer (19) - : : +- ^ NoopFilter (18) - : : +- ^ Scan parquet (17) - : +- ^ InputIteratorTransformer (54) - : +- ^ InputAdapter (53) - : +- ^ ShuffleQueryStage (52) - : +- ColumnarExchange (51) - : +- ^ ProjectExecTransformer (49) - : +- ^ FilterExecTransformer (48) - : +- ^ ProjectExecTransformer (47) - : +- ^ RegularHashAggregateExecTransformer (46) - : +- ^ RegularHashAggregateExecTransformer (45) - : +- ^ ShuffledHashJoinExecTransformer LeftSemi (44) - : :- ^ InputIteratorTransformer (39) - : : +- ^ InputAdapter (38) - : : +- ^ ShuffleQueryStage (37) - : : +- ColumnarExchange (36) - : : +- ^ ProjectExecTransformer (34) - : : +- ^ NoopFilter (33) - : : +- ^ Scan parquet (32) - : +- ^ InputIteratorTransformer (43) - : +- ^ InputAdapter (42) - : +- ^ ShuffleQueryStage (41) - : +- ReusedExchange (40) - +- ^ InputIteratorTransformer (76) - +- ^ InputAdapter (75) - +- ^ ShuffleQueryStage (74) - +- ColumnarExchange (73) - +- ^ ProjectExecTransformer (71) - +- ^ NoopFilter (70) - +- ^ Scan parquet (69) + : +- ^ InputIteratorTransformer (68) + : +- ShuffleQueryStage (66) + : +- ColumnarExchange (65) + : +- VeloxAppendBatches (64) + : +- ^ ProjectExecTransformer (62) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (61) + : :- ^ InputIteratorTransformer (35) + : : +- ShuffleQueryStage (33) + : : +- ColumnarExchange (32) + : : +- VeloxAppendBatches (31) + : : +- ^ ProjectExecTransformer (29) + : : +- ^ ShuffledHashJoinExecTransformer LeftSemi BuildRight (28) + : : :- ^ InputIteratorTransformer (18) + : : : +- ShuffleQueryStage (16) + : : : +- ColumnarExchange (15) + : : : +- VeloxAppendBatches (14) + : : : +- ^ ProjectExecTransformer (12) + : : : +- ^ NoopFilter (11) + : : : +- ^ Scan parquet (10) + : : +- ^ InputIteratorTransformer (27) + : : +- ShuffleQueryStage (25) + : : +- ColumnarExchange (24) + : : +- VeloxAppendBatches (23) + : : +- ^ ProjectExecTransformer (21) + : : +- ^ NoopFilter (20) + : : +- ^ Scan parquet (19) + : +- ^ InputIteratorTransformer (60) + : +- ShuffleQueryStage (58) + : +- ColumnarExchange (57) + : +- VeloxAppendBatches (56) + : +- ^ ProjectExecTransformer (54) + : +- ^ FilterExecTransformer (53) + : +- ^ ProjectExecTransformer (52) + : +- ^ RegularHashAggregateExecTransformer (51) + : +- ^ RegularHashAggregateExecTransformer (50) + : +- ^ ShuffledHashJoinExecTransformer LeftSemi BuildRight (49) + : :- ^ InputIteratorTransformer (44) + : : +- ShuffleQueryStage (42) + : : +- ColumnarExchange (41) + : : +- VeloxAppendBatches (40) + : : +- ^ ProjectExecTransformer (38) + : : +- ^ NoopFilter (37) + : : +- ^ Scan parquet (36) + : +- ^ InputIteratorTransformer (48) + : +- ShuffleQueryStage (46) + : +- ReusedExchange (45) + +- ^ InputIteratorTransformer (85) + +- ShuffleQueryStage (83) + +- ColumnarExchange (82) + +- VeloxAppendBatches (81) + +- ^ ProjectExecTransformer (79) + +- ^ NoopFilter (78) + +- ^ Scan parquet (77) +- == Initial Plan == - Sort (125) - +- Exchange (124) - +- Project (123) - +- ShuffledHashJoin Inner BuildRight (122) - :- Exchange (117) - : +- Project (116) - : +- ShuffledHashJoin LeftSemi BuildRight (115) - : :- Exchange (89) - : : +- Filter (88) - : : +- Scan parquet (87) - : +- Exchange (114) - : +- Project (113) - : +- ShuffledHashJoin Inner BuildLeft (112) - : :- Exchange (98) - : : +- ShuffledHashJoin LeftSemi BuildRight (97) - : : :- Exchange (92) - : : : +- Filter (91) - : : : +- Scan parquet (90) - : : +- Exchange (96) - : : +- Project (95) - : : +- Filter (94) - : : +- Scan parquet (93) - : +- Exchange (111) - : +- Filter (110) - : +- HashAggregate (109) - : +- HashAggregate (108) - : +- ShuffledHashJoin LeftSemi BuildRight (107) - : :- Exchange (102) - : : +- Project (101) - : : +- Filter (100) - : : +- Scan parquet (99) - : +- Exchange (106) - : +- Project (105) - : +- Filter (104) - : +- Scan parquet (103) - +- Exchange (121) - +- Project (120) - +- Filter (119) - +- Scan parquet (118) + Sort (145) + +- Exchange (144) + +- Project (143) + +- SortMergeJoin Inner (142) + :- Sort (136) + : +- Exchange (135) + : +- Project (134) + : +- SortMergeJoin LeftSemi (133) + : :- Sort (100) + : : +- Exchange (99) + : : +- Filter (98) + : : +- Scan parquet (97) + : +- Sort (132) + : +- Exchange (131) + : +- Project (130) + : +- SortMergeJoin Inner (129) + : :- Sort (112) + : : +- Exchange (111) + : : +- SortMergeJoin LeftSemi (110) + : : :- Sort (104) + : : : +- Exchange (103) + : : : +- Filter (102) + : : : +- Scan parquet (101) + : : +- Sort (109) + : : +- Exchange (108) + : : +- Project (107) + : : +- Filter (106) + : : +- Scan parquet (105) + : +- Sort (128) + : +- Exchange (127) + : +- Filter (126) + : +- HashAggregate (125) + : +- HashAggregate (124) + : +- SortMergeJoin LeftSemi (123) + : :- Sort (117) + : : +- Exchange (116) + : : +- Project (115) + : : +- Filter (114) + : : +- Scan parquet (113) + : +- Sort (122) + : +- Exchange (121) + : +- Project (120) + : +- Filter (119) + : +- Scan parquet (118) + +- Sort (141) + +- Exchange (140) + +- Project (139) + +- Filter (138) + +- Scan parquet (137) (1) Scan parquet @@ -137,518 +146,598 @@ Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] +Arguments: X + +(6) ColumnarExchange Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] -(9) Scan parquet +(10) Scan parquet Output [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_availqty), IsNotNull(ps_partkey), IsNotNull(ps_suppkey)] ReadSchema: struct -(10) NoopFilter +(11) NoopFilter Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [4]: [hash(ps_partkey#X, 42) AS hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X] Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X] +Arguments: X + +(15) ColumnarExchange Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_availqty#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] -(17) Scan parquet +(19) Scan parquet Output [2]: [p_partkey#X, p_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_name), StringStartsWith(p_name,forest)] ReadSchema: struct -(18) NoopFilter +(20) NoopFilter Input [2]: [p_partkey#X, p_name#X] Arguments: [p_partkey#X, p_name#X] -(19) ProjectExecTransformer +(21) ProjectExecTransformer Output [2]: [hash(p_partkey#X, 42) AS hash_partition_key#X, p_partkey#X] Input [2]: [p_partkey#X, p_name#X] -(20) WholeStageCodegenTransformer (X) +(22) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, p_partkey#X] Arguments: false -(21) ColumnarExchange +(23) VeloxAppendBatches +Input [2]: [hash_partition_key#X, p_partkey#X] +Arguments: X + +(24) ColumnarExchange Input [2]: [hash_partition_key#X, p_partkey#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [id=#X] -(22) ShuffleQueryStage +(25) ShuffleQueryStage Output [1]: [p_partkey#X] Arguments: X -(23) InputAdapter +(26) InputAdapter Input [1]: [p_partkey#X] -(24) InputIteratorTransformer +(27) InputIteratorTransformer Input [1]: [p_partkey#X] -(25) ShuffledHashJoinExecTransformer +(28) ShuffledHashJoinExecTransformer Left keys [1]: [ps_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(26) ProjectExecTransformer +(29) ProjectExecTransformer Output [4]: [hash(ps_partkey#X, ps_suppkey#X, 42) AS hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X] Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] -(27) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: false -(28) ColumnarExchange +(31) VeloxAppendBatches +Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X] +Arguments: X + +(32) ColumnarExchange Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: hashpartitioning(ps_partkey#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_availqty#X], [plan_id=X], [id=#X] -(29) ShuffleQueryStage +(33) ShuffleQueryStage Output [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: X -(30) InputAdapter +(34) InputAdapter Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] -(31) InputIteratorTransformer +(35) InputIteratorTransformer Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] -(32) Scan parquet +(36) Scan parquet Output [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1994-01-01), LessThan(l_shipdate,1995-01-01), IsNotNull(l_partkey), IsNotNull(l_suppkey)] ReadSchema: struct -(33) NoopFilter +(37) NoopFilter Input [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] Arguments: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] -(34) ProjectExecTransformer +(38) ProjectExecTransformer Output [4]: [hash(l_partkey#X, 42) AS hash_partition_key#X, l_partkey#X, l_suppkey#X, l_quantity#X] Input [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] -(35) WholeStageCodegenTransformer (X) +(39) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, l_partkey#X, l_suppkey#X, l_quantity#X] Arguments: false -(36) ColumnarExchange +(40) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_partkey#X, l_suppkey#X, l_quantity#X] +Arguments: X + +(41) ColumnarExchange Input [4]: [hash_partition_key#X, l_partkey#X, l_suppkey#X, l_quantity#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_suppkey#X, l_quantity#X], [plan_id=X], [id=#X] -(37) ShuffleQueryStage +(42) ShuffleQueryStage Output [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] Arguments: X -(38) InputAdapter +(43) InputAdapter Input [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] -(39) InputIteratorTransformer +(44) InputIteratorTransformer Input [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] -(40) ReusedExchange [Reuses operator id: 21] +(45) ReusedExchange [Reuses operator id: 24] Output [1]: [p_partkey#X] -(41) ShuffleQueryStage +(46) ShuffleQueryStage Output [1]: [p_partkey#X] Arguments: X -(42) InputAdapter +(47) InputAdapter Input [1]: [p_partkey#X] -(43) InputIteratorTransformer +(48) InputIteratorTransformer Input [1]: [p_partkey#X] -(44) ShuffledHashJoinExecTransformer +(49) ShuffledHashJoinExecTransformer Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(45) RegularHashAggregateExecTransformer +(50) RegularHashAggregateExecTransformer Input [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] Keys [2]: [l_partkey#X, l_suppkey#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] -(46) RegularHashAggregateExecTransformer +(51) RegularHashAggregateExecTransformer Input [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] Keys [2]: [l_partkey#X, l_suppkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [3]: [l_partkey#X, l_suppkey#X, sum(l_quantity#X)#X] -(47) ProjectExecTransformer +(52) ProjectExecTransformer Output [3]: [CheckOverflow((0.50 * promote_precision(sum(l_quantity#X)#X)), DecimalType(24,3), true) AS (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Input [3]: [l_partkey#X, l_suppkey#X, sum(l_quantity#X)#X] -(48) FilterExecTransformer +(53) FilterExecTransformer Input [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Arguments: isnotnull((0.5 * sum(l_quantity))#X) -(49) ProjectExecTransformer +(54) ProjectExecTransformer Output [4]: [hash(l_partkey#X, l_suppkey#X, 42) AS hash_partition_key#X, (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Input [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] -(50) WholeStageCodegenTransformer (X) +(55) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Arguments: false -(51) ColumnarExchange +(56) VeloxAppendBatches +Input [4]: [hash_partition_key#X, (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] +Arguments: X + +(57) ColumnarExchange Input [4]: [hash_partition_key#X, (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Arguments: hashpartitioning(l_partkey#X, l_suppkey#X, 1), ENSURE_REQUIREMENTS, [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X], [plan_id=X], [id=#X] -(52) ShuffleQueryStage +(58) ShuffleQueryStage Output [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Arguments: X -(53) InputAdapter +(59) InputAdapter Input [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] -(54) InputIteratorTransformer +(60) InputIteratorTransformer Input [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] -(55) ShuffledHashJoinExecTransformer +(61) ShuffledHashJoinExecTransformer Left keys [2]: [ps_partkey#X, ps_suppkey#X] Right keys [2]: [l_partkey#X, l_suppkey#X] Join condition: (cast(cast(ps_availqty#X as decimal(10,0)) as decimal(24,3)) > (0.5 * sum(l_quantity))#X) -(56) ProjectExecTransformer +(62) ProjectExecTransformer Output [2]: [hash(ps_suppkey#X, 42) AS hash_partition_key#X, ps_suppkey#X] Input [6]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] -(57) WholeStageCodegenTransformer (X) +(63) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, ps_suppkey#X] Arguments: false -(58) ColumnarExchange +(64) VeloxAppendBatches +Input [2]: [hash_partition_key#X, ps_suppkey#X] +Arguments: X + +(65) ColumnarExchange Input [2]: [hash_partition_key#X, ps_suppkey#X] Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_suppkey#X], [plan_id=X], [id=#X] -(59) ShuffleQueryStage +(66) ShuffleQueryStage Output [1]: [ps_suppkey#X] Arguments: X -(60) InputAdapter +(67) InputAdapter Input [1]: [ps_suppkey#X] -(61) InputIteratorTransformer +(68) InputIteratorTransformer Input [1]: [ps_suppkey#X] -(62) ShuffledHashJoinExecTransformer +(69) ShuffledHashJoinExecTransformer Left keys [1]: [s_suppkey#X] Right keys [1]: [ps_suppkey#X] Join condition: None -(63) ProjectExecTransformer +(70) ProjectExecTransformer Output [4]: [hash(s_nationkey#X, 42) AS hash_partition_key#X, s_name#X, s_address#X, s_nationkey#X] Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] -(64) WholeStageCodegenTransformer (X) +(71) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, s_name#X, s_address#X, s_nationkey#X] Arguments: false -(65) ColumnarExchange +(72) VeloxAppendBatches +Input [4]: [hash_partition_key#X, s_name#X, s_address#X, s_nationkey#X] +Arguments: X + +(73) ColumnarExchange Input [4]: [hash_partition_key#X, s_name#X, s_address#X, s_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_name#X, s_address#X, s_nationkey#X], [plan_id=X], [id=#X] -(66) ShuffleQueryStage +(74) ShuffleQueryStage Output [3]: [s_name#X, s_address#X, s_nationkey#X] Arguments: X -(67) InputAdapter +(75) InputAdapter Input [3]: [s_name#X, s_address#X, s_nationkey#X] -(68) InputIteratorTransformer +(76) InputIteratorTransformer Input [3]: [s_name#X, s_address#X, s_nationkey#X] -(69) Scan parquet +(77) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,CANADA), IsNotNull(n_nationkey)] ReadSchema: struct -(70) NoopFilter +(78) NoopFilter Input [2]: [n_nationkey#X, n_name#X] Arguments: [n_nationkey#X, n_name#X] -(71) ProjectExecTransformer +(79) ProjectExecTransformer Output [2]: [hash(n_nationkey#X, 42) AS hash_partition_key#X, n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(72) WholeStageCodegenTransformer (X) +(80) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, n_nationkey#X] Arguments: false -(73) ColumnarExchange +(81) VeloxAppendBatches +Input [2]: [hash_partition_key#X, n_nationkey#X] +Arguments: X + +(82) ColumnarExchange Input [2]: [hash_partition_key#X, n_nationkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X], [plan_id=X], [id=#X] -(74) ShuffleQueryStage +(83) ShuffleQueryStage Output [1]: [n_nationkey#X] Arguments: X -(75) InputAdapter +(84) InputAdapter Input [1]: [n_nationkey#X] -(76) InputIteratorTransformer +(85) InputIteratorTransformer Input [1]: [n_nationkey#X] -(77) ShuffledHashJoinExecTransformer +(86) ShuffledHashJoinExecTransformer Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(78) ProjectExecTransformer +(87) ProjectExecTransformer Output [2]: [s_name#X, s_address#X] Input [4]: [s_name#X, s_address#X, s_nationkey#X, n_nationkey#X] -(79) WholeStageCodegenTransformer (X) +(88) WholeStageCodegenTransformer (X) Input [2]: [s_name#X, s_address#X] Arguments: false -(80) ColumnarExchange +(89) VeloxAppendBatches +Input [2]: [s_name#X, s_address#X] +Arguments: X + +(90) ColumnarExchange Input [2]: [s_name#X, s_address#X] Arguments: rangepartitioning(s_name#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(81) ShuffleQueryStage +(91) ShuffleQueryStage Output [2]: [s_name#X, s_address#X] Arguments: X -(82) InputAdapter +(92) InputAdapter Input [2]: [s_name#X, s_address#X] -(83) InputIteratorTransformer +(93) InputIteratorTransformer Input [2]: [s_name#X, s_address#X] -(84) SortExecTransformer +(94) SortExecTransformer Input [2]: [s_name#X, s_address#X] Arguments: [s_name#X ASC NULLS FIRST], true, 0 -(85) WholeStageCodegenTransformer (X) +(95) WholeStageCodegenTransformer (X) Input [2]: [s_name#X, s_address#X] Arguments: false -(86) VeloxColumnarToRowExec +(96) VeloxColumnarToRowExec Input [2]: [s_name#X, s_address#X] -(87) Scan parquet +(97) Scan parquet Output [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_nationkey)] ReadSchema: struct -(88) Filter +(98) Filter Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Condition : isnotnull(s_nationkey#X) -(89) Exchange +(99) Exchange Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(90) Scan parquet +(100) Sort +Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] +Arguments: [s_suppkey#X ASC NULLS FIRST], false, 0 + +(101) Scan parquet Output [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_availqty), IsNotNull(ps_partkey), IsNotNull(ps_suppkey)] ReadSchema: struct -(91) Filter +(102) Filter Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Condition : ((isnotnull(ps_availqty#X) AND isnotnull(ps_partkey#X)) AND isnotnull(ps_suppkey#X)) -(92) Exchange +(103) Exchange Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(93) Scan parquet +(104) Sort +Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] +Arguments: [ps_partkey#X ASC NULLS FIRST], false, 0 + +(105) Scan parquet Output [2]: [p_partkey#X, p_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_name), StringStartsWith(p_name,forest)] ReadSchema: struct -(94) Filter +(106) Filter Input [2]: [p_partkey#X, p_name#X] Condition : (isnotnull(p_name#X) AND StartsWith(p_name#X, forest)) -(95) Project +(107) Project Output [1]: [p_partkey#X] Input [2]: [p_partkey#X, p_name#X] -(96) Exchange +(108) Exchange Input [1]: [p_partkey#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(97) ShuffledHashJoin +(109) Sort +Input [1]: [p_partkey#X] +Arguments: [p_partkey#X ASC NULLS FIRST], false, 0 + +(110) SortMergeJoin Left keys [1]: [ps_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(98) Exchange +(111) Exchange Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: hashpartitioning(ps_partkey#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(99) Scan parquet +(112) Sort +Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] +Arguments: [ps_partkey#X ASC NULLS FIRST, ps_suppkey#X ASC NULLS FIRST], false, 0 + +(113) Scan parquet Output [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1994-01-01), LessThan(l_shipdate,1995-01-01), IsNotNull(l_partkey), IsNotNull(l_suppkey)] ReadSchema: struct -(100) Filter +(114) Filter Input [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] Condition : ((((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1994-01-01)) AND (l_shipdate#X < 1995-01-01)) AND isnotnull(l_partkey#X)) AND isnotnull(l_suppkey#X)) -(101) Project +(115) Project Output [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] Input [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] -(102) Exchange +(116) Exchange Input [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(103) Scan parquet +(117) Sort +Input [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] +Arguments: [l_partkey#X ASC NULLS FIRST], false, 0 + +(118) Scan parquet Output [2]: [p_partkey#X, p_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_name), StringStartsWith(p_name,forest)] ReadSchema: struct -(104) Filter +(119) Filter Input [2]: [p_partkey#X, p_name#X] Condition : (isnotnull(p_name#X) AND StartsWith(p_name#X, forest)) -(105) Project +(120) Project Output [1]: [p_partkey#X] Input [2]: [p_partkey#X, p_name#X] -(106) Exchange +(121) Exchange Input [1]: [p_partkey#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(107) ShuffledHashJoin +(122) Sort +Input [1]: [p_partkey#X] +Arguments: [p_partkey#X ASC NULLS FIRST], false, 0 + +(123) SortMergeJoin Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(108) HashAggregate +(124) HashAggregate Input [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] Keys [2]: [l_partkey#X, l_suppkey#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] -(109) HashAggregate +(125) HashAggregate Input [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] Keys [2]: [l_partkey#X, l_suppkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [3]: [CheckOverflow((0.50 * promote_precision(sum(l_quantity#X)#X)), DecimalType(24,3), true) AS (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] -(110) Filter +(126) Filter Input [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Condition : isnotnull((0.5 * sum(l_quantity))#X) -(111) Exchange +(127) Exchange Input [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Arguments: hashpartitioning(l_partkey#X, l_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(112) ShuffledHashJoin +(128) Sort +Input [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] +Arguments: [l_partkey#X ASC NULLS FIRST, l_suppkey#X ASC NULLS FIRST], false, 0 + +(129) SortMergeJoin Left keys [2]: [ps_partkey#X, ps_suppkey#X] Right keys [2]: [l_partkey#X, l_suppkey#X] Join condition: (cast(cast(ps_availqty#X as decimal(10,0)) as decimal(24,3)) > (0.5 * sum(l_quantity))#X) -(113) Project +(130) Project Output [1]: [ps_suppkey#X] Input [6]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] -(114) Exchange +(131) Exchange Input [1]: [ps_suppkey#X] Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(115) ShuffledHashJoin +(132) Sort +Input [1]: [ps_suppkey#X] +Arguments: [ps_suppkey#X ASC NULLS FIRST], false, 0 + +(133) SortMergeJoin Left keys [1]: [s_suppkey#X] Right keys [1]: [ps_suppkey#X] Join condition: None -(116) Project +(134) Project Output [3]: [s_name#X, s_address#X, s_nationkey#X] Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] -(117) Exchange +(135) Exchange Input [3]: [s_name#X, s_address#X, s_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(118) Scan parquet +(136) Sort +Input [3]: [s_name#X, s_address#X, s_nationkey#X] +Arguments: [s_nationkey#X ASC NULLS FIRST], false, 0 + +(137) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,CANADA), IsNotNull(n_nationkey)] ReadSchema: struct -(119) Filter +(138) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : ((isnotnull(n_name#X) AND (n_name#X = CANADA)) AND isnotnull(n_nationkey#X)) -(120) Project +(139) Project Output [1]: [n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(121) Exchange +(140) Exchange Input [1]: [n_nationkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(122) ShuffledHashJoin +(141) Sort +Input [1]: [n_nationkey#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(142) SortMergeJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(123) Project +(143) Project Output [2]: [s_name#X, s_address#X] Input [4]: [s_name#X, s_address#X, s_nationkey#X, n_nationkey#X] -(124) Exchange +(144) Exchange Input [2]: [s_name#X, s_address#X] Arguments: rangepartitioning(s_name#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(125) Sort +(145) Sort Input [2]: [s_name#X, s_address#X] Arguments: [s_name#X ASC NULLS FIRST], true, 0 -(126) AdaptiveSparkPlan +(146) AdaptiveSparkPlan Output [2]: [s_name#X, s_address#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/21.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/21.txt index b7067f8b5e3a9..e2a72528c4ed4 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/21.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/21.txt @@ -1,114 +1,123 @@ == Physical Plan == -AdaptiveSparkPlan (119) +AdaptiveSparkPlan (138) +- == Final Plan == - VeloxColumnarToRowExec (82) - +- TakeOrderedAndProjectExecTransformer (81) - +- ^ RegularHashAggregateExecTransformer (79) - +- ^ InputIteratorTransformer (78) - +- ^ InputAdapter (77) - +- ^ ShuffleQueryStage (76) - +- ColumnarExchange (75) - +- ^ ProjectExecTransformer (73) - +- ^ FlushableHashAggregateExecTransformer (72) - +- ^ ProjectExecTransformer (71) - +- ^ ShuffledHashJoinExecTransformer Inner (70) - :- ^ InputIteratorTransformer (61) - : +- ^ InputAdapter (60) - : +- ^ ShuffleQueryStage (59) - : +- ColumnarExchange (58) - : +- ^ ProjectExecTransformer (56) - : +- ^ ShuffledHashJoinExecTransformer Inner (55) - : :- ^ InputIteratorTransformer (46) - : : +- ^ InputAdapter (45) - : : +- ^ ShuffleQueryStage (44) - : : +- ColumnarExchange (43) - : : +- ^ ProjectExecTransformer (41) - : : +- ^ ShuffledHashJoinExecTransformer Inner (40) - : : :- ^ InputIteratorTransformer (8) - : : : +- ^ InputAdapter (7) - : : : +- ^ ShuffleQueryStage (6) - : : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (92) + +- TakeOrderedAndProjectExecTransformer (91) + +- ^ RegularHashAggregateExecTransformer (89) + +- ^ InputIteratorTransformer (88) + +- ShuffleQueryStage (86) + +- ColumnarExchange (85) + +- VeloxAppendBatches (84) + +- ^ ProjectExecTransformer (82) + +- ^ FlushableHashAggregateExecTransformer (81) + +- ^ ProjectExecTransformer (80) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (79) + :- ^ InputIteratorTransformer (69) + : +- ShuffleQueryStage (67) + : +- ColumnarExchange (66) + : +- VeloxAppendBatches (65) + : +- ^ ProjectExecTransformer (63) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (62) + : :- ^ InputIteratorTransformer (52) + : : +- ShuffleQueryStage (50) + : : +- ColumnarExchange (49) + : : +- VeloxAppendBatches (48) + : : +- ^ ProjectExecTransformer (46) + : : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (45) + : : :- ^ InputIteratorTransformer (9) + : : : +- ShuffleQueryStage (7) + : : : +- ColumnarExchange (6) + : : : +- VeloxAppendBatches (5) : : : +- ^ ProjectExecTransformer (3) : : : +- ^ NoopFilter (2) : : : +- ^ Scan parquet (1) - : : +- ^ InputIteratorTransformer (39) - : : +- ^ InputAdapter (38) - : : +- ^ ShuffleQueryStage (37) - : : +- ColumnarExchange (36) - : : +- ^ ProjectExecTransformer (34) - : : +- ^ ShuffledHashJoinExecTransformer LeftAnti (33) - : : :- ^ ShuffledHashJoinExecTransformer LeftSemi (24) - : : : :- ^ InputIteratorTransformer (16) - : : : : +- ^ InputAdapter (15) - : : : : +- ^ ShuffleQueryStage (14) - : : : : +- ColumnarExchange (13) - : : : : +- ^ ProjectExecTransformer (11) - : : : : +- ^ NoopFilter (10) - : : : : +- ^ Scan parquet (9) - : : : +- ^ InputIteratorTransformer (23) - : : : +- ^ InputAdapter (22) - : : : +- ^ ShuffleQueryStage (21) - : : : +- ColumnarExchange (20) - : : : +- ^ ProjectExecTransformer (18) - : : : +- ^ Scan parquet (17) - : : +- ^ InputIteratorTransformer (32) - : : +- ^ InputAdapter (31) - : : +- ^ ShuffleQueryStage (30) - : : +- ColumnarExchange (29) - : : +- ^ ProjectExecTransformer (27) - : : +- ^ NoopFilter (26) - : : +- ^ Scan parquet (25) - : +- ^ InputIteratorTransformer (54) - : +- ^ InputAdapter (53) - : +- ^ ShuffleQueryStage (52) - : +- ColumnarExchange (51) - : +- ^ ProjectExecTransformer (49) - : +- ^ NoopFilter (48) - : +- ^ Scan parquet (47) - +- ^ InputIteratorTransformer (69) - +- ^ InputAdapter (68) - +- ^ ShuffleQueryStage (67) - +- ColumnarExchange (66) - +- ^ ProjectExecTransformer (64) - +- ^ NoopFilter (63) - +- ^ Scan parquet (62) + : : +- ^ InputIteratorTransformer (44) + : : +- ShuffleQueryStage (42) + : : +- ColumnarExchange (41) + : : +- VeloxAppendBatches (40) + : : +- ^ ProjectExecTransformer (38) + : : +- ^ ShuffledHashJoinExecTransformer LeftAnti BuildRight (37) + : : :- ^ ShuffledHashJoinExecTransformer LeftSemi BuildRight (27) + : : : :- ^ InputIteratorTransformer (18) + : : : : +- ShuffleQueryStage (16) + : : : : +- ColumnarExchange (15) + : : : : +- VeloxAppendBatches (14) + : : : : +- ^ ProjectExecTransformer (12) + : : : : +- ^ NoopFilter (11) + : : : : +- ^ Scan parquet (10) + : : : +- ^ InputIteratorTransformer (26) + : : : +- ShuffleQueryStage (24) + : : : +- ColumnarExchange (23) + : : : +- VeloxAppendBatches (22) + : : : +- ^ ProjectExecTransformer (20) + : : : +- ^ Scan parquet (19) + : : +- ^ InputIteratorTransformer (36) + : : +- ShuffleQueryStage (34) + : : +- ColumnarExchange (33) + : : +- VeloxAppendBatches (32) + : : +- ^ ProjectExecTransformer (30) + : : +- ^ NoopFilter (29) + : : +- ^ Scan parquet (28) + : +- ^ InputIteratorTransformer (61) + : +- ShuffleQueryStage (59) + : +- ColumnarExchange (58) + : +- VeloxAppendBatches (57) + : +- ^ ProjectExecTransformer (55) + : +- ^ NoopFilter (54) + : +- ^ Scan parquet (53) + +- ^ InputIteratorTransformer (78) + +- ShuffleQueryStage (76) + +- ColumnarExchange (75) + +- VeloxAppendBatches (74) + +- ^ ProjectExecTransformer (72) + +- ^ NoopFilter (71) + +- ^ Scan parquet (70) +- == Initial Plan == - TakeOrderedAndProject (118) - +- HashAggregate (117) - +- Exchange (116) - +- HashAggregate (115) - +- Project (114) - +- ShuffledHashJoin Inner BuildRight (113) - :- Exchange (108) - : +- Project (107) - : +- ShuffledHashJoin Inner BuildRight (106) - : :- Exchange (101) - : : +- Project (100) - : : +- ShuffledHashJoin Inner BuildLeft (99) - : : :- Exchange (85) - : : : +- Filter (84) - : : : +- Scan parquet (83) - : : +- Exchange (98) - : : +- ShuffledHashJoin LeftAnti BuildRight (97) - : : :- ShuffledHashJoin LeftSemi BuildRight (92) - : : : :- Exchange (89) - : : : : +- Project (88) - : : : : +- Filter (87) - : : : : +- Scan parquet (86) - : : : +- Exchange (91) - : : : +- Scan parquet (90) - : : +- Exchange (96) - : : +- Project (95) - : : +- Filter (94) - : : +- Scan parquet (93) - : +- Exchange (105) - : +- Project (104) - : +- Filter (103) - : +- Scan parquet (102) - +- Exchange (112) - +- Project (111) - +- Filter (110) - +- Scan parquet (109) + TakeOrderedAndProject (137) + +- HashAggregate (136) + +- Exchange (135) + +- HashAggregate (134) + +- Project (133) + +- SortMergeJoin Inner (132) + :- Sort (126) + : +- Exchange (125) + : +- Project (124) + : +- SortMergeJoin Inner (123) + : :- Sort (117) + : : +- Exchange (116) + : : +- Project (115) + : : +- SortMergeJoin Inner (114) + : : :- Sort (96) + : : : +- Exchange (95) + : : : +- Filter (94) + : : : +- Scan parquet (93) + : : +- Sort (113) + : : +- Exchange (112) + : : +- SortMergeJoin LeftAnti (111) + : : :- SortMergeJoin LeftSemi (105) + : : : :- Sort (101) + : : : : +- Exchange (100) + : : : : +- Project (99) + : : : : +- Filter (98) + : : : : +- Scan parquet (97) + : : : +- Sort (104) + : : : +- Exchange (103) + : : : +- Scan parquet (102) + : : +- Sort (110) + : : +- Exchange (109) + : : +- Project (108) + : : +- Filter (107) + : : +- Scan parquet (106) + : +- Sort (122) + : +- Exchange (121) + : +- Project (120) + : +- Filter (119) + : +- Scan parquet (118) + +- Sort (131) + +- Exchange (130) + +- Project (129) + +- Filter (128) + +- Scan parquet (127) (1) Scan parquet @@ -130,494 +139,570 @@ Input [3]: [s_suppkey#X, s_name#X, s_nationkey#X] Input [4]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_nationkey#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [4]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_nationkey#X] +Arguments: X + +(6) ColumnarExchange Input [4]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_nationkey#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [3]: [s_suppkey#X, s_name#X, s_nationkey#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [3]: [s_suppkey#X, s_name#X, s_nationkey#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [3]: [s_suppkey#X, s_name#X, s_nationkey#X] -(9) Scan parquet +(10) Scan parquet Output [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_receiptdate), IsNotNull(l_commitdate), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(10) NoopFilter +(11) NoopFilter Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Arguments: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [3]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_suppkey#X] Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] +Arguments: X + +(15) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [2]: [l_orderkey#X, l_suppkey#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [2]: [l_orderkey#X, l_suppkey#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [2]: [l_orderkey#X, l_suppkey#X] -(17) Scan parquet +(19) Scan parquet Output [2]: [l_orderkey#X, l_suppkey#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [3]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_suppkey#X] Input [2]: [l_orderkey#X, l_suppkey#X] -(19) WholeStageCodegenTransformer (X) +(21) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] Arguments: false -(20) ColumnarExchange +(22) VeloxAppendBatches +Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] +Arguments: X + +(23) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [id=#X] -(21) ShuffleQueryStage +(24) ShuffleQueryStage Output [2]: [l_orderkey#X, l_suppkey#X] Arguments: X -(22) InputAdapter +(25) InputAdapter Input [2]: [l_orderkey#X, l_suppkey#X] -(23) InputIteratorTransformer +(26) InputIteratorTransformer Input [2]: [l_orderkey#X, l_suppkey#X] -(24) ShuffledHashJoinExecTransformer +(27) ShuffledHashJoinExecTransformer Left keys [1]: [l_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: NOT (l_suppkey#X = l_suppkey#X) -(25) Scan parquet +(28) Scan parquet Output [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_receiptdate), IsNotNull(l_commitdate)] ReadSchema: struct -(26) NoopFilter +(29) NoopFilter Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Arguments: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] -(27) ProjectExecTransformer +(30) ProjectExecTransformer Output [3]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_suppkey#X] Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] -(28) WholeStageCodegenTransformer (X) +(31) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] Arguments: false -(29) ColumnarExchange +(32) VeloxAppendBatches +Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] +Arguments: X + +(33) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [id=#X] -(30) ShuffleQueryStage +(34) ShuffleQueryStage Output [2]: [l_orderkey#X, l_suppkey#X] Arguments: X -(31) InputAdapter +(35) InputAdapter Input [2]: [l_orderkey#X, l_suppkey#X] -(32) InputIteratorTransformer +(36) InputIteratorTransformer Input [2]: [l_orderkey#X, l_suppkey#X] -(33) ShuffledHashJoinExecTransformer +(37) ShuffledHashJoinExecTransformer Left keys [1]: [l_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: NOT (l_suppkey#X = l_suppkey#X) -(34) ProjectExecTransformer +(38) ProjectExecTransformer Output [3]: [hash(l_suppkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_suppkey#X] Input [2]: [l_orderkey#X, l_suppkey#X] -(35) WholeStageCodegenTransformer (X) +(39) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] Arguments: false -(36) ColumnarExchange +(40) VeloxAppendBatches +Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] +Arguments: X + +(41) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [id=#X] -(37) ShuffleQueryStage +(42) ShuffleQueryStage Output [2]: [l_orderkey#X, l_suppkey#X] Arguments: X -(38) InputAdapter +(43) InputAdapter Input [2]: [l_orderkey#X, l_suppkey#X] -(39) InputIteratorTransformer +(44) InputIteratorTransformer Input [2]: [l_orderkey#X, l_suppkey#X] -(40) ShuffledHashJoinExecTransformer +(45) ShuffledHashJoinExecTransformer Left keys [1]: [s_suppkey#X] Right keys [1]: [l_suppkey#X] Join condition: None -(41) ProjectExecTransformer +(46) ProjectExecTransformer Output [4]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, s_name#X, s_nationkey#X, l_orderkey#X] Input [5]: [s_suppkey#X, s_name#X, s_nationkey#X, l_orderkey#X, l_suppkey#X] -(42) WholeStageCodegenTransformer (X) +(47) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, s_name#X, s_nationkey#X, l_orderkey#X] Arguments: false -(43) ColumnarExchange +(48) VeloxAppendBatches +Input [4]: [hash_partition_key#X, s_name#X, s_nationkey#X, l_orderkey#X] +Arguments: X + +(49) ColumnarExchange Input [4]: [hash_partition_key#X, s_name#X, s_nationkey#X, l_orderkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [s_name#X, s_nationkey#X, l_orderkey#X], [plan_id=X], [id=#X] -(44) ShuffleQueryStage +(50) ShuffleQueryStage Output [3]: [s_name#X, s_nationkey#X, l_orderkey#X] Arguments: X -(45) InputAdapter +(51) InputAdapter Input [3]: [s_name#X, s_nationkey#X, l_orderkey#X] -(46) InputIteratorTransformer +(52) InputIteratorTransformer Input [3]: [s_name#X, s_nationkey#X, l_orderkey#X] -(47) Scan parquet +(53) Scan parquet Output [2]: [o_orderkey#X, o_orderstatus#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderstatus), EqualTo(o_orderstatus,F), IsNotNull(o_orderkey)] ReadSchema: struct -(48) NoopFilter +(54) NoopFilter Input [2]: [o_orderkey#X, o_orderstatus#X] Arguments: [o_orderkey#X, o_orderstatus#X] -(49) ProjectExecTransformer +(55) ProjectExecTransformer Output [2]: [hash(o_orderkey#X, 42) AS hash_partition_key#X, o_orderkey#X] Input [2]: [o_orderkey#X, o_orderstatus#X] -(50) WholeStageCodegenTransformer (X) +(56) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, o_orderkey#X] Arguments: false -(51) ColumnarExchange +(57) VeloxAppendBatches +Input [2]: [hash_partition_key#X, o_orderkey#X] +Arguments: X + +(58) ColumnarExchange Input [2]: [hash_partition_key#X, o_orderkey#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X], [plan_id=X], [id=#X] -(52) ShuffleQueryStage +(59) ShuffleQueryStage Output [1]: [o_orderkey#X] Arguments: X -(53) InputAdapter +(60) InputAdapter Input [1]: [o_orderkey#X] -(54) InputIteratorTransformer +(61) InputIteratorTransformer Input [1]: [o_orderkey#X] -(55) ShuffledHashJoinExecTransformer +(62) ShuffledHashJoinExecTransformer Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join condition: None -(56) ProjectExecTransformer +(63) ProjectExecTransformer Output [3]: [hash(s_nationkey#X, 42) AS hash_partition_key#X, s_name#X, s_nationkey#X] Input [4]: [s_name#X, s_nationkey#X, l_orderkey#X, o_orderkey#X] -(57) WholeStageCodegenTransformer (X) +(64) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, s_name#X, s_nationkey#X] Arguments: false -(58) ColumnarExchange +(65) VeloxAppendBatches +Input [3]: [hash_partition_key#X, s_name#X, s_nationkey#X] +Arguments: X + +(66) ColumnarExchange Input [3]: [hash_partition_key#X, s_name#X, s_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_name#X, s_nationkey#X], [plan_id=X], [id=#X] -(59) ShuffleQueryStage +(67) ShuffleQueryStage Output [2]: [s_name#X, s_nationkey#X] Arguments: X -(60) InputAdapter +(68) InputAdapter Input [2]: [s_name#X, s_nationkey#X] -(61) InputIteratorTransformer +(69) InputIteratorTransformer Input [2]: [s_name#X, s_nationkey#X] -(62) Scan parquet +(70) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,SAUDI ARABIA), IsNotNull(n_nationkey)] ReadSchema: struct -(63) NoopFilter +(71) NoopFilter Input [2]: [n_nationkey#X, n_name#X] Arguments: [n_nationkey#X, n_name#X] -(64) ProjectExecTransformer +(72) ProjectExecTransformer Output [2]: [hash(n_nationkey#X, 42) AS hash_partition_key#X, n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(65) WholeStageCodegenTransformer (X) +(73) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, n_nationkey#X] Arguments: false -(66) ColumnarExchange +(74) VeloxAppendBatches +Input [2]: [hash_partition_key#X, n_nationkey#X] +Arguments: X + +(75) ColumnarExchange Input [2]: [hash_partition_key#X, n_nationkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X], [plan_id=X], [id=#X] -(67) ShuffleQueryStage +(76) ShuffleQueryStage Output [1]: [n_nationkey#X] Arguments: X -(68) InputAdapter +(77) InputAdapter Input [1]: [n_nationkey#X] -(69) InputIteratorTransformer +(78) InputIteratorTransformer Input [1]: [n_nationkey#X] -(70) ShuffledHashJoinExecTransformer +(79) ShuffledHashJoinExecTransformer Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(71) ProjectExecTransformer +(80) ProjectExecTransformer Output [1]: [s_name#X] Input [3]: [s_name#X, s_nationkey#X, n_nationkey#X] -(72) FlushableHashAggregateExecTransformer +(81) FlushableHashAggregateExecTransformer Input [1]: [s_name#X] Keys [1]: [s_name#X] Functions [1]: [partial_count(1)] Aggregate Attributes [1]: [count#X] Results [2]: [s_name#X, count#X] -(73) ProjectExecTransformer +(82) ProjectExecTransformer Output [3]: [hash(s_name#X, 42) AS hash_partition_key#X, s_name#X, count#X] Input [2]: [s_name#X, count#X] -(74) WholeStageCodegenTransformer (X) +(83) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, s_name#X, count#X] Arguments: false -(75) ColumnarExchange +(84) VeloxAppendBatches +Input [3]: [hash_partition_key#X, s_name#X, count#X] +Arguments: X + +(85) ColumnarExchange Input [3]: [hash_partition_key#X, s_name#X, count#X] Arguments: hashpartitioning(s_name#X, 1), ENSURE_REQUIREMENTS, [s_name#X, count#X], [plan_id=X], [id=#X] -(76) ShuffleQueryStage +(86) ShuffleQueryStage Output [2]: [s_name#X, count#X] Arguments: X -(77) InputAdapter +(87) InputAdapter Input [2]: [s_name#X, count#X] -(78) InputIteratorTransformer +(88) InputIteratorTransformer Input [2]: [s_name#X, count#X] -(79) RegularHashAggregateExecTransformer +(89) RegularHashAggregateExecTransformer Input [2]: [s_name#X, count#X] Keys [1]: [s_name#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [s_name#X, count(1)#X AS numwait#X] -(80) WholeStageCodegenTransformer (X) +(90) WholeStageCodegenTransformer (X) Input [2]: [s_name#X, numwait#X] Arguments: false -(81) TakeOrderedAndProjectExecTransformer +(91) TakeOrderedAndProjectExecTransformer Input [2]: [s_name#X, numwait#X] Arguments: X, [numwait#X DESC NULLS LAST, s_name#X ASC NULLS FIRST], [s_name#X, numwait#X], 0 -(82) VeloxColumnarToRowExec +(92) VeloxColumnarToRowExec Input [2]: [s_name#X, numwait#X] -(83) Scan parquet +(93) Scan parquet Output [3]: [s_suppkey#X, s_name#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(84) Filter +(94) Filter Input [3]: [s_suppkey#X, s_name#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(85) Exchange +(95) Exchange Input [3]: [s_suppkey#X, s_name#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(86) Scan parquet +(96) Sort +Input [3]: [s_suppkey#X, s_name#X, s_nationkey#X] +Arguments: [s_suppkey#X ASC NULLS FIRST], false, 0 + +(97) Scan parquet Output [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_receiptdate), IsNotNull(l_commitdate), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(87) Filter +(98) Filter Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Condition : ((((isnotnull(l_receiptdate#X) AND isnotnull(l_commitdate#X)) AND (l_receiptdate#X > l_commitdate#X)) AND isnotnull(l_suppkey#X)) AND isnotnull(l_orderkey#X)) -(88) Project +(99) Project Output [2]: [l_orderkey#X, l_suppkey#X] Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] -(89) Exchange +(100) Exchange Input [2]: [l_orderkey#X, l_suppkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(90) Scan parquet +(101) Sort +Input [2]: [l_orderkey#X, l_suppkey#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(102) Scan parquet Output [2]: [l_orderkey#X, l_suppkey#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(91) Exchange +(103) Exchange Input [2]: [l_orderkey#X, l_suppkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(92) ShuffledHashJoin +(104) Sort +Input [2]: [l_orderkey#X, l_suppkey#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(105) SortMergeJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: NOT (l_suppkey#X = l_suppkey#X) -(93) Scan parquet +(106) Scan parquet Output [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_receiptdate), IsNotNull(l_commitdate)] ReadSchema: struct -(94) Filter +(107) Filter Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Condition : ((isnotnull(l_receiptdate#X) AND isnotnull(l_commitdate#X)) AND (l_receiptdate#X > l_commitdate#X)) -(95) Project +(108) Project Output [2]: [l_orderkey#X, l_suppkey#X] Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] -(96) Exchange +(109) Exchange Input [2]: [l_orderkey#X, l_suppkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(97) ShuffledHashJoin +(110) Sort +Input [2]: [l_orderkey#X, l_suppkey#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(111) SortMergeJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: NOT (l_suppkey#X = l_suppkey#X) -(98) Exchange +(112) Exchange Input [2]: [l_orderkey#X, l_suppkey#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(99) ShuffledHashJoin +(113) Sort +Input [2]: [l_orderkey#X, l_suppkey#X] +Arguments: [l_suppkey#X ASC NULLS FIRST], false, 0 + +(114) SortMergeJoin Left keys [1]: [s_suppkey#X] Right keys [1]: [l_suppkey#X] Join condition: None -(100) Project +(115) Project Output [3]: [s_name#X, s_nationkey#X, l_orderkey#X] Input [5]: [s_suppkey#X, s_name#X, s_nationkey#X, l_orderkey#X, l_suppkey#X] -(101) Exchange +(116) Exchange Input [3]: [s_name#X, s_nationkey#X, l_orderkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(102) Scan parquet +(117) Sort +Input [3]: [s_name#X, s_nationkey#X, l_orderkey#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(118) Scan parquet Output [2]: [o_orderkey#X, o_orderstatus#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderstatus), EqualTo(o_orderstatus,F), IsNotNull(o_orderkey)] ReadSchema: struct -(103) Filter +(119) Filter Input [2]: [o_orderkey#X, o_orderstatus#X] Condition : ((isnotnull(o_orderstatus#X) AND (o_orderstatus#X = F)) AND isnotnull(o_orderkey#X)) -(104) Project +(120) Project Output [1]: [o_orderkey#X] Input [2]: [o_orderkey#X, o_orderstatus#X] -(105) Exchange +(121) Exchange Input [1]: [o_orderkey#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(106) ShuffledHashJoin +(122) Sort +Input [1]: [o_orderkey#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(123) SortMergeJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join condition: None -(107) Project +(124) Project Output [2]: [s_name#X, s_nationkey#X] Input [4]: [s_name#X, s_nationkey#X, l_orderkey#X, o_orderkey#X] -(108) Exchange +(125) Exchange Input [2]: [s_name#X, s_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(109) Scan parquet +(126) Sort +Input [2]: [s_name#X, s_nationkey#X] +Arguments: [s_nationkey#X ASC NULLS FIRST], false, 0 + +(127) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,SAUDI ARABIA), IsNotNull(n_nationkey)] ReadSchema: struct -(110) Filter +(128) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : ((isnotnull(n_name#X) AND (n_name#X = SAUDI ARABIA)) AND isnotnull(n_nationkey#X)) -(111) Project +(129) Project Output [1]: [n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(112) Exchange +(130) Exchange Input [1]: [n_nationkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(113) ShuffledHashJoin +(131) Sort +Input [1]: [n_nationkey#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(132) SortMergeJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(114) Project +(133) Project Output [1]: [s_name#X] Input [3]: [s_name#X, s_nationkey#X, n_nationkey#X] -(115) HashAggregate +(134) HashAggregate Input [1]: [s_name#X] Keys [1]: [s_name#X] Functions [1]: [partial_count(1)] Aggregate Attributes [1]: [count#X] Results [2]: [s_name#X, count#X] -(116) Exchange +(135) Exchange Input [2]: [s_name#X, count#X] Arguments: hashpartitioning(s_name#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(117) HashAggregate +(136) HashAggregate Input [2]: [s_name#X, count#X] Keys [1]: [s_name#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [s_name#X, count(1)#X AS numwait#X] -(118) TakeOrderedAndProject +(137) TakeOrderedAndProject Input [2]: [s_name#X, numwait#X] Arguments: X, [numwait#X DESC NULLS LAST, s_name#X ASC NULLS FIRST], [s_name#X, numwait#X] -(119) AdaptiveSparkPlan +(138) AdaptiveSparkPlan Output [2]: [s_name#X, numwait#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/22.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/22.txt index dc38dbb7675af..984abd470378b 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/22.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/22.txt @@ -1,47 +1,49 @@ == Physical Plan == -AdaptiveSparkPlan (46) +AdaptiveSparkPlan (52) +- == Final Plan == - VeloxColumnarToRowExec (33) - +- ^ SortExecTransformer (31) - +- ^ InputIteratorTransformer (30) - +- ^ InputAdapter (29) - +- ^ ShuffleQueryStage (28) - +- ColumnarExchange (27) - +- ^ RegularHashAggregateExecTransformer (25) - +- ^ InputIteratorTransformer (24) - +- ^ InputAdapter (23) - +- ^ ShuffleQueryStage (22) - +- ColumnarExchange (21) - +- ^ ProjectExecTransformer (19) - +- ^ FlushableHashAggregateExecTransformer (18) - +- ^ ProjectExecTransformer (17) - +- ^ ShuffledHashJoinExecTransformer LeftAnti (16) - :- ^ InputIteratorTransformer (8) - : +- ^ InputAdapter (7) - : +- ^ ShuffleQueryStage (6) - : +- ColumnarExchange (5) + VeloxColumnarToRowExec (37) + +- ^ SortExecTransformer (35) + +- ^ InputIteratorTransformer (34) + +- ShuffleQueryStage (32) + +- ColumnarExchange (31) + +- VeloxAppendBatches (30) + +- ^ RegularHashAggregateExecTransformer (28) + +- ^ InputIteratorTransformer (27) + +- ShuffleQueryStage (25) + +- ColumnarExchange (24) + +- VeloxAppendBatches (23) + +- ^ ProjectExecTransformer (21) + +- ^ FlushableHashAggregateExecTransformer (20) + +- ^ ProjectExecTransformer (19) + +- ^ ShuffledHashJoinExecTransformer LeftAnti BuildRight (18) + :- ^ InputIteratorTransformer (9) + : +- ShuffleQueryStage (7) + : +- ColumnarExchange (6) + : +- VeloxAppendBatches (5) : +- ^ ProjectExecTransformer (3) : +- ^ NoopFilter (2) : +- ^ Scan parquet (1) - +- ^ InputIteratorTransformer (15) - +- ^ InputAdapter (14) - +- ^ ShuffleQueryStage (13) - +- ColumnarExchange (12) - +- ^ ProjectExecTransformer (10) - +- ^ Scan parquet (9) + +- ^ InputIteratorTransformer (17) + +- ShuffleQueryStage (15) + +- ColumnarExchange (14) + +- VeloxAppendBatches (13) + +- ^ ProjectExecTransformer (11) + +- ^ Scan parquet (10) +- == Initial Plan == - Sort (45) - +- Exchange (44) - +- HashAggregate (43) - +- Exchange (42) - +- HashAggregate (41) - +- Project (40) - +- ShuffledHashJoin LeftAnti BuildRight (39) - :- Exchange (36) - : +- Filter (35) - : +- Scan parquet (34) - +- Exchange (38) - +- Scan parquet (37) + Sort (51) + +- Exchange (50) + +- HashAggregate (49) + +- Exchange (48) + +- HashAggregate (47) + +- Project (46) + +- SortMergeJoin LeftAnti (45) + :- Sort (41) + : +- Exchange (40) + : +- Filter (39) + : +- Scan parquet (38) + +- Sort (44) + +- Exchange (43) + +- Scan parquet (42) (1) Scan parquet @@ -63,182 +65,206 @@ Input [3]: [c_custkey#X, c_phone#X, c_acctbal#X] Input [4]: [hash_partition_key#X, c_custkey#X, c_phone#X, c_acctbal#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [4]: [hash_partition_key#X, c_custkey#X, c_phone#X, c_acctbal#X] +Arguments: X + +(6) ColumnarExchange Input [4]: [hash_partition_key#X, c_custkey#X, c_phone#X, c_acctbal#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_phone#X, c_acctbal#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [3]: [c_custkey#X, c_phone#X, c_acctbal#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [3]: [c_custkey#X, c_phone#X, c_acctbal#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [3]: [c_custkey#X, c_phone#X, c_acctbal#X] -(9) Scan parquet +(10) Scan parquet Output [1]: [o_custkey#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(10) ProjectExecTransformer +(11) ProjectExecTransformer Output [2]: [hash(o_custkey#X, 42) AS hash_partition_key#X, o_custkey#X] Input [1]: [o_custkey#X] -(11) WholeStageCodegenTransformer (X) +(12) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, o_custkey#X] Arguments: false -(12) ColumnarExchange +(13) VeloxAppendBatches +Input [2]: [hash_partition_key#X, o_custkey#X] +Arguments: X + +(14) ColumnarExchange Input [2]: [hash_partition_key#X, o_custkey#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_custkey#X], [plan_id=X], [id=#X] -(13) ShuffleQueryStage +(15) ShuffleQueryStage Output [1]: [o_custkey#X] Arguments: X -(14) InputAdapter +(16) InputAdapter Input [1]: [o_custkey#X] -(15) InputIteratorTransformer +(17) InputIteratorTransformer Input [1]: [o_custkey#X] -(16) ShuffledHashJoinExecTransformer +(18) ShuffledHashJoinExecTransformer Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(17) ProjectExecTransformer +(19) ProjectExecTransformer Output [2]: [substring(c_phone#X, 1, 2) AS cntrycode#X, c_acctbal#X] Input [3]: [c_custkey#X, c_phone#X, c_acctbal#X] -(18) FlushableHashAggregateExecTransformer +(20) FlushableHashAggregateExecTransformer Input [2]: [cntrycode#X, c_acctbal#X] Keys [1]: [cntrycode#X] Functions [2]: [partial_count(1), partial_sum(c_acctbal#X)] Aggregate Attributes [3]: [count#X, sum#X, isEmpty#X] Results [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] -(19) ProjectExecTransformer +(21) ProjectExecTransformer Output [5]: [hash(cntrycode#X, 42) AS hash_partition_key#X, cntrycode#X, count#X, sum#X, isEmpty#X] Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] -(20) WholeStageCodegenTransformer (X) +(22) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, cntrycode#X, count#X, sum#X, isEmpty#X] Arguments: false -(21) ColumnarExchange +(23) VeloxAppendBatches +Input [5]: [hash_partition_key#X, cntrycode#X, count#X, sum#X, isEmpty#X] +Arguments: X + +(24) ColumnarExchange Input [5]: [hash_partition_key#X, cntrycode#X, count#X, sum#X, isEmpty#X] Arguments: hashpartitioning(cntrycode#X, 1), ENSURE_REQUIREMENTS, [cntrycode#X, count#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(22) ShuffleQueryStage +(25) ShuffleQueryStage Output [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] Arguments: X -(23) InputAdapter +(26) InputAdapter Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] -(24) InputIteratorTransformer +(27) InputIteratorTransformer Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] -(25) RegularHashAggregateExecTransformer +(28) RegularHashAggregateExecTransformer Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] Keys [1]: [cntrycode#X] Functions [2]: [count(1), sum(c_acctbal#X)] Aggregate Attributes [2]: [count(1)#X, sum(c_acctbal#X)#X] Results [3]: [cntrycode#X, count(1)#X AS numcust#X, sum(c_acctbal#X)#X AS totacctbal#X] -(26) WholeStageCodegenTransformer (X) +(29) WholeStageCodegenTransformer (X) Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: false -(27) ColumnarExchange +(30) VeloxAppendBatches +Input [3]: [cntrycode#X, numcust#X, totacctbal#X] +Arguments: X + +(31) ColumnarExchange Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: rangepartitioning(cntrycode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(28) ShuffleQueryStage +(32) ShuffleQueryStage Output [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: X -(29) InputAdapter +(33) InputAdapter Input [3]: [cntrycode#X, numcust#X, totacctbal#X] -(30) InputIteratorTransformer +(34) InputIteratorTransformer Input [3]: [cntrycode#X, numcust#X, totacctbal#X] -(31) SortExecTransformer +(35) SortExecTransformer Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: [cntrycode#X ASC NULLS FIRST], true, 0 -(32) WholeStageCodegenTransformer (X) +(36) WholeStageCodegenTransformer (X) Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: false -(33) VeloxColumnarToRowExec +(37) VeloxColumnarToRowExec Input [3]: [cntrycode#X, numcust#X, totacctbal#X] -(34) Scan parquet +(38) Scan parquet Output [3]: [c_custkey#X, c_phone#X, c_acctbal#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_acctbal)] ReadSchema: struct -(35) Filter +(39) Filter Input [3]: [c_custkey#X, c_phone#X, c_acctbal#X] Condition : ((isnotnull(c_acctbal#X) AND substring(c_phone#X, 1, 2) IN (13,31,23,29,30,18,17)) AND (cast(c_acctbal#X as decimal(16,6)) > Subquery subquery#X, [id=#X])) -(36) Exchange +(40) Exchange Input [3]: [c_custkey#X, c_phone#X, c_acctbal#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(37) Scan parquet +(41) Sort +Input [3]: [c_custkey#X, c_phone#X, c_acctbal#X] +Arguments: [c_custkey#X ASC NULLS FIRST], false, 0 + +(42) Scan parquet Output [1]: [o_custkey#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(38) Exchange +(43) Exchange Input [1]: [o_custkey#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(39) ShuffledHashJoin +(44) Sort +Input [1]: [o_custkey#X] +Arguments: [o_custkey#X ASC NULLS FIRST], false, 0 + +(45) SortMergeJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(40) Project +(46) Project Output [2]: [substring(c_phone#X, 1, 2) AS cntrycode#X, c_acctbal#X] Input [3]: [c_custkey#X, c_phone#X, c_acctbal#X] -(41) HashAggregate +(47) HashAggregate Input [2]: [cntrycode#X, c_acctbal#X] Keys [1]: [cntrycode#X] Functions [2]: [partial_count(1), partial_sum(c_acctbal#X)] Aggregate Attributes [3]: [count#X, sum#X, isEmpty#X] Results [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] -(42) Exchange +(48) Exchange Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] Arguments: hashpartitioning(cntrycode#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(43) HashAggregate +(49) HashAggregate Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] Keys [1]: [cntrycode#X] Functions [2]: [count(1), sum(c_acctbal#X)] Aggregate Attributes [2]: [count(1)#X, sum(c_acctbal#X)#X] Results [3]: [cntrycode#X, count(1)#X AS numcust#X, sum(c_acctbal#X)#X AS totacctbal#X] -(44) Exchange +(50) Exchange Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: rangepartitioning(cntrycode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(45) Sort +(51) Sort Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: [cntrycode#X ASC NULLS FIRST], true, 0 -(46) AdaptiveSparkPlan +(52) AdaptiveSparkPlan Output [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/3.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/3.txt index d6c6f13e90639..58484edaa685a 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/3.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/3.txt @@ -1,60 +1,64 @@ == Physical Plan == -AdaptiveSparkPlan (59) +AdaptiveSparkPlan (67) +- == Final Plan == - VeloxColumnarToRowExec (39) - +- TakeOrderedAndProjectExecTransformer (38) - +- ^ ProjectExecTransformer (36) - +- ^ RegularHashAggregateExecTransformer (35) - +- ^ RegularHashAggregateExecTransformer (34) - +- ^ ProjectExecTransformer (33) - +- ^ ShuffledHashJoinExecTransformer Inner (32) - :- ^ InputIteratorTransformer (23) - : +- ^ InputAdapter (22) - : +- ^ ShuffleQueryStage (21) - : +- ColumnarExchange (20) - : +- ^ ProjectExecTransformer (18) - : +- ^ ShuffledHashJoinExecTransformer Inner (17) - : :- ^ InputIteratorTransformer (8) - : : +- ^ InputAdapter (7) - : : +- ^ ShuffleQueryStage (6) - : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (43) + +- TakeOrderedAndProjectExecTransformer (42) + +- ^ ProjectExecTransformer (40) + +- ^ RegularHashAggregateExecTransformer (39) + +- ^ RegularHashAggregateExecTransformer (38) + +- ^ ProjectExecTransformer (37) + +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (36) + :- ^ InputIteratorTransformer (26) + : +- ShuffleQueryStage (24) + : +- ColumnarExchange (23) + : +- VeloxAppendBatches (22) + : +- ^ ProjectExecTransformer (20) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (19) + : :- ^ InputIteratorTransformer (9) + : : +- ShuffleQueryStage (7) + : : +- ColumnarExchange (6) + : : +- VeloxAppendBatches (5) : : +- ^ ProjectExecTransformer (3) : : +- ^ NoopFilter (2) : : +- ^ Scan parquet (1) - : +- ^ InputIteratorTransformer (16) - : +- ^ InputAdapter (15) - : +- ^ ShuffleQueryStage (14) - : +- ColumnarExchange (13) - : +- ^ ProjectExecTransformer (11) - : +- ^ NoopFilter (10) - : +- ^ Scan parquet (9) - +- ^ InputIteratorTransformer (31) - +- ^ InputAdapter (30) - +- ^ ShuffleQueryStage (29) - +- ColumnarExchange (28) - +- ^ ProjectExecTransformer (26) - +- ^ NoopFilter (25) - +- ^ Scan parquet (24) + : +- ^ InputIteratorTransformer (18) + : +- ShuffleQueryStage (16) + : +- ColumnarExchange (15) + : +- VeloxAppendBatches (14) + : +- ^ ProjectExecTransformer (12) + : +- ^ NoopFilter (11) + : +- ^ Scan parquet (10) + +- ^ InputIteratorTransformer (35) + +- ShuffleQueryStage (33) + +- ColumnarExchange (32) + +- VeloxAppendBatches (31) + +- ^ ProjectExecTransformer (29) + +- ^ NoopFilter (28) + +- ^ Scan parquet (27) +- == Initial Plan == - TakeOrderedAndProject (58) - +- HashAggregate (57) - +- HashAggregate (56) - +- Project (55) - +- ShuffledHashJoin Inner BuildRight (54) - :- Exchange (49) - : +- Project (48) - : +- ShuffledHashJoin Inner BuildLeft (47) - : :- Exchange (43) - : : +- Project (42) - : : +- Filter (41) - : : +- Scan parquet (40) - : +- Exchange (46) - : +- Filter (45) - : +- Scan parquet (44) - +- Exchange (53) - +- Project (52) - +- Filter (51) - +- Scan parquet (50) + TakeOrderedAndProject (66) + +- HashAggregate (65) + +- HashAggregate (64) + +- Project (63) + +- SortMergeJoin Inner (62) + :- Sort (56) + : +- Exchange (55) + : +- Project (54) + : +- SortMergeJoin Inner (53) + : :- Sort (48) + : : +- Exchange (47) + : : +- Project (46) + : : +- Filter (45) + : : +- Scan parquet (44) + : +- Sort (52) + : +- Exchange (51) + : +- Filter (50) + : +- Scan parquet (49) + +- Sort (61) + +- Exchange (60) + +- Project (59) + +- Filter (58) + +- Scan parquet (57) (1) Scan parquet @@ -76,244 +80,276 @@ Input [2]: [c_custkey#X, c_mktsegment#X] Input [2]: [hash_partition_key#X, c_custkey#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [2]: [hash_partition_key#X, c_custkey#X] +Arguments: X + +(6) ColumnarExchange Input [2]: [hash_partition_key#X, c_custkey#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [1]: [c_custkey#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [1]: [c_custkey#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [1]: [c_custkey#X] -(9) Scan parquet +(10) Scan parquet Output [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), LessThan(o_orderdate,1995-03-15), IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(10) NoopFilter +(11) NoopFilter Input [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Arguments: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [5]: [hash(o_custkey#X, 42) AS hash_partition_key#X, o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Input [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] +Arguments: X + +(15) ColumnarExchange Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [4]: [hash(o_orderkey#X, 42) AS hash_partition_key#X, o_orderkey#X, o_orderdate#X, o_shippriority#X] Input [5]: [c_custkey#X, o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] -(19) WholeStageCodegenTransformer (X) +(21) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, o_orderkey#X, o_orderdate#X, o_shippriority#X] Arguments: false -(20) ColumnarExchange +(22) VeloxAppendBatches +Input [4]: [hash_partition_key#X, o_orderkey#X, o_orderdate#X, o_shippriority#X] +Arguments: X + +(23) ColumnarExchange Input [4]: [hash_partition_key#X, o_orderkey#X, o_orderdate#X, o_shippriority#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderdate#X, o_shippriority#X], [plan_id=X], [id=#X] -(21) ShuffleQueryStage +(24) ShuffleQueryStage Output [3]: [o_orderkey#X, o_orderdate#X, o_shippriority#X] Arguments: X -(22) InputAdapter +(25) InputAdapter Input [3]: [o_orderkey#X, o_orderdate#X, o_shippriority#X] -(23) InputIteratorTransformer +(26) InputIteratorTransformer Input [3]: [o_orderkey#X, o_orderdate#X, o_shippriority#X] -(24) Scan parquet +(27) Scan parquet Output [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThan(l_shipdate,1995-03-15), IsNotNull(l_orderkey)] ReadSchema: struct -(25) NoopFilter +(28) NoopFilter Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(26) ProjectExecTransformer +(29) ProjectExecTransformer Output [4]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(27) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X] Arguments: false -(28) ColumnarExchange +(31) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(32) ColumnarExchange Input [4]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(29) ShuffleQueryStage +(33) ShuffleQueryStage Output [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] Arguments: X -(30) InputAdapter +(34) InputAdapter Input [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] -(31) InputIteratorTransformer +(35) InputIteratorTransformer Input [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] -(32) ShuffledHashJoinExecTransformer +(36) ShuffledHashJoinExecTransformer Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(33) ProjectExecTransformer +(37) ProjectExecTransformer Output [6]: [o_orderdate#X, o_shippriority#X, l_orderkey#X, l_extendedprice#X, l_discount#X, CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) AS _pre_X#X] Input [6]: [o_orderkey#X, o_orderdate#X, o_shippriority#X, l_orderkey#X, l_extendedprice#X, l_discount#X] -(34) RegularHashAggregateExecTransformer +(38) RegularHashAggregateExecTransformer Input [6]: [o_orderdate#X, o_shippriority#X, l_orderkey#X, l_extendedprice#X, l_discount#X, _pre_X#X] Keys [3]: [l_orderkey#X, o_orderdate#X, o_shippriority#X] Functions [1]: [partial_sum(_pre_X#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] -(35) RegularHashAggregateExecTransformer +(39) RegularHashAggregateExecTransformer Input [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] Keys [3]: [l_orderkey#X, o_orderdate#X, o_shippriority#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] Results [4]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] -(36) ProjectExecTransformer +(40) ProjectExecTransformer Output [4]: [l_orderkey#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X AS revenue#X, o_orderdate#X, o_shippriority#X] Input [4]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] -(37) WholeStageCodegenTransformer (X) +(41) WholeStageCodegenTransformer (X) Input [4]: [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] Arguments: false -(38) TakeOrderedAndProjectExecTransformer +(42) TakeOrderedAndProjectExecTransformer Input [4]: [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] Arguments: X, [revenue#X DESC NULLS LAST, o_orderdate#X ASC NULLS FIRST], [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X], 0 -(39) VeloxColumnarToRowExec +(43) VeloxColumnarToRowExec Input [4]: [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] -(40) Scan parquet +(44) Scan parquet Output [2]: [c_custkey#X, c_mktsegment#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_mktsegment), EqualTo(c_mktsegment,BUILDING), IsNotNull(c_custkey)] ReadSchema: struct -(41) Filter +(45) Filter Input [2]: [c_custkey#X, c_mktsegment#X] Condition : ((isnotnull(c_mktsegment#X) AND (c_mktsegment#X = BUILDING)) AND isnotnull(c_custkey#X)) -(42) Project +(46) Project Output [1]: [c_custkey#X] Input [2]: [c_custkey#X, c_mktsegment#X] -(43) Exchange +(47) Exchange Input [1]: [c_custkey#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(44) Scan parquet +(48) Sort +Input [1]: [c_custkey#X] +Arguments: [c_custkey#X ASC NULLS FIRST], false, 0 + +(49) Scan parquet Output [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), LessThan(o_orderdate,1995-03-15), IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(45) Filter +(50) Filter Input [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Condition : (((isnotnull(o_orderdate#X) AND (o_orderdate#X < 1995-03-15)) AND isnotnull(o_custkey#X)) AND isnotnull(o_orderkey#X)) -(46) Exchange +(51) Exchange Input [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(47) ShuffledHashJoin +(52) Sort +Input [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] +Arguments: [o_custkey#X ASC NULLS FIRST], false, 0 + +(53) SortMergeJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(48) Project +(54) Project Output [3]: [o_orderkey#X, o_orderdate#X, o_shippriority#X] Input [5]: [c_custkey#X, o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] -(49) Exchange +(55) Exchange Input [3]: [o_orderkey#X, o_orderdate#X, o_shippriority#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(50) Scan parquet +(56) Sort +Input [3]: [o_orderkey#X, o_orderdate#X, o_shippriority#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(57) Scan parquet Output [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThan(l_shipdate,1995-03-15), IsNotNull(l_orderkey)] ReadSchema: struct -(51) Filter +(58) Filter Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : ((isnotnull(l_shipdate#X) AND (l_shipdate#X > 1995-03-15)) AND isnotnull(l_orderkey#X)) -(52) Project +(59) Project Output [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(53) Exchange +(60) Exchange Input [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(54) ShuffledHashJoin +(61) Sort +Input [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(62) SortMergeJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(55) Project +(63) Project Output [5]: [o_orderdate#X, o_shippriority#X, l_orderkey#X, l_extendedprice#X, l_discount#X] Input [6]: [o_orderkey#X, o_orderdate#X, o_shippriority#X, l_orderkey#X, l_extendedprice#X, l_discount#X] -(56) HashAggregate +(64) HashAggregate Input [5]: [o_orderdate#X, o_shippriority#X, l_orderkey#X, l_extendedprice#X, l_discount#X] Keys [3]: [l_orderkey#X, o_orderdate#X, o_shippriority#X] Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] -(57) HashAggregate +(65) HashAggregate Input [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] Keys [3]: [l_orderkey#X, o_orderdate#X, o_shippriority#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] Results [4]: [l_orderkey#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X AS revenue#X, o_orderdate#X, o_shippriority#X] -(58) TakeOrderedAndProject +(66) TakeOrderedAndProject Input [4]: [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] Arguments: X, [revenue#X DESC NULLS LAST, o_orderdate#X ASC NULLS FIRST], [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] -(59) AdaptiveSparkPlan +(67) AdaptiveSparkPlan Output [4]: [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/4.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/4.txt index 4c33fb73f7576..cb7a3c3a0955c 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/4.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/4.txt @@ -1,51 +1,53 @@ == Physical Plan == -AdaptiveSparkPlan (50) +AdaptiveSparkPlan (56) +- == Final Plan == - VeloxColumnarToRowExec (34) - +- ^ SortExecTransformer (32) - +- ^ InputIteratorTransformer (31) - +- ^ InputAdapter (30) - +- ^ ShuffleQueryStage (29) - +- ColumnarExchange (28) - +- ^ RegularHashAggregateExecTransformer (26) - +- ^ InputIteratorTransformer (25) - +- ^ InputAdapter (24) - +- ^ ShuffleQueryStage (23) - +- ColumnarExchange (22) - +- ^ ProjectExecTransformer (20) - +- ^ FlushableHashAggregateExecTransformer (19) - +- ^ ProjectExecTransformer (18) - +- ^ ShuffledHashJoinExecTransformer LeftSemi (17) - :- ^ InputIteratorTransformer (8) - : +- ^ InputAdapter (7) - : +- ^ ShuffleQueryStage (6) - : +- ColumnarExchange (5) + VeloxColumnarToRowExec (38) + +- ^ SortExecTransformer (36) + +- ^ InputIteratorTransformer (35) + +- ShuffleQueryStage (33) + +- ColumnarExchange (32) + +- VeloxAppendBatches (31) + +- ^ RegularHashAggregateExecTransformer (29) + +- ^ InputIteratorTransformer (28) + +- ShuffleQueryStage (26) + +- ColumnarExchange (25) + +- VeloxAppendBatches (24) + +- ^ ProjectExecTransformer (22) + +- ^ FlushableHashAggregateExecTransformer (21) + +- ^ ProjectExecTransformer (20) + +- ^ ShuffledHashJoinExecTransformer LeftSemi BuildRight (19) + :- ^ InputIteratorTransformer (9) + : +- ShuffleQueryStage (7) + : +- ColumnarExchange (6) + : +- VeloxAppendBatches (5) : +- ^ ProjectExecTransformer (3) : +- ^ NoopFilter (2) : +- ^ Scan parquet (1) - +- ^ InputIteratorTransformer (16) - +- ^ InputAdapter (15) - +- ^ ShuffleQueryStage (14) - +- ColumnarExchange (13) - +- ^ ProjectExecTransformer (11) - +- ^ NoopFilter (10) - +- ^ Scan parquet (9) + +- ^ InputIteratorTransformer (18) + +- ShuffleQueryStage (16) + +- ColumnarExchange (15) + +- VeloxAppendBatches (14) + +- ^ ProjectExecTransformer (12) + +- ^ NoopFilter (11) + +- ^ Scan parquet (10) +- == Initial Plan == - Sort (49) - +- Exchange (48) - +- HashAggregate (47) - +- Exchange (46) - +- HashAggregate (45) - +- Project (44) - +- ShuffledHashJoin LeftSemi BuildRight (43) - :- Exchange (38) - : +- Project (37) - : +- Filter (36) - : +- Scan parquet (35) - +- Exchange (42) - +- Project (41) - +- Filter (40) - +- Scan parquet (39) + Sort (55) + +- Exchange (54) + +- HashAggregate (53) + +- Exchange (52) + +- HashAggregate (51) + +- Project (50) + +- SortMergeJoin LeftSemi (49) + :- Sort (43) + : +- Exchange (42) + : +- Project (41) + : +- Filter (40) + : +- Scan parquet (39) + +- Sort (48) + +- Exchange (47) + +- Project (46) + +- Filter (45) + +- Scan parquet (44) (1) Scan parquet @@ -67,200 +69,224 @@ Input [3]: [o_orderkey#X, o_orderdate#X, o_orderpriority#X] Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderpriority#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderpriority#X] +Arguments: X + +(6) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderpriority#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderpriority#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [2]: [o_orderkey#X, o_orderpriority#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [2]: [o_orderkey#X, o_orderpriority#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [2]: [o_orderkey#X, o_orderpriority#X] -(9) Scan parquet +(10) Scan parquet Output [3]: [l_orderkey#X, l_commitdate#X, l_receiptdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_commitdate), IsNotNull(l_receiptdate)] ReadSchema: struct -(10) NoopFilter +(11) NoopFilter Input [3]: [l_orderkey#X, l_commitdate#X, l_receiptdate#X] Arguments: [l_orderkey#X, l_commitdate#X, l_receiptdate#X] -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [2]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X] Input [3]: [l_orderkey#X, l_commitdate#X, l_receiptdate#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, l_orderkey#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [2]: [hash_partition_key#X, l_orderkey#X] +Arguments: X + +(15) ColumnarExchange Input [2]: [hash_partition_key#X, l_orderkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [1]: [l_orderkey#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [1]: [l_orderkey#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [1]: [l_orderkey#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [1]: [o_orderpriority#X] Input [2]: [o_orderkey#X, o_orderpriority#X] -(19) FlushableHashAggregateExecTransformer +(21) FlushableHashAggregateExecTransformer Input [1]: [o_orderpriority#X] Keys [1]: [o_orderpriority#X] Functions [1]: [partial_count(1)] Aggregate Attributes [1]: [count#X] Results [2]: [o_orderpriority#X, count#X] -(20) ProjectExecTransformer +(22) ProjectExecTransformer Output [3]: [hash(o_orderpriority#X, 42) AS hash_partition_key#X, o_orderpriority#X, count#X] Input [2]: [o_orderpriority#X, count#X] -(21) WholeStageCodegenTransformer (X) +(23) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, o_orderpriority#X, count#X] Arguments: false -(22) ColumnarExchange +(24) VeloxAppendBatches +Input [3]: [hash_partition_key#X, o_orderpriority#X, count#X] +Arguments: X + +(25) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderpriority#X, count#X] Arguments: hashpartitioning(o_orderpriority#X, 1), ENSURE_REQUIREMENTS, [o_orderpriority#X, count#X], [plan_id=X], [id=#X] -(23) ShuffleQueryStage +(26) ShuffleQueryStage Output [2]: [o_orderpriority#X, count#X] Arguments: X -(24) InputAdapter +(27) InputAdapter Input [2]: [o_orderpriority#X, count#X] -(25) InputIteratorTransformer +(28) InputIteratorTransformer Input [2]: [o_orderpriority#X, count#X] -(26) RegularHashAggregateExecTransformer +(29) RegularHashAggregateExecTransformer Input [2]: [o_orderpriority#X, count#X] Keys [1]: [o_orderpriority#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [o_orderpriority#X, count(1)#X AS order_count#X] -(27) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [2]: [o_orderpriority#X, order_count#X] Arguments: false -(28) ColumnarExchange +(31) VeloxAppendBatches +Input [2]: [o_orderpriority#X, order_count#X] +Arguments: X + +(32) ColumnarExchange Input [2]: [o_orderpriority#X, order_count#X] Arguments: rangepartitioning(o_orderpriority#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(29) ShuffleQueryStage +(33) ShuffleQueryStage Output [2]: [o_orderpriority#X, order_count#X] Arguments: X -(30) InputAdapter +(34) InputAdapter Input [2]: [o_orderpriority#X, order_count#X] -(31) InputIteratorTransformer +(35) InputIteratorTransformer Input [2]: [o_orderpriority#X, order_count#X] -(32) SortExecTransformer +(36) SortExecTransformer Input [2]: [o_orderpriority#X, order_count#X] Arguments: [o_orderpriority#X ASC NULLS FIRST], true, 0 -(33) WholeStageCodegenTransformer (X) +(37) WholeStageCodegenTransformer (X) Input [2]: [o_orderpriority#X, order_count#X] Arguments: false -(34) VeloxColumnarToRowExec +(38) VeloxColumnarToRowExec Input [2]: [o_orderpriority#X, order_count#X] -(35) Scan parquet +(39) Scan parquet Output [3]: [o_orderkey#X, o_orderdate#X, o_orderpriority#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1993-07-01), LessThan(o_orderdate,1993-10-01)] ReadSchema: struct -(36) Filter +(40) Filter Input [3]: [o_orderkey#X, o_orderdate#X, o_orderpriority#X] Condition : ((isnotnull(o_orderdate#X) AND (o_orderdate#X >= 1993-07-01)) AND (o_orderdate#X < 1993-10-01)) -(37) Project +(41) Project Output [2]: [o_orderkey#X, o_orderpriority#X] Input [3]: [o_orderkey#X, o_orderdate#X, o_orderpriority#X] -(38) Exchange +(42) Exchange Input [2]: [o_orderkey#X, o_orderpriority#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(39) Scan parquet +(43) Sort +Input [2]: [o_orderkey#X, o_orderpriority#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(44) Scan parquet Output [3]: [l_orderkey#X, l_commitdate#X, l_receiptdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_commitdate), IsNotNull(l_receiptdate)] ReadSchema: struct -(40) Filter +(45) Filter Input [3]: [l_orderkey#X, l_commitdate#X, l_receiptdate#X] Condition : ((isnotnull(l_commitdate#X) AND isnotnull(l_receiptdate#X)) AND (l_commitdate#X < l_receiptdate#X)) -(41) Project +(46) Project Output [1]: [l_orderkey#X] Input [3]: [l_orderkey#X, l_commitdate#X, l_receiptdate#X] -(42) Exchange +(47) Exchange Input [1]: [l_orderkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(43) ShuffledHashJoin +(48) Sort +Input [1]: [l_orderkey#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(49) SortMergeJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(44) Project +(50) Project Output [1]: [o_orderpriority#X] Input [2]: [o_orderkey#X, o_orderpriority#X] -(45) HashAggregate +(51) HashAggregate Input [1]: [o_orderpriority#X] Keys [1]: [o_orderpriority#X] Functions [1]: [partial_count(1)] Aggregate Attributes [1]: [count#X] Results [2]: [o_orderpriority#X, count#X] -(46) Exchange +(52) Exchange Input [2]: [o_orderpriority#X, count#X] Arguments: hashpartitioning(o_orderpriority#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(47) HashAggregate +(53) HashAggregate Input [2]: [o_orderpriority#X, count#X] Keys [1]: [o_orderpriority#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [o_orderpriority#X, count(1)#X AS order_count#X] -(48) Exchange +(54) Exchange Input [2]: [o_orderpriority#X, order_count#X] Arguments: rangepartitioning(o_orderpriority#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(49) Sort +(55) Sort Input [2]: [o_orderpriority#X, order_count#X] Arguments: [o_orderpriority#X ASC NULLS FIRST], true, 0 -(50) AdaptiveSparkPlan +(56) AdaptiveSparkPlan Output [2]: [o_orderpriority#X, order_count#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/5.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/5.txt index 8bc2587dbaee7..930a5a0bf4886 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/5.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/5.txt @@ -1,127 +1,137 @@ == Physical Plan == -AdaptiveSparkPlan (134) +AdaptiveSparkPlan (156) +- == Final Plan == - VeloxColumnarToRowExec (94) - +- ^ SortExecTransformer (92) - +- ^ InputIteratorTransformer (91) - +- ^ InputAdapter (90) - +- ^ ShuffleQueryStage (89) - +- ColumnarExchange (88) - +- ^ RegularHashAggregateExecTransformer (86) - +- ^ InputIteratorTransformer (85) - +- ^ InputAdapter (84) - +- ^ ShuffleQueryStage (83) - +- ColumnarExchange (82) - +- ^ ProjectExecTransformer (80) - +- ^ FlushableHashAggregateExecTransformer (79) - +- ^ ProjectExecTransformer (78) - +- ^ ShuffledHashJoinExecTransformer Inner (77) - :- ^ InputIteratorTransformer (68) - : +- ^ InputAdapter (67) - : +- ^ ShuffleQueryStage (66) - : +- ColumnarExchange (65) - : +- ^ ProjectExecTransformer (63) - : +- ^ ShuffledHashJoinExecTransformer Inner (62) - : :- ^ InputIteratorTransformer (53) - : : +- ^ InputAdapter (52) - : : +- ^ ShuffleQueryStage (51) - : : +- ColumnarExchange (50) - : : +- ^ ProjectExecTransformer (48) - : : +- ^ ShuffledHashJoinExecTransformer Inner (47) - : : :- ^ InputIteratorTransformer (38) - : : : +- ^ InputAdapter (37) - : : : +- ^ ShuffleQueryStage (36) - : : : +- ColumnarExchange (35) - : : : +- ^ ProjectExecTransformer (33) - : : : +- ^ ShuffledHashJoinExecTransformer Inner (32) - : : : :- ^ InputIteratorTransformer (23) - : : : : +- ^ InputAdapter (22) - : : : : +- ^ ShuffleQueryStage (21) - : : : : +- ColumnarExchange (20) - : : : : +- ^ ProjectExecTransformer (18) - : : : : +- ^ ShuffledHashJoinExecTransformer Inner (17) - : : : : :- ^ InputIteratorTransformer (8) - : : : : : +- ^ InputAdapter (7) - : : : : : +- ^ ShuffleQueryStage (6) - : : : : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (106) + +- ^ SortExecTransformer (104) + +- ^ InputIteratorTransformer (103) + +- ShuffleQueryStage (101) + +- ColumnarExchange (100) + +- VeloxAppendBatches (99) + +- ^ RegularHashAggregateExecTransformer (97) + +- ^ InputIteratorTransformer (96) + +- ShuffleQueryStage (94) + +- ColumnarExchange (93) + +- VeloxAppendBatches (92) + +- ^ ProjectExecTransformer (90) + +- ^ FlushableHashAggregateExecTransformer (89) + +- ^ ProjectExecTransformer (88) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (87) + :- ^ InputIteratorTransformer (77) + : +- ShuffleQueryStage (75) + : +- ColumnarExchange (74) + : +- VeloxAppendBatches (73) + : +- ^ ProjectExecTransformer (71) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (70) + : :- ^ InputIteratorTransformer (60) + : : +- ShuffleQueryStage (58) + : : +- ColumnarExchange (57) + : : +- VeloxAppendBatches (56) + : : +- ^ ProjectExecTransformer (54) + : : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (53) + : : :- ^ InputIteratorTransformer (43) + : : : +- ShuffleQueryStage (41) + : : : +- ColumnarExchange (40) + : : : +- VeloxAppendBatches (39) + : : : +- ^ ProjectExecTransformer (37) + : : : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (36) + : : : :- ^ InputIteratorTransformer (26) + : : : : +- ShuffleQueryStage (24) + : : : : +- ColumnarExchange (23) + : : : : +- VeloxAppendBatches (22) + : : : : +- ^ ProjectExecTransformer (20) + : : : : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (19) + : : : : :- ^ InputIteratorTransformer (9) + : : : : : +- ShuffleQueryStage (7) + : : : : : +- ColumnarExchange (6) + : : : : : +- VeloxAppendBatches (5) : : : : : +- ^ ProjectExecTransformer (3) : : : : : +- ^ NoopFilter (2) : : : : : +- ^ Scan parquet (1) - : : : : +- ^ InputIteratorTransformer (16) - : : : : +- ^ InputAdapter (15) - : : : : +- ^ ShuffleQueryStage (14) - : : : : +- ColumnarExchange (13) - : : : : +- ^ ProjectExecTransformer (11) - : : : : +- ^ NoopFilter (10) - : : : : +- ^ Scan parquet (9) - : : : +- ^ InputIteratorTransformer (31) - : : : +- ^ InputAdapter (30) - : : : +- ^ ShuffleQueryStage (29) - : : : +- ColumnarExchange (28) - : : : +- ^ ProjectExecTransformer (26) - : : : +- ^ NoopFilter (25) - : : : +- ^ Scan parquet (24) - : : +- ^ InputIteratorTransformer (46) - : : +- ^ InputAdapter (45) - : : +- ^ ShuffleQueryStage (44) - : : +- ColumnarExchange (43) - : : +- ^ ProjectExecTransformer (41) - : : +- ^ NoopFilter (40) - : : +- ^ Scan parquet (39) - : +- ^ InputIteratorTransformer (61) - : +- ^ InputAdapter (60) - : +- ^ ShuffleQueryStage (59) - : +- ColumnarExchange (58) - : +- ^ ProjectExecTransformer (56) - : +- ^ NoopFilter (55) - : +- ^ Scan parquet (54) - +- ^ InputIteratorTransformer (76) - +- ^ InputAdapter (75) - +- ^ ShuffleQueryStage (74) - +- ColumnarExchange (73) - +- ^ ProjectExecTransformer (71) - +- ^ NoopFilter (70) - +- ^ Scan parquet (69) + : : : : +- ^ InputIteratorTransformer (18) + : : : : +- ShuffleQueryStage (16) + : : : : +- ColumnarExchange (15) + : : : : +- VeloxAppendBatches (14) + : : : : +- ^ ProjectExecTransformer (12) + : : : : +- ^ NoopFilter (11) + : : : : +- ^ Scan parquet (10) + : : : +- ^ InputIteratorTransformer (35) + : : : +- ShuffleQueryStage (33) + : : : +- ColumnarExchange (32) + : : : +- VeloxAppendBatches (31) + : : : +- ^ ProjectExecTransformer (29) + : : : +- ^ NoopFilter (28) + : : : +- ^ Scan parquet (27) + : : +- ^ InputIteratorTransformer (52) + : : +- ShuffleQueryStage (50) + : : +- ColumnarExchange (49) + : : +- VeloxAppendBatches (48) + : : +- ^ ProjectExecTransformer (46) + : : +- ^ NoopFilter (45) + : : +- ^ Scan parquet (44) + : +- ^ InputIteratorTransformer (69) + : +- ShuffleQueryStage (67) + : +- ColumnarExchange (66) + : +- VeloxAppendBatches (65) + : +- ^ ProjectExecTransformer (63) + : +- ^ NoopFilter (62) + : +- ^ Scan parquet (61) + +- ^ InputIteratorTransformer (86) + +- ShuffleQueryStage (84) + +- ColumnarExchange (83) + +- VeloxAppendBatches (82) + +- ^ ProjectExecTransformer (80) + +- ^ NoopFilter (79) + +- ^ Scan parquet (78) +- == Initial Plan == - Sort (133) - +- Exchange (132) - +- HashAggregate (131) - +- Exchange (130) - +- HashAggregate (129) - +- Project (128) - +- ShuffledHashJoin Inner BuildRight (127) - :- Exchange (122) - : +- Project (121) - : +- ShuffledHashJoin Inner BuildRight (120) - : :- Exchange (116) - : : +- Project (115) - : : +- ShuffledHashJoin Inner BuildRight (114) - : : :- Exchange (110) - : : : +- Project (109) - : : : +- ShuffledHashJoin Inner BuildRight (108) - : : : :- Exchange (104) - : : : : +- Project (103) - : : : : +- ShuffledHashJoin Inner BuildLeft (102) - : : : : :- Exchange (97) - : : : : : +- Filter (96) - : : : : : +- Scan parquet (95) - : : : : +- Exchange (101) - : : : : +- Project (100) - : : : : +- Filter (99) - : : : : +- Scan parquet (98) - : : : +- Exchange (107) - : : : +- Filter (106) - : : : +- Scan parquet (105) - : : +- Exchange (113) - : : +- Filter (112) - : : +- Scan parquet (111) - : +- Exchange (119) - : +- Filter (118) - : +- Scan parquet (117) - +- Exchange (126) - +- Project (125) - +- Filter (124) - +- Scan parquet (123) + Sort (155) + +- Exchange (154) + +- HashAggregate (153) + +- Exchange (152) + +- HashAggregate (151) + +- Project (150) + +- SortMergeJoin Inner (149) + :- Sort (143) + : +- Exchange (142) + : +- Project (141) + : +- SortMergeJoin Inner (140) + : :- Sort (135) + : : +- Exchange (134) + : : +- Project (133) + : : +- SortMergeJoin Inner (132) + : : :- Sort (127) + : : : +- Exchange (126) + : : : +- Project (125) + : : : +- SortMergeJoin Inner (124) + : : : :- Sort (119) + : : : : +- Exchange (118) + : : : : +- Project (117) + : : : : +- SortMergeJoin Inner (116) + : : : : :- Sort (110) + : : : : : +- Exchange (109) + : : : : : +- Filter (108) + : : : : : +- Scan parquet (107) + : : : : +- Sort (115) + : : : : +- Exchange (114) + : : : : +- Project (113) + : : : : +- Filter (112) + : : : : +- Scan parquet (111) + : : : +- Sort (123) + : : : +- Exchange (122) + : : : +- Filter (121) + : : : +- Scan parquet (120) + : : +- Sort (131) + : : +- Exchange (130) + : : +- Filter (129) + : : +- Scan parquet (128) + : +- Sort (139) + : +- Exchange (138) + : +- Filter (137) + : +- Scan parquet (136) + +- Sort (148) + +- Exchange (147) + +- Project (146) + +- Filter (145) + +- Scan parquet (144) (1) Scan parquet @@ -143,552 +153,640 @@ Input [2]: [c_custkey#X, c_nationkey#X] Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] +Arguments: X + +(6) ColumnarExchange Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_nationkey#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [2]: [c_custkey#X, c_nationkey#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [2]: [c_custkey#X, c_nationkey#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [2]: [c_custkey#X, c_nationkey#X] -(9) Scan parquet +(10) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1994-01-01), LessThan(o_orderdate,1995-01-01), IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(10) NoopFilter +(11) NoopFilter Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Arguments: [o_orderkey#X, o_custkey#X, o_orderdate#X] -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [3]: [hash(o_custkey#X, 42) AS hash_partition_key#X, o_orderkey#X, o_custkey#X] Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] +Arguments: X + +(15) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [2]: [o_orderkey#X, o_custkey#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [2]: [o_orderkey#X, o_custkey#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [2]: [o_orderkey#X, o_custkey#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [3]: [hash(o_orderkey#X, 42) AS hash_partition_key#X, c_nationkey#X, o_orderkey#X] Input [4]: [c_custkey#X, c_nationkey#X, o_orderkey#X, o_custkey#X] -(19) WholeStageCodegenTransformer (X) +(21) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, c_nationkey#X, o_orderkey#X] Arguments: false -(20) ColumnarExchange +(22) VeloxAppendBatches +Input [3]: [hash_partition_key#X, c_nationkey#X, o_orderkey#X] +Arguments: X + +(23) ColumnarExchange Input [3]: [hash_partition_key#X, c_nationkey#X, o_orderkey#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [c_nationkey#X, o_orderkey#X], [plan_id=X], [id=#X] -(21) ShuffleQueryStage +(24) ShuffleQueryStage Output [2]: [c_nationkey#X, o_orderkey#X] Arguments: X -(22) InputAdapter +(25) InputAdapter Input [2]: [c_nationkey#X, o_orderkey#X] -(23) InputIteratorTransformer +(26) InputIteratorTransformer Input [2]: [c_nationkey#X, o_orderkey#X] -(24) Scan parquet +(27) Scan parquet Output [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_orderkey), IsNotNull(l_suppkey)] ReadSchema: struct -(25) NoopFilter +(28) NoopFilter Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(26) ProjectExecTransformer +(29) ProjectExecTransformer Output [5]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(27) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: false -(28) ColumnarExchange +(31) VeloxAppendBatches +Input [5]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(32) ColumnarExchange Input [5]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(29) ShuffleQueryStage +(33) ShuffleQueryStage Output [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: X -(30) InputAdapter +(34) InputAdapter Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(31) InputIteratorTransformer +(35) InputIteratorTransformer Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(32) ShuffledHashJoinExecTransformer +(36) ShuffledHashJoinExecTransformer Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(33) ProjectExecTransformer +(37) ProjectExecTransformer Output [5]: [hash(l_suppkey#X, c_nationkey#X, 42) AS hash_partition_key#X, c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Input [6]: [c_nationkey#X, o_orderkey#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(34) WholeStageCodegenTransformer (X) +(38) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: false -(35) ColumnarExchange +(39) VeloxAppendBatches +Input [5]: [hash_partition_key#X, c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(40) ColumnarExchange Input [5]: [hash_partition_key#X, c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_suppkey#X, c_nationkey#X, 1), ENSURE_REQUIREMENTS, [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(36) ShuffleQueryStage +(41) ShuffleQueryStage Output [4]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: X -(37) InputAdapter +(42) InputAdapter Input [4]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(38) InputIteratorTransformer +(43) InputIteratorTransformer Input [4]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(39) Scan parquet +(44) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(40) NoopFilter +(45) NoopFilter Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: [s_suppkey#X, s_nationkey#X] -(41) ProjectExecTransformer +(46) ProjectExecTransformer Output [3]: [hash(s_suppkey#X, s_nationkey#X, 42) AS hash_partition_key#X, s_suppkey#X, s_nationkey#X] Input [2]: [s_suppkey#X, s_nationkey#X] -(42) WholeStageCodegenTransformer (X) +(47) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] Arguments: false -(43) ColumnarExchange +(48) VeloxAppendBatches +Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] +Arguments: X + +(49) ColumnarExchange Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [id=#X] -(44) ShuffleQueryStage +(50) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] Arguments: X -(45) InputAdapter +(51) InputAdapter Input [2]: [s_suppkey#X, s_nationkey#X] -(46) InputIteratorTransformer +(52) InputIteratorTransformer Input [2]: [s_suppkey#X, s_nationkey#X] -(47) ShuffledHashJoinExecTransformer +(53) ShuffledHashJoinExecTransformer Left keys [2]: [l_suppkey#X, c_nationkey#X] Right keys [2]: [s_suppkey#X, s_nationkey#X] Join condition: None -(48) ProjectExecTransformer +(54) ProjectExecTransformer Output [4]: [hash(s_nationkey#X, 42) AS hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Input [6]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, s_suppkey#X, s_nationkey#X] -(49) WholeStageCodegenTransformer (X) +(55) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: false -(50) ColumnarExchange +(56) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X] +Arguments: X + +(57) ColumnarExchange Input [4]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X], [plan_id=X], [id=#X] -(51) ShuffleQueryStage +(58) ShuffleQueryStage Output [3]: [l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: X -(52) InputAdapter +(59) InputAdapter Input [3]: [l_extendedprice#X, l_discount#X, s_nationkey#X] -(53) InputIteratorTransformer +(60) InputIteratorTransformer Input [3]: [l_extendedprice#X, l_discount#X, s_nationkey#X] -(54) Scan parquet +(61) Scan parquet Output [3]: [n_nationkey#X, n_name#X, n_regionkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), IsNotNull(n_regionkey)] ReadSchema: struct -(55) NoopFilter +(62) NoopFilter Input [3]: [n_nationkey#X, n_name#X, n_regionkey#X] Arguments: [n_nationkey#X, n_name#X, n_regionkey#X] -(56) ProjectExecTransformer +(63) ProjectExecTransformer Output [4]: [hash(n_nationkey#X, 42) AS hash_partition_key#X, n_nationkey#X, n_name#X, n_regionkey#X] Input [3]: [n_nationkey#X, n_name#X, n_regionkey#X] -(57) WholeStageCodegenTransformer (X) +(64) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, n_nationkey#X, n_name#X, n_regionkey#X] Arguments: false -(58) ColumnarExchange +(65) VeloxAppendBatches +Input [4]: [hash_partition_key#X, n_nationkey#X, n_name#X, n_regionkey#X] +Arguments: X + +(66) ColumnarExchange Input [4]: [hash_partition_key#X, n_nationkey#X, n_name#X, n_regionkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X, n_regionkey#X], [plan_id=X], [id=#X] -(59) ShuffleQueryStage +(67) ShuffleQueryStage Output [3]: [n_nationkey#X, n_name#X, n_regionkey#X] Arguments: X -(60) InputAdapter +(68) InputAdapter Input [3]: [n_nationkey#X, n_name#X, n_regionkey#X] -(61) InputIteratorTransformer +(69) InputIteratorTransformer Input [3]: [n_nationkey#X, n_name#X, n_regionkey#X] -(62) ShuffledHashJoinExecTransformer +(70) ShuffledHashJoinExecTransformer Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(63) ProjectExecTransformer +(71) ProjectExecTransformer Output [5]: [hash(n_regionkey#X, 42) AS hash_partition_key#X, l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] Input [6]: [l_extendedprice#X, l_discount#X, s_nationkey#X, n_nationkey#X, n_name#X, n_regionkey#X] -(64) WholeStageCodegenTransformer (X) +(72) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] Arguments: false -(65) ColumnarExchange +(73) VeloxAppendBatches +Input [5]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] +Arguments: X + +(74) ColumnarExchange Input [5]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] Arguments: hashpartitioning(n_regionkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X], [plan_id=X], [id=#X] -(66) ShuffleQueryStage +(75) ShuffleQueryStage Output [4]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] Arguments: X -(67) InputAdapter +(76) InputAdapter Input [4]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] -(68) InputIteratorTransformer +(77) InputIteratorTransformer Input [4]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] -(69) Scan parquet +(78) Scan parquet Output [2]: [r_regionkey#X, r_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(r_name), EqualTo(r_name,ASIA), IsNotNull(r_regionkey)] ReadSchema: struct -(70) NoopFilter +(79) NoopFilter Input [2]: [r_regionkey#X, r_name#X] Arguments: [r_regionkey#X, r_name#X] -(71) ProjectExecTransformer +(80) ProjectExecTransformer Output [2]: [hash(r_regionkey#X, 42) AS hash_partition_key#X, r_regionkey#X] Input [2]: [r_regionkey#X, r_name#X] -(72) WholeStageCodegenTransformer (X) +(81) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, r_regionkey#X] Arguments: false -(73) ColumnarExchange +(82) VeloxAppendBatches +Input [2]: [hash_partition_key#X, r_regionkey#X] +Arguments: X + +(83) ColumnarExchange Input [2]: [hash_partition_key#X, r_regionkey#X] Arguments: hashpartitioning(r_regionkey#X, 1), ENSURE_REQUIREMENTS, [r_regionkey#X], [plan_id=X], [id=#X] -(74) ShuffleQueryStage +(84) ShuffleQueryStage Output [1]: [r_regionkey#X] Arguments: X -(75) InputAdapter +(85) InputAdapter Input [1]: [r_regionkey#X] -(76) InputIteratorTransformer +(86) InputIteratorTransformer Input [1]: [r_regionkey#X] -(77) ShuffledHashJoinExecTransformer +(87) ShuffledHashJoinExecTransformer Left keys [1]: [n_regionkey#X] Right keys [1]: [r_regionkey#X] Join condition: None -(78) ProjectExecTransformer +(88) ProjectExecTransformer Output [4]: [l_extendedprice#X, l_discount#X, n_name#X, CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) AS _pre_X#X] Input [5]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X, r_regionkey#X] -(79) FlushableHashAggregateExecTransformer +(89) FlushableHashAggregateExecTransformer Input [4]: [l_extendedprice#X, l_discount#X, n_name#X, _pre_X#X] Keys [1]: [n_name#X] Functions [1]: [partial_sum(_pre_X#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [n_name#X, sum#X, isEmpty#X] -(80) ProjectExecTransformer +(90) ProjectExecTransformer Output [4]: [hash(n_name#X, 42) AS hash_partition_key#X, n_name#X, sum#X, isEmpty#X] Input [3]: [n_name#X, sum#X, isEmpty#X] -(81) WholeStageCodegenTransformer (X) +(91) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, n_name#X, sum#X, isEmpty#X] Arguments: false -(82) ColumnarExchange +(92) VeloxAppendBatches +Input [4]: [hash_partition_key#X, n_name#X, sum#X, isEmpty#X] +Arguments: X + +(93) ColumnarExchange Input [4]: [hash_partition_key#X, n_name#X, sum#X, isEmpty#X] Arguments: hashpartitioning(n_name#X, 1), ENSURE_REQUIREMENTS, [n_name#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(83) ShuffleQueryStage +(94) ShuffleQueryStage Output [3]: [n_name#X, sum#X, isEmpty#X] Arguments: X -(84) InputAdapter +(95) InputAdapter Input [3]: [n_name#X, sum#X, isEmpty#X] -(85) InputIteratorTransformer +(96) InputIteratorTransformer Input [3]: [n_name#X, sum#X, isEmpty#X] -(86) RegularHashAggregateExecTransformer +(97) RegularHashAggregateExecTransformer Input [3]: [n_name#X, sum#X, isEmpty#X] Keys [1]: [n_name#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] Results [2]: [n_name#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X AS revenue#X] -(87) WholeStageCodegenTransformer (X) +(98) WholeStageCodegenTransformer (X) Input [2]: [n_name#X, revenue#X] Arguments: false -(88) ColumnarExchange +(99) VeloxAppendBatches +Input [2]: [n_name#X, revenue#X] +Arguments: X + +(100) ColumnarExchange Input [2]: [n_name#X, revenue#X] Arguments: rangepartitioning(revenue#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(89) ShuffleQueryStage +(101) ShuffleQueryStage Output [2]: [n_name#X, revenue#X] Arguments: X -(90) InputAdapter +(102) InputAdapter Input [2]: [n_name#X, revenue#X] -(91) InputIteratorTransformer +(103) InputIteratorTransformer Input [2]: [n_name#X, revenue#X] -(92) SortExecTransformer +(104) SortExecTransformer Input [2]: [n_name#X, revenue#X] Arguments: [revenue#X DESC NULLS LAST], true, 0 -(93) WholeStageCodegenTransformer (X) +(105) WholeStageCodegenTransformer (X) Input [2]: [n_name#X, revenue#X] Arguments: false -(94) VeloxColumnarToRowExec +(106) VeloxColumnarToRowExec Input [2]: [n_name#X, revenue#X] -(95) Scan parquet +(107) Scan parquet Output [2]: [c_custkey#X, c_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey), IsNotNull(c_nationkey)] ReadSchema: struct -(96) Filter +(108) Filter Input [2]: [c_custkey#X, c_nationkey#X] Condition : (isnotnull(c_custkey#X) AND isnotnull(c_nationkey#X)) -(97) Exchange +(109) Exchange Input [2]: [c_custkey#X, c_nationkey#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(98) Scan parquet +(110) Sort +Input [2]: [c_custkey#X, c_nationkey#X] +Arguments: [c_custkey#X ASC NULLS FIRST], false, 0 + +(111) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1994-01-01), LessThan(o_orderdate,1995-01-01), IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(99) Filter +(112) Filter Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Condition : ((((isnotnull(o_orderdate#X) AND (o_orderdate#X >= 1994-01-01)) AND (o_orderdate#X < 1995-01-01)) AND isnotnull(o_custkey#X)) AND isnotnull(o_orderkey#X)) -(100) Project +(113) Project Output [2]: [o_orderkey#X, o_custkey#X] Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] -(101) Exchange +(114) Exchange Input [2]: [o_orderkey#X, o_custkey#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(102) ShuffledHashJoin +(115) Sort +Input [2]: [o_orderkey#X, o_custkey#X] +Arguments: [o_custkey#X ASC NULLS FIRST], false, 0 + +(116) SortMergeJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(103) Project +(117) Project Output [2]: [c_nationkey#X, o_orderkey#X] Input [4]: [c_custkey#X, c_nationkey#X, o_orderkey#X, o_custkey#X] -(104) Exchange +(118) Exchange Input [2]: [c_nationkey#X, o_orderkey#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(105) Scan parquet +(119) Sort +Input [2]: [c_nationkey#X, o_orderkey#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(120) Scan parquet Output [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_orderkey), IsNotNull(l_suppkey)] ReadSchema: struct -(106) Filter +(121) Filter Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Condition : (isnotnull(l_orderkey#X) AND isnotnull(l_suppkey#X)) -(107) Exchange +(122) Exchange Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(108) ShuffledHashJoin +(123) Sort +Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(124) SortMergeJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(109) Project +(125) Project Output [4]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Input [6]: [c_nationkey#X, o_orderkey#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(110) Exchange +(126) Exchange Input [4]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_suppkey#X, c_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(111) Scan parquet +(127) Sort +Input [4]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] +Arguments: [l_suppkey#X ASC NULLS FIRST, c_nationkey#X ASC NULLS FIRST], false, 0 + +(128) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(112) Filter +(129) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(113) Exchange +(130) Exchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, s_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(114) ShuffledHashJoin +(131) Sort +Input [2]: [s_suppkey#X, s_nationkey#X] +Arguments: [s_suppkey#X ASC NULLS FIRST, s_nationkey#X ASC NULLS FIRST], false, 0 + +(132) SortMergeJoin Left keys [2]: [l_suppkey#X, c_nationkey#X] Right keys [2]: [s_suppkey#X, s_nationkey#X] Join condition: None -(115) Project +(133) Project Output [3]: [l_extendedprice#X, l_discount#X, s_nationkey#X] Input [6]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, s_suppkey#X, s_nationkey#X] -(116) Exchange +(134) Exchange Input [3]: [l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(117) Scan parquet +(135) Sort +Input [3]: [l_extendedprice#X, l_discount#X, s_nationkey#X] +Arguments: [s_nationkey#X ASC NULLS FIRST], false, 0 + +(136) Scan parquet Output [3]: [n_nationkey#X, n_name#X, n_regionkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), IsNotNull(n_regionkey)] ReadSchema: struct -(118) Filter +(137) Filter Input [3]: [n_nationkey#X, n_name#X, n_regionkey#X] Condition : (isnotnull(n_nationkey#X) AND isnotnull(n_regionkey#X)) -(119) Exchange +(138) Exchange Input [3]: [n_nationkey#X, n_name#X, n_regionkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(120) ShuffledHashJoin +(139) Sort +Input [3]: [n_nationkey#X, n_name#X, n_regionkey#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(140) SortMergeJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(121) Project +(141) Project Output [4]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] Input [6]: [l_extendedprice#X, l_discount#X, s_nationkey#X, n_nationkey#X, n_name#X, n_regionkey#X] -(122) Exchange +(142) Exchange Input [4]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] Arguments: hashpartitioning(n_regionkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(123) Scan parquet +(143) Sort +Input [4]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] +Arguments: [n_regionkey#X ASC NULLS FIRST], false, 0 + +(144) Scan parquet Output [2]: [r_regionkey#X, r_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(r_name), EqualTo(r_name,ASIA), IsNotNull(r_regionkey)] ReadSchema: struct -(124) Filter +(145) Filter Input [2]: [r_regionkey#X, r_name#X] Condition : ((isnotnull(r_name#X) AND (r_name#X = ASIA)) AND isnotnull(r_regionkey#X)) -(125) Project +(146) Project Output [1]: [r_regionkey#X] Input [2]: [r_regionkey#X, r_name#X] -(126) Exchange +(147) Exchange Input [1]: [r_regionkey#X] Arguments: hashpartitioning(r_regionkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(127) ShuffledHashJoin +(148) Sort +Input [1]: [r_regionkey#X] +Arguments: [r_regionkey#X ASC NULLS FIRST], false, 0 + +(149) SortMergeJoin Left keys [1]: [n_regionkey#X] Right keys [1]: [r_regionkey#X] Join condition: None -(128) Project +(150) Project Output [3]: [l_extendedprice#X, l_discount#X, n_name#X] Input [5]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X, r_regionkey#X] -(129) HashAggregate +(151) HashAggregate Input [3]: [l_extendedprice#X, l_discount#X, n_name#X] Keys [1]: [n_name#X] Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [n_name#X, sum#X, isEmpty#X] -(130) Exchange +(152) Exchange Input [3]: [n_name#X, sum#X, isEmpty#X] Arguments: hashpartitioning(n_name#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(131) HashAggregate +(153) HashAggregate Input [3]: [n_name#X, sum#X, isEmpty#X] Keys [1]: [n_name#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] Results [2]: [n_name#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X AS revenue#X] -(132) Exchange +(154) Exchange Input [2]: [n_name#X, revenue#X] Arguments: rangepartitioning(revenue#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(133) Sort +(155) Sort Input [2]: [n_name#X, revenue#X] Arguments: [revenue#X DESC NULLS LAST], true, 0 -(134) AdaptiveSparkPlan +(156) AdaptiveSparkPlan Output [2]: [n_name#X, revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/6.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/6.txt index 8d1a71e9751e2..b6f876d48e5a0 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/6.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/6.txt @@ -1,23 +1,23 @@ == Physical Plan == -AdaptiveSparkPlan (19) +AdaptiveSparkPlan (20) +- == Final Plan == - VeloxColumnarToRowExec (12) - +- ^ RegularHashAggregateExecTransformer (10) - +- ^ InputIteratorTransformer (9) - +- ^ InputAdapter (8) - +- ^ ShuffleQueryStage (7) - +- ColumnarExchange (6) + VeloxColumnarToRowExec (13) + +- ^ RegularHashAggregateExecTransformer (11) + +- ^ InputIteratorTransformer (10) + +- ShuffleQueryStage (8) + +- ColumnarExchange (7) + +- VeloxAppendBatches (6) +- ^ FlushableHashAggregateExecTransformer (4) +- ^ ProjectExecTransformer (3) +- ^ NoopFilter (2) +- ^ Scan parquet (1) +- == Initial Plan == - HashAggregate (18) - +- Exchange (17) - +- HashAggregate (16) - +- Project (15) - +- Filter (14) - +- Scan parquet (13) + HashAggregate (19) + +- Exchange (18) + +- HashAggregate (17) + +- Project (16) + +- Filter (15) + +- Scan parquet (14) (1) Scan parquet @@ -46,67 +46,71 @@ Results [2]: [sum#X, isEmpty#X] Input [2]: [sum#X, isEmpty#X] Arguments: false -(6) ColumnarExchange +(6) VeloxAppendBatches +Input [2]: [sum#X, isEmpty#X] +Arguments: X + +(7) ColumnarExchange Input [2]: [sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(7) ShuffleQueryStage +(8) ShuffleQueryStage Output [2]: [sum#X, isEmpty#X] Arguments: X -(8) InputAdapter +(9) InputAdapter Input [2]: [sum#X, isEmpty#X] -(9) InputIteratorTransformer +(10) InputIteratorTransformer Input [2]: [sum#X, isEmpty#X] -(10) RegularHashAggregateExecTransformer +(11) RegularHashAggregateExecTransformer Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(CheckOverflow((promote_precision(l_extendedprice#X) * promote_precision(l_discount#X)), DecimalType(25,4), true))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(l_extendedprice#X) * promote_precision(l_discount#X)), DecimalType(25,4), true))#X] Results [1]: [sum(CheckOverflow((promote_precision(l_extendedprice#X) * promote_precision(l_discount#X)), DecimalType(25,4), true))#X AS revenue#X] -(11) WholeStageCodegenTransformer (X) +(12) WholeStageCodegenTransformer (X) Input [1]: [revenue#X] Arguments: false -(12) VeloxColumnarToRowExec +(13) VeloxColumnarToRowExec Input [1]: [revenue#X] -(13) Scan parquet +(14) Scan parquet Output [4]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), IsNotNull(l_discount), IsNotNull(l_quantity), GreaterThanOrEqual(l_shipdate,1994-01-01), LessThan(l_shipdate,1995-01-01), GreaterThanOrEqual(l_discount,0.05), LessThanOrEqual(l_discount,0.07), LessThan(l_quantity,24.00)] ReadSchema: struct -(14) Filter +(15) Filter Input [4]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : (((((((isnotnull(l_shipdate#X) AND isnotnull(l_discount#X)) AND isnotnull(l_quantity#X)) AND (l_shipdate#X >= 1994-01-01)) AND (l_shipdate#X < 1995-01-01)) AND (l_discount#X >= 0.05)) AND (l_discount#X <= 0.07)) AND (l_quantity#X < 24.00)) -(15) Project +(16) Project Output [2]: [l_extendedprice#X, l_discount#X] Input [4]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(16) HashAggregate +(17) HashAggregate Input [2]: [l_extendedprice#X, l_discount#X] Keys: [] Functions [1]: [partial_sum(CheckOverflow((promote_precision(l_extendedprice#X) * promote_precision(l_discount#X)), DecimalType(25,4), true))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(17) Exchange +(18) Exchange Input [2]: [sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X] -(18) HashAggregate +(19) HashAggregate Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(CheckOverflow((promote_precision(l_extendedprice#X) * promote_precision(l_discount#X)), DecimalType(25,4), true))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(l_extendedprice#X) * promote_precision(l_discount#X)), DecimalType(25,4), true))#X] Results [1]: [sum(CheckOverflow((promote_precision(l_extendedprice#X) * promote_precision(l_discount#X)), DecimalType(25,4), true))#X AS revenue#X] -(19) AdaptiveSparkPlan +(20) AdaptiveSparkPlan Output [1]: [revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/7.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/7.txt index 5ac66a7e27088..d9eb23cb737e7 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/7.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/7.txt @@ -1,122 +1,131 @@ == Physical Plan == -AdaptiveSparkPlan (128) +AdaptiveSparkPlan (149) +- == Final Plan == - VeloxColumnarToRowExec (90) - +- ^ SortExecTransformer (88) - +- ^ InputIteratorTransformer (87) - +- ^ InputAdapter (86) - +- ^ ShuffleQueryStage (85) - +- ColumnarExchange (84) - +- ^ RegularHashAggregateExecTransformer (82) - +- ^ InputIteratorTransformer (81) - +- ^ InputAdapter (80) - +- ^ ShuffleQueryStage (79) - +- ColumnarExchange (78) - +- ^ ProjectExecTransformer (76) - +- ^ FlushableHashAggregateExecTransformer (75) - +- ^ ProjectExecTransformer (74) - +- ^ ShuffledHashJoinExecTransformer Inner (73) - :- ^ InputIteratorTransformer (68) - : +- ^ InputAdapter (67) - : +- ^ ShuffleQueryStage (66) - : +- ColumnarExchange (65) - : +- ^ ProjectExecTransformer (63) - : +- ^ ShuffledHashJoinExecTransformer Inner (62) - : :- ^ InputIteratorTransformer (53) - : : +- ^ InputAdapter (52) - : : +- ^ ShuffleQueryStage (51) - : : +- ColumnarExchange (50) - : : +- ^ ProjectExecTransformer (48) - : : +- ^ ShuffledHashJoinExecTransformer Inner (47) - : : :- ^ InputIteratorTransformer (38) - : : : +- ^ InputAdapter (37) - : : : +- ^ ShuffleQueryStage (36) - : : : +- ColumnarExchange (35) - : : : +- ^ ProjectExecTransformer (33) - : : : +- ^ ShuffledHashJoinExecTransformer Inner (32) - : : : :- ^ InputIteratorTransformer (23) - : : : : +- ^ InputAdapter (22) - : : : : +- ^ ShuffleQueryStage (21) - : : : : +- ColumnarExchange (20) - : : : : +- ^ ProjectExecTransformer (18) - : : : : +- ^ ShuffledHashJoinExecTransformer Inner (17) - : : : : :- ^ InputIteratorTransformer (8) - : : : : : +- ^ InputAdapter (7) - : : : : : +- ^ ShuffleQueryStage (6) - : : : : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (101) + +- ^ SortExecTransformer (99) + +- ^ InputIteratorTransformer (98) + +- ShuffleQueryStage (96) + +- ColumnarExchange (95) + +- VeloxAppendBatches (94) + +- ^ RegularHashAggregateExecTransformer (92) + +- ^ InputIteratorTransformer (91) + +- ShuffleQueryStage (89) + +- ColumnarExchange (88) + +- VeloxAppendBatches (87) + +- ^ ProjectExecTransformer (85) + +- ^ FlushableHashAggregateExecTransformer (84) + +- ^ ProjectExecTransformer (83) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (82) + :- ^ InputIteratorTransformer (77) + : +- ShuffleQueryStage (75) + : +- ColumnarExchange (74) + : +- VeloxAppendBatches (73) + : +- ^ ProjectExecTransformer (71) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (70) + : :- ^ InputIteratorTransformer (60) + : : +- ShuffleQueryStage (58) + : : +- ColumnarExchange (57) + : : +- VeloxAppendBatches (56) + : : +- ^ ProjectExecTransformer (54) + : : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (53) + : : :- ^ InputIteratorTransformer (43) + : : : +- ShuffleQueryStage (41) + : : : +- ColumnarExchange (40) + : : : +- VeloxAppendBatches (39) + : : : +- ^ ProjectExecTransformer (37) + : : : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (36) + : : : :- ^ InputIteratorTransformer (26) + : : : : +- ShuffleQueryStage (24) + : : : : +- ColumnarExchange (23) + : : : : +- VeloxAppendBatches (22) + : : : : +- ^ ProjectExecTransformer (20) + : : : : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (19) + : : : : :- ^ InputIteratorTransformer (9) + : : : : : +- ShuffleQueryStage (7) + : : : : : +- ColumnarExchange (6) + : : : : : +- VeloxAppendBatches (5) : : : : : +- ^ ProjectExecTransformer (3) : : : : : +- ^ NoopFilter (2) : : : : : +- ^ Scan parquet (1) - : : : : +- ^ InputIteratorTransformer (16) - : : : : +- ^ InputAdapter (15) - : : : : +- ^ ShuffleQueryStage (14) - : : : : +- ColumnarExchange (13) - : : : : +- ^ ProjectExecTransformer (11) - : : : : +- ^ NoopFilter (10) - : : : : +- ^ Scan parquet (9) - : : : +- ^ InputIteratorTransformer (31) - : : : +- ^ InputAdapter (30) - : : : +- ^ ShuffleQueryStage (29) - : : : +- ColumnarExchange (28) - : : : +- ^ ProjectExecTransformer (26) - : : : +- ^ NoopFilter (25) - : : : +- ^ Scan parquet (24) - : : +- ^ InputIteratorTransformer (46) - : : +- ^ InputAdapter (45) - : : +- ^ ShuffleQueryStage (44) - : : +- ColumnarExchange (43) - : : +- ^ ProjectExecTransformer (41) - : : +- ^ NoopFilter (40) - : : +- ^ Scan parquet (39) - : +- ^ InputIteratorTransformer (61) - : +- ^ InputAdapter (60) - : +- ^ ShuffleQueryStage (59) - : +- ColumnarExchange (58) - : +- ^ ProjectExecTransformer (56) - : +- ^ NoopFilter (55) - : +- ^ Scan parquet (54) - +- ^ InputIteratorTransformer (72) - +- ^ InputAdapter (71) - +- ^ ShuffleQueryStage (70) - +- ReusedExchange (69) + : : : : +- ^ InputIteratorTransformer (18) + : : : : +- ShuffleQueryStage (16) + : : : : +- ColumnarExchange (15) + : : : : +- VeloxAppendBatches (14) + : : : : +- ^ ProjectExecTransformer (12) + : : : : +- ^ NoopFilter (11) + : : : : +- ^ Scan parquet (10) + : : : +- ^ InputIteratorTransformer (35) + : : : +- ShuffleQueryStage (33) + : : : +- ColumnarExchange (32) + : : : +- VeloxAppendBatches (31) + : : : +- ^ ProjectExecTransformer (29) + : : : +- ^ NoopFilter (28) + : : : +- ^ Scan parquet (27) + : : +- ^ InputIteratorTransformer (52) + : : +- ShuffleQueryStage (50) + : : +- ColumnarExchange (49) + : : +- VeloxAppendBatches (48) + : : +- ^ ProjectExecTransformer (46) + : : +- ^ NoopFilter (45) + : : +- ^ Scan parquet (44) + : +- ^ InputIteratorTransformer (69) + : +- ShuffleQueryStage (67) + : +- ColumnarExchange (66) + : +- VeloxAppendBatches (65) + : +- ^ ProjectExecTransformer (63) + : +- ^ NoopFilter (62) + : +- ^ Scan parquet (61) + +- ^ InputIteratorTransformer (81) + +- ShuffleQueryStage (79) + +- ReusedExchange (78) +- == Initial Plan == - Sort (127) - +- Exchange (126) - +- HashAggregate (125) - +- Exchange (124) - +- HashAggregate (123) - +- Project (122) - +- ShuffledHashJoin Inner BuildRight (121) - :- Exchange (117) - : +- Project (116) - : +- ShuffledHashJoin Inner BuildRight (115) - : :- Exchange (111) - : : +- Project (110) - : : +- ShuffledHashJoin Inner BuildRight (109) - : : :- Exchange (105) - : : : +- Project (104) - : : : +- ShuffledHashJoin Inner BuildRight (103) - : : : :- Exchange (99) - : : : : +- Project (98) - : : : : +- ShuffledHashJoin Inner BuildLeft (97) - : : : : :- Exchange (93) - : : : : : +- Filter (92) - : : : : : +- Scan parquet (91) - : : : : +- Exchange (96) - : : : : +- Filter (95) - : : : : +- Scan parquet (94) - : : : +- Exchange (102) - : : : +- Filter (101) - : : : +- Scan parquet (100) - : : +- Exchange (108) - : : +- Filter (107) - : : +- Scan parquet (106) - : +- Exchange (114) - : +- Filter (113) - : +- Scan parquet (112) - +- Exchange (120) - +- Filter (119) - +- Scan parquet (118) + Sort (148) + +- Exchange (147) + +- HashAggregate (146) + +- Exchange (145) + +- HashAggregate (144) + +- Project (143) + +- SortMergeJoin Inner (142) + :- Sort (137) + : +- Exchange (136) + : +- Project (135) + : +- SortMergeJoin Inner (134) + : :- Sort (129) + : : +- Exchange (128) + : : +- Project (127) + : : +- SortMergeJoin Inner (126) + : : :- Sort (121) + : : : +- Exchange (120) + : : : +- Project (119) + : : : +- SortMergeJoin Inner (118) + : : : :- Sort (113) + : : : : +- Exchange (112) + : : : : +- Project (111) + : : : : +- SortMergeJoin Inner (110) + : : : : :- Sort (105) + : : : : : +- Exchange (104) + : : : : : +- Filter (103) + : : : : : +- Scan parquet (102) + : : : : +- Sort (109) + : : : : +- Exchange (108) + : : : : +- Filter (107) + : : : : +- Scan parquet (106) + : : : +- Sort (117) + : : : +- Exchange (116) + : : : +- Filter (115) + : : : +- Scan parquet (114) + : : +- Sort (125) + : : +- Exchange (124) + : : +- Filter (123) + : : +- Scan parquet (122) + : +- Sort (133) + : +- Exchange (132) + : +- Filter (131) + : +- Scan parquet (130) + +- Sort (141) + +- Exchange (140) + +- Filter (139) + +- Scan parquet (138) (1) Scan parquet @@ -138,524 +147,608 @@ Input [2]: [s_suppkey#X, s_nationkey#X] Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] +Arguments: X + +(6) ColumnarExchange Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [2]: [s_suppkey#X, s_nationkey#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [2]: [s_suppkey#X, s_nationkey#X] -(9) Scan parquet +(10) Scan parquet Output [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1995-01-01), LessThanOrEqual(l_shipdate,1996-12-31), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(10) NoopFilter +(11) NoopFilter Input [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [6]: [hash(l_suppkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Input [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [6]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] +Arguments: X + +(15) ColumnarExchange Input [6]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [s_suppkey#X] Right keys [1]: [l_suppkey#X] Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [6]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Input [7]: [s_suppkey#X, s_nationkey#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(19) WholeStageCodegenTransformer (X) +(21) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: false -(20) ColumnarExchange +(22) VeloxAppendBatches +Input [6]: [hash_partition_key#X, s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] +Arguments: X + +(23) ColumnarExchange Input [6]: [hash_partition_key#X, s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X], [plan_id=X], [id=#X] -(21) ShuffleQueryStage +(24) ShuffleQueryStage Output [5]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: X -(22) InputAdapter +(25) InputAdapter Input [5]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(23) InputIteratorTransformer +(26) InputIteratorTransformer Input [5]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(24) Scan parquet +(27) Scan parquet Output [2]: [o_orderkey#X, o_custkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderkey), IsNotNull(o_custkey)] ReadSchema: struct -(25) NoopFilter +(28) NoopFilter Input [2]: [o_orderkey#X, o_custkey#X] Arguments: [o_orderkey#X, o_custkey#X] -(26) ProjectExecTransformer +(29) ProjectExecTransformer Output [3]: [hash(o_orderkey#X, 42) AS hash_partition_key#X, o_orderkey#X, o_custkey#X] Input [2]: [o_orderkey#X, o_custkey#X] -(27) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] Arguments: false -(28) ColumnarExchange +(31) VeloxAppendBatches +Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] +Arguments: X + +(32) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [id=#X] -(29) ShuffleQueryStage +(33) ShuffleQueryStage Output [2]: [o_orderkey#X, o_custkey#X] Arguments: X -(30) InputAdapter +(34) InputAdapter Input [2]: [o_orderkey#X, o_custkey#X] -(31) InputIteratorTransformer +(35) InputIteratorTransformer Input [2]: [o_orderkey#X, o_custkey#X] -(32) ShuffledHashJoinExecTransformer +(36) ShuffledHashJoinExecTransformer Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join condition: None -(33) ProjectExecTransformer +(37) ProjectExecTransformer Output [6]: [hash(o_custkey#X, 42) AS hash_partition_key#X, s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] Input [7]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_orderkey#X, o_custkey#X] -(34) WholeStageCodegenTransformer (X) +(38) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] Arguments: false -(35) ColumnarExchange +(39) VeloxAppendBatches +Input [6]: [hash_partition_key#X, s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] +Arguments: X + +(40) ColumnarExchange Input [6]: [hash_partition_key#X, s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X], [plan_id=X], [id=#X] -(36) ShuffleQueryStage +(41) ShuffleQueryStage Output [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] Arguments: X -(37) InputAdapter +(42) InputAdapter Input [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] -(38) InputIteratorTransformer +(43) InputIteratorTransformer Input [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] -(39) Scan parquet +(44) Scan parquet Output [2]: [c_custkey#X, c_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey), IsNotNull(c_nationkey)] ReadSchema: struct -(40) NoopFilter +(45) NoopFilter Input [2]: [c_custkey#X, c_nationkey#X] Arguments: [c_custkey#X, c_nationkey#X] -(41) ProjectExecTransformer +(46) ProjectExecTransformer Output [3]: [hash(c_custkey#X, 42) AS hash_partition_key#X, c_custkey#X, c_nationkey#X] Input [2]: [c_custkey#X, c_nationkey#X] -(42) WholeStageCodegenTransformer (X) +(47) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] Arguments: false -(43) ColumnarExchange +(48) VeloxAppendBatches +Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] +Arguments: X + +(49) ColumnarExchange Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_nationkey#X], [plan_id=X], [id=#X] -(44) ShuffleQueryStage +(50) ShuffleQueryStage Output [2]: [c_custkey#X, c_nationkey#X] Arguments: X -(45) InputAdapter +(51) InputAdapter Input [2]: [c_custkey#X, c_nationkey#X] -(46) InputIteratorTransformer +(52) InputIteratorTransformer Input [2]: [c_custkey#X, c_nationkey#X] -(47) ShuffledHashJoinExecTransformer +(53) ShuffledHashJoinExecTransformer Left keys [1]: [o_custkey#X] Right keys [1]: [c_custkey#X] Join condition: None -(48) ProjectExecTransformer +(54) ProjectExecTransformer Output [6]: [hash(s_nationkey#X, 42) AS hash_partition_key#X, s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] Input [7]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X, c_custkey#X, c_nationkey#X] -(49) WholeStageCodegenTransformer (X) +(55) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] Arguments: false -(50) ColumnarExchange +(56) VeloxAppendBatches +Input [6]: [hash_partition_key#X, s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] +Arguments: X + +(57) ColumnarExchange Input [6]: [hash_partition_key#X, s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X], [plan_id=X], [id=#X] -(51) ShuffleQueryStage +(58) ShuffleQueryStage Output [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] Arguments: X -(52) InputAdapter +(59) InputAdapter Input [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] -(53) InputIteratorTransformer +(60) InputIteratorTransformer Input [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] -(54) Scan parquet +(61) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), Or(EqualTo(n_name,FRANCE),EqualTo(n_name,GERMANY))] ReadSchema: struct -(55) NoopFilter +(62) NoopFilter Input [2]: [n_nationkey#X, n_name#X] Arguments: [n_nationkey#X, n_name#X] -(56) ProjectExecTransformer +(63) ProjectExecTransformer Output [3]: [hash(n_nationkey#X, 42) AS hash_partition_key#X, n_nationkey#X, n_name#X] Input [2]: [n_nationkey#X, n_name#X] -(57) WholeStageCodegenTransformer (X) +(64) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] Arguments: false -(58) ColumnarExchange +(65) VeloxAppendBatches +Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] +Arguments: X + +(66) ColumnarExchange Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [id=#X] -(59) ShuffleQueryStage +(67) ShuffleQueryStage Output [2]: [n_nationkey#X, n_name#X] Arguments: X -(60) InputAdapter +(68) InputAdapter Input [2]: [n_nationkey#X, n_name#X] -(61) InputIteratorTransformer +(69) InputIteratorTransformer Input [2]: [n_nationkey#X, n_name#X] -(62) ShuffledHashJoinExecTransformer +(70) ShuffledHashJoinExecTransformer Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(63) ProjectExecTransformer +(71) ProjectExecTransformer Output [6]: [hash(c_nationkey#X, 42) AS hash_partition_key#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] Input [7]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_nationkey#X, n_name#X] -(64) WholeStageCodegenTransformer (X) +(72) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] Arguments: false -(65) ColumnarExchange +(73) VeloxAppendBatches +Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] +Arguments: X + +(74) ColumnarExchange Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X], [plan_id=X], [id=#X] -(66) ShuffleQueryStage +(75) ShuffleQueryStage Output [5]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] Arguments: X -(67) InputAdapter +(76) InputAdapter Input [5]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] -(68) InputIteratorTransformer +(77) InputIteratorTransformer Input [5]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] -(69) ReusedExchange [Reuses operator id: 58] +(78) ReusedExchange [Reuses operator id: 66] Output [2]: [n_nationkey#X, n_name#X] -(70) ShuffleQueryStage +(79) ShuffleQueryStage Output [2]: [n_nationkey#X, n_name#X] Arguments: X -(71) InputAdapter +(80) InputAdapter Input [2]: [n_nationkey#X, n_name#X] -(72) InputIteratorTransformer +(81) InputIteratorTransformer Input [2]: [n_nationkey#X, n_name#X] -(73) ShuffledHashJoinExecTransformer +(82) ShuffledHashJoinExecTransformer Left keys [1]: [c_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: (((n_name#X = FRANCE) AND (n_name#X = GERMANY)) OR ((n_name#X = GERMANY) AND (n_name#X = FRANCE))) -(74) ProjectExecTransformer +(83) ProjectExecTransformer Output [4]: [n_name#X AS supp_nation#X, n_name#X AS cust_nation#X, year(l_shipdate#X) AS l_year#X, CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) AS volume#X] Input [7]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X, n_nationkey#X, n_name#X] -(75) FlushableHashAggregateExecTransformer +(84) FlushableHashAggregateExecTransformer Input [4]: [supp_nation#X, cust_nation#X, l_year#X, volume#X] Keys [3]: [supp_nation#X, cust_nation#X, l_year#X] Functions [1]: [partial_sum(volume#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] -(76) ProjectExecTransformer +(85) ProjectExecTransformer Output [6]: [hash(supp_nation#X, cust_nation#X, l_year#X, 42) AS hash_partition_key#X, supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] -(77) WholeStageCodegenTransformer (X) +(86) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Arguments: false -(78) ColumnarExchange +(87) VeloxAppendBatches +Input [6]: [hash_partition_key#X, supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] +Arguments: X + +(88) ColumnarExchange Input [6]: [hash_partition_key#X, supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Arguments: hashpartitioning(supp_nation#X, cust_nation#X, l_year#X, 1), ENSURE_REQUIREMENTS, [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(79) ShuffleQueryStage +(89) ShuffleQueryStage Output [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Arguments: X -(80) InputAdapter +(90) InputAdapter Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] -(81) InputIteratorTransformer +(91) InputIteratorTransformer Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] -(82) RegularHashAggregateExecTransformer +(92) RegularHashAggregateExecTransformer Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Keys [3]: [supp_nation#X, cust_nation#X, l_year#X] Functions [1]: [sum(volume#X)] Aggregate Attributes [1]: [sum(volume#X)#X] Results [4]: [supp_nation#X, cust_nation#X, l_year#X, sum(volume#X)#X AS revenue#X] -(83) WholeStageCodegenTransformer (X) +(93) WholeStageCodegenTransformer (X) Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: false -(84) ColumnarExchange +(94) VeloxAppendBatches +Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] +Arguments: X + +(95) ColumnarExchange Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: rangepartitioning(supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(85) ShuffleQueryStage +(96) ShuffleQueryStage Output [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: X -(86) InputAdapter +(97) InputAdapter Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] -(87) InputIteratorTransformer +(98) InputIteratorTransformer Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] -(88) SortExecTransformer +(99) SortExecTransformer Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: [supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST], true, 0 -(89) WholeStageCodegenTransformer (X) +(100) WholeStageCodegenTransformer (X) Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: false -(90) VeloxColumnarToRowExec +(101) VeloxColumnarToRowExec Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] -(91) Scan parquet +(102) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(92) Filter +(103) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(93) Exchange +(104) Exchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(94) Scan parquet +(105) Sort +Input [2]: [s_suppkey#X, s_nationkey#X] +Arguments: [s_suppkey#X ASC NULLS FIRST], false, 0 + +(106) Scan parquet Output [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1995-01-01), LessThanOrEqual(l_shipdate,1996-12-31), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(95) Filter +(107) Filter Input [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : ((((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1995-01-01)) AND (l_shipdate#X <= 1996-12-31)) AND isnotnull(l_suppkey#X)) AND isnotnull(l_orderkey#X)) -(96) Exchange +(108) Exchange Input [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(97) ShuffledHashJoin +(109) Sort +Input [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] +Arguments: [l_suppkey#X ASC NULLS FIRST], false, 0 + +(110) SortMergeJoin Left keys [1]: [s_suppkey#X] Right keys [1]: [l_suppkey#X] Join condition: None -(98) Project +(111) Project Output [5]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Input [7]: [s_suppkey#X, s_nationkey#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(99) Exchange +(112) Exchange Input [5]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(100) Scan parquet +(113) Sort +Input [5]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(114) Scan parquet Output [2]: [o_orderkey#X, o_custkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderkey), IsNotNull(o_custkey)] ReadSchema: struct -(101) Filter +(115) Filter Input [2]: [o_orderkey#X, o_custkey#X] Condition : (isnotnull(o_orderkey#X) AND isnotnull(o_custkey#X)) -(102) Exchange +(116) Exchange Input [2]: [o_orderkey#X, o_custkey#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(103) ShuffledHashJoin +(117) Sort +Input [2]: [o_orderkey#X, o_custkey#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(118) SortMergeJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join condition: None -(104) Project +(119) Project Output [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] Input [7]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_orderkey#X, o_custkey#X] -(105) Exchange +(120) Exchange Input [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(106) Scan parquet +(121) Sort +Input [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] +Arguments: [o_custkey#X ASC NULLS FIRST], false, 0 + +(122) Scan parquet Output [2]: [c_custkey#X, c_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey), IsNotNull(c_nationkey)] ReadSchema: struct -(107) Filter +(123) Filter Input [2]: [c_custkey#X, c_nationkey#X] Condition : (isnotnull(c_custkey#X) AND isnotnull(c_nationkey#X)) -(108) Exchange +(124) Exchange Input [2]: [c_custkey#X, c_nationkey#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(109) ShuffledHashJoin +(125) Sort +Input [2]: [c_custkey#X, c_nationkey#X] +Arguments: [c_custkey#X ASC NULLS FIRST], false, 0 + +(126) SortMergeJoin Left keys [1]: [o_custkey#X] Right keys [1]: [c_custkey#X] Join condition: None -(110) Project +(127) Project Output [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] Input [7]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X, c_custkey#X, c_nationkey#X] -(111) Exchange +(128) Exchange Input [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(112) Scan parquet +(129) Sort +Input [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] +Arguments: [s_nationkey#X ASC NULLS FIRST], false, 0 + +(130) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), Or(EqualTo(n_name,FRANCE),EqualTo(n_name,GERMANY))] ReadSchema: struct -(113) Filter +(131) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : (isnotnull(n_nationkey#X) AND ((n_name#X = FRANCE) OR (n_name#X = GERMANY))) -(114) Exchange +(132) Exchange Input [2]: [n_nationkey#X, n_name#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(115) ShuffledHashJoin +(133) Sort +Input [2]: [n_nationkey#X, n_name#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(134) SortMergeJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(116) Project +(135) Project Output [5]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] Input [7]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_nationkey#X, n_name#X] -(117) Exchange +(136) Exchange Input [5]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(118) Scan parquet +(137) Sort +Input [5]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] +Arguments: [c_nationkey#X ASC NULLS FIRST], false, 0 + +(138) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), Or(EqualTo(n_name,GERMANY),EqualTo(n_name,FRANCE))] ReadSchema: struct -(119) Filter +(139) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : (isnotnull(n_nationkey#X) AND ((n_name#X = GERMANY) OR (n_name#X = FRANCE))) -(120) Exchange +(140) Exchange Input [2]: [n_nationkey#X, n_name#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(121) ShuffledHashJoin +(141) Sort +Input [2]: [n_nationkey#X, n_name#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(142) SortMergeJoin Left keys [1]: [c_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: (((n_name#X = FRANCE) AND (n_name#X = GERMANY)) OR ((n_name#X = GERMANY) AND (n_name#X = FRANCE))) -(122) Project +(143) Project Output [4]: [n_name#X AS supp_nation#X, n_name#X AS cust_nation#X, year(l_shipdate#X) AS l_year#X, CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) AS volume#X] Input [7]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X, n_nationkey#X, n_name#X] -(123) HashAggregate +(144) HashAggregate Input [4]: [supp_nation#X, cust_nation#X, l_year#X, volume#X] Keys [3]: [supp_nation#X, cust_nation#X, l_year#X] Functions [1]: [partial_sum(volume#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] -(124) Exchange +(145) Exchange Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Arguments: hashpartitioning(supp_nation#X, cust_nation#X, l_year#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(125) HashAggregate +(146) HashAggregate Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Keys [3]: [supp_nation#X, cust_nation#X, l_year#X] Functions [1]: [sum(volume#X)] Aggregate Attributes [1]: [sum(volume#X)#X] Results [4]: [supp_nation#X, cust_nation#X, l_year#X, sum(volume#X)#X AS revenue#X] -(126) Exchange +(147) Exchange Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: rangepartitioning(supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(127) Sort +(148) Sort Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: [supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST], true, 0 -(128) AdaptiveSparkPlan +(149) AdaptiveSparkPlan Output [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/8.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/8.txt index 7e7064f9fbae8..5c9e51b95c606 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/8.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/8.txt @@ -1,166 +1,180 @@ == Physical Plan == -AdaptiveSparkPlan (177) +AdaptiveSparkPlan (207) +- == Final Plan == - VeloxColumnarToRowExec (125) - +- ^ SortExecTransformer (123) - +- ^ InputIteratorTransformer (122) - +- ^ InputAdapter (121) - +- ^ ShuffleQueryStage (120) - +- ColumnarExchange (119) - +- ^ ProjectExecTransformer (117) - +- ^ RegularHashAggregateExecTransformer (116) - +- ^ InputIteratorTransformer (115) - +- ^ InputAdapter (114) - +- ^ ShuffleQueryStage (113) - +- ColumnarExchange (112) - +- ^ ProjectExecTransformer (110) - +- ^ FlushableHashAggregateExecTransformer (109) - +- ^ ProjectExecTransformer (108) - +- ^ ShuffledHashJoinExecTransformer Inner (107) - :- ^ InputIteratorTransformer (98) - : +- ^ InputAdapter (97) - : +- ^ ShuffleQueryStage (96) - : +- ColumnarExchange (95) - : +- ^ ProjectExecTransformer (93) - : +- ^ ShuffledHashJoinExecTransformer Inner (92) - : :- ^ InputIteratorTransformer (83) - : : +- ^ InputAdapter (82) - : : +- ^ ShuffleQueryStage (81) - : : +- ColumnarExchange (80) - : : +- ^ ProjectExecTransformer (78) - : : +- ^ ShuffledHashJoinExecTransformer Inner (77) - : : :- ^ InputIteratorTransformer (68) - : : : +- ^ InputAdapter (67) - : : : +- ^ ShuffleQueryStage (66) - : : : +- ColumnarExchange (65) - : : : +- ^ ProjectExecTransformer (63) - : : : +- ^ ShuffledHashJoinExecTransformer Inner (62) - : : : :- ^ InputIteratorTransformer (53) - : : : : +- ^ InputAdapter (52) - : : : : +- ^ ShuffleQueryStage (51) - : : : : +- ColumnarExchange (50) - : : : : +- ^ ProjectExecTransformer (48) - : : : : +- ^ ShuffledHashJoinExecTransformer Inner (47) - : : : : :- ^ InputIteratorTransformer (38) - : : : : : +- ^ InputAdapter (37) - : : : : : +- ^ ShuffleQueryStage (36) - : : : : : +- ColumnarExchange (35) - : : : : : +- ^ ProjectExecTransformer (33) - : : : : : +- ^ ShuffledHashJoinExecTransformer Inner (32) - : : : : : :- ^ InputIteratorTransformer (23) - : : : : : : +- ^ InputAdapter (22) - : : : : : : +- ^ ShuffleQueryStage (21) - : : : : : : +- ColumnarExchange (20) - : : : : : : +- ^ ProjectExecTransformer (18) - : : : : : : +- ^ ShuffledHashJoinExecTransformer Inner (17) - : : : : : : :- ^ InputIteratorTransformer (8) - : : : : : : : +- ^ InputAdapter (7) - : : : : : : : +- ^ ShuffleQueryStage (6) - : : : : : : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (141) + +- ^ SortExecTransformer (139) + +- ^ InputIteratorTransformer (138) + +- ShuffleQueryStage (136) + +- ColumnarExchange (135) + +- VeloxAppendBatches (134) + +- ^ ProjectExecTransformer (132) + +- ^ RegularHashAggregateExecTransformer (131) + +- ^ InputIteratorTransformer (130) + +- ShuffleQueryStage (128) + +- ColumnarExchange (127) + +- VeloxAppendBatches (126) + +- ^ ProjectExecTransformer (124) + +- ^ FlushableHashAggregateExecTransformer (123) + +- ^ ProjectExecTransformer (122) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (121) + :- ^ InputIteratorTransformer (111) + : +- ShuffleQueryStage (109) + : +- ColumnarExchange (108) + : +- VeloxAppendBatches (107) + : +- ^ ProjectExecTransformer (105) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (104) + : :- ^ InputIteratorTransformer (94) + : : +- ShuffleQueryStage (92) + : : +- ColumnarExchange (91) + : : +- VeloxAppendBatches (90) + : : +- ^ ProjectExecTransformer (88) + : : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (87) + : : :- ^ InputIteratorTransformer (77) + : : : +- ShuffleQueryStage (75) + : : : +- ColumnarExchange (74) + : : : +- VeloxAppendBatches (73) + : : : +- ^ ProjectExecTransformer (71) + : : : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (70) + : : : :- ^ InputIteratorTransformer (60) + : : : : +- ShuffleQueryStage (58) + : : : : +- ColumnarExchange (57) + : : : : +- VeloxAppendBatches (56) + : : : : +- ^ ProjectExecTransformer (54) + : : : : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (53) + : : : : :- ^ InputIteratorTransformer (43) + : : : : : +- ShuffleQueryStage (41) + : : : : : +- ColumnarExchange (40) + : : : : : +- VeloxAppendBatches (39) + : : : : : +- ^ ProjectExecTransformer (37) + : : : : : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (36) + : : : : : :- ^ InputIteratorTransformer (26) + : : : : : : +- ShuffleQueryStage (24) + : : : : : : +- ColumnarExchange (23) + : : : : : : +- VeloxAppendBatches (22) + : : : : : : +- ^ ProjectExecTransformer (20) + : : : : : : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (19) + : : : : : : :- ^ InputIteratorTransformer (9) + : : : : : : : +- ShuffleQueryStage (7) + : : : : : : : +- ColumnarExchange (6) + : : : : : : : +- VeloxAppendBatches (5) : : : : : : : +- ^ ProjectExecTransformer (3) : : : : : : : +- ^ NoopFilter (2) : : : : : : : +- ^ Scan parquet (1) - : : : : : : +- ^ InputIteratorTransformer (16) - : : : : : : +- ^ InputAdapter (15) - : : : : : : +- ^ ShuffleQueryStage (14) - : : : : : : +- ColumnarExchange (13) - : : : : : : +- ^ ProjectExecTransformer (11) - : : : : : : +- ^ NoopFilter (10) - : : : : : : +- ^ Scan parquet (9) - : : : : : +- ^ InputIteratorTransformer (31) - : : : : : +- ^ InputAdapter (30) - : : : : : +- ^ ShuffleQueryStage (29) - : : : : : +- ColumnarExchange (28) - : : : : : +- ^ ProjectExecTransformer (26) - : : : : : +- ^ NoopFilter (25) - : : : : : +- ^ Scan parquet (24) - : : : : +- ^ InputIteratorTransformer (46) - : : : : +- ^ InputAdapter (45) - : : : : +- ^ ShuffleQueryStage (44) - : : : : +- ColumnarExchange (43) - : : : : +- ^ ProjectExecTransformer (41) - : : : : +- ^ NoopFilter (40) - : : : : +- ^ Scan parquet (39) - : : : +- ^ InputIteratorTransformer (61) - : : : +- ^ InputAdapter (60) - : : : +- ^ ShuffleQueryStage (59) - : : : +- ColumnarExchange (58) - : : : +- ^ ProjectExecTransformer (56) - : : : +- ^ NoopFilter (55) - : : : +- ^ Scan parquet (54) - : : +- ^ InputIteratorTransformer (76) - : : +- ^ InputAdapter (75) - : : +- ^ ShuffleQueryStage (74) - : : +- ColumnarExchange (73) - : : +- ^ ProjectExecTransformer (71) - : : +- ^ NoopFilter (70) - : : +- ^ Scan parquet (69) - : +- ^ InputIteratorTransformer (91) - : +- ^ InputAdapter (90) - : +- ^ ShuffleQueryStage (89) - : +- ColumnarExchange (88) - : +- ^ ProjectExecTransformer (86) - : +- ^ NoopFilter (85) - : +- ^ Scan parquet (84) - +- ^ InputIteratorTransformer (106) - +- ^ InputAdapter (105) - +- ^ ShuffleQueryStage (104) - +- ColumnarExchange (103) - +- ^ ProjectExecTransformer (101) - +- ^ NoopFilter (100) - +- ^ Scan parquet (99) + : : : : : : +- ^ InputIteratorTransformer (18) + : : : : : : +- ShuffleQueryStage (16) + : : : : : : +- ColumnarExchange (15) + : : : : : : +- VeloxAppendBatches (14) + : : : : : : +- ^ ProjectExecTransformer (12) + : : : : : : +- ^ NoopFilter (11) + : : : : : : +- ^ Scan parquet (10) + : : : : : +- ^ InputIteratorTransformer (35) + : : : : : +- ShuffleQueryStage (33) + : : : : : +- ColumnarExchange (32) + : : : : : +- VeloxAppendBatches (31) + : : : : : +- ^ ProjectExecTransformer (29) + : : : : : +- ^ NoopFilter (28) + : : : : : +- ^ Scan parquet (27) + : : : : +- ^ InputIteratorTransformer (52) + : : : : +- ShuffleQueryStage (50) + : : : : +- ColumnarExchange (49) + : : : : +- VeloxAppendBatches (48) + : : : : +- ^ ProjectExecTransformer (46) + : : : : +- ^ NoopFilter (45) + : : : : +- ^ Scan parquet (44) + : : : +- ^ InputIteratorTransformer (69) + : : : +- ShuffleQueryStage (67) + : : : +- ColumnarExchange (66) + : : : +- VeloxAppendBatches (65) + : : : +- ^ ProjectExecTransformer (63) + : : : +- ^ NoopFilter (62) + : : : +- ^ Scan parquet (61) + : : +- ^ InputIteratorTransformer (86) + : : +- ShuffleQueryStage (84) + : : +- ColumnarExchange (83) + : : +- VeloxAppendBatches (82) + : : +- ^ ProjectExecTransformer (80) + : : +- ^ NoopFilter (79) + : : +- ^ Scan parquet (78) + : +- ^ InputIteratorTransformer (103) + : +- ShuffleQueryStage (101) + : +- ColumnarExchange (100) + : +- VeloxAppendBatches (99) + : +- ^ ProjectExecTransformer (97) + : +- ^ NoopFilter (96) + : +- ^ Scan parquet (95) + +- ^ InputIteratorTransformer (120) + +- ShuffleQueryStage (118) + +- ColumnarExchange (117) + +- VeloxAppendBatches (116) + +- ^ ProjectExecTransformer (114) + +- ^ NoopFilter (113) + +- ^ Scan parquet (112) +- == Initial Plan == - Sort (176) - +- Exchange (175) - +- HashAggregate (174) - +- Exchange (173) - +- HashAggregate (172) - +- Project (171) - +- ShuffledHashJoin Inner BuildRight (170) - :- Exchange (165) - : +- Project (164) - : +- ShuffledHashJoin Inner BuildRight (163) - : :- Exchange (159) - : : +- Project (158) - : : +- ShuffledHashJoin Inner BuildRight (157) - : : :- Exchange (153) - : : : +- Project (152) - : : : +- ShuffledHashJoin Inner BuildRight (151) - : : : :- Exchange (147) - : : : : +- Project (146) - : : : : +- ShuffledHashJoin Inner BuildRight (145) - : : : : :- Exchange (141) - : : : : : +- Project (140) - : : : : : +- ShuffledHashJoin Inner BuildRight (139) - : : : : : :- Exchange (135) - : : : : : : +- Project (134) - : : : : : : +- ShuffledHashJoin Inner BuildLeft (133) - : : : : : : :- Exchange (129) - : : : : : : : +- Project (128) - : : : : : : : +- Filter (127) - : : : : : : : +- Scan parquet (126) - : : : : : : +- Exchange (132) - : : : : : : +- Filter (131) - : : : : : : +- Scan parquet (130) - : : : : : +- Exchange (138) - : : : : : +- Filter (137) - : : : : : +- Scan parquet (136) - : : : : +- Exchange (144) - : : : : +- Filter (143) - : : : : +- Scan parquet (142) - : : : +- Exchange (150) - : : : +- Filter (149) - : : : +- Scan parquet (148) - : : +- Exchange (156) - : : +- Filter (155) - : : +- Scan parquet (154) - : +- Exchange (162) - : +- Filter (161) - : +- Scan parquet (160) - +- Exchange (169) - +- Project (168) - +- Filter (167) - +- Scan parquet (166) + Sort (206) + +- Exchange (205) + +- HashAggregate (204) + +- Exchange (203) + +- HashAggregate (202) + +- Project (201) + +- SortMergeJoin Inner (200) + :- Sort (194) + : +- Exchange (193) + : +- Project (192) + : +- SortMergeJoin Inner (191) + : :- Sort (186) + : : +- Exchange (185) + : : +- Project (184) + : : +- SortMergeJoin Inner (183) + : : :- Sort (178) + : : : +- Exchange (177) + : : : +- Project (176) + : : : +- SortMergeJoin Inner (175) + : : : :- Sort (170) + : : : : +- Exchange (169) + : : : : +- Project (168) + : : : : +- SortMergeJoin Inner (167) + : : : : :- Sort (162) + : : : : : +- Exchange (161) + : : : : : +- Project (160) + : : : : : +- SortMergeJoin Inner (159) + : : : : : :- Sort (154) + : : : : : : +- Exchange (153) + : : : : : : +- Project (152) + : : : : : : +- SortMergeJoin Inner (151) + : : : : : : :- Sort (146) + : : : : : : : +- Exchange (145) + : : : : : : : +- Project (144) + : : : : : : : +- Filter (143) + : : : : : : : +- Scan parquet (142) + : : : : : : +- Sort (150) + : : : : : : +- Exchange (149) + : : : : : : +- Filter (148) + : : : : : : +- Scan parquet (147) + : : : : : +- Sort (158) + : : : : : +- Exchange (157) + : : : : : +- Filter (156) + : : : : : +- Scan parquet (155) + : : : : +- Sort (166) + : : : : +- Exchange (165) + : : : : +- Filter (164) + : : : : +- Scan parquet (163) + : : : +- Sort (174) + : : : +- Exchange (173) + : : : +- Filter (172) + : : : +- Scan parquet (171) + : : +- Sort (182) + : : +- Exchange (181) + : : +- Filter (180) + : : +- Scan parquet (179) + : +- Sort (190) + : +- Exchange (189) + : +- Filter (188) + : +- Scan parquet (187) + +- Sort (199) + +- Exchange (198) + +- Project (197) + +- Filter (196) + +- Scan parquet (195) (1) Scan parquet @@ -182,732 +196,852 @@ Input [2]: [p_partkey#X, p_type#X] Input [2]: [hash_partition_key#X, p_partkey#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [2]: [hash_partition_key#X, p_partkey#X] +Arguments: X + +(6) ColumnarExchange Input [2]: [hash_partition_key#X, p_partkey#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [1]: [p_partkey#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [1]: [p_partkey#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [1]: [p_partkey#X] -(9) Scan parquet +(10) Scan parquet Output [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(10) NoopFilter +(11) NoopFilter Input [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [6]: [hash(l_partkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Input [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [6]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(15) ColumnarExchange Input [6]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [p_partkey#X] Right keys [1]: [l_partkey#X] Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [5]: [hash(l_suppkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Input [6]: [p_partkey#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(19) WholeStageCodegenTransformer (X) +(21) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: false -(20) ColumnarExchange +(22) VeloxAppendBatches +Input [5]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(23) ColumnarExchange Input [5]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(21) ShuffleQueryStage +(24) ShuffleQueryStage Output [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: X -(22) InputAdapter +(25) InputAdapter Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(23) InputIteratorTransformer +(26) InputIteratorTransformer Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(24) Scan parquet +(27) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(25) NoopFilter +(28) NoopFilter Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: [s_suppkey#X, s_nationkey#X] -(26) ProjectExecTransformer +(29) ProjectExecTransformer Output [3]: [hash(s_suppkey#X, 42) AS hash_partition_key#X, s_suppkey#X, s_nationkey#X] Input [2]: [s_suppkey#X, s_nationkey#X] -(27) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] Arguments: false -(28) ColumnarExchange +(31) VeloxAppendBatches +Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] +Arguments: X + +(32) ColumnarExchange Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [id=#X] -(29) ShuffleQueryStage +(33) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] Arguments: X -(30) InputAdapter +(34) InputAdapter Input [2]: [s_suppkey#X, s_nationkey#X] -(31) InputIteratorTransformer +(35) InputIteratorTransformer Input [2]: [s_suppkey#X, s_nationkey#X] -(32) ShuffledHashJoinExecTransformer +(36) ShuffledHashJoinExecTransformer Left keys [1]: [l_suppkey#X] Right keys [1]: [s_suppkey#X] Join condition: None -(33) ProjectExecTransformer +(37) ProjectExecTransformer Output [5]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Input [6]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, s_suppkey#X, s_nationkey#X] -(34) WholeStageCodegenTransformer (X) +(38) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: false -(35) ColumnarExchange +(39) VeloxAppendBatches +Input [5]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] +Arguments: X + +(40) ColumnarExchange Input [5]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X], [plan_id=X], [id=#X] -(36) ShuffleQueryStage +(41) ShuffleQueryStage Output [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: X -(37) InputAdapter +(42) InputAdapter Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] -(38) InputIteratorTransformer +(43) InputIteratorTransformer Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] -(39) Scan parquet +(44) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1995-01-01), LessThanOrEqual(o_orderdate,1996-12-31), IsNotNull(o_orderkey), IsNotNull(o_custkey)] ReadSchema: struct -(40) NoopFilter +(45) NoopFilter Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Arguments: [o_orderkey#X, o_custkey#X, o_orderdate#X] -(41) ProjectExecTransformer +(46) ProjectExecTransformer Output [4]: [hash(o_orderkey#X, 42) AS hash_partition_key#X, o_orderkey#X, o_custkey#X, o_orderdate#X] Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] -(42) WholeStageCodegenTransformer (X) +(47) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_orderdate#X] Arguments: false -(43) ColumnarExchange +(48) VeloxAppendBatches +Input [4]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_orderdate#X] +Arguments: X + +(49) ColumnarExchange Input [4]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_orderdate#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_orderdate#X], [plan_id=X], [id=#X] -(44) ShuffleQueryStage +(50) ShuffleQueryStage Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Arguments: X -(45) InputAdapter +(51) InputAdapter Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] -(46) InputIteratorTransformer +(52) InputIteratorTransformer Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] -(47) ShuffledHashJoinExecTransformer +(53) ShuffledHashJoinExecTransformer Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join condition: None -(48) ProjectExecTransformer +(54) ProjectExecTransformer Output [6]: [hash(o_custkey#X, 42) AS hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] Input [7]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderkey#X, o_custkey#X, o_orderdate#X] -(49) WholeStageCodegenTransformer (X) +(55) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] Arguments: false -(50) ColumnarExchange +(56) VeloxAppendBatches +Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] +Arguments: X + +(57) ColumnarExchange Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X], [plan_id=X], [id=#X] -(51) ShuffleQueryStage +(58) ShuffleQueryStage Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] Arguments: X -(52) InputAdapter +(59) InputAdapter Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] -(53) InputIteratorTransformer +(60) InputIteratorTransformer Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] -(54) Scan parquet +(61) Scan parquet Output [2]: [c_custkey#X, c_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey), IsNotNull(c_nationkey)] ReadSchema: struct -(55) NoopFilter +(62) NoopFilter Input [2]: [c_custkey#X, c_nationkey#X] Arguments: [c_custkey#X, c_nationkey#X] -(56) ProjectExecTransformer +(63) ProjectExecTransformer Output [3]: [hash(c_custkey#X, 42) AS hash_partition_key#X, c_custkey#X, c_nationkey#X] Input [2]: [c_custkey#X, c_nationkey#X] -(57) WholeStageCodegenTransformer (X) +(64) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] Arguments: false -(58) ColumnarExchange +(65) VeloxAppendBatches +Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] +Arguments: X + +(66) ColumnarExchange Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_nationkey#X], [plan_id=X], [id=#X] -(59) ShuffleQueryStage +(67) ShuffleQueryStage Output [2]: [c_custkey#X, c_nationkey#X] Arguments: X -(60) InputAdapter +(68) InputAdapter Input [2]: [c_custkey#X, c_nationkey#X] -(61) InputIteratorTransformer +(69) InputIteratorTransformer Input [2]: [c_custkey#X, c_nationkey#X] -(62) ShuffledHashJoinExecTransformer +(70) ShuffledHashJoinExecTransformer Left keys [1]: [o_custkey#X] Right keys [1]: [c_custkey#X] Join condition: None -(63) ProjectExecTransformer +(71) ProjectExecTransformer Output [6]: [hash(c_nationkey#X, 42) AS hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] Input [7]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X, c_custkey#X, c_nationkey#X] -(64) WholeStageCodegenTransformer (X) +(72) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] Arguments: false -(65) ColumnarExchange +(73) VeloxAppendBatches +Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] +Arguments: X + +(74) ColumnarExchange Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X], [plan_id=X], [id=#X] -(66) ShuffleQueryStage +(75) ShuffleQueryStage Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] Arguments: X -(67) InputAdapter +(76) InputAdapter Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] -(68) InputIteratorTransformer +(77) InputIteratorTransformer Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] -(69) Scan parquet +(78) Scan parquet Output [2]: [n_nationkey#X, n_regionkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), IsNotNull(n_regionkey)] ReadSchema: struct -(70) NoopFilter +(79) NoopFilter Input [2]: [n_nationkey#X, n_regionkey#X] Arguments: [n_nationkey#X, n_regionkey#X] -(71) ProjectExecTransformer +(80) ProjectExecTransformer Output [3]: [hash(n_nationkey#X, 42) AS hash_partition_key#X, n_nationkey#X, n_regionkey#X] Input [2]: [n_nationkey#X, n_regionkey#X] -(72) WholeStageCodegenTransformer (X) +(81) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, n_nationkey#X, n_regionkey#X] Arguments: false -(73) ColumnarExchange +(82) VeloxAppendBatches +Input [3]: [hash_partition_key#X, n_nationkey#X, n_regionkey#X] +Arguments: X + +(83) ColumnarExchange Input [3]: [hash_partition_key#X, n_nationkey#X, n_regionkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_regionkey#X], [plan_id=X], [id=#X] -(74) ShuffleQueryStage +(84) ShuffleQueryStage Output [2]: [n_nationkey#X, n_regionkey#X] Arguments: X -(75) InputAdapter +(85) InputAdapter Input [2]: [n_nationkey#X, n_regionkey#X] -(76) InputIteratorTransformer +(86) InputIteratorTransformer Input [2]: [n_nationkey#X, n_regionkey#X] -(77) ShuffledHashJoinExecTransformer +(87) ShuffledHashJoinExecTransformer Left keys [1]: [c_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(78) ProjectExecTransformer +(88) ProjectExecTransformer Output [6]: [hash(s_nationkey#X, 42) AS hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] Input [7]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X, n_nationkey#X, n_regionkey#X] -(79) WholeStageCodegenTransformer (X) +(89) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] Arguments: false -(80) ColumnarExchange +(90) VeloxAppendBatches +Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] +Arguments: X + +(91) ColumnarExchange Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X], [plan_id=X], [id=#X] -(81) ShuffleQueryStage +(92) ShuffleQueryStage Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] Arguments: X -(82) InputAdapter +(93) InputAdapter Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] -(83) InputIteratorTransformer +(94) InputIteratorTransformer Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] -(84) Scan parquet +(95) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey)] ReadSchema: struct -(85) NoopFilter +(96) NoopFilter Input [2]: [n_nationkey#X, n_name#X] Arguments: [n_nationkey#X, n_name#X] -(86) ProjectExecTransformer +(97) ProjectExecTransformer Output [3]: [hash(n_nationkey#X, 42) AS hash_partition_key#X, n_nationkey#X, n_name#X] Input [2]: [n_nationkey#X, n_name#X] -(87) WholeStageCodegenTransformer (X) +(98) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] Arguments: false -(88) ColumnarExchange +(99) VeloxAppendBatches +Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] +Arguments: X + +(100) ColumnarExchange Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [id=#X] -(89) ShuffleQueryStage +(101) ShuffleQueryStage Output [2]: [n_nationkey#X, n_name#X] Arguments: X -(90) InputAdapter +(102) InputAdapter Input [2]: [n_nationkey#X, n_name#X] -(91) InputIteratorTransformer +(103) InputIteratorTransformer Input [2]: [n_nationkey#X, n_name#X] -(92) ShuffledHashJoinExecTransformer +(104) ShuffledHashJoinExecTransformer Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(93) ProjectExecTransformer +(105) ProjectExecTransformer Output [6]: [hash(n_regionkey#X, 42) AS hash_partition_key#X, l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] Input [7]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X, n_nationkey#X, n_name#X] -(94) WholeStageCodegenTransformer (X) +(106) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] Arguments: false -(95) ColumnarExchange +(107) VeloxAppendBatches +Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] +Arguments: X + +(108) ColumnarExchange Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] Arguments: hashpartitioning(n_regionkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X], [plan_id=X], [id=#X] -(96) ShuffleQueryStage +(109) ShuffleQueryStage Output [5]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] Arguments: X -(97) InputAdapter +(110) InputAdapter Input [5]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] -(98) InputIteratorTransformer +(111) InputIteratorTransformer Input [5]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] -(99) Scan parquet +(112) Scan parquet Output [2]: [r_regionkey#X, r_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(r_name), EqualTo(r_name,AMERICA), IsNotNull(r_regionkey)] ReadSchema: struct -(100) NoopFilter +(113) NoopFilter Input [2]: [r_regionkey#X, r_name#X] Arguments: [r_regionkey#X, r_name#X] -(101) ProjectExecTransformer +(114) ProjectExecTransformer Output [2]: [hash(r_regionkey#X, 42) AS hash_partition_key#X, r_regionkey#X] Input [2]: [r_regionkey#X, r_name#X] -(102) WholeStageCodegenTransformer (X) +(115) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, r_regionkey#X] Arguments: false -(103) ColumnarExchange +(116) VeloxAppendBatches +Input [2]: [hash_partition_key#X, r_regionkey#X] +Arguments: X + +(117) ColumnarExchange Input [2]: [hash_partition_key#X, r_regionkey#X] Arguments: hashpartitioning(r_regionkey#X, 1), ENSURE_REQUIREMENTS, [r_regionkey#X], [plan_id=X], [id=#X] -(104) ShuffleQueryStage +(118) ShuffleQueryStage Output [1]: [r_regionkey#X] Arguments: X -(105) InputAdapter +(119) InputAdapter Input [1]: [r_regionkey#X] -(106) InputIteratorTransformer +(120) InputIteratorTransformer Input [1]: [r_regionkey#X] -(107) ShuffledHashJoinExecTransformer +(121) ShuffledHashJoinExecTransformer Left keys [1]: [n_regionkey#X] Right keys [1]: [r_regionkey#X] Join condition: None -(108) ProjectExecTransformer +(122) ProjectExecTransformer Output [4]: [year(o_orderdate#X) AS o_year#X, CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) AS volume#X, n_name#X AS nation#X, CASE WHEN (n_name#X = BRAZIL) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) ELSE 0.0000 END AS _pre_X#X] Input [6]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X, r_regionkey#X] -(109) FlushableHashAggregateExecTransformer +(123) FlushableHashAggregateExecTransformer Input [4]: [o_year#X, volume#X, nation#X, _pre_X#X] Keys [1]: [o_year#X] Functions [2]: [partial_sum(_pre_X#X), partial_sum(volume#X)] Aggregate Attributes [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Results [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] -(110) ProjectExecTransformer +(124) ProjectExecTransformer Output [6]: [hash(o_year#X, 42) AS hash_partition_key#X, o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] -(111) WholeStageCodegenTransformer (X) +(125) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: false -(112) ColumnarExchange +(126) VeloxAppendBatches +Input [6]: [hash_partition_key#X, o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] +Arguments: X + +(127) ColumnarExchange Input [6]: [hash_partition_key#X, o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: hashpartitioning(o_year#X, 1), ENSURE_REQUIREMENTS, [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(113) ShuffleQueryStage +(128) ShuffleQueryStage Output [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: X -(114) InputAdapter +(129) InputAdapter Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] -(115) InputIteratorTransformer +(130) InputIteratorTransformer Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] -(116) RegularHashAggregateExecTransformer +(131) RegularHashAggregateExecTransformer Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Keys [1]: [o_year#X] Functions [2]: [sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END), sum(volume#X)] Aggregate Attributes [2]: [sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X, sum(volume#X)#X] Results [3]: [o_year#X, sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X, sum(volume#X)#X] -(117) ProjectExecTransformer +(132) ProjectExecTransformer Output [2]: [o_year#X, CheckOverflow((promote_precision(sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X) / promote_precision(sum(volume#X)#X)), DecimalType(38,6), true) AS mkt_share#X] Input [3]: [o_year#X, sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X, sum(volume#X)#X] -(118) WholeStageCodegenTransformer (X) +(133) WholeStageCodegenTransformer (X) Input [2]: [o_year#X, mkt_share#X] Arguments: false -(119) ColumnarExchange +(134) VeloxAppendBatches +Input [2]: [o_year#X, mkt_share#X] +Arguments: X + +(135) ColumnarExchange Input [2]: [o_year#X, mkt_share#X] Arguments: rangepartitioning(o_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(120) ShuffleQueryStage +(136) ShuffleQueryStage Output [2]: [o_year#X, mkt_share#X] Arguments: X -(121) InputAdapter +(137) InputAdapter Input [2]: [o_year#X, mkt_share#X] -(122) InputIteratorTransformer +(138) InputIteratorTransformer Input [2]: [o_year#X, mkt_share#X] -(123) SortExecTransformer +(139) SortExecTransformer Input [2]: [o_year#X, mkt_share#X] Arguments: [o_year#X ASC NULLS FIRST], true, 0 -(124) WholeStageCodegenTransformer (X) +(140) WholeStageCodegenTransformer (X) Input [2]: [o_year#X, mkt_share#X] Arguments: false -(125) VeloxColumnarToRowExec +(141) VeloxColumnarToRowExec Input [2]: [o_year#X, mkt_share#X] -(126) Scan parquet +(142) Scan parquet Output [2]: [p_partkey#X, p_type#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_type), EqualTo(p_type,ECONOMY ANODIZED STEEL), IsNotNull(p_partkey)] ReadSchema: struct -(127) Filter +(143) Filter Input [2]: [p_partkey#X, p_type#X] Condition : ((isnotnull(p_type#X) AND (p_type#X = ECONOMY ANODIZED STEEL)) AND isnotnull(p_partkey#X)) -(128) Project +(144) Project Output [1]: [p_partkey#X] Input [2]: [p_partkey#X, p_type#X] -(129) Exchange +(145) Exchange Input [1]: [p_partkey#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(130) Scan parquet +(146) Sort +Input [1]: [p_partkey#X] +Arguments: [p_partkey#X ASC NULLS FIRST], false, 0 + +(147) Scan parquet Output [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(131) Filter +(148) Filter Input [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Condition : ((isnotnull(l_partkey#X) AND isnotnull(l_suppkey#X)) AND isnotnull(l_orderkey#X)) -(132) Exchange +(149) Exchange Input [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(133) ShuffledHashJoin +(150) Sort +Input [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] +Arguments: [l_partkey#X ASC NULLS FIRST], false, 0 + +(151) SortMergeJoin Left keys [1]: [p_partkey#X] Right keys [1]: [l_partkey#X] Join condition: None -(134) Project +(152) Project Output [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Input [6]: [p_partkey#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(135) Exchange +(153) Exchange Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(136) Scan parquet +(154) Sort +Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] +Arguments: [l_suppkey#X ASC NULLS FIRST], false, 0 + +(155) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(137) Filter +(156) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(138) Exchange +(157) Exchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(139) ShuffledHashJoin +(158) Sort +Input [2]: [s_suppkey#X, s_nationkey#X] +Arguments: [s_suppkey#X ASC NULLS FIRST], false, 0 + +(159) SortMergeJoin Left keys [1]: [l_suppkey#X] Right keys [1]: [s_suppkey#X] Join condition: None -(140) Project +(160) Project Output [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Input [6]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, s_suppkey#X, s_nationkey#X] -(141) Exchange +(161) Exchange Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(142) Scan parquet +(162) Sort +Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(163) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1995-01-01), LessThanOrEqual(o_orderdate,1996-12-31), IsNotNull(o_orderkey), IsNotNull(o_custkey)] ReadSchema: struct -(143) Filter +(164) Filter Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Condition : ((((isnotnull(o_orderdate#X) AND (o_orderdate#X >= 1995-01-01)) AND (o_orderdate#X <= 1996-12-31)) AND isnotnull(o_orderkey#X)) AND isnotnull(o_custkey#X)) -(144) Exchange +(165) Exchange Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(145) ShuffledHashJoin +(166) Sort +Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(167) SortMergeJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join condition: None -(146) Project +(168) Project Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] Input [7]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderkey#X, o_custkey#X, o_orderdate#X] -(147) Exchange +(169) Exchange Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(148) Scan parquet +(170) Sort +Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] +Arguments: [o_custkey#X ASC NULLS FIRST], false, 0 + +(171) Scan parquet Output [2]: [c_custkey#X, c_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey), IsNotNull(c_nationkey)] ReadSchema: struct -(149) Filter +(172) Filter Input [2]: [c_custkey#X, c_nationkey#X] Condition : (isnotnull(c_custkey#X) AND isnotnull(c_nationkey#X)) -(150) Exchange +(173) Exchange Input [2]: [c_custkey#X, c_nationkey#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(151) ShuffledHashJoin +(174) Sort +Input [2]: [c_custkey#X, c_nationkey#X] +Arguments: [c_custkey#X ASC NULLS FIRST], false, 0 + +(175) SortMergeJoin Left keys [1]: [o_custkey#X] Right keys [1]: [c_custkey#X] Join condition: None -(152) Project +(176) Project Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] Input [7]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X, c_custkey#X, c_nationkey#X] -(153) Exchange +(177) Exchange Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(154) Scan parquet +(178) Sort +Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] +Arguments: [c_nationkey#X ASC NULLS FIRST], false, 0 + +(179) Scan parquet Output [2]: [n_nationkey#X, n_regionkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), IsNotNull(n_regionkey)] ReadSchema: struct -(155) Filter +(180) Filter Input [2]: [n_nationkey#X, n_regionkey#X] Condition : (isnotnull(n_nationkey#X) AND isnotnull(n_regionkey#X)) -(156) Exchange +(181) Exchange Input [2]: [n_nationkey#X, n_regionkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(157) ShuffledHashJoin +(182) Sort +Input [2]: [n_nationkey#X, n_regionkey#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(183) SortMergeJoin Left keys [1]: [c_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(158) Project +(184) Project Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] Input [7]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X, n_nationkey#X, n_regionkey#X] -(159) Exchange +(185) Exchange Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(160) Scan parquet +(186) Sort +Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] +Arguments: [s_nationkey#X ASC NULLS FIRST], false, 0 + +(187) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey)] ReadSchema: struct -(161) Filter +(188) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : isnotnull(n_nationkey#X) -(162) Exchange +(189) Exchange Input [2]: [n_nationkey#X, n_name#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(163) ShuffledHashJoin +(190) Sort +Input [2]: [n_nationkey#X, n_name#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(191) SortMergeJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(164) Project +(192) Project Output [5]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] Input [7]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X, n_nationkey#X, n_name#X] -(165) Exchange +(193) Exchange Input [5]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] Arguments: hashpartitioning(n_regionkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(166) Scan parquet +(194) Sort +Input [5]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] +Arguments: [n_regionkey#X ASC NULLS FIRST], false, 0 + +(195) Scan parquet Output [2]: [r_regionkey#X, r_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(r_name), EqualTo(r_name,AMERICA), IsNotNull(r_regionkey)] ReadSchema: struct -(167) Filter +(196) Filter Input [2]: [r_regionkey#X, r_name#X] Condition : ((isnotnull(r_name#X) AND (r_name#X = AMERICA)) AND isnotnull(r_regionkey#X)) -(168) Project +(197) Project Output [1]: [r_regionkey#X] Input [2]: [r_regionkey#X, r_name#X] -(169) Exchange +(198) Exchange Input [1]: [r_regionkey#X] Arguments: hashpartitioning(r_regionkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(170) ShuffledHashJoin +(199) Sort +Input [1]: [r_regionkey#X] +Arguments: [r_regionkey#X ASC NULLS FIRST], false, 0 + +(200) SortMergeJoin Left keys [1]: [n_regionkey#X] Right keys [1]: [r_regionkey#X] Join condition: None -(171) Project +(201) Project Output [3]: [year(o_orderdate#X) AS o_year#X, CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) AS volume#X, n_name#X AS nation#X] Input [6]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X, r_regionkey#X] -(172) HashAggregate +(202) HashAggregate Input [3]: [o_year#X, volume#X, nation#X] Keys [1]: [o_year#X] Functions [2]: [partial_sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END), partial_sum(volume#X)] Aggregate Attributes [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Results [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] -(173) Exchange +(203) Exchange Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: hashpartitioning(o_year#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(174) HashAggregate +(204) HashAggregate Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Keys [1]: [o_year#X] Functions [2]: [sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END), sum(volume#X)] Aggregate Attributes [2]: [sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X, sum(volume#X)#X] Results [2]: [o_year#X, CheckOverflow((promote_precision(sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X) / promote_precision(sum(volume#X)#X)), DecimalType(38,6), true) AS mkt_share#X] -(175) Exchange +(205) Exchange Input [2]: [o_year#X, mkt_share#X] Arguments: rangepartitioning(o_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(176) Sort +(206) Sort Input [2]: [o_year#X, mkt_share#X] Arguments: [o_year#X ASC NULLS FIRST], true, 0 -(177) AdaptiveSparkPlan +(207) AdaptiveSparkPlan Output [2]: [o_year#X, mkt_share#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/9.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/9.txt index 5ba4a9f7ce0ef..2abb6ec215c6c 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/9.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/9.txt @@ -1,126 +1,136 @@ == Physical Plan == -AdaptiveSparkPlan (133) +AdaptiveSparkPlan (155) +- == Final Plan == - VeloxColumnarToRowExec (94) - +- ^ SortExecTransformer (92) - +- ^ InputIteratorTransformer (91) - +- ^ InputAdapter (90) - +- ^ ShuffleQueryStage (89) - +- ColumnarExchange (88) - +- ^ RegularHashAggregateExecTransformer (86) - +- ^ InputIteratorTransformer (85) - +- ^ InputAdapter (84) - +- ^ ShuffleQueryStage (83) - +- ColumnarExchange (82) - +- ^ ProjectExecTransformer (80) - +- ^ FlushableHashAggregateExecTransformer (79) - +- ^ ProjectExecTransformer (78) - +- ^ ShuffledHashJoinExecTransformer Inner (77) - :- ^ InputIteratorTransformer (68) - : +- ^ InputAdapter (67) - : +- ^ ShuffleQueryStage (66) - : +- ColumnarExchange (65) - : +- ^ ProjectExecTransformer (63) - : +- ^ ShuffledHashJoinExecTransformer Inner (62) - : :- ^ InputIteratorTransformer (53) - : : +- ^ InputAdapter (52) - : : +- ^ ShuffleQueryStage (51) - : : +- ColumnarExchange (50) - : : +- ^ ProjectExecTransformer (48) - : : +- ^ ShuffledHashJoinExecTransformer Inner (47) - : : :- ^ InputIteratorTransformer (38) - : : : +- ^ InputAdapter (37) - : : : +- ^ ShuffleQueryStage (36) - : : : +- ColumnarExchange (35) - : : : +- ^ ProjectExecTransformer (33) - : : : +- ^ ShuffledHashJoinExecTransformer Inner (32) - : : : :- ^ InputIteratorTransformer (23) - : : : : +- ^ InputAdapter (22) - : : : : +- ^ ShuffleQueryStage (21) - : : : : +- ColumnarExchange (20) - : : : : +- ^ ProjectExecTransformer (18) - : : : : +- ^ ShuffledHashJoinExecTransformer Inner (17) - : : : : :- ^ InputIteratorTransformer (8) - : : : : : +- ^ InputAdapter (7) - : : : : : +- ^ ShuffleQueryStage (6) - : : : : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (106) + +- ^ SortExecTransformer (104) + +- ^ InputIteratorTransformer (103) + +- ShuffleQueryStage (101) + +- ColumnarExchange (100) + +- VeloxAppendBatches (99) + +- ^ RegularHashAggregateExecTransformer (97) + +- ^ InputIteratorTransformer (96) + +- ShuffleQueryStage (94) + +- ColumnarExchange (93) + +- VeloxAppendBatches (92) + +- ^ ProjectExecTransformer (90) + +- ^ FlushableHashAggregateExecTransformer (89) + +- ^ ProjectExecTransformer (88) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (87) + :- ^ InputIteratorTransformer (77) + : +- ShuffleQueryStage (75) + : +- ColumnarExchange (74) + : +- VeloxAppendBatches (73) + : +- ^ ProjectExecTransformer (71) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (70) + : :- ^ InputIteratorTransformer (60) + : : +- ShuffleQueryStage (58) + : : +- ColumnarExchange (57) + : : +- VeloxAppendBatches (56) + : : +- ^ ProjectExecTransformer (54) + : : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (53) + : : :- ^ InputIteratorTransformer (43) + : : : +- ShuffleQueryStage (41) + : : : +- ColumnarExchange (40) + : : : +- VeloxAppendBatches (39) + : : : +- ^ ProjectExecTransformer (37) + : : : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (36) + : : : :- ^ InputIteratorTransformer (26) + : : : : +- ShuffleQueryStage (24) + : : : : +- ColumnarExchange (23) + : : : : +- VeloxAppendBatches (22) + : : : : +- ^ ProjectExecTransformer (20) + : : : : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (19) + : : : : :- ^ InputIteratorTransformer (9) + : : : : : +- ShuffleQueryStage (7) + : : : : : +- ColumnarExchange (6) + : : : : : +- VeloxAppendBatches (5) : : : : : +- ^ ProjectExecTransformer (3) : : : : : +- ^ NoopFilter (2) : : : : : +- ^ Scan parquet (1) - : : : : +- ^ InputIteratorTransformer (16) - : : : : +- ^ InputAdapter (15) - : : : : +- ^ ShuffleQueryStage (14) - : : : : +- ColumnarExchange (13) - : : : : +- ^ ProjectExecTransformer (11) - : : : : +- ^ NoopFilter (10) - : : : : +- ^ Scan parquet (9) - : : : +- ^ InputIteratorTransformer (31) - : : : +- ^ InputAdapter (30) - : : : +- ^ ShuffleQueryStage (29) - : : : +- ColumnarExchange (28) - : : : +- ^ ProjectExecTransformer (26) - : : : +- ^ NoopFilter (25) - : : : +- ^ Scan parquet (24) - : : +- ^ InputIteratorTransformer (46) - : : +- ^ InputAdapter (45) - : : +- ^ ShuffleQueryStage (44) - : : +- ColumnarExchange (43) - : : +- ^ ProjectExecTransformer (41) - : : +- ^ NoopFilter (40) - : : +- ^ Scan parquet (39) - : +- ^ InputIteratorTransformer (61) - : +- ^ InputAdapter (60) - : +- ^ ShuffleQueryStage (59) - : +- ColumnarExchange (58) - : +- ^ ProjectExecTransformer (56) - : +- ^ NoopFilter (55) - : +- ^ Scan parquet (54) - +- ^ InputIteratorTransformer (76) - +- ^ InputAdapter (75) - +- ^ ShuffleQueryStage (74) - +- ColumnarExchange (73) - +- ^ ProjectExecTransformer (71) - +- ^ NoopFilter (70) - +- ^ Scan parquet (69) + : : : : +- ^ InputIteratorTransformer (18) + : : : : +- ShuffleQueryStage (16) + : : : : +- ColumnarExchange (15) + : : : : +- VeloxAppendBatches (14) + : : : : +- ^ ProjectExecTransformer (12) + : : : : +- ^ NoopFilter (11) + : : : : +- ^ Scan parquet (10) + : : : +- ^ InputIteratorTransformer (35) + : : : +- ShuffleQueryStage (33) + : : : +- ColumnarExchange (32) + : : : +- VeloxAppendBatches (31) + : : : +- ^ ProjectExecTransformer (29) + : : : +- ^ NoopFilter (28) + : : : +- ^ Scan parquet (27) + : : +- ^ InputIteratorTransformer (52) + : : +- ShuffleQueryStage (50) + : : +- ColumnarExchange (49) + : : +- VeloxAppendBatches (48) + : : +- ^ ProjectExecTransformer (46) + : : +- ^ NoopFilter (45) + : : +- ^ Scan parquet (44) + : +- ^ InputIteratorTransformer (69) + : +- ShuffleQueryStage (67) + : +- ColumnarExchange (66) + : +- VeloxAppendBatches (65) + : +- ^ ProjectExecTransformer (63) + : +- ^ NoopFilter (62) + : +- ^ Scan parquet (61) + +- ^ InputIteratorTransformer (86) + +- ShuffleQueryStage (84) + +- ColumnarExchange (83) + +- VeloxAppendBatches (82) + +- ^ ProjectExecTransformer (80) + +- ^ NoopFilter (79) + +- ^ Scan parquet (78) +- == Initial Plan == - Sort (132) - +- Exchange (131) - +- HashAggregate (130) - +- Exchange (129) - +- HashAggregate (128) - +- Project (127) - +- ShuffledHashJoin Inner BuildRight (126) - :- Exchange (122) - : +- Project (121) - : +- ShuffledHashJoin Inner BuildRight (120) - : :- Exchange (116) - : : +- Project (115) - : : +- ShuffledHashJoin Inner BuildRight (114) - : : :- Exchange (110) - : : : +- Project (109) - : : : +- ShuffledHashJoin Inner BuildRight (108) - : : : :- Exchange (104) - : : : : +- Project (103) - : : : : +- ShuffledHashJoin Inner BuildLeft (102) - : : : : :- Exchange (98) - : : : : : +- Project (97) - : : : : : +- Filter (96) - : : : : : +- Scan parquet (95) - : : : : +- Exchange (101) - : : : : +- Filter (100) - : : : : +- Scan parquet (99) - : : : +- Exchange (107) - : : : +- Filter (106) - : : : +- Scan parquet (105) - : : +- Exchange (113) - : : +- Filter (112) - : : +- Scan parquet (111) - : +- Exchange (119) - : +- Filter (118) - : +- Scan parquet (117) - +- Exchange (125) - +- Filter (124) - +- Scan parquet (123) + Sort (154) + +- Exchange (153) + +- HashAggregate (152) + +- Exchange (151) + +- HashAggregate (150) + +- Project (149) + +- SortMergeJoin Inner (148) + :- Sort (143) + : +- Exchange (142) + : +- Project (141) + : +- SortMergeJoin Inner (140) + : :- Sort (135) + : : +- Exchange (134) + : : +- Project (133) + : : +- SortMergeJoin Inner (132) + : : :- Sort (127) + : : : +- Exchange (126) + : : : +- Project (125) + : : : +- SortMergeJoin Inner (124) + : : : :- Sort (119) + : : : : +- Exchange (118) + : : : : +- Project (117) + : : : : +- SortMergeJoin Inner (116) + : : : : :- Sort (111) + : : : : : +- Exchange (110) + : : : : : +- Project (109) + : : : : : +- Filter (108) + : : : : : +- Scan parquet (107) + : : : : +- Sort (115) + : : : : +- Exchange (114) + : : : : +- Filter (113) + : : : : +- Scan parquet (112) + : : : +- Sort (123) + : : : +- Exchange (122) + : : : +- Filter (121) + : : : +- Scan parquet (120) + : : +- Sort (131) + : : +- Exchange (130) + : : +- Filter (129) + : : +- Scan parquet (128) + : +- Sort (139) + : +- Exchange (138) + : +- Filter (137) + : +- Scan parquet (136) + +- Sort (147) + +- Exchange (146) + +- Filter (145) + +- Scan parquet (144) (1) Scan parquet @@ -142,548 +152,636 @@ Input [2]: [p_partkey#X, p_name#X] Input [2]: [hash_partition_key#X, p_partkey#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [2]: [hash_partition_key#X, p_partkey#X] +Arguments: X + +(6) ColumnarExchange Input [2]: [hash_partition_key#X, p_partkey#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [1]: [p_partkey#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [1]: [p_partkey#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [1]: [p_partkey#X] -(9) Scan parquet +(10) Scan parquet Output [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(10) NoopFilter +(11) NoopFilter Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [7]: [hash(l_partkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [7]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [7]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(15) ColumnarExchange Input [7]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [p_partkey#X] Right keys [1]: [l_partkey#X] Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [7]: [hash(l_suppkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Input [7]: [p_partkey#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -(19) WholeStageCodegenTransformer (X) +(21) WholeStageCodegenTransformer (X) Input [7]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: false -(20) ColumnarExchange +(22) VeloxAppendBatches +Input [7]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(23) ColumnarExchange Input [7]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(21) ShuffleQueryStage +(24) ShuffleQueryStage Output [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: X -(22) InputAdapter +(25) InputAdapter Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -(23) InputIteratorTransformer +(26) InputIteratorTransformer Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -(24) Scan parquet +(27) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(25) NoopFilter +(28) NoopFilter Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: [s_suppkey#X, s_nationkey#X] -(26) ProjectExecTransformer +(29) ProjectExecTransformer Output [3]: [hash(s_suppkey#X, 42) AS hash_partition_key#X, s_suppkey#X, s_nationkey#X] Input [2]: [s_suppkey#X, s_nationkey#X] -(27) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] Arguments: false -(28) ColumnarExchange +(31) VeloxAppendBatches +Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] +Arguments: X + +(32) ColumnarExchange Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [id=#X] -(29) ShuffleQueryStage +(33) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] Arguments: X -(30) InputAdapter +(34) InputAdapter Input [2]: [s_suppkey#X, s_nationkey#X] -(31) InputIteratorTransformer +(35) InputIteratorTransformer Input [2]: [s_suppkey#X, s_nationkey#X] -(32) ShuffledHashJoinExecTransformer +(36) ShuffledHashJoinExecTransformer Left keys [1]: [l_suppkey#X] Right keys [1]: [s_suppkey#X] Join condition: None -(33) ProjectExecTransformer +(37) ProjectExecTransformer Output [8]: [hash(l_suppkey#X, l_partkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Input [8]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_suppkey#X, s_nationkey#X] -(34) WholeStageCodegenTransformer (X) +(38) WholeStageCodegenTransformer (X) Input [8]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: false -(35) ColumnarExchange +(39) VeloxAppendBatches +Input [8]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] +Arguments: X + +(40) ColumnarExchange Input [8]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: hashpartitioning(l_suppkey#X, l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X], [plan_id=X], [id=#X] -(36) ShuffleQueryStage +(41) ShuffleQueryStage Output [7]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: X -(37) InputAdapter +(42) InputAdapter Input [7]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] -(38) InputIteratorTransformer +(43) InputIteratorTransformer Input [7]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] -(39) Scan parquet +(44) Scan parquet Output [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_suppkey), IsNotNull(ps_partkey)] ReadSchema: struct -(40) NoopFilter +(45) NoopFilter Input [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Arguments: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] -(41) ProjectExecTransformer +(46) ProjectExecTransformer Output [4]: [hash(ps_suppkey#X, ps_partkey#X, 42) AS hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Input [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] -(42) WholeStageCodegenTransformer (X) +(47) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Arguments: false -(43) ColumnarExchange +(48) VeloxAppendBatches +Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_supplycost#X] +Arguments: X + +(49) ColumnarExchange Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Arguments: hashpartitioning(ps_suppkey#X, ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_supplycost#X], [plan_id=X], [id=#X] -(44) ShuffleQueryStage +(50) ShuffleQueryStage Output [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Arguments: X -(45) InputAdapter +(51) InputAdapter Input [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] -(46) InputIteratorTransformer +(52) InputIteratorTransformer Input [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] -(47) ShuffledHashJoinExecTransformer +(53) ShuffledHashJoinExecTransformer Left keys [2]: [l_suppkey#X, l_partkey#X] Right keys [2]: [ps_suppkey#X, ps_partkey#X] Join condition: None -(48) ProjectExecTransformer +(54) ProjectExecTransformer Output [7]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] Input [10]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_partkey#X, ps_suppkey#X, ps_supplycost#X] -(49) WholeStageCodegenTransformer (X) +(55) WholeStageCodegenTransformer (X) Input [7]: [hash_partition_key#X, l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] Arguments: false -(50) ColumnarExchange +(56) VeloxAppendBatches +Input [7]: [hash_partition_key#X, l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] +Arguments: X + +(57) ColumnarExchange Input [7]: [hash_partition_key#X, l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X], [plan_id=X], [id=#X] -(51) ShuffleQueryStage +(58) ShuffleQueryStage Output [6]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] Arguments: X -(52) InputAdapter +(59) InputAdapter Input [6]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] -(53) InputIteratorTransformer +(60) InputIteratorTransformer Input [6]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] -(54) Scan parquet +(61) Scan parquet Output [2]: [o_orderkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderkey)] ReadSchema: struct -(55) NoopFilter +(62) NoopFilter Input [2]: [o_orderkey#X, o_orderdate#X] Arguments: [o_orderkey#X, o_orderdate#X] -(56) ProjectExecTransformer +(63) ProjectExecTransformer Output [3]: [hash(o_orderkey#X, 42) AS hash_partition_key#X, o_orderkey#X, o_orderdate#X] Input [2]: [o_orderkey#X, o_orderdate#X] -(57) WholeStageCodegenTransformer (X) +(64) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderdate#X] Arguments: false -(58) ColumnarExchange +(65) VeloxAppendBatches +Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderdate#X] +Arguments: X + +(66) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderdate#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderdate#X], [plan_id=X], [id=#X] -(59) ShuffleQueryStage +(67) ShuffleQueryStage Output [2]: [o_orderkey#X, o_orderdate#X] Arguments: X -(60) InputAdapter +(68) InputAdapter Input [2]: [o_orderkey#X, o_orderdate#X] -(61) InputIteratorTransformer +(69) InputIteratorTransformer Input [2]: [o_orderkey#X, o_orderdate#X] -(62) ShuffledHashJoinExecTransformer +(70) ShuffledHashJoinExecTransformer Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join condition: None -(63) ProjectExecTransformer +(71) ProjectExecTransformer Output [7]: [hash(s_nationkey#X, 42) AS hash_partition_key#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] Input [8]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderkey#X, o_orderdate#X] -(64) WholeStageCodegenTransformer (X) +(72) WholeStageCodegenTransformer (X) Input [7]: [hash_partition_key#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] Arguments: false -(65) ColumnarExchange +(73) VeloxAppendBatches +Input [7]: [hash_partition_key#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] +Arguments: X + +(74) ColumnarExchange Input [7]: [hash_partition_key#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X], [plan_id=X], [id=#X] -(66) ShuffleQueryStage +(75) ShuffleQueryStage Output [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] Arguments: X -(67) InputAdapter +(76) InputAdapter Input [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] -(68) InputIteratorTransformer +(77) InputIteratorTransformer Input [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] -(69) Scan parquet +(78) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey)] ReadSchema: struct -(70) NoopFilter +(79) NoopFilter Input [2]: [n_nationkey#X, n_name#X] Arguments: [n_nationkey#X, n_name#X] -(71) ProjectExecTransformer +(80) ProjectExecTransformer Output [3]: [hash(n_nationkey#X, 42) AS hash_partition_key#X, n_nationkey#X, n_name#X] Input [2]: [n_nationkey#X, n_name#X] -(72) WholeStageCodegenTransformer (X) +(81) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] Arguments: false -(73) ColumnarExchange +(82) VeloxAppendBatches +Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] +Arguments: X + +(83) ColumnarExchange Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [id=#X] -(74) ShuffleQueryStage +(84) ShuffleQueryStage Output [2]: [n_nationkey#X, n_name#X] Arguments: X -(75) InputAdapter +(85) InputAdapter Input [2]: [n_nationkey#X, n_name#X] -(76) InputIteratorTransformer +(86) InputIteratorTransformer Input [2]: [n_nationkey#X, n_name#X] -(77) ShuffledHashJoinExecTransformer +(87) ShuffledHashJoinExecTransformer Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(78) ProjectExecTransformer +(88) ProjectExecTransformer Output [3]: [n_name#X AS nation#X, year(o_orderdate#X) AS o_year#X, CheckOverflow((promote_precision(cast(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) as decimal(27,4))) - promote_precision(cast(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(l_quantity#X)), DecimalType(25,4), true) as decimal(27,4)))), DecimalType(27,4), true) AS amount#X] Input [8]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X, n_nationkey#X, n_name#X] -(79) FlushableHashAggregateExecTransformer +(89) FlushableHashAggregateExecTransformer Input [3]: [nation#X, o_year#X, amount#X] Keys [2]: [nation#X, o_year#X] Functions [1]: [partial_sum(amount#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [4]: [nation#X, o_year#X, sum#X, isEmpty#X] -(80) ProjectExecTransformer +(90) ProjectExecTransformer Output [5]: [hash(nation#X, o_year#X, 42) AS hash_partition_key#X, nation#X, o_year#X, sum#X, isEmpty#X] Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] -(81) WholeStageCodegenTransformer (X) +(91) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, nation#X, o_year#X, sum#X, isEmpty#X] Arguments: false -(82) ColumnarExchange +(92) VeloxAppendBatches +Input [5]: [hash_partition_key#X, nation#X, o_year#X, sum#X, isEmpty#X] +Arguments: X + +(93) ColumnarExchange Input [5]: [hash_partition_key#X, nation#X, o_year#X, sum#X, isEmpty#X] Arguments: hashpartitioning(nation#X, o_year#X, 1), ENSURE_REQUIREMENTS, [nation#X, o_year#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(83) ShuffleQueryStage +(94) ShuffleQueryStage Output [4]: [nation#X, o_year#X, sum#X, isEmpty#X] Arguments: X -(84) InputAdapter +(95) InputAdapter Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] -(85) InputIteratorTransformer +(96) InputIteratorTransformer Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] -(86) RegularHashAggregateExecTransformer +(97) RegularHashAggregateExecTransformer Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] Keys [2]: [nation#X, o_year#X] Functions [1]: [sum(amount#X)] Aggregate Attributes [1]: [sum(amount#X)#X] Results [3]: [nation#X, o_year#X, sum(amount#X)#X AS sum_profit#X] -(87) WholeStageCodegenTransformer (X) +(98) WholeStageCodegenTransformer (X) Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: false -(88) ColumnarExchange +(99) VeloxAppendBatches +Input [3]: [nation#X, o_year#X, sum_profit#X] +Arguments: X + +(100) ColumnarExchange Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: rangepartitioning(nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(89) ShuffleQueryStage +(101) ShuffleQueryStage Output [3]: [nation#X, o_year#X, sum_profit#X] Arguments: X -(90) InputAdapter +(102) InputAdapter Input [3]: [nation#X, o_year#X, sum_profit#X] -(91) InputIteratorTransformer +(103) InputIteratorTransformer Input [3]: [nation#X, o_year#X, sum_profit#X] -(92) SortExecTransformer +(104) SortExecTransformer Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: [nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST], true, 0 -(93) WholeStageCodegenTransformer (X) +(105) WholeStageCodegenTransformer (X) Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: false -(94) VeloxColumnarToRowExec +(106) VeloxColumnarToRowExec Input [3]: [nation#X, o_year#X, sum_profit#X] -(95) Scan parquet +(107) Scan parquet Output [2]: [p_partkey#X, p_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_name), StringContains(p_name,green), IsNotNull(p_partkey)] ReadSchema: struct -(96) Filter +(108) Filter Input [2]: [p_partkey#X, p_name#X] Condition : ((isnotnull(p_name#X) AND Contains(p_name#X, green)) AND isnotnull(p_partkey#X)) -(97) Project +(109) Project Output [1]: [p_partkey#X] Input [2]: [p_partkey#X, p_name#X] -(98) Exchange +(110) Exchange Input [1]: [p_partkey#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(99) Scan parquet +(111) Sort +Input [1]: [p_partkey#X] +Arguments: [p_partkey#X ASC NULLS FIRST], false, 0 + +(112) Scan parquet Output [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(100) Filter +(113) Filter Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Condition : ((isnotnull(l_partkey#X) AND isnotnull(l_suppkey#X)) AND isnotnull(l_orderkey#X)) -(101) Exchange +(114) Exchange Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(102) ShuffledHashJoin +(115) Sort +Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] +Arguments: [l_partkey#X ASC NULLS FIRST], false, 0 + +(116) SortMergeJoin Left keys [1]: [p_partkey#X] Right keys [1]: [l_partkey#X] Join condition: None -(103) Project +(117) Project Output [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Input [7]: [p_partkey#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -(104) Exchange +(118) Exchange Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(105) Scan parquet +(119) Sort +Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] +Arguments: [l_suppkey#X ASC NULLS FIRST], false, 0 + +(120) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(106) Filter +(121) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(107) Exchange +(122) Exchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(108) ShuffledHashJoin +(123) Sort +Input [2]: [s_suppkey#X, s_nationkey#X] +Arguments: [s_suppkey#X ASC NULLS FIRST], false, 0 + +(124) SortMergeJoin Left keys [1]: [l_suppkey#X] Right keys [1]: [s_suppkey#X] Join condition: None -(109) Project +(125) Project Output [7]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Input [8]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_suppkey#X, s_nationkey#X] -(110) Exchange +(126) Exchange Input [7]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: hashpartitioning(l_suppkey#X, l_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(111) Scan parquet +(127) Sort +Input [7]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] +Arguments: [l_suppkey#X ASC NULLS FIRST, l_partkey#X ASC NULLS FIRST], false, 0 + +(128) Scan parquet Output [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_suppkey), IsNotNull(ps_partkey)] ReadSchema: struct -(112) Filter +(129) Filter Input [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Condition : (isnotnull(ps_suppkey#X) AND isnotnull(ps_partkey#X)) -(113) Exchange +(130) Exchange Input [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Arguments: hashpartitioning(ps_suppkey#X, ps_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(114) ShuffledHashJoin +(131) Sort +Input [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] +Arguments: [ps_suppkey#X ASC NULLS FIRST, ps_partkey#X ASC NULLS FIRST], false, 0 + +(132) SortMergeJoin Left keys [2]: [l_suppkey#X, l_partkey#X] Right keys [2]: [ps_suppkey#X, ps_partkey#X] Join condition: None -(115) Project +(133) Project Output [6]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] Input [10]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_partkey#X, ps_suppkey#X, ps_supplycost#X] -(116) Exchange +(134) Exchange Input [6]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(117) Scan parquet +(135) Sort +Input [6]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(136) Scan parquet Output [2]: [o_orderkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderkey)] ReadSchema: struct -(118) Filter +(137) Filter Input [2]: [o_orderkey#X, o_orderdate#X] Condition : isnotnull(o_orderkey#X) -(119) Exchange +(138) Exchange Input [2]: [o_orderkey#X, o_orderdate#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(120) ShuffledHashJoin +(139) Sort +Input [2]: [o_orderkey#X, o_orderdate#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(140) SortMergeJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join condition: None -(121) Project +(141) Project Output [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] Input [8]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderkey#X, o_orderdate#X] -(122) Exchange +(142) Exchange Input [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(123) Scan parquet +(143) Sort +Input [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] +Arguments: [s_nationkey#X ASC NULLS FIRST], false, 0 + +(144) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey)] ReadSchema: struct -(124) Filter +(145) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : isnotnull(n_nationkey#X) -(125) Exchange +(146) Exchange Input [2]: [n_nationkey#X, n_name#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(126) ShuffledHashJoin +(147) Sort +Input [2]: [n_nationkey#X, n_name#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(148) SortMergeJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(127) Project +(149) Project Output [3]: [n_name#X AS nation#X, year(o_orderdate#X) AS o_year#X, CheckOverflow((promote_precision(cast(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) as decimal(27,4))) - promote_precision(cast(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(l_quantity#X)), DecimalType(25,4), true) as decimal(27,4)))), DecimalType(27,4), true) AS amount#X] Input [8]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X, n_nationkey#X, n_name#X] -(128) HashAggregate +(150) HashAggregate Input [3]: [nation#X, o_year#X, amount#X] Keys [2]: [nation#X, o_year#X] Functions [1]: [partial_sum(amount#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [4]: [nation#X, o_year#X, sum#X, isEmpty#X] -(129) Exchange +(151) Exchange Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] Arguments: hashpartitioning(nation#X, o_year#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(130) HashAggregate +(152) HashAggregate Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] Keys [2]: [nation#X, o_year#X] Functions [1]: [sum(amount#X)] Aggregate Attributes [1]: [sum(amount#X)#X] Results [3]: [nation#X, o_year#X, sum(amount#X)#X AS sum_profit#X] -(131) Exchange +(153) Exchange Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: rangepartitioning(nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(132) Sort +(154) Sort Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: [nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST], true, 0 -(133) AdaptiveSparkPlan +(155) AdaptiveSparkPlan Output [3]: [nation#X, o_year#X, sum_profit#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/1.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/1.txt index 74f79bd3ee642..afea15af53d24 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/1.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/1.txt @@ -1,31 +1,31 @@ == Physical Plan == -AdaptiveSparkPlan (28) +AdaptiveSparkPlan (30) +- == Final Plan == - VeloxColumnarToRowExec (19) - +- ^ SortExecTransformer (17) - +- ^ InputIteratorTransformer (16) - +- ^ InputAdapter (15) - +- ^ ShuffleQueryStage (14), Statistics(X) - +- ColumnarExchange (13) - +- ^ RegularHashAggregateExecTransformer (11) - +- ^ InputIteratorTransformer (10) - +- ^ InputAdapter (9) - +- ^ ShuffleQueryStage (8), Statistics(X) - +- ColumnarExchange (7) + VeloxColumnarToRowExec (21) + +- ^ SortExecTransformer (19) + +- ^ InputIteratorTransformer (18) + +- ShuffleQueryStage (16), Statistics(X) + +- ColumnarExchange (15) + +- VeloxAppendBatches (14) + +- ^ RegularHashAggregateExecTransformer (12) + +- ^ InputIteratorTransformer (11) + +- ShuffleQueryStage (9), Statistics(X) + +- ColumnarExchange (8) + +- VeloxAppendBatches (7) +- ^ ProjectExecTransformer (5) +- ^ FlushableHashAggregateExecTransformer (4) +- ^ ProjectExecTransformer (3) +- ^ NoopFilter (2) +- ^ Scan parquet (1) +- == Initial Plan == - Sort (27) - +- Exchange (26) - +- HashAggregate (25) - +- Exchange (24) - +- HashAggregate (23) - +- Project (22) - +- Filter (21) - +- Scan parquet (20) + Sort (29) + +- Exchange (28) + +- HashAggregate (27) + +- Exchange (26) + +- HashAggregate (25) + +- Project (24) + +- Filter (23) + +- Scan parquet (22) (1) Scan parquet @@ -58,97 +58,105 @@ Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, Input [18]: [hash_partition_key#X, l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Arguments: false -(7) ColumnarExchange +(7) VeloxAppendBatches +Input [18]: [hash_partition_key#X, l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] +Arguments: X + +(8) ColumnarExchange Input [18]: [hash_partition_key#X, l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Arguments: hashpartitioning(l_returnflag#X, l_linestatus#X, 1), ENSURE_REQUIREMENTS, [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X], [plan_id=X], [id=#X] -(8) ShuffleQueryStage +(9) ShuffleQueryStage Output [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Arguments: X -(9) InputAdapter +(10) InputAdapter Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] -(10) InputIteratorTransformer +(11) InputIteratorTransformer Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] -(11) RegularHashAggregateExecTransformer +(12) RegularHashAggregateExecTransformer Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Keys [2]: [l_returnflag#X, l_linestatus#X] Functions [8]: [sum(l_quantity#X), sum(l_extendedprice#X), sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4))), sum(CheckOverflow((promote_precision(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4))) * promote_precision(cast(CheckOverflow((1.00 + promote_precision(cast(l_tax#X as decimal(13,2)))), DecimalType(13,2)) as decimal(26,4)))), DecimalType(38,6))), avg(l_quantity#X), avg(l_extendedprice#X), avg(l_discount#X), count(1)] Aggregate Attributes [8]: [sum(l_quantity#X)#X, sum(l_extendedprice#X)#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X, sum(CheckOverflow((promote_precision(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4))) * promote_precision(cast(CheckOverflow((1.00 + promote_precision(cast(l_tax#X as decimal(13,2)))), DecimalType(13,2)) as decimal(26,4)))), DecimalType(38,6)))#X, avg(l_quantity#X)#X, avg(l_extendedprice#X)#X, avg(l_discount#X)#X, count(1)#X] Results [10]: [l_returnflag#X, l_linestatus#X, sum(l_quantity#X)#X AS sum_qty#X, sum(l_extendedprice#X)#X AS sum_base_price#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X AS sum_disc_price#X, sum(CheckOverflow((promote_precision(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4))) * promote_precision(cast(CheckOverflow((1.00 + promote_precision(cast(l_tax#X as decimal(13,2)))), DecimalType(13,2)) as decimal(26,4)))), DecimalType(38,6)))#X AS sum_charge#X, avg(l_quantity#X)#X AS avg_qty#X, avg(l_extendedprice#X)#X AS avg_price#X, avg(l_discount#X)#X AS avg_disc#X, count(1)#X AS count_order#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] +Arguments: X + +(15) ColumnarExchange Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: rangepartitioning(l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] -(17) SortExecTransformer +(19) SortExecTransformer Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: [l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST], true, 0 -(18) WholeStageCodegenTransformer (X) +(20) WholeStageCodegenTransformer (X) Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: false -(19) VeloxColumnarToRowExec +(21) VeloxColumnarToRowExec Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] -(20) Scan parquet +(22) Scan parquet Output [7]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_tax#X, l_returnflag#X, l_linestatus#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), LessThanOrEqual(l_shipdate,1998-09-02)] ReadSchema: struct -(21) Filter +(23) Filter Input [7]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_tax#X, l_returnflag#X, l_linestatus#X, l_shipdate#X] Condition : (isnotnull(l_shipdate#X) AND (l_shipdate#X <= 1998-09-02)) -(22) Project +(24) Project Output [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_tax#X, l_returnflag#X, l_linestatus#X] Input [7]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_tax#X, l_returnflag#X, l_linestatus#X, l_shipdate#X] -(23) HashAggregate +(25) HashAggregate Input [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_tax#X, l_returnflag#X, l_linestatus#X] Keys [2]: [l_returnflag#X, l_linestatus#X] Functions [8]: [partial_sum(l_quantity#X), partial_sum(l_extendedprice#X), partial_sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4))), partial_sum(CheckOverflow((promote_precision(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4))) * promote_precision(cast(CheckOverflow((1.00 + promote_precision(cast(l_tax#X as decimal(13,2)))), DecimalType(13,2)) as decimal(26,4)))), DecimalType(38,6))), partial_avg(l_quantity#X), partial_avg(l_extendedprice#X), partial_avg(l_discount#X), partial_count(1)] Aggregate Attributes [15]: [sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Results [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] -(24) Exchange +(26) Exchange Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Arguments: hashpartitioning(l_returnflag#X, l_linestatus#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(25) HashAggregate +(27) HashAggregate Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Keys [2]: [l_returnflag#X, l_linestatus#X] Functions [8]: [sum(l_quantity#X), sum(l_extendedprice#X), sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4))), sum(CheckOverflow((promote_precision(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4))) * promote_precision(cast(CheckOverflow((1.00 + promote_precision(cast(l_tax#X as decimal(13,2)))), DecimalType(13,2)) as decimal(26,4)))), DecimalType(38,6))), avg(l_quantity#X), avg(l_extendedprice#X), avg(l_discount#X), count(1)] Aggregate Attributes [8]: [sum(l_quantity#X)#X, sum(l_extendedprice#X)#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X, sum(CheckOverflow((promote_precision(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4))) * promote_precision(cast(CheckOverflow((1.00 + promote_precision(cast(l_tax#X as decimal(13,2)))), DecimalType(13,2)) as decimal(26,4)))), DecimalType(38,6)))#X, avg(l_quantity#X)#X, avg(l_extendedprice#X)#X, avg(l_discount#X)#X, count(1)#X] Results [10]: [l_returnflag#X, l_linestatus#X, sum(l_quantity#X)#X AS sum_qty#X, sum(l_extendedprice#X)#X AS sum_base_price#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X AS sum_disc_price#X, sum(CheckOverflow((promote_precision(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4))) * promote_precision(cast(CheckOverflow((1.00 + promote_precision(cast(l_tax#X as decimal(13,2)))), DecimalType(13,2)) as decimal(26,4)))), DecimalType(38,6)))#X AS sum_charge#X, avg(l_quantity#X)#X AS avg_qty#X, avg(l_extendedprice#X)#X AS avg_price#X, avg(l_discount#X)#X AS avg_disc#X, count(1)#X AS count_order#X] -(26) Exchange +(28) Exchange Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: rangepartitioning(l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(27) Sort +(29) Sort Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: [l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST], true, 0 -(28) AdaptiveSparkPlan +(30) AdaptiveSparkPlan Output [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/10.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/10.txt index 66b2ccdc77b37..3be5f1996fa8c 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/10.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/10.txt @@ -1,85 +1,91 @@ == Physical Plan == -AdaptiveSparkPlan (87) +AdaptiveSparkPlan (100) +- == Final Plan == - VeloxColumnarToRowExec (60) - +- TakeOrderedAndProjectExecTransformer (59) - +- ^ ProjectExecTransformer (57) - +- ^ RegularHashAggregateExecTransformer (56) - +- ^ InputIteratorTransformer (55) - +- ^ InputAdapter (54) - +- ^ ShuffleQueryStage (53), Statistics(X) - +- ColumnarExchange (52) - +- ^ ProjectExecTransformer (50) - +- ^ FlushableHashAggregateExecTransformer (49) - +- ^ ProjectExecTransformer (48) - +- ^ ShuffledHashJoinExecTransformer Inner (47) - :- ^ InputIteratorTransformer (38) - : +- ^ InputAdapter (37) - : +- ^ ShuffleQueryStage (36), Statistics(X) - : +- ColumnarExchange (35) - : +- ^ ProjectExecTransformer (33) - : +- ^ ShuffledHashJoinExecTransformer Inner (32) - : :- ^ InputIteratorTransformer (23) - : : +- ^ InputAdapter (22) - : : +- ^ ShuffleQueryStage (21), Statistics(X) - : : +- ColumnarExchange (20) - : : +- ^ ProjectExecTransformer (18) - : : +- ^ ShuffledHashJoinExecTransformer Inner (17) - : : :- ^ InputIteratorTransformer (8) - : : : +- ^ InputAdapter (7) - : : : +- ^ ShuffleQueryStage (6), Statistics(X) - : : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (67) + +- TakeOrderedAndProjectExecTransformer (66) + +- ^ ProjectExecTransformer (64) + +- ^ RegularHashAggregateExecTransformer (63) + +- ^ InputIteratorTransformer (62) + +- ShuffleQueryStage (60), Statistics(X) + +- ColumnarExchange (59) + +- VeloxAppendBatches (58) + +- ^ ProjectExecTransformer (56) + +- ^ FlushableHashAggregateExecTransformer (55) + +- ^ ProjectExecTransformer (54) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (53) + :- ^ InputIteratorTransformer (43) + : +- ShuffleQueryStage (41), Statistics(X) + : +- ColumnarExchange (40) + : +- VeloxAppendBatches (39) + : +- ^ ProjectExecTransformer (37) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (36) + : :- ^ InputIteratorTransformer (26) + : : +- ShuffleQueryStage (24), Statistics(X) + : : +- ColumnarExchange (23) + : : +- VeloxAppendBatches (22) + : : +- ^ ProjectExecTransformer (20) + : : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (19) + : : :- ^ InputIteratorTransformer (9) + : : : +- ShuffleQueryStage (7), Statistics(X) + : : : +- ColumnarExchange (6) + : : : +- VeloxAppendBatches (5) : : : +- ^ ProjectExecTransformer (3) : : : +- ^ NoopFilter (2) : : : +- ^ Scan parquet (1) - : : +- ^ InputIteratorTransformer (16) - : : +- ^ InputAdapter (15) - : : +- ^ ShuffleQueryStage (14), Statistics(X) - : : +- ColumnarExchange (13) - : : +- ^ ProjectExecTransformer (11) - : : +- ^ NoopFilter (10) - : : +- ^ Scan parquet (9) - : +- ^ InputIteratorTransformer (31) - : +- ^ InputAdapter (30) - : +- ^ ShuffleQueryStage (29), Statistics(X) - : +- ColumnarExchange (28) - : +- ^ ProjectExecTransformer (26) - : +- ^ NoopFilter (25) - : +- ^ Scan parquet (24) - +- ^ InputIteratorTransformer (46) - +- ^ InputAdapter (45) - +- ^ ShuffleQueryStage (44), Statistics(X) - +- ColumnarExchange (43) - +- ^ ProjectExecTransformer (41) - +- ^ NoopFilter (40) - +- ^ Scan parquet (39) + : : +- ^ InputIteratorTransformer (18) + : : +- ShuffleQueryStage (16), Statistics(X) + : : +- ColumnarExchange (15) + : : +- VeloxAppendBatches (14) + : : +- ^ ProjectExecTransformer (12) + : : +- ^ NoopFilter (11) + : : +- ^ Scan parquet (10) + : +- ^ InputIteratorTransformer (35) + : +- ShuffleQueryStage (33), Statistics(X) + : +- ColumnarExchange (32) + : +- VeloxAppendBatches (31) + : +- ^ ProjectExecTransformer (29) + : +- ^ NoopFilter (28) + : +- ^ Scan parquet (27) + +- ^ InputIteratorTransformer (52) + +- ShuffleQueryStage (50), Statistics(X) + +- ColumnarExchange (49) + +- VeloxAppendBatches (48) + +- ^ ProjectExecTransformer (46) + +- ^ NoopFilter (45) + +- ^ Scan parquet (44) +- == Initial Plan == - TakeOrderedAndProject (86) - +- HashAggregate (85) - +- Exchange (84) - +- HashAggregate (83) - +- Project (82) - +- ShuffledHashJoin Inner BuildRight (81) - :- Exchange (77) - : +- Project (76) - : +- ShuffledHashJoin Inner BuildRight (75) - : :- Exchange (70) - : : +- Project (69) - : : +- ShuffledHashJoin Inner BuildRight (68) - : : :- Exchange (63) - : : : +- Filter (62) - : : : +- Scan parquet (61) - : : +- Exchange (67) - : : +- Project (66) - : : +- Filter (65) - : : +- Scan parquet (64) - : +- Exchange (74) - : +- Project (73) - : +- Filter (72) - : +- Scan parquet (71) - +- Exchange (80) - +- Filter (79) - +- Scan parquet (78) + TakeOrderedAndProject (99) + +- HashAggregate (98) + +- Exchange (97) + +- HashAggregate (96) + +- Project (95) + +- SortMergeJoin Inner (94) + :- Sort (89) + : +- Exchange (88) + : +- Project (87) + : +- SortMergeJoin Inner (86) + : :- Sort (80) + : : +- Exchange (79) + : : +- Project (78) + : : +- SortMergeJoin Inner (77) + : : :- Sort (71) + : : : +- Exchange (70) + : : : +- Filter (69) + : : : +- Scan parquet (68) + : : +- Sort (76) + : : +- Exchange (75) + : : +- Project (74) + : : +- Filter (73) + : : +- Scan parquet (72) + : +- Sort (85) + : +- Exchange (84) + : +- Project (83) + : +- Filter (82) + : +- Scan parquet (81) + +- Sort (93) + +- Exchange (92) + +- Filter (91) + +- Scan parquet (90) (1) Scan parquet @@ -101,358 +107,410 @@ Input [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acct Input [8]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [8]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] +Arguments: X + +(6) ColumnarExchange Input [8]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] -(9) Scan parquet +(10) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1993-10-01), LessThan(o_orderdate,1994-01-01), IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(10) NoopFilter +(11) NoopFilter Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Arguments: [o_orderkey#X, o_custkey#X, o_orderdate#X] -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [3]: [hash(o_custkey#X, 42) AS hash_partition_key#X, o_orderkey#X, o_custkey#X] Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] +Arguments: X + +(15) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [2]: [o_orderkey#X, o_custkey#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [2]: [o_orderkey#X, o_custkey#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [2]: [o_orderkey#X, o_custkey#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [9]: [hash(o_orderkey#X, 42) AS hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] Input [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X, o_custkey#X] -(19) WholeStageCodegenTransformer (X) +(21) WholeStageCodegenTransformer (X) Input [9]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] Arguments: false -(20) ColumnarExchange +(22) VeloxAppendBatches +Input [9]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] +Arguments: X + +(23) ColumnarExchange Input [9]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X], [plan_id=X], [id=#X] -(21) ShuffleQueryStage +(24) ShuffleQueryStage Output [8]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] Arguments: X -(22) InputAdapter +(25) InputAdapter Input [8]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] -(23) InputIteratorTransformer +(26) InputIteratorTransformer Input [8]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] -(24) Scan parquet +(27) Scan parquet Output [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_returnflag#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_returnflag), EqualTo(l_returnflag,R), IsNotNull(l_orderkey)] ReadSchema: struct -(25) NoopFilter +(28) NoopFilter Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_returnflag#X] Arguments: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_returnflag#X] -(26) ProjectExecTransformer +(29) ProjectExecTransformer Output [4]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_returnflag#X] -(27) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X] Arguments: false -(28) ColumnarExchange +(31) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(32) ColumnarExchange Input [4]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(29) ShuffleQueryStage +(33) ShuffleQueryStage Output [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] Arguments: X -(30) InputAdapter +(34) InputAdapter Input [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] -(31) InputIteratorTransformer +(35) InputIteratorTransformer Input [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] -(32) ShuffledHashJoinExecTransformer +(36) ShuffledHashJoinExecTransformer Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(33) ProjectExecTransformer +(37) ProjectExecTransformer Output [10]: [hash(c_nationkey#X, 42) AS hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] Input [11]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X] -(34) WholeStageCodegenTransformer (X) +(38) WholeStageCodegenTransformer (X) Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] Arguments: false -(35) ColumnarExchange +(39) VeloxAppendBatches +Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(40) ColumnarExchange Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(36) ShuffleQueryStage +(41) ShuffleQueryStage Output [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] Arguments: X -(37) InputAdapter +(42) InputAdapter Input [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] -(38) InputIteratorTransformer +(43) InputIteratorTransformer Input [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] -(39) Scan parquet +(44) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey)] ReadSchema: struct -(40) NoopFilter +(45) NoopFilter Input [2]: [n_nationkey#X, n_name#X] Arguments: [n_nationkey#X, n_name#X] -(41) ProjectExecTransformer +(46) ProjectExecTransformer Output [3]: [hash(n_nationkey#X, 42) AS hash_partition_key#X, n_nationkey#X, n_name#X] Input [2]: [n_nationkey#X, n_name#X] -(42) WholeStageCodegenTransformer (X) +(47) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] Arguments: false -(43) ColumnarExchange +(48) VeloxAppendBatches +Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] +Arguments: X + +(49) ColumnarExchange Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [id=#X] -(44) ShuffleQueryStage +(50) ShuffleQueryStage Output [2]: [n_nationkey#X, n_name#X] Arguments: X -(45) InputAdapter +(51) InputAdapter Input [2]: [n_nationkey#X, n_name#X] -(46) InputIteratorTransformer +(52) InputIteratorTransformer Input [2]: [n_nationkey#X, n_name#X] -(47) ShuffledHashJoinExecTransformer +(53) ShuffledHashJoinExecTransformer Left keys [1]: [c_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(48) ProjectExecTransformer +(54) ProjectExecTransformer Output [10]: [c_custkey#X, c_name#X, c_address#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X, n_name#X, CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) AS _pre_X#X] Input [11]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X, n_nationkey#X, n_name#X] -(49) FlushableHashAggregateExecTransformer +(55) FlushableHashAggregateExecTransformer Input [10]: [c_custkey#X, c_name#X, c_address#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X, n_name#X, _pre_X#X] Keys [7]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X] Functions [1]: [partial_sum(_pre_X#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] -(50) ProjectExecTransformer +(56) ProjectExecTransformer Output [10]: [hash(c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, 42) AS hash_partition_key#X, c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] -(51) WholeStageCodegenTransformer (X) +(57) WholeStageCodegenTransformer (X) Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Arguments: false -(52) ColumnarExchange +(58) VeloxAppendBatches +Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] +Arguments: X + +(59) ColumnarExchange Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Arguments: hashpartitioning(c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(53) ShuffleQueryStage +(60) ShuffleQueryStage Output [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Arguments: X -(54) InputAdapter +(61) InputAdapter Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] -(55) InputIteratorTransformer +(62) InputIteratorTransformer Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] -(56) RegularHashAggregateExecTransformer +(63) RegularHashAggregateExecTransformer Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Keys [7]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] Results [8]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] -(57) ProjectExecTransformer +(64) ProjectExecTransformer Output [8]: [c_custkey#X, c_name#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X AS revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] Input [8]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] -(58) WholeStageCodegenTransformer (X) +(65) WholeStageCodegenTransformer (X) Input [8]: [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] Arguments: false -(59) TakeOrderedAndProjectExecTransformer +(66) TakeOrderedAndProjectExecTransformer Input [8]: [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] Arguments: X, [revenue#X DESC NULLS LAST], [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X], 0 -(60) VeloxColumnarToRowExec +(67) VeloxColumnarToRowExec Input [8]: [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] -(61) Scan parquet +(68) Scan parquet Output [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey), IsNotNull(c_nationkey)] ReadSchema: struct -(62) Filter +(69) Filter Input [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] Condition : (isnotnull(c_custkey#X) AND isnotnull(c_nationkey#X)) -(63) Exchange +(70) Exchange Input [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(64) Scan parquet +(71) Sort +Input [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] +Arguments: [c_custkey#X ASC NULLS FIRST], false, 0 + +(72) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1993-10-01), LessThan(o_orderdate,1994-01-01), IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(65) Filter +(73) Filter Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Condition : ((((isnotnull(o_orderdate#X) AND (o_orderdate#X >= 1993-10-01)) AND (o_orderdate#X < 1994-01-01)) AND isnotnull(o_custkey#X)) AND isnotnull(o_orderkey#X)) -(66) Project +(74) Project Output [2]: [o_orderkey#X, o_custkey#X] Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] -(67) Exchange +(75) Exchange Input [2]: [o_orderkey#X, o_custkey#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(68) ShuffledHashJoin +(76) Sort +Input [2]: [o_orderkey#X, o_custkey#X] +Arguments: [o_custkey#X ASC NULLS FIRST], false, 0 + +(77) SortMergeJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(69) Project +(78) Project Output [8]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] Input [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X, o_custkey#X] -(70) Exchange +(79) Exchange Input [8]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(71) Scan parquet +(80) Sort +Input [8]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(81) Scan parquet Output [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_returnflag#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_returnflag), EqualTo(l_returnflag,R), IsNotNull(l_orderkey)] ReadSchema: struct -(72) Filter +(82) Filter Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_returnflag#X] Condition : ((isnotnull(l_returnflag#X) AND (l_returnflag#X = R)) AND isnotnull(l_orderkey#X)) -(73) Project +(83) Project Output [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_returnflag#X] -(74) Exchange +(84) Exchange Input [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(75) ShuffledHashJoin +(85) Sort +Input [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(86) SortMergeJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(76) Project +(87) Project Output [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] Input [11]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X] -(77) Exchange +(88) Exchange Input [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(78) Scan parquet +(89) Sort +Input [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] +Arguments: [c_nationkey#X ASC NULLS FIRST], false, 0 + +(90) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey)] ReadSchema: struct -(79) Filter +(91) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : isnotnull(n_nationkey#X) -(80) Exchange +(92) Exchange Input [2]: [n_nationkey#X, n_name#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(81) ShuffledHashJoin +(93) Sort +Input [2]: [n_nationkey#X, n_name#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(94) SortMergeJoin Left keys [1]: [c_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(82) Project +(95) Project Output [9]: [c_custkey#X, c_name#X, c_address#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X, n_name#X] Input [11]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X, n_nationkey#X, n_name#X] -(83) HashAggregate +(96) HashAggregate Input [9]: [c_custkey#X, c_name#X, c_address#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X, n_name#X] Keys [7]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X] Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] -(84) Exchange +(97) Exchange Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Arguments: hashpartitioning(c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(85) HashAggregate +(98) HashAggregate Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Keys [7]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] Results [8]: [c_custkey#X, c_name#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X AS revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] -(86) TakeOrderedAndProject +(99) TakeOrderedAndProject Input [8]: [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] Arguments: X, [revenue#X DESC NULLS LAST], [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] -(87) AdaptiveSparkPlan +(100) AdaptiveSparkPlan Output [8]: [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/11.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/11.txt index 4bd0783da6fea..2347eb9b677e2 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/11.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/11.txt @@ -1,71 +1,75 @@ == Physical Plan == -AdaptiveSparkPlan (72) +AdaptiveSparkPlan (82) +- == Final Plan == - VeloxColumnarToRowExec (50) - +- ^ SortExecTransformer (48) - +- ^ InputIteratorTransformer (47) - +- ^ InputAdapter (46) - +- ^ ShuffleQueryStage (45), Statistics(X) - +- ColumnarExchange (44) - +- ^ FilterExecTransformer (42) - +- ^ RegularHashAggregateExecTransformer (41) - +- ^ InputIteratorTransformer (40) - +- ^ InputAdapter (39) - +- ^ ShuffleQueryStage (38), Statistics(X) - +- ColumnarExchange (37) - +- ^ ProjectExecTransformer (35) - +- ^ FlushableHashAggregateExecTransformer (34) - +- ^ ProjectExecTransformer (33) - +- ^ ShuffledHashJoinExecTransformer Inner (32) - :- ^ InputIteratorTransformer (23) - : +- ^ InputAdapter (22) - : +- ^ ShuffleQueryStage (21), Statistics(X) - : +- ColumnarExchange (20) - : +- ^ ProjectExecTransformer (18) - : +- ^ ShuffledHashJoinExecTransformer Inner (17) - : :- ^ InputIteratorTransformer (8) - : : +- ^ InputAdapter (7) - : : +- ^ ShuffleQueryStage (6), Statistics(X) - : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (56) + +- ^ SortExecTransformer (54) + +- ^ InputIteratorTransformer (53) + +- ShuffleQueryStage (51), Statistics(X) + +- ColumnarExchange (50) + +- VeloxAppendBatches (49) + +- ^ FilterExecTransformer (47) + +- ^ RegularHashAggregateExecTransformer (46) + +- ^ InputIteratorTransformer (45) + +- ShuffleQueryStage (43), Statistics(X) + +- ColumnarExchange (42) + +- VeloxAppendBatches (41) + +- ^ ProjectExecTransformer (39) + +- ^ FlushableHashAggregateExecTransformer (38) + +- ^ ProjectExecTransformer (37) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (36) + :- ^ InputIteratorTransformer (26) + : +- ShuffleQueryStage (24), Statistics(X) + : +- ColumnarExchange (23) + : +- VeloxAppendBatches (22) + : +- ^ ProjectExecTransformer (20) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (19) + : :- ^ InputIteratorTransformer (9) + : : +- ShuffleQueryStage (7), Statistics(X) + : : +- ColumnarExchange (6) + : : +- VeloxAppendBatches (5) : : +- ^ ProjectExecTransformer (3) : : +- ^ NoopFilter (2) : : +- ^ Scan parquet (1) - : +- ^ InputIteratorTransformer (16) - : +- ^ InputAdapter (15) - : +- ^ ShuffleQueryStage (14), Statistics(X) - : +- ColumnarExchange (13) - : +- ^ ProjectExecTransformer (11) - : +- ^ NoopFilter (10) - : +- ^ Scan parquet (9) - +- ^ InputIteratorTransformer (31) - +- ^ InputAdapter (30) - +- ^ ShuffleQueryStage (29), Statistics(X) - +- ColumnarExchange (28) - +- ^ ProjectExecTransformer (26) - +- ^ NoopFilter (25) - +- ^ Scan parquet (24) + : +- ^ InputIteratorTransformer (18) + : +- ShuffleQueryStage (16), Statistics(X) + : +- ColumnarExchange (15) + : +- VeloxAppendBatches (14) + : +- ^ ProjectExecTransformer (12) + : +- ^ NoopFilter (11) + : +- ^ Scan parquet (10) + +- ^ InputIteratorTransformer (35) + +- ShuffleQueryStage (33), Statistics(X) + +- ColumnarExchange (32) + +- VeloxAppendBatches (31) + +- ^ ProjectExecTransformer (29) + +- ^ NoopFilter (28) + +- ^ Scan parquet (27) +- == Initial Plan == - Sort (71) - +- Exchange (70) - +- Filter (69) - +- HashAggregate (68) - +- Exchange (67) - +- HashAggregate (66) - +- Project (65) - +- ShuffledHashJoin Inner BuildRight (64) - :- Exchange (59) - : +- Project (58) - : +- ShuffledHashJoin Inner BuildRight (57) - : :- Exchange (53) - : : +- Filter (52) - : : +- Scan parquet (51) - : +- Exchange (56) - : +- Filter (55) - : +- Scan parquet (54) - +- Exchange (63) - +- Project (62) - +- Filter (61) - +- Scan parquet (60) + Sort (81) + +- Exchange (80) + +- Filter (79) + +- HashAggregate (78) + +- Exchange (77) + +- HashAggregate (76) + +- Project (75) + +- SortMergeJoin Inner (74) + :- Sort (68) + : +- Exchange (67) + : +- Project (66) + : +- SortMergeJoin Inner (65) + : :- Sort (60) + : : +- Exchange (59) + : : +- Filter (58) + : : +- Scan parquet (57) + : +- Sort (64) + : +- Exchange (63) + : +- Filter (62) + : +- Scan parquet (61) + +- Sort (73) + +- Exchange (72) + +- Project (71) + +- Filter (70) + +- Scan parquet (69) (1) Scan parquet @@ -87,553 +91,619 @@ Input [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Input [5]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [5]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] +Arguments: X + +(6) ColumnarExchange Input [5]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] -(9) Scan parquet +(10) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(10) NoopFilter +(11) NoopFilter Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: [s_suppkey#X, s_nationkey#X] -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [3]: [hash(s_suppkey#X, 42) AS hash_partition_key#X, s_suppkey#X, s_nationkey#X] Input [2]: [s_suppkey#X, s_nationkey#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] +Arguments: X + +(15) ColumnarExchange Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [2]: [s_suppkey#X, s_nationkey#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [2]: [s_suppkey#X, s_nationkey#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [ps_suppkey#X] Right keys [1]: [s_suppkey#X] Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [5]: [hash(s_nationkey#X, 42) AS hash_partition_key#X, ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] Input [6]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X, s_suppkey#X, s_nationkey#X] -(19) WholeStageCodegenTransformer (X) +(21) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] Arguments: false -(20) ColumnarExchange +(22) VeloxAppendBatches +Input [5]: [hash_partition_key#X, ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] +Arguments: X + +(23) ColumnarExchange Input [5]: [hash_partition_key#X, ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X], [plan_id=X], [id=#X] -(21) ShuffleQueryStage +(24) ShuffleQueryStage Output [4]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] Arguments: X -(22) InputAdapter +(25) InputAdapter Input [4]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] -(23) InputIteratorTransformer +(26) InputIteratorTransformer Input [4]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] -(24) Scan parquet +(27) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,GERMANY), IsNotNull(n_nationkey)] ReadSchema: struct -(25) NoopFilter +(28) NoopFilter Input [2]: [n_nationkey#X, n_name#X] Arguments: [n_nationkey#X, n_name#X] -(26) ProjectExecTransformer +(29) ProjectExecTransformer Output [2]: [hash(n_nationkey#X, 42) AS hash_partition_key#X, n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(27) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, n_nationkey#X] Arguments: false -(28) ColumnarExchange +(31) VeloxAppendBatches +Input [2]: [hash_partition_key#X, n_nationkey#X] +Arguments: X + +(32) ColumnarExchange Input [2]: [hash_partition_key#X, n_nationkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X], [plan_id=X], [id=#X] -(29) ShuffleQueryStage +(33) ShuffleQueryStage Output [1]: [n_nationkey#X] Arguments: X -(30) InputAdapter +(34) InputAdapter Input [1]: [n_nationkey#X] -(31) InputIteratorTransformer +(35) InputIteratorTransformer Input [1]: [n_nationkey#X] -(32) ShuffledHashJoinExecTransformer +(36) ShuffledHashJoinExecTransformer Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(33) ProjectExecTransformer +(37) ProjectExecTransformer Output [4]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)) AS _pre_X#X] Input [5]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X, n_nationkey#X] -(34) FlushableHashAggregateExecTransformer +(38) FlushableHashAggregateExecTransformer Input [4]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, _pre_X#X] Keys [1]: [ps_partkey#X] Functions [1]: [partial_sum(_pre_X#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [ps_partkey#X, sum#X, isEmpty#X] -(35) ProjectExecTransformer +(39) ProjectExecTransformer Output [4]: [hash(ps_partkey#X, 42) AS hash_partition_key#X, ps_partkey#X, sum#X, isEmpty#X] Input [3]: [ps_partkey#X, sum#X, isEmpty#X] -(36) WholeStageCodegenTransformer (X) +(40) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, ps_partkey#X, sum#X, isEmpty#X] Arguments: false -(37) ColumnarExchange +(41) VeloxAppendBatches +Input [4]: [hash_partition_key#X, ps_partkey#X, sum#X, isEmpty#X] +Arguments: X + +(42) ColumnarExchange Input [4]: [hash_partition_key#X, ps_partkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(38) ShuffleQueryStage +(43) ShuffleQueryStage Output [3]: [ps_partkey#X, sum#X, isEmpty#X] Arguments: X -(39) InputAdapter +(44) InputAdapter Input [3]: [ps_partkey#X, sum#X, isEmpty#X] -(40) InputIteratorTransformer +(45) InputIteratorTransformer Input [3]: [ps_partkey#X, sum#X, isEmpty#X] -(41) RegularHashAggregateExecTransformer +(46) RegularHashAggregateExecTransformer Input [3]: [ps_partkey#X, sum#X, isEmpty#X] Keys [1]: [ps_partkey#X] Functions [1]: [sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))#X] Results [2]: [ps_partkey#X, sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))#X AS value#X] -(42) FilterExecTransformer +(47) FilterExecTransformer Input [2]: [ps_partkey#X, value#X] Arguments: (isnotnull(value#X) AND (cast(value#X as decimal(38,6)) > Subquery subquery#X, [id=#X])) -(43) WholeStageCodegenTransformer (X) +(48) WholeStageCodegenTransformer (X) Input [2]: [ps_partkey#X, value#X] Arguments: false -(44) ColumnarExchange +(49) VeloxAppendBatches +Input [2]: [ps_partkey#X, value#X] +Arguments: X + +(50) ColumnarExchange Input [2]: [ps_partkey#X, value#X] Arguments: rangepartitioning(value#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(45) ShuffleQueryStage +(51) ShuffleQueryStage Output [2]: [ps_partkey#X, value#X] Arguments: X -(46) InputAdapter +(52) InputAdapter Input [2]: [ps_partkey#X, value#X] -(47) InputIteratorTransformer +(53) InputIteratorTransformer Input [2]: [ps_partkey#X, value#X] -(48) SortExecTransformer +(54) SortExecTransformer Input [2]: [ps_partkey#X, value#X] Arguments: [value#X DESC NULLS LAST], true, 0 -(49) WholeStageCodegenTransformer (X) +(55) WholeStageCodegenTransformer (X) Input [2]: [ps_partkey#X, value#X] Arguments: false -(50) VeloxColumnarToRowExec +(56) VeloxColumnarToRowExec Input [2]: [ps_partkey#X, value#X] -(51) Scan parquet +(57) Scan parquet Output [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_suppkey)] ReadSchema: struct -(52) Filter +(58) Filter Input [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Condition : isnotnull(ps_suppkey#X) -(53) Exchange +(59) Exchange Input [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(54) Scan parquet +(60) Sort +Input [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] +Arguments: [ps_suppkey#X ASC NULLS FIRST], false, 0 + +(61) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(55) Filter +(62) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(56) Exchange +(63) Exchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(57) ShuffledHashJoin +(64) Sort +Input [2]: [s_suppkey#X, s_nationkey#X] +Arguments: [s_suppkey#X ASC NULLS FIRST], false, 0 + +(65) SortMergeJoin Left keys [1]: [ps_suppkey#X] Right keys [1]: [s_suppkey#X] Join condition: None -(58) Project +(66) Project Output [4]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] Input [6]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X, s_suppkey#X, s_nationkey#X] -(59) Exchange +(67) Exchange Input [4]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(60) Scan parquet +(68) Sort +Input [4]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] +Arguments: [s_nationkey#X ASC NULLS FIRST], false, 0 + +(69) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,GERMANY), IsNotNull(n_nationkey)] ReadSchema: struct -(61) Filter +(70) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : ((isnotnull(n_name#X) AND (n_name#X = GERMANY)) AND isnotnull(n_nationkey#X)) -(62) Project +(71) Project Output [1]: [n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(63) Exchange +(72) Exchange Input [1]: [n_nationkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(64) ShuffledHashJoin +(73) Sort +Input [1]: [n_nationkey#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(74) SortMergeJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(65) Project +(75) Project Output [3]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X] Input [5]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X, n_nationkey#X] -(66) HashAggregate +(76) HashAggregate Input [3]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X] Keys [1]: [ps_partkey#X] Functions [1]: [partial_sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [ps_partkey#X, sum#X, isEmpty#X] -(67) Exchange +(77) Exchange Input [3]: [ps_partkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(68) HashAggregate +(78) HashAggregate Input [3]: [ps_partkey#X, sum#X, isEmpty#X] Keys [1]: [ps_partkey#X] Functions [1]: [sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))#X] Results [2]: [ps_partkey#X, sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))#X AS value#X] -(69) Filter +(79) Filter Input [2]: [ps_partkey#X, value#X] Condition : (isnotnull(value#X) AND (cast(value#X as decimal(38,6)) > Subquery subquery#X, [id=#X])) -(70) Exchange +(80) Exchange Input [2]: [ps_partkey#X, value#X] Arguments: rangepartitioning(value#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(71) Sort +(81) Sort Input [2]: [ps_partkey#X, value#X] Arguments: [value#X DESC NULLS LAST], true, 0 -(72) AdaptiveSparkPlan +(82) AdaptiveSparkPlan Output [2]: [ps_partkey#X, value#X] Arguments: isFinalPlan=true ===== Subqueries ===== -Subquery:1 Hosting operator id = 42 Hosting Expression = Subquery subquery#X, [id=#X] -AdaptiveSparkPlan (120) +Subquery:1 Hosting operator id = 47 Hosting Expression = Subquery subquery#X, [id=#X] +AdaptiveSparkPlan (136) +- == Final Plan == - VeloxColumnarToRowExec (102) - +- ^ ProjectExecTransformer (100) - +- ^ RegularHashAggregateExecTransformer (99) - +- ^ RegularHashAggregateExecTransformer (98) - +- ^ ProjectExecTransformer (97) - +- ^ ShuffledHashJoinExecTransformer Inner (96) - :- ^ InputIteratorTransformer (91) - : +- ^ InputAdapter (90) - : +- ^ ShuffleQueryStage (89), Statistics(X) - : +- ColumnarExchange (88) - : +- ^ ProjectExecTransformer (86) - : +- ^ ShuffledHashJoinExecTransformer Inner (85) - : :- ^ InputIteratorTransformer (80) - : : +- ^ InputAdapter (79) - : : +- ^ ShuffleQueryStage (78), Statistics(X) - : : +- ColumnarExchange (77) - : : +- ^ ProjectExecTransformer (75) - : : +- ^ NoopFilter (74) - : : +- ^ Scan parquet (73) - : +- ^ InputIteratorTransformer (84) - : +- ^ InputAdapter (83) - : +- ^ ShuffleQueryStage (82), Statistics(X) - : +- ReusedExchange (81) - +- ^ InputIteratorTransformer (95) - +- ^ InputAdapter (94) - +- ^ ShuffleQueryStage (93), Statistics(X) - +- ReusedExchange (92) + VeloxColumnarToRowExec (114) + +- ^ ProjectExecTransformer (112) + +- ^ RegularHashAggregateExecTransformer (111) + +- ^ RegularHashAggregateExecTransformer (110) + +- ^ ProjectExecTransformer (109) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (108) + :- ^ InputIteratorTransformer (103) + : +- ShuffleQueryStage (101), Statistics(X) + : +- ColumnarExchange (100) + : +- VeloxAppendBatches (99) + : +- ^ ProjectExecTransformer (97) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (96) + : :- ^ InputIteratorTransformer (91) + : : +- ShuffleQueryStage (89), Statistics(X) + : : +- ColumnarExchange (88) + : : +- VeloxAppendBatches (87) + : : +- ^ ProjectExecTransformer (85) + : : +- ^ NoopFilter (84) + : : +- ^ Scan parquet (83) + : +- ^ InputIteratorTransformer (95) + : +- ShuffleQueryStage (93), Statistics(X) + : +- ReusedExchange (92) + +- ^ InputIteratorTransformer (107) + +- ShuffleQueryStage (105), Statistics(X) + +- ReusedExchange (104) +- == Initial Plan == - HashAggregate (119) - +- HashAggregate (118) - +- Project (117) - +- ShuffledHashJoin Inner BuildRight (116) - :- Exchange (111) - : +- Project (110) - : +- ShuffledHashJoin Inner BuildRight (109) - : :- Exchange (105) - : : +- Filter (104) - : : +- Scan parquet (103) - : +- Exchange (108) - : +- Filter (107) - : +- Scan parquet (106) - +- Exchange (115) - +- Project (114) - +- Filter (113) - +- Scan parquet (112) - - -(73) Scan parquet + HashAggregate (135) + +- HashAggregate (134) + +- Project (133) + +- SortMergeJoin Inner (132) + :- Sort (126) + : +- Exchange (125) + : +- Project (124) + : +- SortMergeJoin Inner (123) + : :- Sort (118) + : : +- Exchange (117) + : : +- Filter (116) + : : +- Scan parquet (115) + : +- Sort (122) + : +- Exchange (121) + : +- Filter (120) + : +- Scan parquet (119) + +- Sort (131) + +- Exchange (130) + +- Project (129) + +- Filter (128) + +- Scan parquet (127) + + +(83) Scan parquet Output [3]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_suppkey)] ReadSchema: struct -(74) NoopFilter +(84) NoopFilter Input [3]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Arguments: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] -(75) ProjectExecTransformer +(85) ProjectExecTransformer Output [4]: [hash(ps_suppkey#X, 42) AS hash_partition_key#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Input [3]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] -(76) WholeStageCodegenTransformer (X) +(86) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Arguments: false -(77) ColumnarExchange +(87) VeloxAppendBatches +Input [4]: [hash_partition_key#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] +Arguments: X + +(88) ColumnarExchange Input [4]: [hash_partition_key#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_suppkey#X, ps_availqty#X, ps_supplycost#X], [plan_id=X], [id=#X] -(78) ShuffleQueryStage +(89) ShuffleQueryStage Output [3]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Arguments: X -(79) InputAdapter +(90) InputAdapter Input [3]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] -(80) InputIteratorTransformer +(91) InputIteratorTransformer Input [3]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] -(81) ReusedExchange [Reuses operator id: 13] +(92) ReusedExchange [Reuses operator id: 15] Output [2]: [s_suppkey#X, s_nationkey#X] -(82) ShuffleQueryStage +(93) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] Arguments: X -(83) InputAdapter +(94) InputAdapter Input [2]: [s_suppkey#X, s_nationkey#X] -(84) InputIteratorTransformer +(95) InputIteratorTransformer Input [2]: [s_suppkey#X, s_nationkey#X] -(85) ShuffledHashJoinExecTransformer +(96) ShuffledHashJoinExecTransformer Left keys [1]: [ps_suppkey#X] Right keys [1]: [s_suppkey#X] Join condition: None -(86) ProjectExecTransformer +(97) ProjectExecTransformer Output [4]: [hash(s_nationkey#X, 42) AS hash_partition_key#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] Input [5]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X, s_suppkey#X, s_nationkey#X] -(87) WholeStageCodegenTransformer (X) +(98) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] Arguments: false -(88) ColumnarExchange +(99) VeloxAppendBatches +Input [4]: [hash_partition_key#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] +Arguments: X + +(100) ColumnarExchange Input [4]: [hash_partition_key#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [ps_availqty#X, ps_supplycost#X, s_nationkey#X], [plan_id=X], [id=#X] -(89) ShuffleQueryStage +(101) ShuffleQueryStage Output [3]: [ps_availqty#X, ps_supplycost#X, s_nationkey#X] Arguments: X -(90) InputAdapter +(102) InputAdapter Input [3]: [ps_availqty#X, ps_supplycost#X, s_nationkey#X] -(91) InputIteratorTransformer +(103) InputIteratorTransformer Input [3]: [ps_availqty#X, ps_supplycost#X, s_nationkey#X] -(92) ReusedExchange [Reuses operator id: 28] +(104) ReusedExchange [Reuses operator id: 32] Output [1]: [n_nationkey#X] -(93) ShuffleQueryStage +(105) ShuffleQueryStage Output [1]: [n_nationkey#X] Arguments: X -(94) InputAdapter +(106) InputAdapter Input [1]: [n_nationkey#X] -(95) InputIteratorTransformer +(107) InputIteratorTransformer Input [1]: [n_nationkey#X] -(96) ShuffledHashJoinExecTransformer +(108) ShuffledHashJoinExecTransformer Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(97) ProjectExecTransformer +(109) ProjectExecTransformer Output [3]: [ps_availqty#X, ps_supplycost#X, CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)) AS _pre_X#X] Input [4]: [ps_availqty#X, ps_supplycost#X, s_nationkey#X, n_nationkey#X] -(98) RegularHashAggregateExecTransformer +(110) RegularHashAggregateExecTransformer Input [3]: [ps_availqty#X, ps_supplycost#X, _pre_X#X] Keys: [] Functions [1]: [partial_sum(_pre_X#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(99) RegularHashAggregateExecTransformer +(111) RegularHashAggregateExecTransformer Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))#X] Results [1]: [sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))#X] -(100) ProjectExecTransformer +(112) ProjectExecTransformer Output [1]: [CheckOverflow((promote_precision(cast(sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))#X as decimal(38,10))) * 0.0001000000), DecimalType(38,6)) AS (sum((ps_supplycost * ps_availqty)) * 0.0001000000)#X] Input [1]: [sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))#X] -(101) WholeStageCodegenTransformer (X) +(113) WholeStageCodegenTransformer (X) Input [1]: [(sum((ps_supplycost * ps_availqty)) * 0.0001000000)#X] Arguments: false -(102) VeloxColumnarToRowExec +(114) VeloxColumnarToRowExec Input [1]: [(sum((ps_supplycost * ps_availqty)) * 0.0001000000)#X] -(103) Scan parquet +(115) Scan parquet Output [3]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_suppkey)] ReadSchema: struct -(104) Filter +(116) Filter Input [3]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Condition : isnotnull(ps_suppkey#X) -(105) Exchange +(117) Exchange Input [3]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(106) Scan parquet +(118) Sort +Input [3]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] +Arguments: [ps_suppkey#X ASC NULLS FIRST], false, 0 + +(119) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(107) Filter +(120) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(108) Exchange +(121) Exchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(109) ShuffledHashJoin +(122) Sort +Input [2]: [s_suppkey#X, s_nationkey#X] +Arguments: [s_suppkey#X ASC NULLS FIRST], false, 0 + +(123) SortMergeJoin Left keys [1]: [ps_suppkey#X] Right keys [1]: [s_suppkey#X] Join condition: None -(110) Project +(124) Project Output [3]: [ps_availqty#X, ps_supplycost#X, s_nationkey#X] Input [5]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X, s_suppkey#X, s_nationkey#X] -(111) Exchange +(125) Exchange Input [3]: [ps_availqty#X, ps_supplycost#X, s_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(112) Scan parquet +(126) Sort +Input [3]: [ps_availqty#X, ps_supplycost#X, s_nationkey#X] +Arguments: [s_nationkey#X ASC NULLS FIRST], false, 0 + +(127) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,GERMANY), IsNotNull(n_nationkey)] ReadSchema: struct -(113) Filter +(128) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : ((isnotnull(n_name#X) AND (n_name#X = GERMANY)) AND isnotnull(n_nationkey#X)) -(114) Project +(129) Project Output [1]: [n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(115) Exchange +(130) Exchange Input [1]: [n_nationkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(116) ShuffledHashJoin +(131) Sort +Input [1]: [n_nationkey#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(132) SortMergeJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(117) Project +(133) Project Output [2]: [ps_availqty#X, ps_supplycost#X] Input [4]: [ps_availqty#X, ps_supplycost#X, s_nationkey#X, n_nationkey#X] -(118) HashAggregate +(134) HashAggregate Input [2]: [ps_availqty#X, ps_supplycost#X] Keys: [] Functions [1]: [partial_sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(119) HashAggregate +(135) HashAggregate Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))#X] Results [1]: [CheckOverflow((promote_precision(cast(sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))#X as decimal(38,10))) * 0.0001000000), DecimalType(38,6)) AS (sum((ps_supplycost * ps_availqty)) * 0.0001000000)#X] -(120) AdaptiveSparkPlan +(136) AdaptiveSparkPlan Output [1]: [(sum((ps_supplycost * ps_availqty)) * 0.0001000000)#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/12.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/12.txt index ec56c2f995435..b0f084e2d0486 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/12.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/12.txt @@ -1,50 +1,52 @@ == Physical Plan == -AdaptiveSparkPlan (49) +AdaptiveSparkPlan (55) +- == Final Plan == - VeloxColumnarToRowExec (34) - +- ^ SortExecTransformer (32) - +- ^ InputIteratorTransformer (31) - +- ^ InputAdapter (30) - +- ^ ShuffleQueryStage (29), Statistics(X) - +- ColumnarExchange (28) - +- ^ RegularHashAggregateExecTransformer (26) - +- ^ InputIteratorTransformer (25) - +- ^ InputAdapter (24) - +- ^ ShuffleQueryStage (23), Statistics(X) - +- ColumnarExchange (22) - +- ^ ProjectExecTransformer (20) - +- ^ FlushableHashAggregateExecTransformer (19) - +- ^ ProjectExecTransformer (18) - +- ^ ShuffledHashJoinExecTransformer Inner (17) - :- ^ InputIteratorTransformer (8) - : +- ^ InputAdapter (7) - : +- ^ ShuffleQueryStage (6), Statistics(X) - : +- ColumnarExchange (5) + VeloxColumnarToRowExec (38) + +- ^ SortExecTransformer (36) + +- ^ InputIteratorTransformer (35) + +- ShuffleQueryStage (33), Statistics(X) + +- ColumnarExchange (32) + +- VeloxAppendBatches (31) + +- ^ RegularHashAggregateExecTransformer (29) + +- ^ InputIteratorTransformer (28) + +- ShuffleQueryStage (26), Statistics(X) + +- ColumnarExchange (25) + +- VeloxAppendBatches (24) + +- ^ ProjectExecTransformer (22) + +- ^ FlushableHashAggregateExecTransformer (21) + +- ^ ProjectExecTransformer (20) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (19) + :- ^ InputIteratorTransformer (9) + : +- ShuffleQueryStage (7), Statistics(X) + : +- ColumnarExchange (6) + : +- VeloxAppendBatches (5) : +- ^ ProjectExecTransformer (3) : +- ^ NoopFilter (2) : +- ^ Scan parquet (1) - +- ^ InputIteratorTransformer (16) - +- ^ InputAdapter (15) - +- ^ ShuffleQueryStage (14), Statistics(X) - +- ColumnarExchange (13) - +- ^ ProjectExecTransformer (11) - +- ^ NoopFilter (10) - +- ^ Scan parquet (9) + +- ^ InputIteratorTransformer (18) + +- ShuffleQueryStage (16), Statistics(X) + +- ColumnarExchange (15) + +- VeloxAppendBatches (14) + +- ^ ProjectExecTransformer (12) + +- ^ NoopFilter (11) + +- ^ Scan parquet (10) +- == Initial Plan == - Sort (48) - +- Exchange (47) - +- HashAggregate (46) - +- Exchange (45) - +- HashAggregate (44) - +- Project (43) - +- ShuffledHashJoin Inner BuildLeft (42) - :- Exchange (37) - : +- Filter (36) - : +- Scan parquet (35) - +- Exchange (41) - +- Project (40) - +- Filter (39) - +- Scan parquet (38) + Sort (54) + +- Exchange (53) + +- HashAggregate (52) + +- Exchange (51) + +- HashAggregate (50) + +- Project (49) + +- SortMergeJoin Inner (48) + :- Sort (42) + : +- Exchange (41) + : +- Filter (40) + : +- Scan parquet (39) + +- Sort (47) + +- Exchange (46) + +- Project (45) + +- Filter (44) + +- Scan parquet (43) (1) Scan parquet @@ -66,196 +68,220 @@ Input [2]: [o_orderkey#X, o_orderpriority#X] Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderpriority#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderpriority#X] +Arguments: X + +(6) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderpriority#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderpriority#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [2]: [o_orderkey#X, o_orderpriority#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [2]: [o_orderkey#X, o_orderpriority#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [2]: [o_orderkey#X, o_orderpriority#X] -(9) Scan parquet +(10) Scan parquet Output [5]: [l_orderkey#X, l_shipdate#X, l_commitdate#X, l_receiptdate#X, l_shipmode#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_commitdate), IsNotNull(l_receiptdate), IsNotNull(l_shipdate), In(l_shipmode, [MAIL,SHIP]), GreaterThanOrEqual(l_receiptdate,1994-01-01), LessThan(l_receiptdate,1995-01-01), IsNotNull(l_orderkey)] ReadSchema: struct -(10) NoopFilter +(11) NoopFilter Input [5]: [l_orderkey#X, l_shipdate#X, l_commitdate#X, l_receiptdate#X, l_shipmode#X] Arguments: [l_orderkey#X, l_shipdate#X, l_commitdate#X, l_receiptdate#X, l_shipmode#X] -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [3]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_shipmode#X] Input [5]: [l_orderkey#X, l_shipdate#X, l_commitdate#X, l_receiptdate#X, l_shipmode#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, l_orderkey#X, l_shipmode#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [3]: [hash_partition_key#X, l_orderkey#X, l_shipmode#X] +Arguments: X + +(15) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_shipmode#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_shipmode#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [2]: [l_orderkey#X, l_shipmode#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [2]: [l_orderkey#X, l_shipmode#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [2]: [l_orderkey#X, l_shipmode#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [4]: [o_orderpriority#X, l_shipmode#X, CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END AS _pre_X#X, CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END AS _pre_X#X] Input [4]: [o_orderkey#X, o_orderpriority#X, l_orderkey#X, l_shipmode#X] -(19) FlushableHashAggregateExecTransformer +(21) FlushableHashAggregateExecTransformer Input [4]: [o_orderpriority#X, l_shipmode#X, _pre_X#X, _pre_X#X] Keys [1]: [l_shipmode#X] Functions [2]: [partial_sum(_pre_X#X), partial_sum(_pre_X#X)] Aggregate Attributes [2]: [sum#X, sum#X] Results [3]: [l_shipmode#X, sum#X, sum#X] -(20) ProjectExecTransformer +(22) ProjectExecTransformer Output [4]: [hash(l_shipmode#X, 42) AS hash_partition_key#X, l_shipmode#X, sum#X, sum#X] Input [3]: [l_shipmode#X, sum#X, sum#X] -(21) WholeStageCodegenTransformer (X) +(23) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, l_shipmode#X, sum#X, sum#X] Arguments: false -(22) ColumnarExchange +(24) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_shipmode#X, sum#X, sum#X] +Arguments: X + +(25) ColumnarExchange Input [4]: [hash_partition_key#X, l_shipmode#X, sum#X, sum#X] Arguments: hashpartitioning(l_shipmode#X, 1), ENSURE_REQUIREMENTS, [l_shipmode#X, sum#X, sum#X], [plan_id=X], [id=#X] -(23) ShuffleQueryStage +(26) ShuffleQueryStage Output [3]: [l_shipmode#X, sum#X, sum#X] Arguments: X -(24) InputAdapter +(27) InputAdapter Input [3]: [l_shipmode#X, sum#X, sum#X] -(25) InputIteratorTransformer +(28) InputIteratorTransformer Input [3]: [l_shipmode#X, sum#X, sum#X] -(26) RegularHashAggregateExecTransformer +(29) RegularHashAggregateExecTransformer Input [3]: [l_shipmode#X, sum#X, sum#X] Keys [1]: [l_shipmode#X] Functions [2]: [sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END), sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)] Aggregate Attributes [2]: [sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X, sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X] Results [3]: [l_shipmode#X, sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X AS high_line_count#X, sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X AS low_line_count#X] -(27) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: false -(28) ColumnarExchange +(31) VeloxAppendBatches +Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] +Arguments: X + +(32) ColumnarExchange Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: rangepartitioning(l_shipmode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(29) ShuffleQueryStage +(33) ShuffleQueryStage Output [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: X -(30) InputAdapter +(34) InputAdapter Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] -(31) InputIteratorTransformer +(35) InputIteratorTransformer Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] -(32) SortExecTransformer +(36) SortExecTransformer Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: [l_shipmode#X ASC NULLS FIRST], true, 0 -(33) WholeStageCodegenTransformer (X) +(37) WholeStageCodegenTransformer (X) Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: false -(34) VeloxColumnarToRowExec +(38) VeloxColumnarToRowExec Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] -(35) Scan parquet +(39) Scan parquet Output [2]: [o_orderkey#X, o_orderpriority#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderkey)] ReadSchema: struct -(36) Filter +(40) Filter Input [2]: [o_orderkey#X, o_orderpriority#X] Condition : isnotnull(o_orderkey#X) -(37) Exchange +(41) Exchange Input [2]: [o_orderkey#X, o_orderpriority#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(38) Scan parquet +(42) Sort +Input [2]: [o_orderkey#X, o_orderpriority#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(43) Scan parquet Output [5]: [l_orderkey#X, l_shipdate#X, l_commitdate#X, l_receiptdate#X, l_shipmode#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_commitdate), IsNotNull(l_receiptdate), IsNotNull(l_shipdate), In(l_shipmode, [MAIL,SHIP]), GreaterThanOrEqual(l_receiptdate,1994-01-01), LessThan(l_receiptdate,1995-01-01), IsNotNull(l_orderkey)] ReadSchema: struct -(39) Filter +(44) Filter Input [5]: [l_orderkey#X, l_shipdate#X, l_commitdate#X, l_receiptdate#X, l_shipmode#X] Condition : ((((((((isnotnull(l_commitdate#X) AND isnotnull(l_receiptdate#X)) AND isnotnull(l_shipdate#X)) AND l_shipmode#X IN (MAIL,SHIP)) AND (l_commitdate#X < l_receiptdate#X)) AND (l_shipdate#X < l_commitdate#X)) AND (l_receiptdate#X >= 1994-01-01)) AND (l_receiptdate#X < 1995-01-01)) AND isnotnull(l_orderkey#X)) -(40) Project +(45) Project Output [2]: [l_orderkey#X, l_shipmode#X] Input [5]: [l_orderkey#X, l_shipdate#X, l_commitdate#X, l_receiptdate#X, l_shipmode#X] -(41) Exchange +(46) Exchange Input [2]: [l_orderkey#X, l_shipmode#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(42) ShuffledHashJoin +(47) Sort +Input [2]: [l_orderkey#X, l_shipmode#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(48) SortMergeJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(43) Project +(49) Project Output [2]: [o_orderpriority#X, l_shipmode#X] Input [4]: [o_orderkey#X, o_orderpriority#X, l_orderkey#X, l_shipmode#X] -(44) HashAggregate +(50) HashAggregate Input [2]: [o_orderpriority#X, l_shipmode#X] Keys [1]: [l_shipmode#X] Functions [2]: [partial_sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END), partial_sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)] Aggregate Attributes [2]: [sum#X, sum#X] Results [3]: [l_shipmode#X, sum#X, sum#X] -(45) Exchange +(51) Exchange Input [3]: [l_shipmode#X, sum#X, sum#X] Arguments: hashpartitioning(l_shipmode#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(46) HashAggregate +(52) HashAggregate Input [3]: [l_shipmode#X, sum#X, sum#X] Keys [1]: [l_shipmode#X] Functions [2]: [sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END), sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)] Aggregate Attributes [2]: [sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X, sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X] Results [3]: [l_shipmode#X, sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X AS high_line_count#X, sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X AS low_line_count#X] -(47) Exchange +(53) Exchange Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: rangepartitioning(l_shipmode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(48) Sort +(54) Sort Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: [l_shipmode#X ASC NULLS FIRST], true, 0 -(49) AdaptiveSparkPlan +(55) AdaptiveSparkPlan Output [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/13.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/13.txt index f7d8f3c3b72f2..07c32ff95fb1e 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/13.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/13.txt @@ -1,53 +1,55 @@ == Physical Plan == -AdaptiveSparkPlan (52) +AdaptiveSparkPlan (58) +- == Final Plan == - VeloxColumnarToRowExec (36) - +- ^ SortExecTransformer (34) - +- ^ InputIteratorTransformer (33) - +- ^ InputAdapter (32) - +- ^ ShuffleQueryStage (31), Statistics(X) - +- ColumnarExchange (30) - +- ^ RegularHashAggregateExecTransformer (28) - +- ^ InputIteratorTransformer (27) - +- ^ InputAdapter (26) - +- ^ ShuffleQueryStage (25), Statistics(X) - +- ColumnarExchange (24) - +- ^ ProjectExecTransformer (22) - +- ^ FlushableHashAggregateExecTransformer (21) - +- ^ ProjectExecTransformer (20) - +- ^ RegularHashAggregateExecTransformer (19) - +- ^ RegularHashAggregateExecTransformer (18) - +- ^ ProjectExecTransformer (17) - +- ^ ShuffledHashJoinExecTransformer LeftOuter (16) - :- ^ InputIteratorTransformer (7) - : +- ^ InputAdapter (6) - : +- ^ ShuffleQueryStage (5), Statistics(X) - : +- ColumnarExchange (4) + VeloxColumnarToRowExec (40) + +- ^ SortExecTransformer (38) + +- ^ InputIteratorTransformer (37) + +- ShuffleQueryStage (35), Statistics(X) + +- ColumnarExchange (34) + +- VeloxAppendBatches (33) + +- ^ RegularHashAggregateExecTransformer (31) + +- ^ InputIteratorTransformer (30) + +- ShuffleQueryStage (28), Statistics(X) + +- ColumnarExchange (27) + +- VeloxAppendBatches (26) + +- ^ ProjectExecTransformer (24) + +- ^ FlushableHashAggregateExecTransformer (23) + +- ^ ProjectExecTransformer (22) + +- ^ RegularHashAggregateExecTransformer (21) + +- ^ RegularHashAggregateExecTransformer (20) + +- ^ ProjectExecTransformer (19) + +- ^ ShuffledHashJoinExecTransformer LeftOuter BuildLeft (18) + :- ^ InputIteratorTransformer (8) + : +- ShuffleQueryStage (6), Statistics(X) + : +- ColumnarExchange (5) + : +- VeloxAppendBatches (4) : +- ^ ProjectExecTransformer (2) : +- ^ Scan parquet (1) - +- ^ InputIteratorTransformer (15) - +- ^ InputAdapter (14) - +- ^ ShuffleQueryStage (13), Statistics(X) - +- ColumnarExchange (12) - +- ^ ProjectExecTransformer (10) - +- ^ NoopFilter (9) - +- ^ Scan parquet (8) + +- ^ InputIteratorTransformer (17) + +- ShuffleQueryStage (15), Statistics(X) + +- ColumnarExchange (14) + +- VeloxAppendBatches (13) + +- ^ ProjectExecTransformer (11) + +- ^ NoopFilter (10) + +- ^ Scan parquet (9) +- == Initial Plan == - Sort (51) - +- Exchange (50) - +- HashAggregate (49) - +- Exchange (48) - +- HashAggregate (47) - +- HashAggregate (46) - +- HashAggregate (45) - +- Project (44) - +- ShuffledHashJoin LeftOuter BuildRight (43) - :- Exchange (38) - : +- Scan parquet (37) - +- Exchange (42) - +- Project (41) - +- Filter (40) - +- Scan parquet (39) + Sort (57) + +- Exchange (56) + +- HashAggregate (55) + +- Exchange (54) + +- HashAggregate (53) + +- HashAggregate (52) + +- HashAggregate (51) + +- Project (50) + +- SortMergeJoin LeftOuter (49) + :- Sort (43) + : +- Exchange (42) + : +- Scan parquet (41) + +- Sort (48) + +- Exchange (47) + +- Project (46) + +- Filter (45) + +- Scan parquet (44) (1) Scan parquet @@ -64,223 +66,247 @@ Input [1]: [c_custkey#X] Input [2]: [hash_partition_key#X, c_custkey#X] Arguments: false -(4) ColumnarExchange +(4) VeloxAppendBatches +Input [2]: [hash_partition_key#X, c_custkey#X] +Arguments: X + +(5) ColumnarExchange Input [2]: [hash_partition_key#X, c_custkey#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X], [plan_id=X], [id=#X] -(5) ShuffleQueryStage +(6) ShuffleQueryStage Output [1]: [c_custkey#X] Arguments: X -(6) InputAdapter +(7) InputAdapter Input [1]: [c_custkey#X] -(7) InputIteratorTransformer +(8) InputIteratorTransformer Input [1]: [c_custkey#X] -(8) Scan parquet +(9) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_comment#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_comment), IsNotNull(o_custkey)] ReadSchema: struct -(9) NoopFilter +(10) NoopFilter Input [3]: [o_orderkey#X, o_custkey#X, o_comment#X] Arguments: [o_orderkey#X, o_custkey#X, o_comment#X] -(10) ProjectExecTransformer +(11) ProjectExecTransformer Output [3]: [hash(o_custkey#X, 42) AS hash_partition_key#X, o_orderkey#X, o_custkey#X] Input [3]: [o_orderkey#X, o_custkey#X, o_comment#X] -(11) WholeStageCodegenTransformer (X) +(12) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] Arguments: false -(12) ColumnarExchange +(13) VeloxAppendBatches +Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] +Arguments: X + +(14) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [id=#X] -(13) ShuffleQueryStage +(15) ShuffleQueryStage Output [2]: [o_orderkey#X, o_custkey#X] Arguments: X -(14) InputAdapter +(16) InputAdapter Input [2]: [o_orderkey#X, o_custkey#X] -(15) InputIteratorTransformer +(17) InputIteratorTransformer Input [2]: [o_orderkey#X, o_custkey#X] -(16) ShuffledHashJoinExecTransformer +(18) ShuffledHashJoinExecTransformer Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(17) ProjectExecTransformer +(19) ProjectExecTransformer Output [2]: [c_custkey#X, o_orderkey#X] Input [3]: [c_custkey#X, o_orderkey#X, o_custkey#X] -(18) RegularHashAggregateExecTransformer +(20) RegularHashAggregateExecTransformer Input [2]: [c_custkey#X, o_orderkey#X] Keys [1]: [c_custkey#X] Functions [1]: [partial_count(o_orderkey#X)] Aggregate Attributes [1]: [count#X] Results [2]: [c_custkey#X, count#X] -(19) RegularHashAggregateExecTransformer +(21) RegularHashAggregateExecTransformer Input [2]: [c_custkey#X, count#X] Keys [1]: [c_custkey#X] Functions [1]: [count(o_orderkey#X)] Aggregate Attributes [1]: [count(o_orderkey#X)#X] Results [2]: [c_custkey#X, count(o_orderkey#X)#X] -(20) ProjectExecTransformer +(22) ProjectExecTransformer Output [1]: [count(o_orderkey#X)#X AS c_count#X] Input [2]: [c_custkey#X, count(o_orderkey#X)#X] -(21) FlushableHashAggregateExecTransformer +(23) FlushableHashAggregateExecTransformer Input [1]: [c_count#X] Keys [1]: [c_count#X] Functions [1]: [partial_count(1)] Aggregate Attributes [1]: [count#X] Results [2]: [c_count#X, count#X] -(22) ProjectExecTransformer +(24) ProjectExecTransformer Output [3]: [hash(c_count#X, 42) AS hash_partition_key#X, c_count#X, count#X] Input [2]: [c_count#X, count#X] -(23) WholeStageCodegenTransformer (X) +(25) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, c_count#X, count#X] Arguments: false -(24) ColumnarExchange +(26) VeloxAppendBatches +Input [3]: [hash_partition_key#X, c_count#X, count#X] +Arguments: X + +(27) ColumnarExchange Input [3]: [hash_partition_key#X, c_count#X, count#X] Arguments: hashpartitioning(c_count#X, 1), ENSURE_REQUIREMENTS, [c_count#X, count#X], [plan_id=X], [id=#X] -(25) ShuffleQueryStage +(28) ShuffleQueryStage Output [2]: [c_count#X, count#X] Arguments: X -(26) InputAdapter +(29) InputAdapter Input [2]: [c_count#X, count#X] -(27) InputIteratorTransformer +(30) InputIteratorTransformer Input [2]: [c_count#X, count#X] -(28) RegularHashAggregateExecTransformer +(31) RegularHashAggregateExecTransformer Input [2]: [c_count#X, count#X] Keys [1]: [c_count#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [c_count#X, count(1)#X AS custdist#X] -(29) WholeStageCodegenTransformer (X) +(32) WholeStageCodegenTransformer (X) Input [2]: [c_count#X, custdist#X] Arguments: false -(30) ColumnarExchange +(33) VeloxAppendBatches +Input [2]: [c_count#X, custdist#X] +Arguments: X + +(34) ColumnarExchange Input [2]: [c_count#X, custdist#X] Arguments: rangepartitioning(custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(31) ShuffleQueryStage +(35) ShuffleQueryStage Output [2]: [c_count#X, custdist#X] Arguments: X -(32) InputAdapter +(36) InputAdapter Input [2]: [c_count#X, custdist#X] -(33) InputIteratorTransformer +(37) InputIteratorTransformer Input [2]: [c_count#X, custdist#X] -(34) SortExecTransformer +(38) SortExecTransformer Input [2]: [c_count#X, custdist#X] Arguments: [custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST], true, 0 -(35) WholeStageCodegenTransformer (X) +(39) WholeStageCodegenTransformer (X) Input [2]: [c_count#X, custdist#X] Arguments: false -(36) VeloxColumnarToRowExec +(40) VeloxColumnarToRowExec Input [2]: [c_count#X, custdist#X] -(37) Scan parquet +(41) Scan parquet Output [1]: [c_custkey#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(38) Exchange +(42) Exchange Input [1]: [c_custkey#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(39) Scan parquet +(43) Sort +Input [1]: [c_custkey#X] +Arguments: [c_custkey#X ASC NULLS FIRST], false, 0 + +(44) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_comment#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_comment), IsNotNull(o_custkey)] ReadSchema: struct -(40) Filter +(45) Filter Input [3]: [o_orderkey#X, o_custkey#X, o_comment#X] Condition : ((isnotnull(o_comment#X) AND NOT o_comment#X LIKE %special%requests%) AND isnotnull(o_custkey#X)) -(41) Project +(46) Project Output [2]: [o_orderkey#X, o_custkey#X] Input [3]: [o_orderkey#X, o_custkey#X, o_comment#X] -(42) Exchange +(47) Exchange Input [2]: [o_orderkey#X, o_custkey#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(43) ShuffledHashJoin +(48) Sort +Input [2]: [o_orderkey#X, o_custkey#X] +Arguments: [o_custkey#X ASC NULLS FIRST], false, 0 + +(49) SortMergeJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(44) Project +(50) Project Output [2]: [c_custkey#X, o_orderkey#X] Input [3]: [c_custkey#X, o_orderkey#X, o_custkey#X] -(45) HashAggregate +(51) HashAggregate Input [2]: [c_custkey#X, o_orderkey#X] Keys [1]: [c_custkey#X] Functions [1]: [partial_count(o_orderkey#X)] Aggregate Attributes [1]: [count#X] Results [2]: [c_custkey#X, count#X] -(46) HashAggregate +(52) HashAggregate Input [2]: [c_custkey#X, count#X] Keys [1]: [c_custkey#X] Functions [1]: [count(o_orderkey#X)] Aggregate Attributes [1]: [count(o_orderkey#X)#X] Results [1]: [count(o_orderkey#X)#X AS c_count#X] -(47) HashAggregate +(53) HashAggregate Input [1]: [c_count#X] Keys [1]: [c_count#X] Functions [1]: [partial_count(1)] Aggregate Attributes [1]: [count#X] Results [2]: [c_count#X, count#X] -(48) Exchange +(54) Exchange Input [2]: [c_count#X, count#X] Arguments: hashpartitioning(c_count#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(49) HashAggregate +(55) HashAggregate Input [2]: [c_count#X, count#X] Keys [1]: [c_count#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [c_count#X, count(1)#X AS custdist#X] -(50) Exchange +(56) Exchange Input [2]: [c_count#X, custdist#X] Arguments: rangepartitioning(custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(51) Sort +(57) Sort Input [2]: [c_count#X, custdist#X] Arguments: [custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST], true, 0 -(52) AdaptiveSparkPlan +(58) AdaptiveSparkPlan Output [2]: [c_count#X, custdist#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/14.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/14.txt index 4f4492d881fda..11bbb2a71e795 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/14.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/14.txt @@ -1,38 +1,40 @@ == Physical Plan == -AdaptiveSparkPlan (35) +AdaptiveSparkPlan (39) +- == Final Plan == - VeloxColumnarToRowExec (23) - +- ^ ProjectExecTransformer (21) - +- ^ RegularHashAggregateExecTransformer (20) - +- ^ RegularHashAggregateExecTransformer (19) - +- ^ ProjectExecTransformer (18) - +- ^ ShuffledHashJoinExecTransformer Inner (17) - :- ^ InputIteratorTransformer (8) - : +- ^ InputAdapter (7) - : +- ^ ShuffleQueryStage (6), Statistics(X) - : +- ColumnarExchange (5) + VeloxColumnarToRowExec (25) + +- ^ ProjectExecTransformer (23) + +- ^ RegularHashAggregateExecTransformer (22) + +- ^ RegularHashAggregateExecTransformer (21) + +- ^ ProjectExecTransformer (20) + +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (19) + :- ^ InputIteratorTransformer (9) + : +- ShuffleQueryStage (7), Statistics(X) + : +- ColumnarExchange (6) + : +- VeloxAppendBatches (5) : +- ^ ProjectExecTransformer (3) : +- ^ NoopFilter (2) : +- ^ Scan parquet (1) - +- ^ InputIteratorTransformer (16) - +- ^ InputAdapter (15) - +- ^ ShuffleQueryStage (14), Statistics(X) - +- ColumnarExchange (13) - +- ^ ProjectExecTransformer (11) - +- ^ NoopFilter (10) - +- ^ Scan parquet (9) + +- ^ InputIteratorTransformer (18) + +- ShuffleQueryStage (16), Statistics(X) + +- ColumnarExchange (15) + +- VeloxAppendBatches (14) + +- ^ ProjectExecTransformer (12) + +- ^ NoopFilter (11) + +- ^ Scan parquet (10) +- == Initial Plan == - HashAggregate (34) - +- HashAggregate (33) - +- Project (32) - +- ShuffledHashJoin Inner BuildRight (31) - :- Exchange (27) - : +- Project (26) - : +- Filter (25) - : +- Scan parquet (24) - +- Exchange (30) - +- Filter (29) - +- Scan parquet (28) + HashAggregate (38) + +- HashAggregate (37) + +- Project (36) + +- SortMergeJoin Inner (35) + :- Sort (30) + : +- Exchange (29) + : +- Project (28) + : +- Filter (27) + : +- Scan parquet (26) + +- Sort (34) + +- Exchange (33) + +- Filter (32) + +- Scan parquet (31) (1) Scan parquet @@ -54,144 +56,160 @@ Input [4]: [l_partkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Input [4]: [hash_partition_key#X, l_partkey#X, l_extendedprice#X, l_discount#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_partkey#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(6) ColumnarExchange Input [4]: [hash_partition_key#X, l_partkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [3]: [l_partkey#X, l_extendedprice#X, l_discount#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [3]: [l_partkey#X, l_extendedprice#X, l_discount#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [3]: [l_partkey#X, l_extendedprice#X, l_discount#X] -(9) Scan parquet +(10) Scan parquet Output [2]: [p_partkey#X, p_type#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_partkey)] ReadSchema: struct -(10) NoopFilter +(11) NoopFilter Input [2]: [p_partkey#X, p_type#X] Arguments: [p_partkey#X, p_type#X] -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [3]: [hash(p_partkey#X, 42) AS hash_partition_key#X, p_partkey#X, p_type#X] Input [2]: [p_partkey#X, p_type#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, p_partkey#X, p_type#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [3]: [hash_partition_key#X, p_partkey#X, p_type#X] +Arguments: X + +(15) ColumnarExchange Input [3]: [hash_partition_key#X, p_partkey#X, p_type#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X, p_type#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [2]: [p_partkey#X, p_type#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [2]: [p_partkey#X, p_type#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [2]: [p_partkey#X, p_type#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [5]: [l_extendedprice#X, l_discount#X, p_type#X, CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) ELSE 0.0000 END AS _pre_X#X, CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) AS _pre_X#X] Input [5]: [l_partkey#X, l_extendedprice#X, l_discount#X, p_partkey#X, p_type#X] -(19) RegularHashAggregateExecTransformer +(21) RegularHashAggregateExecTransformer Input [5]: [l_extendedprice#X, l_discount#X, p_type#X, _pre_X#X, _pre_X#X] Keys: [] Functions [2]: [partial_sum(_pre_X#X), partial_sum(_pre_X#X)] Aggregate Attributes [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Results [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] -(20) RegularHashAggregateExecTransformer +(22) RegularHashAggregateExecTransformer Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Keys: [] Functions [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) ELSE 0.0000 END), sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) ELSE 0.0000 END)#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] Results [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) ELSE 0.0000 END)#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] -(21) ProjectExecTransformer +(23) ProjectExecTransformer Output [1]: [CheckOverflow((promote_precision(CheckOverflow((100.0000 * promote_precision(sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) ELSE 0.0000 END)#X)), DecimalType(38,6))) / promote_precision(cast(sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X as decimal(38,6)))), DecimalType(38,6)) AS promo_revenue#X] Input [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) ELSE 0.0000 END)#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] -(22) WholeStageCodegenTransformer (X) +(24) WholeStageCodegenTransformer (X) Input [1]: [promo_revenue#X] Arguments: false -(23) VeloxColumnarToRowExec +(25) VeloxColumnarToRowExec Input [1]: [promo_revenue#X] -(24) Scan parquet +(26) Scan parquet Output [4]: [l_partkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1995-09-01), LessThan(l_shipdate,1995-10-01), IsNotNull(l_partkey)] ReadSchema: struct -(25) Filter +(27) Filter Input [4]: [l_partkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : (((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1995-09-01)) AND (l_shipdate#X < 1995-10-01)) AND isnotnull(l_partkey#X)) -(26) Project +(28) Project Output [3]: [l_partkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_partkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(27) Exchange +(29) Exchange Input [3]: [l_partkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(28) Scan parquet +(30) Sort +Input [3]: [l_partkey#X, l_extendedprice#X, l_discount#X] +Arguments: [l_partkey#X ASC NULLS FIRST], false, 0 + +(31) Scan parquet Output [2]: [p_partkey#X, p_type#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_partkey)] ReadSchema: struct -(29) Filter +(32) Filter Input [2]: [p_partkey#X, p_type#X] Condition : isnotnull(p_partkey#X) -(30) Exchange +(33) Exchange Input [2]: [p_partkey#X, p_type#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(31) ShuffledHashJoin +(34) Sort +Input [2]: [p_partkey#X, p_type#X] +Arguments: [p_partkey#X ASC NULLS FIRST], false, 0 + +(35) SortMergeJoin Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(32) Project +(36) Project Output [3]: [l_extendedprice#X, l_discount#X, p_type#X] Input [5]: [l_partkey#X, l_extendedprice#X, l_discount#X, p_partkey#X, p_type#X] -(33) HashAggregate +(37) HashAggregate Input [3]: [l_extendedprice#X, l_discount#X, p_type#X] Keys: [] Functions [2]: [partial_sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) ELSE 0.0000 END), partial_sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Results [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] -(34) HashAggregate +(38) HashAggregate Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Keys: [] Functions [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) ELSE 0.0000 END), sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) ELSE 0.0000 END)#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] Results [1]: [CheckOverflow((promote_precision(CheckOverflow((100.0000 * promote_precision(sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) ELSE 0.0000 END)#X)), DecimalType(38,6))) / promote_precision(cast(sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X as decimal(38,6)))), DecimalType(38,6)) AS promo_revenue#X] -(35) AdaptiveSparkPlan +(39) AdaptiveSparkPlan Output [1]: [promo_revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/15.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/15.txt index 98249fae7dd7d..be97f58cf4386 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/15.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/15.txt @@ -1,45 +1,48 @@ == Physical Plan == -AdaptiveSparkPlan (42) +AdaptiveSparkPlan (47) +- == Final Plan == - VeloxColumnarToRowExec (27) - +- AQEShuffleRead (26) - +- ShuffleQueryStage (25), Statistics(X) - +- ColumnarExchange (24) - +- ^ ProjectExecTransformer (22) - +- ^ ShuffledHashJoinExecTransformer Inner (21) - :- ^ InputIteratorTransformer (8) - : +- ^ InputAdapter (7) - : +- ^ ShuffleQueryStage (6), Statistics(X) - : +- ColumnarExchange (5) - : +- ^ ProjectExecTransformer (3) - : +- ^ NoopFilter (2) - : +- ^ Scan parquet (1) - +- ^ FilterExecTransformer (20) - +- ^ RegularHashAggregateExecTransformer (19) - +- ^ InputIteratorTransformer (18) - +- ^ InputAdapter (17) - +- ^ ShuffleQueryStage (16), Statistics(X) - +- ColumnarExchange (15) - +- ^ ProjectExecTransformer (13) - +- ^ FlushableHashAggregateExecTransformer (12) - +- ^ ProjectExecTransformer (11) - +- ^ NoopFilter (10) - +- ^ Scan parquet (9) + VeloxColumnarToRowExec (30) + +- AQEShuffleRead (29) + +- ShuffleQueryStage (28), Statistics(X) + +- ColumnarExchange (27) + +- VeloxAppendBatches (26) + +- ^ ProjectExecTransformer (24) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (23) + :- ^ InputIteratorTransformer (9) + : +- ShuffleQueryStage (7), Statistics(X) + : +- ColumnarExchange (6) + : +- VeloxAppendBatches (5) + : +- ^ ProjectExecTransformer (3) + : +- ^ NoopFilter (2) + : +- ^ Scan parquet (1) + +- ^ FilterExecTransformer (22) + +- ^ RegularHashAggregateExecTransformer (21) + +- ^ InputIteratorTransformer (20) + +- ShuffleQueryStage (18), Statistics(X) + +- ColumnarExchange (17) + +- VeloxAppendBatches (16) + +- ^ ProjectExecTransformer (14) + +- ^ FlushableHashAggregateExecTransformer (13) + +- ^ ProjectExecTransformer (12) + +- ^ NoopFilter (11) + +- ^ Scan parquet (10) +- == Initial Plan == - Sort (41) - +- Exchange (40) - +- Project (39) - +- ShuffledHashJoin Inner BuildLeft (38) - :- Exchange (30) - : +- Filter (29) - : +- Scan parquet (28) - +- Filter (37) - +- HashAggregate (36) - +- Exchange (35) - +- HashAggregate (34) - +- Project (33) - +- Filter (32) - +- Scan parquet (31) + Sort (46) + +- Exchange (45) + +- Project (44) + +- SortMergeJoin Inner (43) + :- Sort (34) + : +- Exchange (33) + : +- Filter (32) + : +- Scan parquet (31) + +- Sort (42) + +- Filter (41) + +- HashAggregate (40) + +- Exchange (39) + +- HashAggregate (38) + +- Project (37) + +- Filter (36) + +- Scan parquet (35) (1) Scan parquet @@ -61,329 +64,353 @@ Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_phone#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_phone#X] +Arguments: X + +(6) ColumnarExchange Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_phone#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_address#X, s_phone#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] -(9) Scan parquet +(10) Scan parquet Output [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1996-01-01), LessThan(l_shipdate,1996-04-01), IsNotNull(l_suppkey)] ReadSchema: struct -(10) NoopFilter +(11) NoopFilter Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) AS _pre_X#X] Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(12) FlushableHashAggregateExecTransformer +(13) FlushableHashAggregateExecTransformer Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, _pre_X#X] Keys [1]: [l_suppkey#X] Functions [1]: [partial_sum(_pre_X#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_suppkey#X, sum#X, isEmpty#X] -(13) ProjectExecTransformer +(14) ProjectExecTransformer Output [4]: [hash(l_suppkey#X, 42) AS hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] Input [3]: [l_suppkey#X, sum#X, isEmpty#X] -(14) WholeStageCodegenTransformer (X) +(15) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: false -(15) ColumnarExchange +(16) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] +Arguments: X + +(17) ColumnarExchange Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(16) ShuffleQueryStage +(18) ShuffleQueryStage Output [3]: [l_suppkey#X, sum#X, isEmpty#X] Arguments: X -(17) InputAdapter +(19) InputAdapter Input [3]: [l_suppkey#X, sum#X, isEmpty#X] -(18) InputIteratorTransformer +(20) InputIteratorTransformer Input [3]: [l_suppkey#X, sum#X, isEmpty#X] -(19) RegularHashAggregateExecTransformer +(21) RegularHashAggregateExecTransformer Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Keys [1]: [l_suppkey#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] Results [2]: [l_suppkey#X AS supplier_no#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X AS total_revenue#X] -(20) FilterExecTransformer +(22) FilterExecTransformer Input [2]: [supplier_no#X, total_revenue#X] Arguments: (isnotnull(total_revenue#X) AND (total_revenue#X = Subquery subquery#X, [id=#X])) -(21) ShuffledHashJoinExecTransformer +(23) ShuffledHashJoinExecTransformer Left keys [1]: [s_suppkey#X] Right keys [1]: [supplier_no#X] Join condition: None -(22) ProjectExecTransformer +(24) ProjectExecTransformer Output [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Input [6]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, supplier_no#X, total_revenue#X] -(23) WholeStageCodegenTransformer (X) +(25) WholeStageCodegenTransformer (X) Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: false -(24) ColumnarExchange +(26) VeloxAppendBatches +Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] +Arguments: X + +(27) ColumnarExchange Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: rangepartitioning(s_suppkey#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(25) ShuffleQueryStage +(28) ShuffleQueryStage Output [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: X -(26) AQEShuffleRead +(29) AQEShuffleRead Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: local -(27) VeloxColumnarToRowExec +(30) VeloxColumnarToRowExec Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] -(28) Scan parquet +(31) Scan parquet Output [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey)] ReadSchema: struct -(29) Filter +(32) Filter Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] Condition : isnotnull(s_suppkey#X) -(30) Exchange +(33) Exchange Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(31) Scan parquet +(34) Sort +Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] +Arguments: [s_suppkey#X ASC NULLS FIRST], false, 0 + +(35) Scan parquet Output [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1996-01-01), LessThan(l_shipdate,1996-04-01), IsNotNull(l_suppkey)] ReadSchema: struct -(32) Filter +(36) Filter Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : (((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1996-01-01)) AND (l_shipdate#X < 1996-04-01)) AND isnotnull(l_suppkey#X)) -(33) Project +(37) Project Output [3]: [l_suppkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(34) HashAggregate +(38) HashAggregate Input [3]: [l_suppkey#X, l_extendedprice#X, l_discount#X] Keys [1]: [l_suppkey#X] Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_suppkey#X, sum#X, isEmpty#X] -(35) Exchange +(39) Exchange Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(36) HashAggregate +(40) HashAggregate Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Keys [1]: [l_suppkey#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] Results [2]: [l_suppkey#X AS supplier_no#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X AS total_revenue#X] -(37) Filter +(41) Filter Input [2]: [supplier_no#X, total_revenue#X] Condition : (isnotnull(total_revenue#X) AND (total_revenue#X = Subquery subquery#X, [id=#X])) -(38) ShuffledHashJoin +(42) Sort +Input [2]: [supplier_no#X, total_revenue#X] +Arguments: [supplier_no#X ASC NULLS FIRST], false, 0 + +(43) SortMergeJoin Left keys [1]: [s_suppkey#X] Right keys [1]: [supplier_no#X] Join condition: None -(39) Project +(44) Project Output [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Input [6]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, supplier_no#X, total_revenue#X] -(40) Exchange +(45) Exchange Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: rangepartitioning(s_suppkey#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(41) Sort +(46) Sort Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: [s_suppkey#X ASC NULLS FIRST], true, 0 -(42) AdaptiveSparkPlan +(47) AdaptiveSparkPlan Output [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: isFinalPlan=true ===== Subqueries ===== -Subquery:1 Hosting operator id = 20 Hosting Expression = Subquery subquery#X, [id=#X] -AdaptiveSparkPlan (67) +Subquery:1 Hosting operator id = 22 Hosting Expression = Subquery subquery#X, [id=#X] +AdaptiveSparkPlan (73) +- == Final Plan == - VeloxColumnarToRowExec (58) - +- ^ RegularHashAggregateExecTransformer (56) - +- ^ RegularHashAggregateExecTransformer (55) - +- ^ ProjectExecTransformer (54) - +- ^ RegularHashAggregateExecTransformer (53) - +- ^ InputIteratorTransformer (52) - +- ^ InputAdapter (51) - +- ^ ShuffleQueryStage (50), Statistics(X) - +- ColumnarExchange (49) - +- ^ ProjectExecTransformer (47) - +- ^ FlushableHashAggregateExecTransformer (46) - +- ^ ProjectExecTransformer (45) - +- ^ NoopFilter (44) - +- ^ Scan parquet (43) + VeloxColumnarToRowExec (64) + +- ^ RegularHashAggregateExecTransformer (62) + +- ^ RegularHashAggregateExecTransformer (61) + +- ^ ProjectExecTransformer (60) + +- ^ RegularHashAggregateExecTransformer (59) + +- ^ InputIteratorTransformer (58) + +- ShuffleQueryStage (56), Statistics(X) + +- ColumnarExchange (55) + +- VeloxAppendBatches (54) + +- ^ ProjectExecTransformer (52) + +- ^ FlushableHashAggregateExecTransformer (51) + +- ^ ProjectExecTransformer (50) + +- ^ NoopFilter (49) + +- ^ Scan parquet (48) +- == Initial Plan == - HashAggregate (66) - +- HashAggregate (65) - +- HashAggregate (64) - +- Exchange (63) - +- HashAggregate (62) - +- Project (61) - +- Filter (60) - +- Scan parquet (59) + HashAggregate (72) + +- HashAggregate (71) + +- HashAggregate (70) + +- Exchange (69) + +- HashAggregate (68) + +- Project (67) + +- Filter (66) + +- Scan parquet (65) -(43) Scan parquet +(48) Scan parquet Output [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1996-01-01), LessThan(l_shipdate,1996-04-01)] ReadSchema: struct -(44) NoopFilter +(49) NoopFilter Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(45) ProjectExecTransformer +(50) ProjectExecTransformer Output [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) AS _pre_X#X] Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(46) FlushableHashAggregateExecTransformer +(51) FlushableHashAggregateExecTransformer Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, _pre_X#X] Keys [1]: [l_suppkey#X] Functions [1]: [partial_sum(_pre_X#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_suppkey#X, sum#X, isEmpty#X] -(47) ProjectExecTransformer +(52) ProjectExecTransformer Output [4]: [hash(l_suppkey#X, 42) AS hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] Input [3]: [l_suppkey#X, sum#X, isEmpty#X] -(48) WholeStageCodegenTransformer (X) +(53) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: false -(49) ColumnarExchange +(54) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] +Arguments: X + +(55) ColumnarExchange Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(50) ShuffleQueryStage +(56) ShuffleQueryStage Output [3]: [l_suppkey#X, sum#X, isEmpty#X] Arguments: X -(51) InputAdapter +(57) InputAdapter Input [3]: [l_suppkey#X, sum#X, isEmpty#X] -(52) InputIteratorTransformer +(58) InputIteratorTransformer Input [3]: [l_suppkey#X, sum#X, isEmpty#X] -(53) RegularHashAggregateExecTransformer +(59) RegularHashAggregateExecTransformer Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Keys [1]: [l_suppkey#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] Results [2]: [l_suppkey#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] -(54) ProjectExecTransformer +(60) ProjectExecTransformer Output [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X AS total_revenue#X] Input [2]: [l_suppkey#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] -(55) RegularHashAggregateExecTransformer +(61) RegularHashAggregateExecTransformer Input [1]: [total_revenue#X] Keys: [] Functions [1]: [partial_max(total_revenue#X)] Aggregate Attributes [1]: [max#X] Results [1]: [max#X] -(56) RegularHashAggregateExecTransformer +(62) RegularHashAggregateExecTransformer Input [1]: [max#X] Keys: [] Functions [1]: [max(total_revenue#X)] Aggregate Attributes [1]: [max(total_revenue#X)#X] Results [1]: [max(total_revenue#X)#X AS max(total_revenue)#X] -(57) WholeStageCodegenTransformer (X) +(63) WholeStageCodegenTransformer (X) Input [1]: [max(total_revenue)#X] Arguments: false -(58) VeloxColumnarToRowExec +(64) VeloxColumnarToRowExec Input [1]: [max(total_revenue)#X] -(59) Scan parquet +(65) Scan parquet Output [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1996-01-01), LessThan(l_shipdate,1996-04-01)] ReadSchema: struct -(60) Filter +(66) Filter Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : ((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1996-01-01)) AND (l_shipdate#X < 1996-04-01)) -(61) Project +(67) Project Output [3]: [l_suppkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(62) HashAggregate +(68) HashAggregate Input [3]: [l_suppkey#X, l_extendedprice#X, l_discount#X] Keys [1]: [l_suppkey#X] Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_suppkey#X, sum#X, isEmpty#X] -(63) Exchange +(69) Exchange Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(64) HashAggregate +(70) HashAggregate Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Keys [1]: [l_suppkey#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] Results [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X AS total_revenue#X] -(65) HashAggregate +(71) HashAggregate Input [1]: [total_revenue#X] Keys: [] Functions [1]: [partial_max(total_revenue#X)] Aggregate Attributes [1]: [max#X] Results [1]: [max#X] -(66) HashAggregate +(72) HashAggregate Input [1]: [max#X] Keys: [] Functions [1]: [max(total_revenue#X)] Aggregate Attributes [1]: [max(total_revenue#X)#X] Results [1]: [max(total_revenue#X)#X AS max(total_revenue)#X] -(67) AdaptiveSparkPlan +(73) AdaptiveSparkPlan Output [1]: [max(total_revenue)#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/16.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/16.txt index 230e9c890d1da..86d2f321f653e 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/16.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/16.txt @@ -1,64 +1,66 @@ == Physical Plan == -AdaptiveSparkPlan (64) +AdaptiveSparkPlan (71) +- == Final Plan == - VeloxColumnarToRowExec (42) - +- ^ SortExecTransformer (40) - +- ^ InputIteratorTransformer (39) - +- ^ InputAdapter (38) - +- ^ ShuffleQueryStage (37), Statistics(X) - +- ColumnarExchange (36) - +- ^ RegularHashAggregateExecTransformer (34) - +- ^ InputIteratorTransformer (33) - +- ^ InputAdapter (32) - +- ^ ShuffleQueryStage (31), Statistics(X) - +- ColumnarExchange (30) - +- ^ ProjectExecTransformer (28) - +- ^ FlushableHashAggregateExecTransformer (27) - +- ^ RegularHashAggregateExecTransformer (26) - +- ^ InputIteratorTransformer (25) - +- ^ InputAdapter (24) - +- ^ ShuffleQueryStage (23), Statistics(X) - +- ColumnarExchange (22) - +- ^ ProjectExecTransformer (20) - +- ^ FlushableHashAggregateExecTransformer (19) - +- ^ ProjectExecTransformer (18) - +- ^ ShuffledHashJoinExecTransformer Inner (17) - :- ^ InputIteratorTransformer (8) - : +- ^ InputAdapter (7) - : +- ^ ShuffleQueryStage (6), Statistics(X) - : +- ColumnarExchange (5) + VeloxColumnarToRowExec (47) + +- ^ SortExecTransformer (45) + +- ^ InputIteratorTransformer (44) + +- ShuffleQueryStage (42), Statistics(X) + +- ColumnarExchange (41) + +- VeloxAppendBatches (40) + +- ^ RegularHashAggregateExecTransformer (38) + +- ^ InputIteratorTransformer (37) + +- ShuffleQueryStage (35), Statistics(X) + +- ColumnarExchange (34) + +- VeloxAppendBatches (33) + +- ^ ProjectExecTransformer (31) + +- ^ FlushableHashAggregateExecTransformer (30) + +- ^ RegularHashAggregateExecTransformer (29) + +- ^ InputIteratorTransformer (28) + +- ShuffleQueryStage (26), Statistics(X) + +- ColumnarExchange (25) + +- VeloxAppendBatches (24) + +- ^ ProjectExecTransformer (22) + +- ^ FlushableHashAggregateExecTransformer (21) + +- ^ ProjectExecTransformer (20) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (19) + :- ^ InputIteratorTransformer (9) + : +- ShuffleQueryStage (7), Statistics(X) + : +- ColumnarExchange (6) + : +- VeloxAppendBatches (5) : +- ^ ProjectExecTransformer (3) : +- ^ NoopFilter (2) : +- ^ Scan parquet (1) - +- ^ InputIteratorTransformer (16) - +- ^ InputAdapter (15) - +- ^ ShuffleQueryStage (14), Statistics(X) - +- ColumnarExchange (13) - +- ^ ProjectExecTransformer (11) - +- ^ NoopFilter (10) - +- ^ Scan parquet (9) + +- ^ InputIteratorTransformer (18) + +- ShuffleQueryStage (16), Statistics(X) + +- ColumnarExchange (15) + +- VeloxAppendBatches (14) + +- ^ ProjectExecTransformer (12) + +- ^ NoopFilter (11) + +- ^ Scan parquet (10) +- == Initial Plan == - Sort (63) - +- Exchange (62) - +- HashAggregate (61) - +- Exchange (60) - +- HashAggregate (59) - +- HashAggregate (58) - +- Exchange (57) - +- HashAggregate (56) - +- Project (55) - +- ShuffledHashJoin Inner BuildRight (54) - :- Exchange (50) - : +- BroadcastHashJoin LeftAnti BuildRight (49) - : :- Filter (44) - : : +- Scan parquet (43) - : +- BroadcastExchange (48) - : +- Project (47) - : +- Filter (46) - : +- Scan parquet (45) - +- Exchange (53) - +- Filter (52) - +- Scan parquet (51) + Sort (70) + +- Exchange (69) + +- HashAggregate (68) + +- Exchange (67) + +- HashAggregate (66) + +- HashAggregate (65) + +- Exchange (64) + +- HashAggregate (63) + +- Project (62) + +- SortMergeJoin Inner (61) + :- Sort (56) + : +- Exchange (55) + : +- BroadcastHashJoin LeftAnti BuildRight (54) + : :- Filter (49) + : : +- Scan parquet (48) + : +- BroadcastExchange (53) + : +- Project (52) + : +- Filter (51) + : +- Scan parquet (50) + +- Sort (60) + +- Exchange (59) + +- Filter (58) + +- Scan parquet (57) (1) Scan parquet @@ -80,270 +82,298 @@ Input [2]: [ps_partkey#X, ps_suppkey#X] Input [3]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [3]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X] +Arguments: X + +(6) ColumnarExchange Input [3]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X] Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [2]: [ps_partkey#X, ps_suppkey#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [2]: [ps_partkey#X, ps_suppkey#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [2]: [ps_partkey#X, ps_suppkey#X] -(9) Scan parquet +(10) Scan parquet Output [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_brand), IsNotNull(p_type), Not(EqualTo(p_brand,Brand#X)), Not(StringStartsWith(p_type,MEDIUM POLISHED)), In(p_size, [14,19,23,3,36,45,49,9]), IsNotNull(p_partkey)] ReadSchema: struct -(10) NoopFilter +(11) NoopFilter Input [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] Arguments: [p_partkey#X, p_brand#X, p_type#X, p_size#X] -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [5]: [hash(p_partkey#X, 42) AS hash_partition_key#X, p_partkey#X, p_brand#X, p_type#X, p_size#X] Input [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, p_partkey#X, p_brand#X, p_type#X, p_size#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [5]: [hash_partition_key#X, p_partkey#X, p_brand#X, p_type#X, p_size#X] +Arguments: X + +(15) ColumnarExchange Input [5]: [hash_partition_key#X, p_partkey#X, p_brand#X, p_type#X, p_size#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X, p_brand#X, p_type#X, p_size#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [ps_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [4]: [ps_suppkey#X, p_brand#X, p_type#X, p_size#X] Input [6]: [ps_partkey#X, ps_suppkey#X, p_partkey#X, p_brand#X, p_type#X, p_size#X] -(19) FlushableHashAggregateExecTransformer +(21) FlushableHashAggregateExecTransformer Input [4]: [ps_suppkey#X, p_brand#X, p_type#X, p_size#X] Keys [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Functions: [] Aggregate Attributes: [] Results [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(20) ProjectExecTransformer +(22) ProjectExecTransformer Output [5]: [hash(p_brand#X, p_type#X, p_size#X, ps_suppkey#X, 42) AS hash_partition_key#X, p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(21) WholeStageCodegenTransformer (X) +(23) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Arguments: false -(22) ColumnarExchange +(24) VeloxAppendBatches +Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, ps_suppkey#X] +Arguments: X + +(25) ColumnarExchange Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, ps_suppkey#X], [plan_id=X], [id=#X] -(23) ShuffleQueryStage +(26) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Arguments: X -(24) InputAdapter +(27) InputAdapter Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(25) InputIteratorTransformer +(28) InputIteratorTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(26) RegularHashAggregateExecTransformer +(29) RegularHashAggregateExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Keys [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Functions: [] Aggregate Attributes: [] Results [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(27) FlushableHashAggregateExecTransformer +(30) FlushableHashAggregateExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] Functions [1]: [partial_count(distinct ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count#X] -(28) ProjectExecTransformer +(31) ProjectExecTransformer Output [5]: [hash(p_brand#X, p_type#X, p_size#X, 42) AS hash_partition_key#X, p_brand#X, p_type#X, p_size#X, count#X] Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] -(29) WholeStageCodegenTransformer (X) +(32) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, count#X] Arguments: false -(30) ColumnarExchange +(33) VeloxAppendBatches +Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, count#X] +Arguments: X + +(34) ColumnarExchange Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, count#X] Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, count#X], [plan_id=X], [id=#X] -(31) ShuffleQueryStage +(35) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, count#X] Arguments: X -(32) InputAdapter +(36) InputAdapter Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] -(33) InputIteratorTransformer +(37) InputIteratorTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] -(34) RegularHashAggregateExecTransformer +(38) RegularHashAggregateExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] Functions [1]: [count(distinct ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count(ps_suppkey#X)#X AS supplier_cnt#X] -(35) WholeStageCodegenTransformer (X) +(39) WholeStageCodegenTransformer (X) Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: false -(36) ColumnarExchange +(40) VeloxAppendBatches +Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] +Arguments: X + +(41) ColumnarExchange Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: rangepartitioning(supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(37) ShuffleQueryStage +(42) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: X -(38) InputAdapter +(43) InputAdapter Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] -(39) InputIteratorTransformer +(44) InputIteratorTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] -(40) SortExecTransformer +(45) SortExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: [supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST], true, 0 -(41) WholeStageCodegenTransformer (X) +(46) WholeStageCodegenTransformer (X) Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: false -(42) VeloxColumnarToRowExec +(47) VeloxColumnarToRowExec Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] -(43) Scan parquet +(48) Scan parquet Output [2]: [ps_partkey#X, ps_suppkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_partkey)] ReadSchema: struct -(44) Filter +(49) Filter Input [2]: [ps_partkey#X, ps_suppkey#X] Condition : isnotnull(ps_partkey#X) -(45) Scan parquet +(50) Scan parquet Output [2]: [s_suppkey#X, s_comment#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_comment)] ReadSchema: struct -(46) Filter +(51) Filter Input [2]: [s_suppkey#X, s_comment#X] Condition : (isnotnull(s_comment#X) AND s_comment#X LIKE %Customer%Complaints%) -(47) Project +(52) Project Output [1]: [s_suppkey#X] Input [2]: [s_suppkey#X, s_comment#X] -(48) BroadcastExchange +(53) BroadcastExchange Input [1]: [s_suppkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),true), [plan_id=X] -(49) BroadcastHashJoin +(54) BroadcastHashJoin Left keys [1]: [ps_suppkey#X] Right keys [1]: [s_suppkey#X] Join condition: None -(50) Exchange +(55) Exchange Input [2]: [ps_partkey#X, ps_suppkey#X] Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(51) Scan parquet +(56) Sort +Input [2]: [ps_partkey#X, ps_suppkey#X] +Arguments: [ps_partkey#X ASC NULLS FIRST], false, 0 + +(57) Scan parquet Output [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_brand), IsNotNull(p_type), Not(EqualTo(p_brand,Brand#X)), Not(StringStartsWith(p_type,MEDIUM POLISHED)), In(p_size, [14,19,23,3,36,45,49,9]), IsNotNull(p_partkey)] ReadSchema: struct -(52) Filter +(58) Filter Input [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] Condition : (((((isnotnull(p_brand#X) AND isnotnull(p_type#X)) AND NOT (p_brand#X = Brand#X)) AND NOT StartsWith(p_type#X, MEDIUM POLISHED)) AND p_size#X IN (49,14,23,45,19,3,36,9)) AND isnotnull(p_partkey#X)) -(53) Exchange +(59) Exchange Input [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(54) ShuffledHashJoin +(60) Sort +Input [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] +Arguments: [p_partkey#X ASC NULLS FIRST], false, 0 + +(61) SortMergeJoin Left keys [1]: [ps_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(55) Project +(62) Project Output [4]: [ps_suppkey#X, p_brand#X, p_type#X, p_size#X] Input [6]: [ps_partkey#X, ps_suppkey#X, p_partkey#X, p_brand#X, p_type#X, p_size#X] -(56) HashAggregate +(63) HashAggregate Input [4]: [ps_suppkey#X, p_brand#X, p_type#X, p_size#X] Keys [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Functions: [] Aggregate Attributes: [] Results [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(57) Exchange +(64) Exchange Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(58) HashAggregate +(65) HashAggregate Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Keys [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Functions: [] Aggregate Attributes: [] Results [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(59) HashAggregate +(66) HashAggregate Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] Functions [1]: [partial_count(distinct ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count#X] -(60) Exchange +(67) Exchange Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(61) HashAggregate +(68) HashAggregate Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] Functions [1]: [count(distinct ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count(ps_suppkey#X)#X AS supplier_cnt#X] -(62) Exchange +(69) Exchange Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: rangepartitioning(supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(63) Sort +(70) Sort Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: [supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST], true, 0 -(64) AdaptiveSparkPlan +(71) AdaptiveSparkPlan Output [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/17.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/17.txt index 3848cd252b450..6a2e47576cadf 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/17.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/17.txt @@ -1,59 +1,62 @@ == Physical Plan == -AdaptiveSparkPlan (57) +AdaptiveSparkPlan (63) +- == Final Plan == - VeloxColumnarToRowExec (37) - +- ^ ProjectExecTransformer (35) - +- ^ RegularHashAggregateExecTransformer (34) - +- ^ RegularHashAggregateExecTransformer (33) - +- ^ ProjectExecTransformer (32) - +- ^ ShuffledHashJoinExecTransformer Inner (31) - :- ^ ProjectExecTransformer (18) - : +- ^ ShuffledHashJoinExecTransformer Inner (17) - : :- ^ InputIteratorTransformer (8) - : : +- ^ InputAdapter (7) - : : +- ^ ShuffleQueryStage (6), Statistics(X) - : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (40) + +- ^ ProjectExecTransformer (38) + +- ^ RegularHashAggregateExecTransformer (37) + +- ^ RegularHashAggregateExecTransformer (36) + +- ^ ProjectExecTransformer (35) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (34) + :- ^ ProjectExecTransformer (20) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (19) + : :- ^ InputIteratorTransformer (9) + : : +- ShuffleQueryStage (7), Statistics(X) + : : +- ColumnarExchange (6) + : : +- VeloxAppendBatches (5) : : +- ^ ProjectExecTransformer (3) : : +- ^ NoopFilter (2) : : +- ^ Scan parquet (1) - : +- ^ InputIteratorTransformer (16) - : +- ^ InputAdapter (15) - : +- ^ ShuffleQueryStage (14), Statistics(X) - : +- ColumnarExchange (13) - : +- ^ ProjectExecTransformer (11) - : +- ^ NoopFilter (10) - : +- ^ Scan parquet (9) - +- ^ FilterExecTransformer (30) - +- ^ ProjectExecTransformer (29) - +- ^ RegularHashAggregateExecTransformer (28) - +- ^ InputIteratorTransformer (27) - +- ^ InputAdapter (26) - +- ^ ShuffleQueryStage (25), Statistics(X) - +- ColumnarExchange (24) - +- ^ ProjectExecTransformer (22) - +- ^ FlushableHashAggregateExecTransformer (21) - +- ^ NoopFilter (20) - +- ^ Scan parquet (19) + : +- ^ InputIteratorTransformer (18) + : +- ShuffleQueryStage (16), Statistics(X) + : +- ColumnarExchange (15) + : +- VeloxAppendBatches (14) + : +- ^ ProjectExecTransformer (12) + : +- ^ NoopFilter (11) + : +- ^ Scan parquet (10) + +- ^ FilterExecTransformer (33) + +- ^ ProjectExecTransformer (32) + +- ^ RegularHashAggregateExecTransformer (31) + +- ^ InputIteratorTransformer (30) + +- ShuffleQueryStage (28), Statistics(X) + +- ColumnarExchange (27) + +- VeloxAppendBatches (26) + +- ^ ProjectExecTransformer (24) + +- ^ FlushableHashAggregateExecTransformer (23) + +- ^ NoopFilter (22) + +- ^ Scan parquet (21) +- == Initial Plan == - HashAggregate (56) - +- HashAggregate (55) - +- Project (54) - +- ShuffledHashJoin Inner BuildRight (53) - :- Project (46) - : +- ShuffledHashJoin Inner BuildRight (45) - : :- Exchange (40) - : : +- Filter (39) - : : +- Scan parquet (38) - : +- Exchange (44) - : +- Project (43) - : +- Filter (42) - : +- Scan parquet (41) - +- Filter (52) - +- HashAggregate (51) - +- Exchange (50) - +- HashAggregate (49) - +- Filter (48) - +- Scan parquet (47) + HashAggregate (62) + +- HashAggregate (61) + +- Project (60) + +- SortMergeJoin Inner (59) + :- Project (51) + : +- SortMergeJoin Inner (50) + : :- Sort (44) + : : +- Exchange (43) + : : +- Filter (42) + : : +- Scan parquet (41) + : +- Sort (49) + : +- Exchange (48) + : +- Project (47) + : +- Filter (46) + : +- Scan parquet (45) + +- Sort (58) + +- Filter (57) + +- HashAggregate (56) + +- Exchange (55) + +- HashAggregate (54) + +- Filter (53) + +- Scan parquet (52) (1) Scan parquet @@ -75,250 +78,274 @@ Input [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] Input [4]: [hash_partition_key#X, l_partkey#X, l_quantity#X, l_extendedprice#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_partkey#X, l_quantity#X, l_extendedprice#X] +Arguments: X + +(6) ColumnarExchange Input [4]: [hash_partition_key#X, l_partkey#X, l_quantity#X, l_extendedprice#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_quantity#X, l_extendedprice#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] -(9) Scan parquet +(10) Scan parquet Output [3]: [p_partkey#X, p_brand#X, p_container#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_brand), IsNotNull(p_container), EqualTo(p_brand,Brand#X), EqualTo(p_container,MED BOX), IsNotNull(p_partkey)] ReadSchema: struct -(10) NoopFilter +(11) NoopFilter Input [3]: [p_partkey#X, p_brand#X, p_container#X] Arguments: [p_partkey#X, p_brand#X, p_container#X] -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [2]: [hash(p_partkey#X, 42) AS hash_partition_key#X, p_partkey#X] Input [3]: [p_partkey#X, p_brand#X, p_container#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, p_partkey#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [2]: [hash_partition_key#X, p_partkey#X] +Arguments: X + +(15) ColumnarExchange Input [2]: [hash_partition_key#X, p_partkey#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [1]: [p_partkey#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [1]: [p_partkey#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [1]: [p_partkey#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [3]: [l_quantity#X, l_extendedprice#X, p_partkey#X] Input [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, p_partkey#X] -(19) Scan parquet +(21) Scan parquet Output [2]: [l_partkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey)] ReadSchema: struct -(20) NoopFilter +(22) NoopFilter Input [2]: [l_partkey#X, l_quantity#X] Arguments: [l_partkey#X, l_quantity#X] -(21) FlushableHashAggregateExecTransformer +(23) FlushableHashAggregateExecTransformer Input [2]: [l_partkey#X, l_quantity#X] Keys [1]: [l_partkey#X] Functions [1]: [partial_avg(l_quantity#X)] Aggregate Attributes [2]: [sum#X, count#X] Results [3]: [l_partkey#X, sum#X, count#X] -(22) ProjectExecTransformer +(24) ProjectExecTransformer Output [4]: [hash(l_partkey#X, 42) AS hash_partition_key#X, l_partkey#X, sum#X, count#X] Input [3]: [l_partkey#X, sum#X, count#X] -(23) WholeStageCodegenTransformer (X) +(25) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, l_partkey#X, sum#X, count#X] Arguments: false -(24) ColumnarExchange +(26) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_partkey#X, sum#X, count#X] +Arguments: X + +(27) ColumnarExchange Input [4]: [hash_partition_key#X, l_partkey#X, sum#X, count#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, sum#X, count#X], [plan_id=X], [id=#X] -(25) ShuffleQueryStage +(28) ShuffleQueryStage Output [3]: [l_partkey#X, sum#X, count#X] Arguments: X -(26) InputAdapter +(29) InputAdapter Input [3]: [l_partkey#X, sum#X, count#X] -(27) InputIteratorTransformer +(30) InputIteratorTransformer Input [3]: [l_partkey#X, sum#X, count#X] -(28) RegularHashAggregateExecTransformer +(31) RegularHashAggregateExecTransformer Input [3]: [l_partkey#X, sum#X, count#X] Keys [1]: [l_partkey#X] Functions [1]: [avg(l_quantity#X)] Aggregate Attributes [1]: [avg(l_quantity#X)#X] Results [2]: [l_partkey#X, avg(l_quantity#X)#X] -(29) ProjectExecTransformer +(32) ProjectExecTransformer Output [2]: [CheckOverflow((0.200000 * promote_precision(avg(l_quantity#X)#X)), DecimalType(18,7)) AS (0.2 * avg(l_quantity))#X, l_partkey#X] Input [2]: [l_partkey#X, avg(l_quantity#X)#X] -(30) FilterExecTransformer +(33) FilterExecTransformer Input [2]: [(0.2 * avg(l_quantity))#X, l_partkey#X] Arguments: isnotnull((0.2 * avg(l_quantity))#X) -(31) ShuffledHashJoinExecTransformer +(34) ShuffledHashJoinExecTransformer Left keys [1]: [p_partkey#X] Right keys [1]: [l_partkey#X] Join condition: (cast(l_quantity#X as decimal(18,7)) < (0.2 * avg(l_quantity))#X) -(32) ProjectExecTransformer +(35) ProjectExecTransformer Output [1]: [l_extendedprice#X] Input [5]: [l_quantity#X, l_extendedprice#X, p_partkey#X, (0.2 * avg(l_quantity))#X, l_partkey#X] -(33) RegularHashAggregateExecTransformer +(36) RegularHashAggregateExecTransformer Input [1]: [l_extendedprice#X] Keys: [] Functions [1]: [partial_sum(l_extendedprice#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(34) RegularHashAggregateExecTransformer +(37) RegularHashAggregateExecTransformer Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(l_extendedprice#X)] Aggregate Attributes [1]: [sum(l_extendedprice#X)#X] Results [1]: [sum(l_extendedprice#X)#X] -(35) ProjectExecTransformer +(38) ProjectExecTransformer Output [1]: [CheckOverflow((promote_precision(sum(l_extendedprice#X)#X) / 7.00), DecimalType(27,6)) AS avg_yearly#X] Input [1]: [sum(l_extendedprice#X)#X] -(36) WholeStageCodegenTransformer (X) +(39) WholeStageCodegenTransformer (X) Input [1]: [avg_yearly#X] Arguments: false -(37) VeloxColumnarToRowExec +(40) VeloxColumnarToRowExec Input [1]: [avg_yearly#X] -(38) Scan parquet +(41) Scan parquet Output [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey), IsNotNull(l_quantity)] ReadSchema: struct -(39) Filter +(42) Filter Input [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] Condition : (isnotnull(l_partkey#X) AND isnotnull(l_quantity#X)) -(40) Exchange +(43) Exchange Input [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(41) Scan parquet +(44) Sort +Input [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] +Arguments: [l_partkey#X ASC NULLS FIRST], false, 0 + +(45) Scan parquet Output [3]: [p_partkey#X, p_brand#X, p_container#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_brand), IsNotNull(p_container), EqualTo(p_brand,Brand#X), EqualTo(p_container,MED BOX), IsNotNull(p_partkey)] ReadSchema: struct -(42) Filter +(46) Filter Input [3]: [p_partkey#X, p_brand#X, p_container#X] Condition : ((((isnotnull(p_brand#X) AND isnotnull(p_container#X)) AND (p_brand#X = Brand#X)) AND (p_container#X = MED BOX)) AND isnotnull(p_partkey#X)) -(43) Project +(47) Project Output [1]: [p_partkey#X] Input [3]: [p_partkey#X, p_brand#X, p_container#X] -(44) Exchange +(48) Exchange Input [1]: [p_partkey#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(45) ShuffledHashJoin +(49) Sort +Input [1]: [p_partkey#X] +Arguments: [p_partkey#X ASC NULLS FIRST], false, 0 + +(50) SortMergeJoin Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(46) Project +(51) Project Output [3]: [l_quantity#X, l_extendedprice#X, p_partkey#X] Input [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, p_partkey#X] -(47) Scan parquet +(52) Scan parquet Output [2]: [l_partkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey)] ReadSchema: struct -(48) Filter +(53) Filter Input [2]: [l_partkey#X, l_quantity#X] Condition : isnotnull(l_partkey#X) -(49) HashAggregate +(54) HashAggregate Input [2]: [l_partkey#X, l_quantity#X] Keys [1]: [l_partkey#X] Functions [1]: [partial_avg(l_quantity#X)] Aggregate Attributes [2]: [sum#X, count#X] Results [3]: [l_partkey#X, sum#X, count#X] -(50) Exchange +(55) Exchange Input [3]: [l_partkey#X, sum#X, count#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(51) HashAggregate +(56) HashAggregate Input [3]: [l_partkey#X, sum#X, count#X] Keys [1]: [l_partkey#X] Functions [1]: [avg(l_quantity#X)] Aggregate Attributes [1]: [avg(l_quantity#X)#X] Results [2]: [CheckOverflow((0.200000 * promote_precision(avg(l_quantity#X)#X)), DecimalType(18,7)) AS (0.2 * avg(l_quantity))#X, l_partkey#X] -(52) Filter +(57) Filter Input [2]: [(0.2 * avg(l_quantity))#X, l_partkey#X] Condition : isnotnull((0.2 * avg(l_quantity))#X) -(53) ShuffledHashJoin +(58) Sort +Input [2]: [(0.2 * avg(l_quantity))#X, l_partkey#X] +Arguments: [l_partkey#X ASC NULLS FIRST], false, 0 + +(59) SortMergeJoin Left keys [1]: [p_partkey#X] Right keys [1]: [l_partkey#X] Join condition: (cast(l_quantity#X as decimal(18,7)) < (0.2 * avg(l_quantity))#X) -(54) Project +(60) Project Output [1]: [l_extendedprice#X] Input [5]: [l_quantity#X, l_extendedprice#X, p_partkey#X, (0.2 * avg(l_quantity))#X, l_partkey#X] -(55) HashAggregate +(61) HashAggregate Input [1]: [l_extendedprice#X] Keys: [] Functions [1]: [partial_sum(l_extendedprice#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(56) HashAggregate +(62) HashAggregate Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(l_extendedprice#X)] Aggregate Attributes [1]: [sum(l_extendedprice#X)#X] Results [1]: [CheckOverflow((promote_precision(sum(l_extendedprice#X)#X) / 7.00), DecimalType(27,6)) AS avg_yearly#X] -(57) AdaptiveSparkPlan +(63) AdaptiveSparkPlan Output [1]: [avg_yearly#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/18.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/18.txt index 08b902eed5ed4..7fe13a003017b 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/18.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/18.txt @@ -1,96 +1,102 @@ == Physical Plan == -AdaptiveSparkPlan (97) +AdaptiveSparkPlan (110) +- == Final Plan == - VeloxColumnarToRowExec (64) - +- TakeOrderedAndProjectExecTransformer (63) - +- ^ RegularHashAggregateExecTransformer (61) - +- ^ RegularHashAggregateExecTransformer (60) - +- ^ ProjectExecTransformer (59) - +- ^ ShuffledHashJoinExecTransformer Inner (58) - :- ^ InputIteratorTransformer (41) - : +- ^ InputAdapter (40) - : +- ^ ShuffleQueryStage (39), Statistics(X) - : +- ColumnarExchange (38) - : +- ^ ProjectExecTransformer (36) - : +- ^ ShuffledHashJoinExecTransformer Inner (35) - : :- ^ InputIteratorTransformer (8) - : : +- ^ InputAdapter (7) - : : +- ^ ShuffleQueryStage (6), Statistics(X) - : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (70) + +- TakeOrderedAndProjectExecTransformer (69) + +- ^ RegularHashAggregateExecTransformer (67) + +- ^ RegularHashAggregateExecTransformer (66) + +- ^ ProjectExecTransformer (65) + +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (64) + :- ^ InputIteratorTransformer (46) + : +- ShuffleQueryStage (44), Statistics(X) + : +- ColumnarExchange (43) + : +- VeloxAppendBatches (42) + : +- ^ ProjectExecTransformer (40) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (39) + : :- ^ InputIteratorTransformer (9) + : : +- ShuffleQueryStage (7), Statistics(X) + : : +- ColumnarExchange (6) + : : +- VeloxAppendBatches (5) : : +- ^ ProjectExecTransformer (3) : : +- ^ NoopFilter (2) : : +- ^ Scan parquet (1) - : +- ^ InputIteratorTransformer (34) - : +- ^ InputAdapter (33) - : +- ^ ShuffleQueryStage (32), Statistics(X) - : +- ColumnarExchange (31) - : +- ^ ProjectExecTransformer (29) - : +- ^ ShuffledHashJoinExecTransformer LeftSemi (28) - : :- ^ InputIteratorTransformer (16) - : : +- ^ InputAdapter (15) - : : +- ^ ShuffleQueryStage (14), Statistics(X) - : : +- ColumnarExchange (13) - : : +- ^ ProjectExecTransformer (11) - : : +- ^ NoopFilter (10) - : : +- ^ Scan parquet (9) - : +- ^ ProjectExecTransformer (27) - : +- ^ FilterExecTransformer (26) - : +- ^ RegularHashAggregateExecTransformer (25) - : +- ^ InputIteratorTransformer (24) - : +- ^ InputAdapter (23) - : +- ^ ShuffleQueryStage (22), Statistics(X) - : +- ColumnarExchange (21) - : +- ^ ProjectExecTransformer (19) - : +- ^ FlushableHashAggregateExecTransformer (18) - : +- ^ Scan parquet (17) - +- ^ ShuffledHashJoinExecTransformer LeftSemi (57) - :- ^ InputIteratorTransformer (49) - : +- ^ InputAdapter (48) - : +- ^ ShuffleQueryStage (47), Statistics(X) - : +- ColumnarExchange (46) - : +- ^ ProjectExecTransformer (44) - : +- ^ NoopFilter (43) - : +- ^ Scan parquet (42) - +- ^ ProjectExecTransformer (56) - +- ^ FilterExecTransformer (55) - +- ^ RegularHashAggregateExecTransformer (54) - +- ^ InputIteratorTransformer (53) - +- ^ InputAdapter (52) - +- ^ ShuffleQueryStage (51), Statistics(X) - +- ReusedExchange (50) + : +- ^ InputIteratorTransformer (38) + : +- ShuffleQueryStage (36), Statistics(X) + : +- ColumnarExchange (35) + : +- VeloxAppendBatches (34) + : +- ^ ProjectExecTransformer (32) + : +- ^ ShuffledHashJoinExecTransformer LeftSemi BuildRight (31) + : :- ^ InputIteratorTransformer (18) + : : +- ShuffleQueryStage (16), Statistics(X) + : : +- ColumnarExchange (15) + : : +- VeloxAppendBatches (14) + : : +- ^ ProjectExecTransformer (12) + : : +- ^ NoopFilter (11) + : : +- ^ Scan parquet (10) + : +- ^ ProjectExecTransformer (30) + : +- ^ FilterExecTransformer (29) + : +- ^ RegularHashAggregateExecTransformer (28) + : +- ^ InputIteratorTransformer (27) + : +- ShuffleQueryStage (25), Statistics(X) + : +- ColumnarExchange (24) + : +- VeloxAppendBatches (23) + : +- ^ ProjectExecTransformer (21) + : +- ^ FlushableHashAggregateExecTransformer (20) + : +- ^ Scan parquet (19) + +- ^ ShuffledHashJoinExecTransformer LeftSemi BuildRight (63) + :- ^ InputIteratorTransformer (55) + : +- ShuffleQueryStage (53), Statistics(X) + : +- ColumnarExchange (52) + : +- VeloxAppendBatches (51) + : +- ^ ProjectExecTransformer (49) + : +- ^ NoopFilter (48) + : +- ^ Scan parquet (47) + +- ^ ProjectExecTransformer (62) + +- ^ FilterExecTransformer (61) + +- ^ RegularHashAggregateExecTransformer (60) + +- ^ InputIteratorTransformer (59) + +- ShuffleQueryStage (57), Statistics(X) + +- ReusedExchange (56) +- == Initial Plan == - TakeOrderedAndProject (96) - +- HashAggregate (95) - +- HashAggregate (94) - +- Project (93) - +- ShuffledHashJoin Inner BuildRight (92) - :- Exchange (81) - : +- Project (80) - : +- ShuffledHashJoin Inner BuildLeft (79) - : :- Exchange (67) - : : +- Filter (66) - : : +- Scan parquet (65) - : +- Exchange (78) - : +- ShuffledHashJoin LeftSemi BuildRight (77) - : :- Exchange (70) - : : +- Filter (69) - : : +- Scan parquet (68) - : +- Project (76) - : +- Filter (75) - : +- HashAggregate (74) - : +- Exchange (73) - : +- HashAggregate (72) - : +- Scan parquet (71) - +- ShuffledHashJoin LeftSemi BuildRight (91) - :- Exchange (84) - : +- Filter (83) - : +- Scan parquet (82) - +- Project (90) - +- Filter (89) - +- HashAggregate (88) - +- Exchange (87) - +- HashAggregate (86) - +- Scan parquet (85) + TakeOrderedAndProject (109) + +- HashAggregate (108) + +- HashAggregate (107) + +- Project (106) + +- SortMergeJoin Inner (105) + :- Sort (92) + : +- Exchange (91) + : +- Project (90) + : +- SortMergeJoin Inner (89) + : :- Sort (74) + : : +- Exchange (73) + : : +- Filter (72) + : : +- Scan parquet (71) + : +- Sort (88) + : +- Exchange (87) + : +- SortMergeJoin LeftSemi (86) + : :- Sort (78) + : : +- Exchange (77) + : : +- Filter (76) + : : +- Scan parquet (75) + : +- Sort (85) + : +- Project (84) + : +- Filter (83) + : +- HashAggregate (82) + : +- Exchange (81) + : +- HashAggregate (80) + : +- Scan parquet (79) + +- SortMergeJoin LeftSemi (104) + :- Sort (96) + : +- Exchange (95) + : +- Filter (94) + : +- Scan parquet (93) + +- Sort (103) + +- Project (102) + +- Filter (101) + +- HashAggregate (100) + +- Exchange (99) + +- HashAggregate (98) + +- Scan parquet (97) (1) Scan parquet @@ -112,420 +118,472 @@ Input [2]: [c_custkey#X, c_name#X] Input [3]: [hash_partition_key#X, c_custkey#X, c_name#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [3]: [hash_partition_key#X, c_custkey#X, c_name#X] +Arguments: X + +(6) ColumnarExchange Input [3]: [hash_partition_key#X, c_custkey#X, c_name#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [2]: [c_custkey#X, c_name#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [2]: [c_custkey#X, c_name#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [2]: [c_custkey#X, c_name#X] -(9) Scan parquet +(10) Scan parquet Output [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(10) NoopFilter +(11) NoopFilter Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Arguments: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [5]: [hash(o_orderkey#X, 42) AS hash_partition_key#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] +Arguments: X + +(15) ColumnarExchange Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -(17) Scan parquet +(19) Scan parquet Output [2]: [l_orderkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(18) FlushableHashAggregateExecTransformer +(20) FlushableHashAggregateExecTransformer Input [2]: [l_orderkey#X, l_quantity#X] Keys [1]: [l_orderkey#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_orderkey#X, sum#X, isEmpty#X] -(19) ProjectExecTransformer +(21) ProjectExecTransformer Output [4]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, sum#X, isEmpty#X] Input [3]: [l_orderkey#X, sum#X, isEmpty#X] -(20) WholeStageCodegenTransformer (X) +(22) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, l_orderkey#X, sum#X, isEmpty#X] Arguments: false -(21) ColumnarExchange +(23) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_orderkey#X, sum#X, isEmpty#X] +Arguments: X + +(24) ColumnarExchange Input [4]: [hash_partition_key#X, l_orderkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(22) ShuffleQueryStage +(25) ShuffleQueryStage Output [3]: [l_orderkey#X, sum#X, isEmpty#X] Arguments: X -(23) InputAdapter +(26) InputAdapter Input [3]: [l_orderkey#X, sum#X, isEmpty#X] -(24) InputIteratorTransformer +(27) InputIteratorTransformer Input [3]: [l_orderkey#X, sum#X, isEmpty#X] -(25) RegularHashAggregateExecTransformer +(28) RegularHashAggregateExecTransformer Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Keys [1]: [l_orderkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [2]: [l_orderkey#X, sum(l_quantity#X)#X AS sum(l_quantity#X)#X] -(26) FilterExecTransformer +(29) FilterExecTransformer Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] Arguments: (isnotnull(sum(l_quantity#X)#X) AND (sum(l_quantity#X)#X > 300.00)) -(27) ProjectExecTransformer +(30) ProjectExecTransformer Output [1]: [l_orderkey#X] Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] -(28) ShuffledHashJoinExecTransformer +(31) ShuffledHashJoinExecTransformer Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(29) ProjectExecTransformer +(32) ProjectExecTransformer Output [5]: [hash(o_custkey#X, 42) AS hash_partition_key#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -(30) WholeStageCodegenTransformer (X) +(33) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Arguments: false -(31) ColumnarExchange +(34) VeloxAppendBatches +Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] +Arguments: X + +(35) ColumnarExchange Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X], [plan_id=X], [id=#X] -(32) ShuffleQueryStage +(36) ShuffleQueryStage Output [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Arguments: X -(33) InputAdapter +(37) InputAdapter Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -(34) InputIteratorTransformer +(38) InputIteratorTransformer Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -(35) ShuffledHashJoinExecTransformer +(39) ShuffledHashJoinExecTransformer Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(36) ProjectExecTransformer +(40) ProjectExecTransformer Output [6]: [hash(o_orderkey#X, 42) AS hash_partition_key#X, c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] Input [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -(37) WholeStageCodegenTransformer (X) +(41) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] Arguments: false -(38) ColumnarExchange +(42) VeloxAppendBatches +Input [6]: [hash_partition_key#X, c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] +Arguments: X + +(43) ColumnarExchange Input [6]: [hash_partition_key#X, c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X], [plan_id=X], [id=#X] -(39) ShuffleQueryStage +(44) ShuffleQueryStage Output [5]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] Arguments: X -(40) InputAdapter +(45) InputAdapter Input [5]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] -(41) InputIteratorTransformer +(46) InputIteratorTransformer Input [5]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] -(42) Scan parquet +(47) Scan parquet Output [2]: [l_orderkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_orderkey)] ReadSchema: struct -(43) NoopFilter +(48) NoopFilter Input [2]: [l_orderkey#X, l_quantity#X] Arguments: [l_orderkey#X, l_quantity#X] -(44) ProjectExecTransformer +(49) ProjectExecTransformer Output [3]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_quantity#X] Input [2]: [l_orderkey#X, l_quantity#X] -(45) WholeStageCodegenTransformer (X) +(50) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, l_orderkey#X, l_quantity#X] Arguments: false -(46) ColumnarExchange +(51) VeloxAppendBatches +Input [3]: [hash_partition_key#X, l_orderkey#X, l_quantity#X] +Arguments: X + +(52) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_quantity#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_quantity#X], [plan_id=X], [id=#X] -(47) ShuffleQueryStage +(53) ShuffleQueryStage Output [2]: [l_orderkey#X, l_quantity#X] Arguments: X -(48) InputAdapter +(54) InputAdapter Input [2]: [l_orderkey#X, l_quantity#X] -(49) InputIteratorTransformer +(55) InputIteratorTransformer Input [2]: [l_orderkey#X, l_quantity#X] -(50) ReusedExchange [Reuses operator id: 21] +(56) ReusedExchange [Reuses operator id: 24] Output [3]: [l_orderkey#X, sum#X, isEmpty#X] -(51) ShuffleQueryStage +(57) ShuffleQueryStage Output [3]: [l_orderkey#X, sum#X, isEmpty#X] Arguments: X -(52) InputAdapter +(58) InputAdapter Input [3]: [l_orderkey#X, sum#X, isEmpty#X] -(53) InputIteratorTransformer +(59) InputIteratorTransformer Input [3]: [l_orderkey#X, sum#X, isEmpty#X] -(54) RegularHashAggregateExecTransformer +(60) RegularHashAggregateExecTransformer Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Keys [1]: [l_orderkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [2]: [l_orderkey#X, sum(l_quantity#X)#X AS sum(l_quantity#X)#X] -(55) FilterExecTransformer +(61) FilterExecTransformer Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] Arguments: (isnotnull(sum(l_quantity#X)#X) AND (sum(l_quantity#X)#X > 300.00)) -(56) ProjectExecTransformer +(62) ProjectExecTransformer Output [1]: [l_orderkey#X] Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] -(57) ShuffledHashJoinExecTransformer +(63) ShuffledHashJoinExecTransformer Left keys [1]: [l_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(58) ShuffledHashJoinExecTransformer +(64) ShuffledHashJoinExecTransformer Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(59) ProjectExecTransformer +(65) ProjectExecTransformer Output [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_quantity#X] Input [7]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_orderkey#X, l_quantity#X] -(60) RegularHashAggregateExecTransformer +(66) RegularHashAggregateExecTransformer Input [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_quantity#X] Keys [5]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] -(61) RegularHashAggregateExecTransformer +(67) RegularHashAggregateExecTransformer Input [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] Keys [5]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity#X)#X AS sum(l_quantity)#X] -(62) WholeStageCodegenTransformer (X) +(68) WholeStageCodegenTransformer (X) Input [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] Arguments: false -(63) TakeOrderedAndProjectExecTransformer +(69) TakeOrderedAndProjectExecTransformer Input [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] Arguments: X, [o_totalprice#X DESC NULLS LAST, o_orderdate#X ASC NULLS FIRST], [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X], 0 -(64) VeloxColumnarToRowExec +(70) VeloxColumnarToRowExec Input [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] -(65) Scan parquet +(71) Scan parquet Output [2]: [c_custkey#X, c_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey)] ReadSchema: struct -(66) Filter +(72) Filter Input [2]: [c_custkey#X, c_name#X] Condition : isnotnull(c_custkey#X) -(67) Exchange +(73) Exchange Input [2]: [c_custkey#X, c_name#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(68) Scan parquet +(74) Sort +Input [2]: [c_custkey#X, c_name#X] +Arguments: [c_custkey#X ASC NULLS FIRST], false, 0 + +(75) Scan parquet Output [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(69) Filter +(76) Filter Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Condition : (isnotnull(o_custkey#X) AND isnotnull(o_orderkey#X)) -(70) Exchange +(77) Exchange Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(71) Scan parquet +(78) Sort +Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(79) Scan parquet Output [2]: [l_orderkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(72) HashAggregate +(80) HashAggregate Input [2]: [l_orderkey#X, l_quantity#X] Keys [1]: [l_orderkey#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_orderkey#X, sum#X, isEmpty#X] -(73) Exchange +(81) Exchange Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(74) HashAggregate +(82) HashAggregate Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Keys [1]: [l_orderkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [2]: [l_orderkey#X, sum(l_quantity#X)#X AS sum(l_quantity#X)#X] -(75) Filter +(83) Filter Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] Condition : (isnotnull(sum(l_quantity#X)#X) AND (sum(l_quantity#X)#X > 300.00)) -(76) Project +(84) Project Output [1]: [l_orderkey#X] Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] -(77) ShuffledHashJoin +(85) Sort +Input [1]: [l_orderkey#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(86) SortMergeJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(78) Exchange +(87) Exchange Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(79) ShuffledHashJoin +(88) Sort +Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] +Arguments: [o_custkey#X ASC NULLS FIRST], false, 0 + +(89) SortMergeJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(80) Project +(90) Project Output [5]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] Input [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -(81) Exchange +(91) Exchange Input [5]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(82) Scan parquet +(92) Sort +Input [5]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(93) Scan parquet Output [2]: [l_orderkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_orderkey)] ReadSchema: struct -(83) Filter +(94) Filter Input [2]: [l_orderkey#X, l_quantity#X] Condition : isnotnull(l_orderkey#X) -(84) Exchange +(95) Exchange Input [2]: [l_orderkey#X, l_quantity#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(85) Scan parquet +(96) Sort +Input [2]: [l_orderkey#X, l_quantity#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(97) Scan parquet Output [2]: [l_orderkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(86) HashAggregate +(98) HashAggregate Input [2]: [l_orderkey#X, l_quantity#X] Keys [1]: [l_orderkey#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_orderkey#X, sum#X, isEmpty#X] -(87) Exchange +(99) Exchange Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(88) HashAggregate +(100) HashAggregate Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Keys [1]: [l_orderkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [2]: [l_orderkey#X, sum(l_quantity#X)#X AS sum(l_quantity#X)#X] -(89) Filter +(101) Filter Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] Condition : (isnotnull(sum(l_quantity#X)#X) AND (sum(l_quantity#X)#X > 300.00)) -(90) Project +(102) Project Output [1]: [l_orderkey#X] Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] -(91) ShuffledHashJoin +(103) Sort +Input [1]: [l_orderkey#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(104) SortMergeJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(92) ShuffledHashJoin +(105) SortMergeJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(93) Project +(106) Project Output [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_quantity#X] Input [7]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_orderkey#X, l_quantity#X] -(94) HashAggregate +(107) HashAggregate Input [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_quantity#X] Keys [5]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] -(95) HashAggregate +(108) HashAggregate Input [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] Keys [5]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity#X)#X AS sum(l_quantity)#X] -(96) TakeOrderedAndProject +(109) TakeOrderedAndProject Input [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] Arguments: X, [o_totalprice#X DESC NULLS LAST, o_orderdate#X ASC NULLS FIRST], [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] -(97) AdaptiveSparkPlan +(110) AdaptiveSparkPlan Output [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/19.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/19.txt index 29b5d69fa0e88..34abb726b85af 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/19.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/19.txt @@ -1,37 +1,39 @@ == Physical Plan == -AdaptiveSparkPlan (34) +AdaptiveSparkPlan (38) +- == Final Plan == - VeloxColumnarToRowExec (22) - +- ^ RegularHashAggregateExecTransformer (20) - +- ^ RegularHashAggregateExecTransformer (19) - +- ^ ProjectExecTransformer (18) - +- ^ ShuffledHashJoinExecTransformer Inner (17) - :- ^ InputIteratorTransformer (8) - : +- ^ InputAdapter (7) - : +- ^ ShuffleQueryStage (6), Statistics(X) - : +- ColumnarExchange (5) + VeloxColumnarToRowExec (24) + +- ^ RegularHashAggregateExecTransformer (22) + +- ^ RegularHashAggregateExecTransformer (21) + +- ^ ProjectExecTransformer (20) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (19) + :- ^ InputIteratorTransformer (9) + : +- ShuffleQueryStage (7), Statistics(X) + : +- ColumnarExchange (6) + : +- VeloxAppendBatches (5) : +- ^ ProjectExecTransformer (3) : +- ^ NoopFilter (2) : +- ^ Scan parquet (1) - +- ^ InputIteratorTransformer (16) - +- ^ InputAdapter (15) - +- ^ ShuffleQueryStage (14), Statistics(X) - +- ColumnarExchange (13) - +- ^ ProjectExecTransformer (11) - +- ^ NoopFilter (10) - +- ^ Scan parquet (9) + +- ^ InputIteratorTransformer (18) + +- ShuffleQueryStage (16), Statistics(X) + +- ColumnarExchange (15) + +- VeloxAppendBatches (14) + +- ^ ProjectExecTransformer (12) + +- ^ NoopFilter (11) + +- ^ Scan parquet (10) +- == Initial Plan == - HashAggregate (33) - +- HashAggregate (32) - +- Project (31) - +- ShuffledHashJoin Inner BuildRight (30) - :- Exchange (26) - : +- Project (25) - : +- Filter (24) - : +- Scan parquet (23) - +- Exchange (29) - +- Filter (28) - +- Scan parquet (27) + HashAggregate (37) + +- HashAggregate (36) + +- Project (35) + +- SortMergeJoin Inner (34) + :- Sort (29) + : +- Exchange (28) + : +- Project (27) + : +- Filter (26) + : +- Scan parquet (25) + +- Sort (33) + +- Exchange (32) + +- Filter (31) + +- Scan parquet (30) (1) Scan parquet @@ -53,140 +55,156 @@ Input [6]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, l_shipin Input [5]: [hash_partition_key#X, l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [5]: [hash_partition_key#X, l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(6) ColumnarExchange Input [5]: [hash_partition_key#X, l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -(9) Scan parquet +(10) Scan parquet Output [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_size), GreaterThanOrEqual(p_size,1), IsNotNull(p_partkey), Or(Or(And(And(EqualTo(p_brand,Brand#X),In(p_container, [SM BOX,SM CASE,SM PACK,SM PKG])),LessThanOrEqual(p_size,5)),And(And(EqualTo(p_brand,Brand#X),In(p_container, [MED BAG,MED BOX,MED PACK,MED PKG])),LessThanOrEqual(p_size,10))),And(And(EqualTo(p_brand,Brand#X),In(p_container, [LG BOX,LG CASE,LG PACK,LG PKG])),LessThanOrEqual(p_size,15)))] ReadSchema: struct -(10) NoopFilter +(11) NoopFilter Input [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] Arguments: [p_partkey#X, p_brand#X, p_size#X, p_container#X] -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [5]: [hash(p_partkey#X, 42) AS hash_partition_key#X, p_partkey#X, p_brand#X, p_size#X, p_container#X] Input [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, p_partkey#X, p_brand#X, p_size#X, p_container#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [5]: [hash_partition_key#X, p_partkey#X, p_brand#X, p_size#X, p_container#X] +Arguments: X + +(15) ColumnarExchange Input [5]: [hash_partition_key#X, p_partkey#X, p_brand#X, p_size#X, p_container#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X, p_brand#X, p_size#X, p_container#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join condition: (((((((p_brand#X = Brand#X) AND p_container#X IN (SM CASE,SM BOX,SM PACK,SM PKG)) AND (l_quantity#X >= 1.00)) AND (l_quantity#X <= 11.00)) AND (p_size#X <= 5)) OR (((((p_brand#X = Brand#X) AND p_container#X IN (MED BAG,MED BOX,MED PKG,MED PACK)) AND (l_quantity#X >= 10.00)) AND (l_quantity#X <= 20.00)) AND (p_size#X <= 10))) OR (((((p_brand#X = Brand#X) AND p_container#X IN (LG CASE,LG BOX,LG PACK,LG PKG)) AND (l_quantity#X >= 20.00)) AND (l_quantity#X <= 30.00)) AND (p_size#X <= 15))) -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [3]: [l_extendedprice#X, l_discount#X, CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) AS _pre_X#X] Input [8]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, p_partkey#X, p_brand#X, p_size#X, p_container#X] -(19) RegularHashAggregateExecTransformer +(21) RegularHashAggregateExecTransformer Input [3]: [l_extendedprice#X, l_discount#X, _pre_X#X] Keys: [] Functions [1]: [partial_sum(_pre_X#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(20) RegularHashAggregateExecTransformer +(22) RegularHashAggregateExecTransformer Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] Results [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X AS revenue#X] -(21) WholeStageCodegenTransformer (X) +(23) WholeStageCodegenTransformer (X) Input [1]: [revenue#X] Arguments: false -(22) VeloxColumnarToRowExec +(24) VeloxColumnarToRowExec Input [1]: [revenue#X] -(23) Scan parquet +(25) Scan parquet Output [6]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, l_shipinstruct#X, l_shipmode#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipinstruct), In(l_shipmode, [AIR,AIR REG]), EqualTo(l_shipinstruct,DELIVER IN PERSON), IsNotNull(l_partkey), Or(Or(And(GreaterThanOrEqual(l_quantity,1.00),LessThanOrEqual(l_quantity,11.00)),And(GreaterThanOrEqual(l_quantity,10.00),LessThanOrEqual(l_quantity,20.00))),And(GreaterThanOrEqual(l_quantity,20.00),LessThanOrEqual(l_quantity,30.00)))] ReadSchema: struct -(24) Filter +(26) Filter Input [6]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, l_shipinstruct#X, l_shipmode#X] Condition : ((((isnotnull(l_shipinstruct#X) AND l_shipmode#X IN (AIR,AIR REG)) AND (l_shipinstruct#X = DELIVER IN PERSON)) AND isnotnull(l_partkey#X)) AND ((((l_quantity#X >= 1.00) AND (l_quantity#X <= 11.00)) OR ((l_quantity#X >= 10.00) AND (l_quantity#X <= 20.00))) OR ((l_quantity#X >= 20.00) AND (l_quantity#X <= 30.00)))) -(25) Project +(27) Project Output [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Input [6]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, l_shipinstruct#X, l_shipmode#X] -(26) Exchange +(28) Exchange Input [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(27) Scan parquet +(29) Sort +Input [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] +Arguments: [l_partkey#X ASC NULLS FIRST], false, 0 + +(30) Scan parquet Output [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_size), GreaterThanOrEqual(p_size,1), IsNotNull(p_partkey), Or(Or(And(And(EqualTo(p_brand,Brand#X),In(p_container, [SM BOX,SM CASE,SM PACK,SM PKG])),LessThanOrEqual(p_size,5)),And(And(EqualTo(p_brand,Brand#X),In(p_container, [MED BAG,MED BOX,MED PACK,MED PKG])),LessThanOrEqual(p_size,10))),And(And(EqualTo(p_brand,Brand#X),In(p_container, [LG BOX,LG CASE,LG PACK,LG PKG])),LessThanOrEqual(p_size,15)))] ReadSchema: struct -(28) Filter +(31) Filter Input [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] Condition : (((isnotnull(p_size#X) AND (p_size#X >= 1)) AND isnotnull(p_partkey#X)) AND (((((p_brand#X = Brand#X) AND p_container#X IN (SM CASE,SM BOX,SM PACK,SM PKG)) AND (p_size#X <= 5)) OR (((p_brand#X = Brand#X) AND p_container#X IN (MED BAG,MED BOX,MED PKG,MED PACK)) AND (p_size#X <= 10))) OR (((p_brand#X = Brand#X) AND p_container#X IN (LG CASE,LG BOX,LG PACK,LG PKG)) AND (p_size#X <= 15)))) -(29) Exchange +(32) Exchange Input [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(30) ShuffledHashJoin +(33) Sort +Input [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] +Arguments: [p_partkey#X ASC NULLS FIRST], false, 0 + +(34) SortMergeJoin Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join condition: (((((((p_brand#X = Brand#X) AND p_container#X IN (SM CASE,SM BOX,SM PACK,SM PKG)) AND (l_quantity#X >= 1.00)) AND (l_quantity#X <= 11.00)) AND (p_size#X <= 5)) OR (((((p_brand#X = Brand#X) AND p_container#X IN (MED BAG,MED BOX,MED PKG,MED PACK)) AND (l_quantity#X >= 10.00)) AND (l_quantity#X <= 20.00)) AND (p_size#X <= 10))) OR (((((p_brand#X = Brand#X) AND p_container#X IN (LG CASE,LG BOX,LG PACK,LG PKG)) AND (l_quantity#X >= 20.00)) AND (l_quantity#X <= 30.00)) AND (p_size#X <= 15))) -(31) Project +(35) Project Output [2]: [l_extendedprice#X, l_discount#X] Input [8]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, p_partkey#X, p_brand#X, p_size#X, p_container#X] -(32) HashAggregate +(36) HashAggregate Input [2]: [l_extendedprice#X, l_discount#X] Keys: [] Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(33) HashAggregate +(37) HashAggregate Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] Results [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X AS revenue#X] -(34) AdaptiveSparkPlan +(38) AdaptiveSparkPlan Output [1]: [revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/20.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/20.txt index c5dc33a694eef..2cf50b2a3a986 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/20.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/20.txt @@ -1,119 +1,129 @@ == Physical Plan == -AdaptiveSparkPlan (123) +AdaptiveSparkPlan (143) +- == Final Plan == - VeloxColumnarToRowExec (83) - +- AQEShuffleRead (82) - +- ShuffleQueryStage (81), Statistics(X) - +- ColumnarExchange (80) - +- ^ ProjectExecTransformer (78) - +- ^ ShuffledHashJoinExecTransformer Inner (77) - :- ^ InputIteratorTransformer (68) - : +- ^ InputAdapter (67) - : +- ^ ShuffleQueryStage (66), Statistics(X) - : +- ColumnarExchange (65) - : +- ^ ProjectExecTransformer (63) - : +- ^ ShuffledHashJoinExecTransformer LeftSemi (62) - : :- ^ InputIteratorTransformer (8) - : : +- ^ InputAdapter (7) - : : +- ^ ShuffleQueryStage (6), Statistics(X) - : : +- ColumnarExchange (5) - : : +- ^ ProjectExecTransformer (3) - : : +- ^ NoopFilter (2) - : : +- ^ Scan parquet (1) - : +- ^ InputIteratorTransformer (61) - : +- ^ InputAdapter (60) - : +- ^ ShuffleQueryStage (59), Statistics(X) - : +- ColumnarExchange (58) - : +- ^ ProjectExecTransformer (56) - : +- ^ ShuffledHashJoinExecTransformer Inner (55) - : :- ^ InputIteratorTransformer (31) - : : +- ^ InputAdapter (30) - : : +- ^ ShuffleQueryStage (29), Statistics(X) - : : +- ColumnarExchange (28) - : : +- ^ ProjectExecTransformer (26) - : : +- ^ ShuffledHashJoinExecTransformer LeftSemi (25) - : : :- ^ InputIteratorTransformer (16) - : : : +- ^ InputAdapter (15) - : : : +- ^ ShuffleQueryStage (14), Statistics(X) - : : : +- ColumnarExchange (13) - : : : +- ^ ProjectExecTransformer (11) - : : : +- ^ NoopFilter (10) - : : : +- ^ Scan parquet (9) - : : +- ^ InputIteratorTransformer (24) - : : +- ^ InputAdapter (23) - : : +- ^ ShuffleQueryStage (22), Statistics(X) - : : +- ColumnarExchange (21) - : : +- ^ ProjectExecTransformer (19) - : : +- ^ NoopFilter (18) - : : +- ^ Scan parquet (17) - : +- ^ InputIteratorTransformer (54) - : +- ^ InputAdapter (53) - : +- ^ ShuffleQueryStage (52), Statistics(X) - : +- ColumnarExchange (51) - : +- ^ ProjectExecTransformer (49) - : +- ^ FilterExecTransformer (48) - : +- ^ ProjectExecTransformer (47) - : +- ^ RegularHashAggregateExecTransformer (46) - : +- ^ RegularHashAggregateExecTransformer (45) - : +- ^ ShuffledHashJoinExecTransformer LeftSemi (44) - : :- ^ InputIteratorTransformer (39) - : : +- ^ InputAdapter (38) - : : +- ^ ShuffleQueryStage (37), Statistics(X) - : : +- ColumnarExchange (36) - : : +- ^ ProjectExecTransformer (34) - : : +- ^ NoopFilter (33) - : : +- ^ Scan parquet (32) - : +- ^ InputIteratorTransformer (43) - : +- ^ InputAdapter (42) - : +- ^ ShuffleQueryStage (41), Statistics(X) - : +- ReusedExchange (40) - +- ^ InputIteratorTransformer (76) - +- ^ InputAdapter (75) - +- ^ ShuffleQueryStage (74), Statistics(X) - +- ColumnarExchange (73) - +- ^ ProjectExecTransformer (71) - +- ^ NoopFilter (70) - +- ^ Scan parquet (69) + VeloxColumnarToRowExec (93) + +- AQEShuffleRead (92) + +- ShuffleQueryStage (91), Statistics(X) + +- ColumnarExchange (90) + +- VeloxAppendBatches (89) + +- ^ ProjectExecTransformer (87) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (86) + :- ^ InputIteratorTransformer (76) + : +- ShuffleQueryStage (74), Statistics(X) + : +- ColumnarExchange (73) + : +- VeloxAppendBatches (72) + : +- ^ ProjectExecTransformer (70) + : +- ^ ShuffledHashJoinExecTransformer LeftSemi BuildRight (69) + : :- ^ InputIteratorTransformer (9) + : : +- ShuffleQueryStage (7), Statistics(X) + : : +- ColumnarExchange (6) + : : +- VeloxAppendBatches (5) + : : +- ^ ProjectExecTransformer (3) + : : +- ^ NoopFilter (2) + : : +- ^ Scan parquet (1) + : +- ^ InputIteratorTransformer (68) + : +- ShuffleQueryStage (66), Statistics(X) + : +- ColumnarExchange (65) + : +- VeloxAppendBatches (64) + : +- ^ ProjectExecTransformer (62) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (61) + : :- ^ InputIteratorTransformer (35) + : : +- ShuffleQueryStage (33), Statistics(X) + : : +- ColumnarExchange (32) + : : +- VeloxAppendBatches (31) + : : +- ^ ProjectExecTransformer (29) + : : +- ^ ShuffledHashJoinExecTransformer LeftSemi BuildRight (28) + : : :- ^ InputIteratorTransformer (18) + : : : +- ShuffleQueryStage (16), Statistics(X) + : : : +- ColumnarExchange (15) + : : : +- VeloxAppendBatches (14) + : : : +- ^ ProjectExecTransformer (12) + : : : +- ^ NoopFilter (11) + : : : +- ^ Scan parquet (10) + : : +- ^ InputIteratorTransformer (27) + : : +- ShuffleQueryStage (25), Statistics(X) + : : +- ColumnarExchange (24) + : : +- VeloxAppendBatches (23) + : : +- ^ ProjectExecTransformer (21) + : : +- ^ NoopFilter (20) + : : +- ^ Scan parquet (19) + : +- ^ InputIteratorTransformer (60) + : +- ShuffleQueryStage (58), Statistics(X) + : +- ColumnarExchange (57) + : +- VeloxAppendBatches (56) + : +- ^ ProjectExecTransformer (54) + : +- ^ FilterExecTransformer (53) + : +- ^ ProjectExecTransformer (52) + : +- ^ RegularHashAggregateExecTransformer (51) + : +- ^ RegularHashAggregateExecTransformer (50) + : +- ^ ShuffledHashJoinExecTransformer LeftSemi BuildRight (49) + : :- ^ InputIteratorTransformer (44) + : : +- ShuffleQueryStage (42), Statistics(X) + : : +- ColumnarExchange (41) + : : +- VeloxAppendBatches (40) + : : +- ^ ProjectExecTransformer (38) + : : +- ^ NoopFilter (37) + : : +- ^ Scan parquet (36) + : +- ^ InputIteratorTransformer (48) + : +- ShuffleQueryStage (46), Statistics(X) + : +- ReusedExchange (45) + +- ^ InputIteratorTransformer (85) + +- ShuffleQueryStage (83), Statistics(X) + +- ColumnarExchange (82) + +- VeloxAppendBatches (81) + +- ^ ProjectExecTransformer (79) + +- ^ NoopFilter (78) + +- ^ Scan parquet (77) +- == Initial Plan == - Sort (122) - +- Exchange (121) - +- Project (120) - +- ShuffledHashJoin Inner BuildRight (119) - :- Exchange (114) - : +- Project (113) - : +- ShuffledHashJoin LeftSemi BuildRight (112) - : :- Exchange (86) - : : +- Filter (85) - : : +- Scan parquet (84) - : +- Exchange (111) - : +- Project (110) - : +- ShuffledHashJoin Inner BuildLeft (109) - : :- Exchange (95) - : : +- ShuffledHashJoin LeftSemi BuildRight (94) - : : :- Exchange (89) - : : : +- Filter (88) - : : : +- Scan parquet (87) - : : +- Exchange (93) - : : +- Project (92) - : : +- Filter (91) - : : +- Scan parquet (90) - : +- Exchange (108) - : +- Filter (107) - : +- HashAggregate (106) - : +- HashAggregate (105) - : +- ShuffledHashJoin LeftSemi BuildRight (104) - : :- Exchange (99) - : : +- Project (98) - : : +- Filter (97) - : : +- Scan parquet (96) - : +- Exchange (103) - : +- Project (102) - : +- Filter (101) - : +- Scan parquet (100) - +- Exchange (118) - +- Project (117) - +- Filter (116) - +- Scan parquet (115) + Sort (142) + +- Exchange (141) + +- Project (140) + +- SortMergeJoin Inner (139) + :- Sort (133) + : +- Exchange (132) + : +- Project (131) + : +- SortMergeJoin LeftSemi (130) + : :- Sort (97) + : : +- Exchange (96) + : : +- Filter (95) + : : +- Scan parquet (94) + : +- Sort (129) + : +- Exchange (128) + : +- Project (127) + : +- SortMergeJoin Inner (126) + : :- Sort (109) + : : +- Exchange (108) + : : +- SortMergeJoin LeftSemi (107) + : : :- Sort (101) + : : : +- Exchange (100) + : : : +- Filter (99) + : : : +- Scan parquet (98) + : : +- Sort (106) + : : +- Exchange (105) + : : +- Project (104) + : : +- Filter (103) + : : +- Scan parquet (102) + : +- Sort (125) + : +- Exchange (124) + : +- Filter (123) + : +- HashAggregate (122) + : +- HashAggregate (121) + : +- SortMergeJoin LeftSemi (120) + : :- Sort (114) + : : +- Exchange (113) + : : +- Project (112) + : : +- Filter (111) + : : +- Scan parquet (110) + : +- Sort (119) + : +- Exchange (118) + : +- Project (117) + : +- Filter (116) + : +- Scan parquet (115) + +- Sort (138) + +- Exchange (137) + +- Project (136) + +- Filter (135) + +- Scan parquet (134) (1) Scan parquet @@ -135,508 +145,588 @@ Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] +Arguments: X + +(6) ColumnarExchange Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] -(9) Scan parquet +(10) Scan parquet Output [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_availqty), IsNotNull(ps_partkey), IsNotNull(ps_suppkey)] ReadSchema: struct -(10) NoopFilter +(11) NoopFilter Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [4]: [hash(ps_partkey#X, 42) AS hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X] Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X] +Arguments: X + +(15) ColumnarExchange Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_availqty#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] -(17) Scan parquet +(19) Scan parquet Output [2]: [p_partkey#X, p_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_name), StringStartsWith(p_name,forest)] ReadSchema: struct -(18) NoopFilter +(20) NoopFilter Input [2]: [p_partkey#X, p_name#X] Arguments: [p_partkey#X, p_name#X] -(19) ProjectExecTransformer +(21) ProjectExecTransformer Output [2]: [hash(p_partkey#X, 42) AS hash_partition_key#X, p_partkey#X] Input [2]: [p_partkey#X, p_name#X] -(20) WholeStageCodegenTransformer (X) +(22) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, p_partkey#X] Arguments: false -(21) ColumnarExchange +(23) VeloxAppendBatches +Input [2]: [hash_partition_key#X, p_partkey#X] +Arguments: X + +(24) ColumnarExchange Input [2]: [hash_partition_key#X, p_partkey#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [id=#X] -(22) ShuffleQueryStage +(25) ShuffleQueryStage Output [1]: [p_partkey#X] Arguments: X -(23) InputAdapter +(26) InputAdapter Input [1]: [p_partkey#X] -(24) InputIteratorTransformer +(27) InputIteratorTransformer Input [1]: [p_partkey#X] -(25) ShuffledHashJoinExecTransformer +(28) ShuffledHashJoinExecTransformer Left keys [1]: [ps_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(26) ProjectExecTransformer +(29) ProjectExecTransformer Output [4]: [hash(ps_partkey#X, ps_suppkey#X, 42) AS hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X] Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] -(27) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: false -(28) ColumnarExchange +(31) VeloxAppendBatches +Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X] +Arguments: X + +(32) ColumnarExchange Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: hashpartitioning(ps_partkey#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_availqty#X], [plan_id=X], [id=#X] -(29) ShuffleQueryStage +(33) ShuffleQueryStage Output [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: X -(30) InputAdapter +(34) InputAdapter Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] -(31) InputIteratorTransformer +(35) InputIteratorTransformer Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] -(32) Scan parquet +(36) Scan parquet Output [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1994-01-01), LessThan(l_shipdate,1995-01-01), IsNotNull(l_partkey), IsNotNull(l_suppkey)] ReadSchema: struct -(33) NoopFilter +(37) NoopFilter Input [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] Arguments: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] -(34) ProjectExecTransformer +(38) ProjectExecTransformer Output [4]: [hash(l_partkey#X, 42) AS hash_partition_key#X, l_partkey#X, l_suppkey#X, l_quantity#X] Input [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] -(35) WholeStageCodegenTransformer (X) +(39) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, l_partkey#X, l_suppkey#X, l_quantity#X] Arguments: false -(36) ColumnarExchange +(40) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_partkey#X, l_suppkey#X, l_quantity#X] +Arguments: X + +(41) ColumnarExchange Input [4]: [hash_partition_key#X, l_partkey#X, l_suppkey#X, l_quantity#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_suppkey#X, l_quantity#X], [plan_id=X], [id=#X] -(37) ShuffleQueryStage +(42) ShuffleQueryStage Output [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] Arguments: X -(38) InputAdapter +(43) InputAdapter Input [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] -(39) InputIteratorTransformer +(44) InputIteratorTransformer Input [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] -(40) ReusedExchange [Reuses operator id: 21] +(45) ReusedExchange [Reuses operator id: 24] Output [1]: [p_partkey#X] -(41) ShuffleQueryStage +(46) ShuffleQueryStage Output [1]: [p_partkey#X] Arguments: X -(42) InputAdapter +(47) InputAdapter Input [1]: [p_partkey#X] -(43) InputIteratorTransformer +(48) InputIteratorTransformer Input [1]: [p_partkey#X] -(44) ShuffledHashJoinExecTransformer +(49) ShuffledHashJoinExecTransformer Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(45) RegularHashAggregateExecTransformer +(50) RegularHashAggregateExecTransformer Input [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] Keys [2]: [l_partkey#X, l_suppkey#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] -(46) RegularHashAggregateExecTransformer +(51) RegularHashAggregateExecTransformer Input [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] Keys [2]: [l_partkey#X, l_suppkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [3]: [l_partkey#X, l_suppkey#X, sum(l_quantity#X)#X] -(47) ProjectExecTransformer +(52) ProjectExecTransformer Output [3]: [CheckOverflow((0.50 * promote_precision(sum(l_quantity#X)#X)), DecimalType(24,3)) AS (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Input [3]: [l_partkey#X, l_suppkey#X, sum(l_quantity#X)#X] -(48) FilterExecTransformer +(53) FilterExecTransformer Input [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Arguments: isnotnull((0.5 * sum(l_quantity))#X) -(49) ProjectExecTransformer +(54) ProjectExecTransformer Output [4]: [hash(l_partkey#X, l_suppkey#X, 42) AS hash_partition_key#X, (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Input [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] -(50) WholeStageCodegenTransformer (X) +(55) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Arguments: false -(51) ColumnarExchange +(56) VeloxAppendBatches +Input [4]: [hash_partition_key#X, (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] +Arguments: X + +(57) ColumnarExchange Input [4]: [hash_partition_key#X, (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Arguments: hashpartitioning(l_partkey#X, l_suppkey#X, 1), ENSURE_REQUIREMENTS, [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X], [plan_id=X], [id=#X] -(52) ShuffleQueryStage +(58) ShuffleQueryStage Output [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Arguments: X -(53) InputAdapter +(59) InputAdapter Input [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] -(54) InputIteratorTransformer +(60) InputIteratorTransformer Input [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] -(55) ShuffledHashJoinExecTransformer +(61) ShuffledHashJoinExecTransformer Left keys [2]: [ps_partkey#X, ps_suppkey#X] Right keys [2]: [l_partkey#X, l_suppkey#X] Join condition: (cast(ps_availqty#X as decimal(24,3)) > (0.5 * sum(l_quantity))#X) -(56) ProjectExecTransformer +(62) ProjectExecTransformer Output [2]: [hash(ps_suppkey#X, 42) AS hash_partition_key#X, ps_suppkey#X] Input [6]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] -(57) WholeStageCodegenTransformer (X) +(63) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, ps_suppkey#X] Arguments: false -(58) ColumnarExchange +(64) VeloxAppendBatches +Input [2]: [hash_partition_key#X, ps_suppkey#X] +Arguments: X + +(65) ColumnarExchange Input [2]: [hash_partition_key#X, ps_suppkey#X] Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_suppkey#X], [plan_id=X], [id=#X] -(59) ShuffleQueryStage +(66) ShuffleQueryStage Output [1]: [ps_suppkey#X] Arguments: X -(60) InputAdapter +(67) InputAdapter Input [1]: [ps_suppkey#X] -(61) InputIteratorTransformer +(68) InputIteratorTransformer Input [1]: [ps_suppkey#X] -(62) ShuffledHashJoinExecTransformer +(69) ShuffledHashJoinExecTransformer Left keys [1]: [s_suppkey#X] Right keys [1]: [ps_suppkey#X] Join condition: None -(63) ProjectExecTransformer +(70) ProjectExecTransformer Output [4]: [hash(s_nationkey#X, 42) AS hash_partition_key#X, s_name#X, s_address#X, s_nationkey#X] Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] -(64) WholeStageCodegenTransformer (X) +(71) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, s_name#X, s_address#X, s_nationkey#X] Arguments: false -(65) ColumnarExchange +(72) VeloxAppendBatches +Input [4]: [hash_partition_key#X, s_name#X, s_address#X, s_nationkey#X] +Arguments: X + +(73) ColumnarExchange Input [4]: [hash_partition_key#X, s_name#X, s_address#X, s_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_name#X, s_address#X, s_nationkey#X], [plan_id=X], [id=#X] -(66) ShuffleQueryStage +(74) ShuffleQueryStage Output [3]: [s_name#X, s_address#X, s_nationkey#X] Arguments: X -(67) InputAdapter +(75) InputAdapter Input [3]: [s_name#X, s_address#X, s_nationkey#X] -(68) InputIteratorTransformer +(76) InputIteratorTransformer Input [3]: [s_name#X, s_address#X, s_nationkey#X] -(69) Scan parquet +(77) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,CANADA), IsNotNull(n_nationkey)] ReadSchema: struct -(70) NoopFilter +(78) NoopFilter Input [2]: [n_nationkey#X, n_name#X] Arguments: [n_nationkey#X, n_name#X] -(71) ProjectExecTransformer +(79) ProjectExecTransformer Output [2]: [hash(n_nationkey#X, 42) AS hash_partition_key#X, n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(72) WholeStageCodegenTransformer (X) +(80) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, n_nationkey#X] Arguments: false -(73) ColumnarExchange +(81) VeloxAppendBatches +Input [2]: [hash_partition_key#X, n_nationkey#X] +Arguments: X + +(82) ColumnarExchange Input [2]: [hash_partition_key#X, n_nationkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X], [plan_id=X], [id=#X] -(74) ShuffleQueryStage +(83) ShuffleQueryStage Output [1]: [n_nationkey#X] Arguments: X -(75) InputAdapter +(84) InputAdapter Input [1]: [n_nationkey#X] -(76) InputIteratorTransformer +(85) InputIteratorTransformer Input [1]: [n_nationkey#X] -(77) ShuffledHashJoinExecTransformer +(86) ShuffledHashJoinExecTransformer Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(78) ProjectExecTransformer +(87) ProjectExecTransformer Output [2]: [s_name#X, s_address#X] Input [4]: [s_name#X, s_address#X, s_nationkey#X, n_nationkey#X] -(79) WholeStageCodegenTransformer (X) +(88) WholeStageCodegenTransformer (X) Input [2]: [s_name#X, s_address#X] Arguments: false -(80) ColumnarExchange +(89) VeloxAppendBatches +Input [2]: [s_name#X, s_address#X] +Arguments: X + +(90) ColumnarExchange Input [2]: [s_name#X, s_address#X] Arguments: rangepartitioning(s_name#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(81) ShuffleQueryStage +(91) ShuffleQueryStage Output [2]: [s_name#X, s_address#X] Arguments: X -(82) AQEShuffleRead +(92) AQEShuffleRead Input [2]: [s_name#X, s_address#X] Arguments: local -(83) VeloxColumnarToRowExec +(93) VeloxColumnarToRowExec Input [2]: [s_name#X, s_address#X] -(84) Scan parquet +(94) Scan parquet Output [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_nationkey)] ReadSchema: struct -(85) Filter +(95) Filter Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Condition : isnotnull(s_nationkey#X) -(86) Exchange +(96) Exchange Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(87) Scan parquet +(97) Sort +Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] +Arguments: [s_suppkey#X ASC NULLS FIRST], false, 0 + +(98) Scan parquet Output [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_availqty), IsNotNull(ps_partkey), IsNotNull(ps_suppkey)] ReadSchema: struct -(88) Filter +(99) Filter Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Condition : ((isnotnull(ps_availqty#X) AND isnotnull(ps_partkey#X)) AND isnotnull(ps_suppkey#X)) -(89) Exchange +(100) Exchange Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(90) Scan parquet +(101) Sort +Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] +Arguments: [ps_partkey#X ASC NULLS FIRST], false, 0 + +(102) Scan parquet Output [2]: [p_partkey#X, p_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_name), StringStartsWith(p_name,forest)] ReadSchema: struct -(91) Filter +(103) Filter Input [2]: [p_partkey#X, p_name#X] Condition : (isnotnull(p_name#X) AND StartsWith(p_name#X, forest)) -(92) Project +(104) Project Output [1]: [p_partkey#X] Input [2]: [p_partkey#X, p_name#X] -(93) Exchange +(105) Exchange Input [1]: [p_partkey#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(94) ShuffledHashJoin +(106) Sort +Input [1]: [p_partkey#X] +Arguments: [p_partkey#X ASC NULLS FIRST], false, 0 + +(107) SortMergeJoin Left keys [1]: [ps_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(95) Exchange +(108) Exchange Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: hashpartitioning(ps_partkey#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(96) Scan parquet +(109) Sort +Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] +Arguments: [ps_partkey#X ASC NULLS FIRST, ps_suppkey#X ASC NULLS FIRST], false, 0 + +(110) Scan parquet Output [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1994-01-01), LessThan(l_shipdate,1995-01-01), IsNotNull(l_partkey), IsNotNull(l_suppkey)] ReadSchema: struct -(97) Filter +(111) Filter Input [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] Condition : ((((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1994-01-01)) AND (l_shipdate#X < 1995-01-01)) AND isnotnull(l_partkey#X)) AND isnotnull(l_suppkey#X)) -(98) Project +(112) Project Output [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] Input [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] -(99) Exchange +(113) Exchange Input [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(100) Scan parquet +(114) Sort +Input [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] +Arguments: [l_partkey#X ASC NULLS FIRST], false, 0 + +(115) Scan parquet Output [2]: [p_partkey#X, p_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_name), StringStartsWith(p_name,forest)] ReadSchema: struct -(101) Filter +(116) Filter Input [2]: [p_partkey#X, p_name#X] Condition : (isnotnull(p_name#X) AND StartsWith(p_name#X, forest)) -(102) Project +(117) Project Output [1]: [p_partkey#X] Input [2]: [p_partkey#X, p_name#X] -(103) Exchange +(118) Exchange Input [1]: [p_partkey#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(104) ShuffledHashJoin +(119) Sort +Input [1]: [p_partkey#X] +Arguments: [p_partkey#X ASC NULLS FIRST], false, 0 + +(120) SortMergeJoin Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(105) HashAggregate +(121) HashAggregate Input [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] Keys [2]: [l_partkey#X, l_suppkey#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] -(106) HashAggregate +(122) HashAggregate Input [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] Keys [2]: [l_partkey#X, l_suppkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [3]: [CheckOverflow((0.50 * promote_precision(sum(l_quantity#X)#X)), DecimalType(24,3)) AS (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] -(107) Filter +(123) Filter Input [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Condition : isnotnull((0.5 * sum(l_quantity))#X) -(108) Exchange +(124) Exchange Input [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Arguments: hashpartitioning(l_partkey#X, l_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(109) ShuffledHashJoin +(125) Sort +Input [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] +Arguments: [l_partkey#X ASC NULLS FIRST, l_suppkey#X ASC NULLS FIRST], false, 0 + +(126) SortMergeJoin Left keys [2]: [ps_partkey#X, ps_suppkey#X] Right keys [2]: [l_partkey#X, l_suppkey#X] Join condition: (cast(ps_availqty#X as decimal(24,3)) > (0.5 * sum(l_quantity))#X) -(110) Project +(127) Project Output [1]: [ps_suppkey#X] Input [6]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] -(111) Exchange +(128) Exchange Input [1]: [ps_suppkey#X] Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(112) ShuffledHashJoin +(129) Sort +Input [1]: [ps_suppkey#X] +Arguments: [ps_suppkey#X ASC NULLS FIRST], false, 0 + +(130) SortMergeJoin Left keys [1]: [s_suppkey#X] Right keys [1]: [ps_suppkey#X] Join condition: None -(113) Project +(131) Project Output [3]: [s_name#X, s_address#X, s_nationkey#X] Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] -(114) Exchange +(132) Exchange Input [3]: [s_name#X, s_address#X, s_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(115) Scan parquet +(133) Sort +Input [3]: [s_name#X, s_address#X, s_nationkey#X] +Arguments: [s_nationkey#X ASC NULLS FIRST], false, 0 + +(134) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,CANADA), IsNotNull(n_nationkey)] ReadSchema: struct -(116) Filter +(135) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : ((isnotnull(n_name#X) AND (n_name#X = CANADA)) AND isnotnull(n_nationkey#X)) -(117) Project +(136) Project Output [1]: [n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(118) Exchange +(137) Exchange Input [1]: [n_nationkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(119) ShuffledHashJoin +(138) Sort +Input [1]: [n_nationkey#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(139) SortMergeJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(120) Project +(140) Project Output [2]: [s_name#X, s_address#X] Input [4]: [s_name#X, s_address#X, s_nationkey#X, n_nationkey#X] -(121) Exchange +(141) Exchange Input [2]: [s_name#X, s_address#X] Arguments: rangepartitioning(s_name#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(122) Sort +(142) Sort Input [2]: [s_name#X, s_address#X] Arguments: [s_name#X ASC NULLS FIRST], true, 0 -(123) AdaptiveSparkPlan +(143) AdaptiveSparkPlan Output [2]: [s_name#X, s_address#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/21.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/21.txt index 91bb400aac043..f92684b17b15c 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/21.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/21.txt @@ -1,113 +1,122 @@ == Physical Plan == -AdaptiveSparkPlan (118) +AdaptiveSparkPlan (137) +- == Final Plan == - VeloxColumnarToRowExec (81) - +- ^ RegularHashAggregateExecTransformer (79) - +- ^ InputIteratorTransformer (78) - +- ^ InputAdapter (77) - +- ^ ShuffleQueryStage (76), Statistics(X) - +- ColumnarExchange (75) - +- ^ ProjectExecTransformer (73) - +- ^ FlushableHashAggregateExecTransformer (72) - +- ^ ProjectExecTransformer (71) - +- ^ ShuffledHashJoinExecTransformer Inner (70) - :- ^ InputIteratorTransformer (61) - : +- ^ InputAdapter (60) - : +- ^ ShuffleQueryStage (59), Statistics(X) - : +- ColumnarExchange (58) - : +- ^ ProjectExecTransformer (56) - : +- ^ ShuffledHashJoinExecTransformer Inner (55) - : :- ^ InputIteratorTransformer (46) - : : +- ^ InputAdapter (45) - : : +- ^ ShuffleQueryStage (44), Statistics(X) - : : +- ColumnarExchange (43) - : : +- ^ ProjectExecTransformer (41) - : : +- ^ ShuffledHashJoinExecTransformer Inner (40) - : : :- ^ InputIteratorTransformer (8) - : : : +- ^ InputAdapter (7) - : : : +- ^ ShuffleQueryStage (6), Statistics(X) - : : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (91) + +- ^ RegularHashAggregateExecTransformer (89) + +- ^ InputIteratorTransformer (88) + +- ShuffleQueryStage (86), Statistics(X) + +- ColumnarExchange (85) + +- VeloxAppendBatches (84) + +- ^ ProjectExecTransformer (82) + +- ^ FlushableHashAggregateExecTransformer (81) + +- ^ ProjectExecTransformer (80) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (79) + :- ^ InputIteratorTransformer (69) + : +- ShuffleQueryStage (67), Statistics(X) + : +- ColumnarExchange (66) + : +- VeloxAppendBatches (65) + : +- ^ ProjectExecTransformer (63) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (62) + : :- ^ InputIteratorTransformer (52) + : : +- ShuffleQueryStage (50), Statistics(X) + : : +- ColumnarExchange (49) + : : +- VeloxAppendBatches (48) + : : +- ^ ProjectExecTransformer (46) + : : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (45) + : : :- ^ InputIteratorTransformer (9) + : : : +- ShuffleQueryStage (7), Statistics(X) + : : : +- ColumnarExchange (6) + : : : +- VeloxAppendBatches (5) : : : +- ^ ProjectExecTransformer (3) : : : +- ^ NoopFilter (2) : : : +- ^ Scan parquet (1) - : : +- ^ InputIteratorTransformer (39) - : : +- ^ InputAdapter (38) - : : +- ^ ShuffleQueryStage (37), Statistics(X) - : : +- ColumnarExchange (36) - : : +- ^ ProjectExecTransformer (34) - : : +- ^ ShuffledHashJoinExecTransformer LeftAnti (33) - : : :- ^ ShuffledHashJoinExecTransformer LeftSemi (24) - : : : :- ^ InputIteratorTransformer (16) - : : : : +- ^ InputAdapter (15) - : : : : +- ^ ShuffleQueryStage (14), Statistics(X) - : : : : +- ColumnarExchange (13) - : : : : +- ^ ProjectExecTransformer (11) - : : : : +- ^ NoopFilter (10) - : : : : +- ^ Scan parquet (9) - : : : +- ^ InputIteratorTransformer (23) - : : : +- ^ InputAdapter (22) - : : : +- ^ ShuffleQueryStage (21), Statistics(X) - : : : +- ColumnarExchange (20) - : : : +- ^ ProjectExecTransformer (18) - : : : +- ^ Scan parquet (17) - : : +- ^ InputIteratorTransformer (32) - : : +- ^ InputAdapter (31) - : : +- ^ ShuffleQueryStage (30), Statistics(X) - : : +- ColumnarExchange (29) - : : +- ^ ProjectExecTransformer (27) - : : +- ^ NoopFilter (26) - : : +- ^ Scan parquet (25) - : +- ^ InputIteratorTransformer (54) - : +- ^ InputAdapter (53) - : +- ^ ShuffleQueryStage (52), Statistics(X) - : +- ColumnarExchange (51) - : +- ^ ProjectExecTransformer (49) - : +- ^ NoopFilter (48) - : +- ^ Scan parquet (47) - +- ^ InputIteratorTransformer (69) - +- ^ InputAdapter (68) - +- ^ ShuffleQueryStage (67), Statistics(X) - +- ColumnarExchange (66) - +- ^ ProjectExecTransformer (64) - +- ^ NoopFilter (63) - +- ^ Scan parquet (62) + : : +- ^ InputIteratorTransformer (44) + : : +- ShuffleQueryStage (42), Statistics(X) + : : +- ColumnarExchange (41) + : : +- VeloxAppendBatches (40) + : : +- ^ ProjectExecTransformer (38) + : : +- ^ ShuffledHashJoinExecTransformer LeftAnti BuildRight (37) + : : :- ^ ShuffledHashJoinExecTransformer LeftSemi BuildRight (27) + : : : :- ^ InputIteratorTransformer (18) + : : : : +- ShuffleQueryStage (16), Statistics(X) + : : : : +- ColumnarExchange (15) + : : : : +- VeloxAppendBatches (14) + : : : : +- ^ ProjectExecTransformer (12) + : : : : +- ^ NoopFilter (11) + : : : : +- ^ Scan parquet (10) + : : : +- ^ InputIteratorTransformer (26) + : : : +- ShuffleQueryStage (24), Statistics(X) + : : : +- ColumnarExchange (23) + : : : +- VeloxAppendBatches (22) + : : : +- ^ ProjectExecTransformer (20) + : : : +- ^ Scan parquet (19) + : : +- ^ InputIteratorTransformer (36) + : : +- ShuffleQueryStage (34), Statistics(X) + : : +- ColumnarExchange (33) + : : +- VeloxAppendBatches (32) + : : +- ^ ProjectExecTransformer (30) + : : +- ^ NoopFilter (29) + : : +- ^ Scan parquet (28) + : +- ^ InputIteratorTransformer (61) + : +- ShuffleQueryStage (59), Statistics(X) + : +- ColumnarExchange (58) + : +- VeloxAppendBatches (57) + : +- ^ ProjectExecTransformer (55) + : +- ^ NoopFilter (54) + : +- ^ Scan parquet (53) + +- ^ InputIteratorTransformer (78) + +- ShuffleQueryStage (76), Statistics(X) + +- ColumnarExchange (75) + +- VeloxAppendBatches (74) + +- ^ ProjectExecTransformer (72) + +- ^ NoopFilter (71) + +- ^ Scan parquet (70) +- == Initial Plan == - TakeOrderedAndProject (117) - +- HashAggregate (116) - +- Exchange (115) - +- HashAggregate (114) - +- Project (113) - +- ShuffledHashJoin Inner BuildRight (112) - :- Exchange (107) - : +- Project (106) - : +- ShuffledHashJoin Inner BuildRight (105) - : :- Exchange (100) - : : +- Project (99) - : : +- ShuffledHashJoin Inner BuildLeft (98) - : : :- Exchange (84) - : : : +- Filter (83) - : : : +- Scan parquet (82) - : : +- Exchange (97) - : : +- ShuffledHashJoin LeftAnti BuildRight (96) - : : :- ShuffledHashJoin LeftSemi BuildRight (91) - : : : :- Exchange (88) - : : : : +- Project (87) - : : : : +- Filter (86) - : : : : +- Scan parquet (85) - : : : +- Exchange (90) - : : : +- Scan parquet (89) - : : +- Exchange (95) - : : +- Project (94) - : : +- Filter (93) - : : +- Scan parquet (92) - : +- Exchange (104) - : +- Project (103) - : +- Filter (102) - : +- Scan parquet (101) - +- Exchange (111) - +- Project (110) - +- Filter (109) - +- Scan parquet (108) + TakeOrderedAndProject (136) + +- HashAggregate (135) + +- Exchange (134) + +- HashAggregate (133) + +- Project (132) + +- SortMergeJoin Inner (131) + :- Sort (125) + : +- Exchange (124) + : +- Project (123) + : +- SortMergeJoin Inner (122) + : :- Sort (116) + : : +- Exchange (115) + : : +- Project (114) + : : +- SortMergeJoin Inner (113) + : : :- Sort (95) + : : : +- Exchange (94) + : : : +- Filter (93) + : : : +- Scan parquet (92) + : : +- Sort (112) + : : +- Exchange (111) + : : +- SortMergeJoin LeftAnti (110) + : : :- SortMergeJoin LeftSemi (104) + : : : :- Sort (100) + : : : : +- Exchange (99) + : : : : +- Project (98) + : : : : +- Filter (97) + : : : : +- Scan parquet (96) + : : : +- Sort (103) + : : : +- Exchange (102) + : : : +- Scan parquet (101) + : : +- Sort (109) + : : +- Exchange (108) + : : +- Project (107) + : : +- Filter (106) + : : +- Scan parquet (105) + : +- Sort (121) + : +- Exchange (120) + : +- Project (119) + : +- Filter (118) + : +- Scan parquet (117) + +- Sort (130) + +- Exchange (129) + +- Project (128) + +- Filter (127) + +- Scan parquet (126) (1) Scan parquet @@ -129,490 +138,566 @@ Input [3]: [s_suppkey#X, s_name#X, s_nationkey#X] Input [4]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_nationkey#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [4]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_nationkey#X] +Arguments: X + +(6) ColumnarExchange Input [4]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_nationkey#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [3]: [s_suppkey#X, s_name#X, s_nationkey#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [3]: [s_suppkey#X, s_name#X, s_nationkey#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [3]: [s_suppkey#X, s_name#X, s_nationkey#X] -(9) Scan parquet +(10) Scan parquet Output [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_receiptdate), IsNotNull(l_commitdate), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(10) NoopFilter +(11) NoopFilter Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Arguments: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [3]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_suppkey#X] Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] +Arguments: X + +(15) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [2]: [l_orderkey#X, l_suppkey#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [2]: [l_orderkey#X, l_suppkey#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [2]: [l_orderkey#X, l_suppkey#X] -(17) Scan parquet +(19) Scan parquet Output [2]: [l_orderkey#X, l_suppkey#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [3]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_suppkey#X] Input [2]: [l_orderkey#X, l_suppkey#X] -(19) WholeStageCodegenTransformer (X) +(21) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] Arguments: false -(20) ColumnarExchange +(22) VeloxAppendBatches +Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] +Arguments: X + +(23) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [id=#X] -(21) ShuffleQueryStage +(24) ShuffleQueryStage Output [2]: [l_orderkey#X, l_suppkey#X] Arguments: X -(22) InputAdapter +(25) InputAdapter Input [2]: [l_orderkey#X, l_suppkey#X] -(23) InputIteratorTransformer +(26) InputIteratorTransformer Input [2]: [l_orderkey#X, l_suppkey#X] -(24) ShuffledHashJoinExecTransformer +(27) ShuffledHashJoinExecTransformer Left keys [1]: [l_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: NOT (l_suppkey#X = l_suppkey#X) -(25) Scan parquet +(28) Scan parquet Output [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_receiptdate), IsNotNull(l_commitdate)] ReadSchema: struct -(26) NoopFilter +(29) NoopFilter Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Arguments: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] -(27) ProjectExecTransformer +(30) ProjectExecTransformer Output [3]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_suppkey#X] Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] -(28) WholeStageCodegenTransformer (X) +(31) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] Arguments: false -(29) ColumnarExchange +(32) VeloxAppendBatches +Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] +Arguments: X + +(33) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [id=#X] -(30) ShuffleQueryStage +(34) ShuffleQueryStage Output [2]: [l_orderkey#X, l_suppkey#X] Arguments: X -(31) InputAdapter +(35) InputAdapter Input [2]: [l_orderkey#X, l_suppkey#X] -(32) InputIteratorTransformer +(36) InputIteratorTransformer Input [2]: [l_orderkey#X, l_suppkey#X] -(33) ShuffledHashJoinExecTransformer +(37) ShuffledHashJoinExecTransformer Left keys [1]: [l_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: NOT (l_suppkey#X = l_suppkey#X) -(34) ProjectExecTransformer +(38) ProjectExecTransformer Output [3]: [hash(l_suppkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_suppkey#X] Input [2]: [l_orderkey#X, l_suppkey#X] -(35) WholeStageCodegenTransformer (X) +(39) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] Arguments: false -(36) ColumnarExchange +(40) VeloxAppendBatches +Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] +Arguments: X + +(41) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [id=#X] -(37) ShuffleQueryStage +(42) ShuffleQueryStage Output [2]: [l_orderkey#X, l_suppkey#X] Arguments: X -(38) InputAdapter +(43) InputAdapter Input [2]: [l_orderkey#X, l_suppkey#X] -(39) InputIteratorTransformer +(44) InputIteratorTransformer Input [2]: [l_orderkey#X, l_suppkey#X] -(40) ShuffledHashJoinExecTransformer +(45) ShuffledHashJoinExecTransformer Left keys [1]: [s_suppkey#X] Right keys [1]: [l_suppkey#X] Join condition: None -(41) ProjectExecTransformer +(46) ProjectExecTransformer Output [4]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, s_name#X, s_nationkey#X, l_orderkey#X] Input [5]: [s_suppkey#X, s_name#X, s_nationkey#X, l_orderkey#X, l_suppkey#X] -(42) WholeStageCodegenTransformer (X) +(47) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, s_name#X, s_nationkey#X, l_orderkey#X] Arguments: false -(43) ColumnarExchange +(48) VeloxAppendBatches +Input [4]: [hash_partition_key#X, s_name#X, s_nationkey#X, l_orderkey#X] +Arguments: X + +(49) ColumnarExchange Input [4]: [hash_partition_key#X, s_name#X, s_nationkey#X, l_orderkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [s_name#X, s_nationkey#X, l_orderkey#X], [plan_id=X], [id=#X] -(44) ShuffleQueryStage +(50) ShuffleQueryStage Output [3]: [s_name#X, s_nationkey#X, l_orderkey#X] Arguments: X -(45) InputAdapter +(51) InputAdapter Input [3]: [s_name#X, s_nationkey#X, l_orderkey#X] -(46) InputIteratorTransformer +(52) InputIteratorTransformer Input [3]: [s_name#X, s_nationkey#X, l_orderkey#X] -(47) Scan parquet +(53) Scan parquet Output [2]: [o_orderkey#X, o_orderstatus#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderstatus), EqualTo(o_orderstatus,F), IsNotNull(o_orderkey)] ReadSchema: struct -(48) NoopFilter +(54) NoopFilter Input [2]: [o_orderkey#X, o_orderstatus#X] Arguments: [o_orderkey#X, o_orderstatus#X] -(49) ProjectExecTransformer +(55) ProjectExecTransformer Output [2]: [hash(o_orderkey#X, 42) AS hash_partition_key#X, o_orderkey#X] Input [2]: [o_orderkey#X, o_orderstatus#X] -(50) WholeStageCodegenTransformer (X) +(56) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, o_orderkey#X] Arguments: false -(51) ColumnarExchange +(57) VeloxAppendBatches +Input [2]: [hash_partition_key#X, o_orderkey#X] +Arguments: X + +(58) ColumnarExchange Input [2]: [hash_partition_key#X, o_orderkey#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X], [plan_id=X], [id=#X] -(52) ShuffleQueryStage +(59) ShuffleQueryStage Output [1]: [o_orderkey#X] Arguments: X -(53) InputAdapter +(60) InputAdapter Input [1]: [o_orderkey#X] -(54) InputIteratorTransformer +(61) InputIteratorTransformer Input [1]: [o_orderkey#X] -(55) ShuffledHashJoinExecTransformer +(62) ShuffledHashJoinExecTransformer Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join condition: None -(56) ProjectExecTransformer +(63) ProjectExecTransformer Output [3]: [hash(s_nationkey#X, 42) AS hash_partition_key#X, s_name#X, s_nationkey#X] Input [4]: [s_name#X, s_nationkey#X, l_orderkey#X, o_orderkey#X] -(57) WholeStageCodegenTransformer (X) +(64) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, s_name#X, s_nationkey#X] Arguments: false -(58) ColumnarExchange +(65) VeloxAppendBatches +Input [3]: [hash_partition_key#X, s_name#X, s_nationkey#X] +Arguments: X + +(66) ColumnarExchange Input [3]: [hash_partition_key#X, s_name#X, s_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_name#X, s_nationkey#X], [plan_id=X], [id=#X] -(59) ShuffleQueryStage +(67) ShuffleQueryStage Output [2]: [s_name#X, s_nationkey#X] Arguments: X -(60) InputAdapter +(68) InputAdapter Input [2]: [s_name#X, s_nationkey#X] -(61) InputIteratorTransformer +(69) InputIteratorTransformer Input [2]: [s_name#X, s_nationkey#X] -(62) Scan parquet +(70) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,SAUDI ARABIA), IsNotNull(n_nationkey)] ReadSchema: struct -(63) NoopFilter +(71) NoopFilter Input [2]: [n_nationkey#X, n_name#X] Arguments: [n_nationkey#X, n_name#X] -(64) ProjectExecTransformer +(72) ProjectExecTransformer Output [2]: [hash(n_nationkey#X, 42) AS hash_partition_key#X, n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(65) WholeStageCodegenTransformer (X) +(73) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, n_nationkey#X] Arguments: false -(66) ColumnarExchange +(74) VeloxAppendBatches +Input [2]: [hash_partition_key#X, n_nationkey#X] +Arguments: X + +(75) ColumnarExchange Input [2]: [hash_partition_key#X, n_nationkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X], [plan_id=X], [id=#X] -(67) ShuffleQueryStage +(76) ShuffleQueryStage Output [1]: [n_nationkey#X] Arguments: X -(68) InputAdapter +(77) InputAdapter Input [1]: [n_nationkey#X] -(69) InputIteratorTransformer +(78) InputIteratorTransformer Input [1]: [n_nationkey#X] -(70) ShuffledHashJoinExecTransformer +(79) ShuffledHashJoinExecTransformer Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(71) ProjectExecTransformer +(80) ProjectExecTransformer Output [1]: [s_name#X] Input [3]: [s_name#X, s_nationkey#X, n_nationkey#X] -(72) FlushableHashAggregateExecTransformer +(81) FlushableHashAggregateExecTransformer Input [1]: [s_name#X] Keys [1]: [s_name#X] Functions [1]: [partial_count(1)] Aggregate Attributes [1]: [count#X] Results [2]: [s_name#X, count#X] -(73) ProjectExecTransformer +(82) ProjectExecTransformer Output [3]: [hash(s_name#X, 42) AS hash_partition_key#X, s_name#X, count#X] Input [2]: [s_name#X, count#X] -(74) WholeStageCodegenTransformer (X) +(83) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, s_name#X, count#X] Arguments: false -(75) ColumnarExchange +(84) VeloxAppendBatches +Input [3]: [hash_partition_key#X, s_name#X, count#X] +Arguments: X + +(85) ColumnarExchange Input [3]: [hash_partition_key#X, s_name#X, count#X] Arguments: hashpartitioning(s_name#X, 1), ENSURE_REQUIREMENTS, [s_name#X, count#X], [plan_id=X], [id=#X] -(76) ShuffleQueryStage +(86) ShuffleQueryStage Output [2]: [s_name#X, count#X] Arguments: X -(77) InputAdapter +(87) InputAdapter Input [2]: [s_name#X, count#X] -(78) InputIteratorTransformer +(88) InputIteratorTransformer Input [2]: [s_name#X, count#X] -(79) RegularHashAggregateExecTransformer +(89) RegularHashAggregateExecTransformer Input [2]: [s_name#X, count#X] Keys [1]: [s_name#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [s_name#X, count(1)#X AS numwait#X] -(80) WholeStageCodegenTransformer (X) +(90) WholeStageCodegenTransformer (X) Input [2]: [s_name#X, numwait#X] Arguments: false -(81) VeloxColumnarToRowExec +(91) VeloxColumnarToRowExec Input [2]: [s_name#X, numwait#X] -(82) Scan parquet +(92) Scan parquet Output [3]: [s_suppkey#X, s_name#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(83) Filter +(93) Filter Input [3]: [s_suppkey#X, s_name#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(84) Exchange +(94) Exchange Input [3]: [s_suppkey#X, s_name#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(85) Scan parquet +(95) Sort +Input [3]: [s_suppkey#X, s_name#X, s_nationkey#X] +Arguments: [s_suppkey#X ASC NULLS FIRST], false, 0 + +(96) Scan parquet Output [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_receiptdate), IsNotNull(l_commitdate), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(86) Filter +(97) Filter Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Condition : ((((isnotnull(l_receiptdate#X) AND isnotnull(l_commitdate#X)) AND (l_receiptdate#X > l_commitdate#X)) AND isnotnull(l_suppkey#X)) AND isnotnull(l_orderkey#X)) -(87) Project +(98) Project Output [2]: [l_orderkey#X, l_suppkey#X] Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] -(88) Exchange +(99) Exchange Input [2]: [l_orderkey#X, l_suppkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(89) Scan parquet +(100) Sort +Input [2]: [l_orderkey#X, l_suppkey#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(101) Scan parquet Output [2]: [l_orderkey#X, l_suppkey#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(90) Exchange +(102) Exchange Input [2]: [l_orderkey#X, l_suppkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(91) ShuffledHashJoin +(103) Sort +Input [2]: [l_orderkey#X, l_suppkey#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(104) SortMergeJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: NOT (l_suppkey#X = l_suppkey#X) -(92) Scan parquet +(105) Scan parquet Output [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_receiptdate), IsNotNull(l_commitdate)] ReadSchema: struct -(93) Filter +(106) Filter Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Condition : ((isnotnull(l_receiptdate#X) AND isnotnull(l_commitdate#X)) AND (l_receiptdate#X > l_commitdate#X)) -(94) Project +(107) Project Output [2]: [l_orderkey#X, l_suppkey#X] Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] -(95) Exchange +(108) Exchange Input [2]: [l_orderkey#X, l_suppkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(96) ShuffledHashJoin +(109) Sort +Input [2]: [l_orderkey#X, l_suppkey#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(110) SortMergeJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: NOT (l_suppkey#X = l_suppkey#X) -(97) Exchange +(111) Exchange Input [2]: [l_orderkey#X, l_suppkey#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(98) ShuffledHashJoin +(112) Sort +Input [2]: [l_orderkey#X, l_suppkey#X] +Arguments: [l_suppkey#X ASC NULLS FIRST], false, 0 + +(113) SortMergeJoin Left keys [1]: [s_suppkey#X] Right keys [1]: [l_suppkey#X] Join condition: None -(99) Project +(114) Project Output [3]: [s_name#X, s_nationkey#X, l_orderkey#X] Input [5]: [s_suppkey#X, s_name#X, s_nationkey#X, l_orderkey#X, l_suppkey#X] -(100) Exchange +(115) Exchange Input [3]: [s_name#X, s_nationkey#X, l_orderkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(101) Scan parquet +(116) Sort +Input [3]: [s_name#X, s_nationkey#X, l_orderkey#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(117) Scan parquet Output [2]: [o_orderkey#X, o_orderstatus#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderstatus), EqualTo(o_orderstatus,F), IsNotNull(o_orderkey)] ReadSchema: struct -(102) Filter +(118) Filter Input [2]: [o_orderkey#X, o_orderstatus#X] Condition : ((isnotnull(o_orderstatus#X) AND (o_orderstatus#X = F)) AND isnotnull(o_orderkey#X)) -(103) Project +(119) Project Output [1]: [o_orderkey#X] Input [2]: [o_orderkey#X, o_orderstatus#X] -(104) Exchange +(120) Exchange Input [1]: [o_orderkey#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(105) ShuffledHashJoin +(121) Sort +Input [1]: [o_orderkey#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(122) SortMergeJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join condition: None -(106) Project +(123) Project Output [2]: [s_name#X, s_nationkey#X] Input [4]: [s_name#X, s_nationkey#X, l_orderkey#X, o_orderkey#X] -(107) Exchange +(124) Exchange Input [2]: [s_name#X, s_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(108) Scan parquet +(125) Sort +Input [2]: [s_name#X, s_nationkey#X] +Arguments: [s_nationkey#X ASC NULLS FIRST], false, 0 + +(126) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,SAUDI ARABIA), IsNotNull(n_nationkey)] ReadSchema: struct -(109) Filter +(127) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : ((isnotnull(n_name#X) AND (n_name#X = SAUDI ARABIA)) AND isnotnull(n_nationkey#X)) -(110) Project +(128) Project Output [1]: [n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(111) Exchange +(129) Exchange Input [1]: [n_nationkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(112) ShuffledHashJoin +(130) Sort +Input [1]: [n_nationkey#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(131) SortMergeJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(113) Project +(132) Project Output [1]: [s_name#X] Input [3]: [s_name#X, s_nationkey#X, n_nationkey#X] -(114) HashAggregate +(133) HashAggregate Input [1]: [s_name#X] Keys [1]: [s_name#X] Functions [1]: [partial_count(1)] Aggregate Attributes [1]: [count#X] Results [2]: [s_name#X, count#X] -(115) Exchange +(134) Exchange Input [2]: [s_name#X, count#X] Arguments: hashpartitioning(s_name#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(116) HashAggregate +(135) HashAggregate Input [2]: [s_name#X, count#X] Keys [1]: [s_name#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [s_name#X, count(1)#X AS numwait#X] -(117) TakeOrderedAndProject +(136) TakeOrderedAndProject Input [2]: [s_name#X, numwait#X] Arguments: X, [numwait#X DESC NULLS LAST, s_name#X ASC NULLS FIRST], [s_name#X, numwait#X] -(118) AdaptiveSparkPlan +(137) AdaptiveSparkPlan Output [2]: [s_name#X, numwait#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/22.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/22.txt index ba18df0d01190..1c2790a4a9999 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/22.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/22.txt @@ -1,47 +1,49 @@ == Physical Plan == -AdaptiveSparkPlan (46) +AdaptiveSparkPlan (52) +- == Final Plan == - VeloxColumnarToRowExec (33) - +- ^ SortExecTransformer (31) - +- ^ InputIteratorTransformer (30) - +- ^ InputAdapter (29) - +- ^ ShuffleQueryStage (28), Statistics(X) - +- ColumnarExchange (27) - +- ^ RegularHashAggregateExecTransformer (25) - +- ^ InputIteratorTransformer (24) - +- ^ InputAdapter (23) - +- ^ ShuffleQueryStage (22), Statistics(X) - +- ColumnarExchange (21) - +- ^ ProjectExecTransformer (19) - +- ^ FlushableHashAggregateExecTransformer (18) - +- ^ ProjectExecTransformer (17) - +- ^ ShuffledHashJoinExecTransformer LeftAnti (16) - :- ^ InputIteratorTransformer (8) - : +- ^ InputAdapter (7) - : +- ^ ShuffleQueryStage (6), Statistics(X) - : +- ColumnarExchange (5) + VeloxColumnarToRowExec (37) + +- ^ SortExecTransformer (35) + +- ^ InputIteratorTransformer (34) + +- ShuffleQueryStage (32), Statistics(X) + +- ColumnarExchange (31) + +- VeloxAppendBatches (30) + +- ^ RegularHashAggregateExecTransformer (28) + +- ^ InputIteratorTransformer (27) + +- ShuffleQueryStage (25), Statistics(X) + +- ColumnarExchange (24) + +- VeloxAppendBatches (23) + +- ^ ProjectExecTransformer (21) + +- ^ FlushableHashAggregateExecTransformer (20) + +- ^ ProjectExecTransformer (19) + +- ^ ShuffledHashJoinExecTransformer LeftAnti BuildRight (18) + :- ^ InputIteratorTransformer (9) + : +- ShuffleQueryStage (7), Statistics(X) + : +- ColumnarExchange (6) + : +- VeloxAppendBatches (5) : +- ^ ProjectExecTransformer (3) : +- ^ NoopFilter (2) : +- ^ Scan parquet (1) - +- ^ InputIteratorTransformer (15) - +- ^ InputAdapter (14) - +- ^ ShuffleQueryStage (13), Statistics(X) - +- ColumnarExchange (12) - +- ^ ProjectExecTransformer (10) - +- ^ Scan parquet (9) + +- ^ InputIteratorTransformer (17) + +- ShuffleQueryStage (15), Statistics(X) + +- ColumnarExchange (14) + +- VeloxAppendBatches (13) + +- ^ ProjectExecTransformer (11) + +- ^ Scan parquet (10) +- == Initial Plan == - Sort (45) - +- Exchange (44) - +- HashAggregate (43) - +- Exchange (42) - +- HashAggregate (41) - +- Project (40) - +- ShuffledHashJoin LeftAnti BuildRight (39) - :- Exchange (36) - : +- Filter (35) - : +- Scan parquet (34) - +- Exchange (38) - +- Scan parquet (37) + Sort (51) + +- Exchange (50) + +- HashAggregate (49) + +- Exchange (48) + +- HashAggregate (47) + +- Project (46) + +- SortMergeJoin LeftAnti (45) + :- Sort (41) + : +- Exchange (40) + : +- Filter (39) + : +- Scan parquet (38) + +- Sort (44) + +- Exchange (43) + +- Scan parquet (42) (1) Scan parquet @@ -63,297 +65,325 @@ Input [3]: [c_custkey#X, c_phone#X, c_acctbal#X] Input [4]: [hash_partition_key#X, c_custkey#X, c_phone#X, c_acctbal#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [4]: [hash_partition_key#X, c_custkey#X, c_phone#X, c_acctbal#X] +Arguments: X + +(6) ColumnarExchange Input [4]: [hash_partition_key#X, c_custkey#X, c_phone#X, c_acctbal#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_phone#X, c_acctbal#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [3]: [c_custkey#X, c_phone#X, c_acctbal#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [3]: [c_custkey#X, c_phone#X, c_acctbal#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [3]: [c_custkey#X, c_phone#X, c_acctbal#X] -(9) Scan parquet +(10) Scan parquet Output [1]: [o_custkey#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(10) ProjectExecTransformer +(11) ProjectExecTransformer Output [2]: [hash(o_custkey#X, 42) AS hash_partition_key#X, o_custkey#X] Input [1]: [o_custkey#X] -(11) WholeStageCodegenTransformer (X) +(12) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, o_custkey#X] Arguments: false -(12) ColumnarExchange +(13) VeloxAppendBatches +Input [2]: [hash_partition_key#X, o_custkey#X] +Arguments: X + +(14) ColumnarExchange Input [2]: [hash_partition_key#X, o_custkey#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_custkey#X], [plan_id=X], [id=#X] -(13) ShuffleQueryStage +(15) ShuffleQueryStage Output [1]: [o_custkey#X] Arguments: X -(14) InputAdapter +(16) InputAdapter Input [1]: [o_custkey#X] -(15) InputIteratorTransformer +(17) InputIteratorTransformer Input [1]: [o_custkey#X] -(16) ShuffledHashJoinExecTransformer +(18) ShuffledHashJoinExecTransformer Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(17) ProjectExecTransformer +(19) ProjectExecTransformer Output [2]: [substring(c_phone#X, 1, 2) AS cntrycode#X, c_acctbal#X] Input [3]: [c_custkey#X, c_phone#X, c_acctbal#X] -(18) FlushableHashAggregateExecTransformer +(20) FlushableHashAggregateExecTransformer Input [2]: [cntrycode#X, c_acctbal#X] Keys [1]: [cntrycode#X] Functions [2]: [partial_count(1), partial_sum(c_acctbal#X)] Aggregate Attributes [3]: [count#X, sum#X, isEmpty#X] Results [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] -(19) ProjectExecTransformer +(21) ProjectExecTransformer Output [5]: [hash(cntrycode#X, 42) AS hash_partition_key#X, cntrycode#X, count#X, sum#X, isEmpty#X] Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] -(20) WholeStageCodegenTransformer (X) +(22) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, cntrycode#X, count#X, sum#X, isEmpty#X] Arguments: false -(21) ColumnarExchange +(23) VeloxAppendBatches +Input [5]: [hash_partition_key#X, cntrycode#X, count#X, sum#X, isEmpty#X] +Arguments: X + +(24) ColumnarExchange Input [5]: [hash_partition_key#X, cntrycode#X, count#X, sum#X, isEmpty#X] Arguments: hashpartitioning(cntrycode#X, 1), ENSURE_REQUIREMENTS, [cntrycode#X, count#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(22) ShuffleQueryStage +(25) ShuffleQueryStage Output [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] Arguments: X -(23) InputAdapter +(26) InputAdapter Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] -(24) InputIteratorTransformer +(27) InputIteratorTransformer Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] -(25) RegularHashAggregateExecTransformer +(28) RegularHashAggregateExecTransformer Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] Keys [1]: [cntrycode#X] Functions [2]: [count(1), sum(c_acctbal#X)] Aggregate Attributes [2]: [count(1)#X, sum(c_acctbal#X)#X] Results [3]: [cntrycode#X, count(1)#X AS numcust#X, sum(c_acctbal#X)#X AS totacctbal#X] -(26) WholeStageCodegenTransformer (X) +(29) WholeStageCodegenTransformer (X) Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: false -(27) ColumnarExchange +(30) VeloxAppendBatches +Input [3]: [cntrycode#X, numcust#X, totacctbal#X] +Arguments: X + +(31) ColumnarExchange Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: rangepartitioning(cntrycode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(28) ShuffleQueryStage +(32) ShuffleQueryStage Output [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: X -(29) InputAdapter +(33) InputAdapter Input [3]: [cntrycode#X, numcust#X, totacctbal#X] -(30) InputIteratorTransformer +(34) InputIteratorTransformer Input [3]: [cntrycode#X, numcust#X, totacctbal#X] -(31) SortExecTransformer +(35) SortExecTransformer Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: [cntrycode#X ASC NULLS FIRST], true, 0 -(32) WholeStageCodegenTransformer (X) +(36) WholeStageCodegenTransformer (X) Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: false -(33) VeloxColumnarToRowExec +(37) VeloxColumnarToRowExec Input [3]: [cntrycode#X, numcust#X, totacctbal#X] -(34) Scan parquet +(38) Scan parquet Output [3]: [c_custkey#X, c_phone#X, c_acctbal#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_acctbal)] ReadSchema: struct -(35) Filter +(39) Filter Input [3]: [c_custkey#X, c_phone#X, c_acctbal#X] Condition : ((isnotnull(c_acctbal#X) AND substring(c_phone#X, 1, 2) IN (13,31,23,29,30,18,17)) AND (cast(c_acctbal#X as decimal(16,6)) > Subquery subquery#X, [id=#X])) -(36) Exchange +(40) Exchange Input [3]: [c_custkey#X, c_phone#X, c_acctbal#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(37) Scan parquet +(41) Sort +Input [3]: [c_custkey#X, c_phone#X, c_acctbal#X] +Arguments: [c_custkey#X ASC NULLS FIRST], false, 0 + +(42) Scan parquet Output [1]: [o_custkey#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(38) Exchange +(43) Exchange Input [1]: [o_custkey#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(39) ShuffledHashJoin +(44) Sort +Input [1]: [o_custkey#X] +Arguments: [o_custkey#X ASC NULLS FIRST], false, 0 + +(45) SortMergeJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(40) Project +(46) Project Output [2]: [substring(c_phone#X, 1, 2) AS cntrycode#X, c_acctbal#X] Input [3]: [c_custkey#X, c_phone#X, c_acctbal#X] -(41) HashAggregate +(47) HashAggregate Input [2]: [cntrycode#X, c_acctbal#X] Keys [1]: [cntrycode#X] Functions [2]: [partial_count(1), partial_sum(c_acctbal#X)] Aggregate Attributes [3]: [count#X, sum#X, isEmpty#X] Results [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] -(42) Exchange +(48) Exchange Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] Arguments: hashpartitioning(cntrycode#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(43) HashAggregate +(49) HashAggregate Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] Keys [1]: [cntrycode#X] Functions [2]: [count(1), sum(c_acctbal#X)] Aggregate Attributes [2]: [count(1)#X, sum(c_acctbal#X)#X] Results [3]: [cntrycode#X, count(1)#X AS numcust#X, sum(c_acctbal#X)#X AS totacctbal#X] -(44) Exchange +(50) Exchange Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: rangepartitioning(cntrycode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(45) Sort +(51) Sort Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: [cntrycode#X ASC NULLS FIRST], true, 0 -(46) AdaptiveSparkPlan +(52) AdaptiveSparkPlan Output [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: isFinalPlan=true ===== Subqueries ===== Subquery:1 Hosting operator id = 1 Hosting Expression = Subquery subquery#X, [id=#X] -AdaptiveSparkPlan (65) +AdaptiveSparkPlan (72) +- == Final Plan == - VeloxColumnarToRowExec (58) - +- ^ RegularHashAggregateExecTransformer (56) - +- ^ InputIteratorTransformer (55) - +- ^ InputAdapter (54) - +- ^ ShuffleQueryStage (53), Statistics(X) - +- ColumnarExchange (52) - +- ^ FlushableHashAggregateExecTransformer (50) - +- ^ ProjectExecTransformer (49) - +- ^ NoopFilter (48) - +- ^ Scan parquet (47) + VeloxColumnarToRowExec (65) + +- ^ RegularHashAggregateExecTransformer (63) + +- ^ InputIteratorTransformer (62) + +- ShuffleQueryStage (60), Statistics(X) + +- ColumnarExchange (59) + +- VeloxAppendBatches (58) + +- ^ FlushableHashAggregateExecTransformer (56) + +- ^ ProjectExecTransformer (55) + +- ^ NoopFilter (54) + +- ^ Scan parquet (53) +- == Initial Plan == - HashAggregate (64) - +- Exchange (63) - +- HashAggregate (62) - +- Project (61) - +- Filter (60) - +- Scan parquet (59) + HashAggregate (71) + +- Exchange (70) + +- HashAggregate (69) + +- Project (68) + +- Filter (67) + +- Scan parquet (66) -(47) Scan parquet +(53) Scan parquet Output [2]: [c_phone#X, c_acctbal#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_acctbal), GreaterThan(c_acctbal,0.00)] ReadSchema: struct -(48) NoopFilter +(54) NoopFilter Input [2]: [c_phone#X, c_acctbal#X] Arguments: [c_phone#X, c_acctbal#X] -(49) ProjectExecTransformer +(55) ProjectExecTransformer Output [1]: [c_acctbal#X] Input [2]: [c_phone#X, c_acctbal#X] -(50) FlushableHashAggregateExecTransformer +(56) FlushableHashAggregateExecTransformer Input [1]: [c_acctbal#X] Keys: [] Functions [1]: [partial_avg(c_acctbal#X)] Aggregate Attributes [2]: [sum#X, count#X] Results [2]: [sum#X, count#X] -(51) WholeStageCodegenTransformer (X) +(57) WholeStageCodegenTransformer (X) Input [2]: [sum#X, count#X] Arguments: false -(52) ColumnarExchange +(58) VeloxAppendBatches +Input [2]: [sum#X, count#X] +Arguments: X + +(59) ColumnarExchange Input [2]: [sum#X, count#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(53) ShuffleQueryStage +(60) ShuffleQueryStage Output [2]: [sum#X, count#X] Arguments: X -(54) InputAdapter +(61) InputAdapter Input [2]: [sum#X, count#X] -(55) InputIteratorTransformer +(62) InputIteratorTransformer Input [2]: [sum#X, count#X] -(56) RegularHashAggregateExecTransformer +(63) RegularHashAggregateExecTransformer Input [2]: [sum#X, count#X] Keys: [] Functions [1]: [avg(c_acctbal#X)] Aggregate Attributes [1]: [avg(c_acctbal#X)#X] Results [1]: [avg(c_acctbal#X)#X AS avg(c_acctbal)#X] -(57) WholeStageCodegenTransformer (X) +(64) WholeStageCodegenTransformer (X) Input [1]: [avg(c_acctbal)#X] Arguments: false -(58) VeloxColumnarToRowExec +(65) VeloxColumnarToRowExec Input [1]: [avg(c_acctbal)#X] -(59) Scan parquet +(66) Scan parquet Output [2]: [c_phone#X, c_acctbal#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_acctbal), GreaterThan(c_acctbal,0.00)] ReadSchema: struct -(60) Filter +(67) Filter Input [2]: [c_phone#X, c_acctbal#X] Condition : ((isnotnull(c_acctbal#X) AND (c_acctbal#X > 0.00)) AND substring(c_phone#X, 1, 2) IN (13,31,23,29,30,18,17)) -(61) Project +(68) Project Output [1]: [c_acctbal#X] Input [2]: [c_phone#X, c_acctbal#X] -(62) HashAggregate +(69) HashAggregate Input [1]: [c_acctbal#X] Keys: [] Functions [1]: [partial_avg(c_acctbal#X)] Aggregate Attributes [2]: [sum#X, count#X] Results [2]: [sum#X, count#X] -(63) Exchange +(70) Exchange Input [2]: [sum#X, count#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X] -(64) HashAggregate +(71) HashAggregate Input [2]: [sum#X, count#X] Keys: [] Functions [1]: [avg(c_acctbal#X)] Aggregate Attributes [1]: [avg(c_acctbal#X)#X] Results [1]: [avg(c_acctbal#X)#X AS avg(c_acctbal)#X] -(65) AdaptiveSparkPlan +(72) AdaptiveSparkPlan Output [1]: [avg(c_acctbal)#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/3.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/3.txt index 08588f64d24f0..1f99052941446 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/3.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/3.txt @@ -1,60 +1,64 @@ == Physical Plan == -AdaptiveSparkPlan (59) +AdaptiveSparkPlan (67) +- == Final Plan == - VeloxColumnarToRowExec (39) - +- TakeOrderedAndProjectExecTransformer (38) - +- ^ ProjectExecTransformer (36) - +- ^ RegularHashAggregateExecTransformer (35) - +- ^ RegularHashAggregateExecTransformer (34) - +- ^ ProjectExecTransformer (33) - +- ^ ShuffledHashJoinExecTransformer Inner (32) - :- ^ InputIteratorTransformer (23) - : +- ^ InputAdapter (22) - : +- ^ ShuffleQueryStage (21), Statistics(X) - : +- ColumnarExchange (20) - : +- ^ ProjectExecTransformer (18) - : +- ^ ShuffledHashJoinExecTransformer Inner (17) - : :- ^ InputIteratorTransformer (8) - : : +- ^ InputAdapter (7) - : : +- ^ ShuffleQueryStage (6), Statistics(X) - : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (43) + +- TakeOrderedAndProjectExecTransformer (42) + +- ^ ProjectExecTransformer (40) + +- ^ RegularHashAggregateExecTransformer (39) + +- ^ RegularHashAggregateExecTransformer (38) + +- ^ ProjectExecTransformer (37) + +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (36) + :- ^ InputIteratorTransformer (26) + : +- ShuffleQueryStage (24), Statistics(X) + : +- ColumnarExchange (23) + : +- VeloxAppendBatches (22) + : +- ^ ProjectExecTransformer (20) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (19) + : :- ^ InputIteratorTransformer (9) + : : +- ShuffleQueryStage (7), Statistics(X) + : : +- ColumnarExchange (6) + : : +- VeloxAppendBatches (5) : : +- ^ ProjectExecTransformer (3) : : +- ^ NoopFilter (2) : : +- ^ Scan parquet (1) - : +- ^ InputIteratorTransformer (16) - : +- ^ InputAdapter (15) - : +- ^ ShuffleQueryStage (14), Statistics(X) - : +- ColumnarExchange (13) - : +- ^ ProjectExecTransformer (11) - : +- ^ NoopFilter (10) - : +- ^ Scan parquet (9) - +- ^ InputIteratorTransformer (31) - +- ^ InputAdapter (30) - +- ^ ShuffleQueryStage (29), Statistics(X) - +- ColumnarExchange (28) - +- ^ ProjectExecTransformer (26) - +- ^ NoopFilter (25) - +- ^ Scan parquet (24) + : +- ^ InputIteratorTransformer (18) + : +- ShuffleQueryStage (16), Statistics(X) + : +- ColumnarExchange (15) + : +- VeloxAppendBatches (14) + : +- ^ ProjectExecTransformer (12) + : +- ^ NoopFilter (11) + : +- ^ Scan parquet (10) + +- ^ InputIteratorTransformer (35) + +- ShuffleQueryStage (33), Statistics(X) + +- ColumnarExchange (32) + +- VeloxAppendBatches (31) + +- ^ ProjectExecTransformer (29) + +- ^ NoopFilter (28) + +- ^ Scan parquet (27) +- == Initial Plan == - TakeOrderedAndProject (58) - +- HashAggregate (57) - +- HashAggregate (56) - +- Project (55) - +- ShuffledHashJoin Inner BuildRight (54) - :- Exchange (49) - : +- Project (48) - : +- ShuffledHashJoin Inner BuildLeft (47) - : :- Exchange (43) - : : +- Project (42) - : : +- Filter (41) - : : +- Scan parquet (40) - : +- Exchange (46) - : +- Filter (45) - : +- Scan parquet (44) - +- Exchange (53) - +- Project (52) - +- Filter (51) - +- Scan parquet (50) + TakeOrderedAndProject (66) + +- HashAggregate (65) + +- HashAggregate (64) + +- Project (63) + +- SortMergeJoin Inner (62) + :- Sort (56) + : +- Exchange (55) + : +- Project (54) + : +- SortMergeJoin Inner (53) + : :- Sort (48) + : : +- Exchange (47) + : : +- Project (46) + : : +- Filter (45) + : : +- Scan parquet (44) + : +- Sort (52) + : +- Exchange (51) + : +- Filter (50) + : +- Scan parquet (49) + +- Sort (61) + +- Exchange (60) + +- Project (59) + +- Filter (58) + +- Scan parquet (57) (1) Scan parquet @@ -76,244 +80,276 @@ Input [2]: [c_custkey#X, c_mktsegment#X] Input [2]: [hash_partition_key#X, c_custkey#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [2]: [hash_partition_key#X, c_custkey#X] +Arguments: X + +(6) ColumnarExchange Input [2]: [hash_partition_key#X, c_custkey#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [1]: [c_custkey#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [1]: [c_custkey#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [1]: [c_custkey#X] -(9) Scan parquet +(10) Scan parquet Output [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), LessThan(o_orderdate,1995-03-15), IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(10) NoopFilter +(11) NoopFilter Input [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Arguments: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [5]: [hash(o_custkey#X, 42) AS hash_partition_key#X, o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Input [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] +Arguments: X + +(15) ColumnarExchange Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [4]: [hash(o_orderkey#X, 42) AS hash_partition_key#X, o_orderkey#X, o_orderdate#X, o_shippriority#X] Input [5]: [c_custkey#X, o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] -(19) WholeStageCodegenTransformer (X) +(21) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, o_orderkey#X, o_orderdate#X, o_shippriority#X] Arguments: false -(20) ColumnarExchange +(22) VeloxAppendBatches +Input [4]: [hash_partition_key#X, o_orderkey#X, o_orderdate#X, o_shippriority#X] +Arguments: X + +(23) ColumnarExchange Input [4]: [hash_partition_key#X, o_orderkey#X, o_orderdate#X, o_shippriority#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderdate#X, o_shippriority#X], [plan_id=X], [id=#X] -(21) ShuffleQueryStage +(24) ShuffleQueryStage Output [3]: [o_orderkey#X, o_orderdate#X, o_shippriority#X] Arguments: X -(22) InputAdapter +(25) InputAdapter Input [3]: [o_orderkey#X, o_orderdate#X, o_shippriority#X] -(23) InputIteratorTransformer +(26) InputIteratorTransformer Input [3]: [o_orderkey#X, o_orderdate#X, o_shippriority#X] -(24) Scan parquet +(27) Scan parquet Output [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThan(l_shipdate,1995-03-15), IsNotNull(l_orderkey)] ReadSchema: struct -(25) NoopFilter +(28) NoopFilter Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(26) ProjectExecTransformer +(29) ProjectExecTransformer Output [4]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(27) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X] Arguments: false -(28) ColumnarExchange +(31) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(32) ColumnarExchange Input [4]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(29) ShuffleQueryStage +(33) ShuffleQueryStage Output [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] Arguments: X -(30) InputAdapter +(34) InputAdapter Input [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] -(31) InputIteratorTransformer +(35) InputIteratorTransformer Input [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] -(32) ShuffledHashJoinExecTransformer +(36) ShuffledHashJoinExecTransformer Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(33) ProjectExecTransformer +(37) ProjectExecTransformer Output [6]: [o_orderdate#X, o_shippriority#X, l_orderkey#X, l_extendedprice#X, l_discount#X, CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) AS _pre_X#X] Input [6]: [o_orderkey#X, o_orderdate#X, o_shippriority#X, l_orderkey#X, l_extendedprice#X, l_discount#X] -(34) RegularHashAggregateExecTransformer +(38) RegularHashAggregateExecTransformer Input [6]: [o_orderdate#X, o_shippriority#X, l_orderkey#X, l_extendedprice#X, l_discount#X, _pre_X#X] Keys [3]: [l_orderkey#X, o_orderdate#X, o_shippriority#X] Functions [1]: [partial_sum(_pre_X#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] -(35) RegularHashAggregateExecTransformer +(39) RegularHashAggregateExecTransformer Input [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] Keys [3]: [l_orderkey#X, o_orderdate#X, o_shippriority#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] Results [4]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] -(36) ProjectExecTransformer +(40) ProjectExecTransformer Output [4]: [l_orderkey#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X AS revenue#X, o_orderdate#X, o_shippriority#X] Input [4]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] -(37) WholeStageCodegenTransformer (X) +(41) WholeStageCodegenTransformer (X) Input [4]: [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] Arguments: false -(38) TakeOrderedAndProjectExecTransformer +(42) TakeOrderedAndProjectExecTransformer Input [4]: [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] Arguments: X, [revenue#X DESC NULLS LAST, o_orderdate#X ASC NULLS FIRST], [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X], 0 -(39) VeloxColumnarToRowExec +(43) VeloxColumnarToRowExec Input [4]: [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] -(40) Scan parquet +(44) Scan parquet Output [2]: [c_custkey#X, c_mktsegment#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_mktsegment), EqualTo(c_mktsegment,BUILDING), IsNotNull(c_custkey)] ReadSchema: struct -(41) Filter +(45) Filter Input [2]: [c_custkey#X, c_mktsegment#X] Condition : ((isnotnull(c_mktsegment#X) AND (c_mktsegment#X = BUILDING)) AND isnotnull(c_custkey#X)) -(42) Project +(46) Project Output [1]: [c_custkey#X] Input [2]: [c_custkey#X, c_mktsegment#X] -(43) Exchange +(47) Exchange Input [1]: [c_custkey#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(44) Scan parquet +(48) Sort +Input [1]: [c_custkey#X] +Arguments: [c_custkey#X ASC NULLS FIRST], false, 0 + +(49) Scan parquet Output [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), LessThan(o_orderdate,1995-03-15), IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(45) Filter +(50) Filter Input [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Condition : (((isnotnull(o_orderdate#X) AND (o_orderdate#X < 1995-03-15)) AND isnotnull(o_custkey#X)) AND isnotnull(o_orderkey#X)) -(46) Exchange +(51) Exchange Input [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(47) ShuffledHashJoin +(52) Sort +Input [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] +Arguments: [o_custkey#X ASC NULLS FIRST], false, 0 + +(53) SortMergeJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(48) Project +(54) Project Output [3]: [o_orderkey#X, o_orderdate#X, o_shippriority#X] Input [5]: [c_custkey#X, o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] -(49) Exchange +(55) Exchange Input [3]: [o_orderkey#X, o_orderdate#X, o_shippriority#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(50) Scan parquet +(56) Sort +Input [3]: [o_orderkey#X, o_orderdate#X, o_shippriority#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(57) Scan parquet Output [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThan(l_shipdate,1995-03-15), IsNotNull(l_orderkey)] ReadSchema: struct -(51) Filter +(58) Filter Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : ((isnotnull(l_shipdate#X) AND (l_shipdate#X > 1995-03-15)) AND isnotnull(l_orderkey#X)) -(52) Project +(59) Project Output [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(53) Exchange +(60) Exchange Input [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(54) ShuffledHashJoin +(61) Sort +Input [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(62) SortMergeJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(55) Project +(63) Project Output [5]: [o_orderdate#X, o_shippriority#X, l_orderkey#X, l_extendedprice#X, l_discount#X] Input [6]: [o_orderkey#X, o_orderdate#X, o_shippriority#X, l_orderkey#X, l_extendedprice#X, l_discount#X] -(56) HashAggregate +(64) HashAggregate Input [5]: [o_orderdate#X, o_shippriority#X, l_orderkey#X, l_extendedprice#X, l_discount#X] Keys [3]: [l_orderkey#X, o_orderdate#X, o_shippriority#X] Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] -(57) HashAggregate +(65) HashAggregate Input [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] Keys [3]: [l_orderkey#X, o_orderdate#X, o_shippriority#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] Results [4]: [l_orderkey#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X AS revenue#X, o_orderdate#X, o_shippriority#X] -(58) TakeOrderedAndProject +(66) TakeOrderedAndProject Input [4]: [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] Arguments: X, [revenue#X DESC NULLS LAST, o_orderdate#X ASC NULLS FIRST], [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] -(59) AdaptiveSparkPlan +(67) AdaptiveSparkPlan Output [4]: [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/4.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/4.txt index 421f6a412ec79..130bc2983040c 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/4.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/4.txt @@ -1,51 +1,53 @@ == Physical Plan == -AdaptiveSparkPlan (50) +AdaptiveSparkPlan (56) +- == Final Plan == - VeloxColumnarToRowExec (34) - +- ^ SortExecTransformer (32) - +- ^ InputIteratorTransformer (31) - +- ^ InputAdapter (30) - +- ^ ShuffleQueryStage (29), Statistics(X) - +- ColumnarExchange (28) - +- ^ RegularHashAggregateExecTransformer (26) - +- ^ InputIteratorTransformer (25) - +- ^ InputAdapter (24) - +- ^ ShuffleQueryStage (23), Statistics(X) - +- ColumnarExchange (22) - +- ^ ProjectExecTransformer (20) - +- ^ FlushableHashAggregateExecTransformer (19) - +- ^ ProjectExecTransformer (18) - +- ^ ShuffledHashJoinExecTransformer LeftSemi (17) - :- ^ InputIteratorTransformer (8) - : +- ^ InputAdapter (7) - : +- ^ ShuffleQueryStage (6), Statistics(X) - : +- ColumnarExchange (5) + VeloxColumnarToRowExec (38) + +- ^ SortExecTransformer (36) + +- ^ InputIteratorTransformer (35) + +- ShuffleQueryStage (33), Statistics(X) + +- ColumnarExchange (32) + +- VeloxAppendBatches (31) + +- ^ RegularHashAggregateExecTransformer (29) + +- ^ InputIteratorTransformer (28) + +- ShuffleQueryStage (26), Statistics(X) + +- ColumnarExchange (25) + +- VeloxAppendBatches (24) + +- ^ ProjectExecTransformer (22) + +- ^ FlushableHashAggregateExecTransformer (21) + +- ^ ProjectExecTransformer (20) + +- ^ ShuffledHashJoinExecTransformer LeftSemi BuildRight (19) + :- ^ InputIteratorTransformer (9) + : +- ShuffleQueryStage (7), Statistics(X) + : +- ColumnarExchange (6) + : +- VeloxAppendBatches (5) : +- ^ ProjectExecTransformer (3) : +- ^ NoopFilter (2) : +- ^ Scan parquet (1) - +- ^ InputIteratorTransformer (16) - +- ^ InputAdapter (15) - +- ^ ShuffleQueryStage (14), Statistics(X) - +- ColumnarExchange (13) - +- ^ ProjectExecTransformer (11) - +- ^ NoopFilter (10) - +- ^ Scan parquet (9) + +- ^ InputIteratorTransformer (18) + +- ShuffleQueryStage (16), Statistics(X) + +- ColumnarExchange (15) + +- VeloxAppendBatches (14) + +- ^ ProjectExecTransformer (12) + +- ^ NoopFilter (11) + +- ^ Scan parquet (10) +- == Initial Plan == - Sort (49) - +- Exchange (48) - +- HashAggregate (47) - +- Exchange (46) - +- HashAggregate (45) - +- Project (44) - +- ShuffledHashJoin LeftSemi BuildRight (43) - :- Exchange (38) - : +- Project (37) - : +- Filter (36) - : +- Scan parquet (35) - +- Exchange (42) - +- Project (41) - +- Filter (40) - +- Scan parquet (39) + Sort (55) + +- Exchange (54) + +- HashAggregate (53) + +- Exchange (52) + +- HashAggregate (51) + +- Project (50) + +- SortMergeJoin LeftSemi (49) + :- Sort (43) + : +- Exchange (42) + : +- Project (41) + : +- Filter (40) + : +- Scan parquet (39) + +- Sort (48) + +- Exchange (47) + +- Project (46) + +- Filter (45) + +- Scan parquet (44) (1) Scan parquet @@ -67,200 +69,224 @@ Input [3]: [o_orderkey#X, o_orderdate#X, o_orderpriority#X] Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderpriority#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderpriority#X] +Arguments: X + +(6) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderpriority#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderpriority#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [2]: [o_orderkey#X, o_orderpriority#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [2]: [o_orderkey#X, o_orderpriority#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [2]: [o_orderkey#X, o_orderpriority#X] -(9) Scan parquet +(10) Scan parquet Output [3]: [l_orderkey#X, l_commitdate#X, l_receiptdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_commitdate), IsNotNull(l_receiptdate)] ReadSchema: struct -(10) NoopFilter +(11) NoopFilter Input [3]: [l_orderkey#X, l_commitdate#X, l_receiptdate#X] Arguments: [l_orderkey#X, l_commitdate#X, l_receiptdate#X] -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [2]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X] Input [3]: [l_orderkey#X, l_commitdate#X, l_receiptdate#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, l_orderkey#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [2]: [hash_partition_key#X, l_orderkey#X] +Arguments: X + +(15) ColumnarExchange Input [2]: [hash_partition_key#X, l_orderkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [1]: [l_orderkey#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [1]: [l_orderkey#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [1]: [l_orderkey#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [1]: [o_orderpriority#X] Input [2]: [o_orderkey#X, o_orderpriority#X] -(19) FlushableHashAggregateExecTransformer +(21) FlushableHashAggregateExecTransformer Input [1]: [o_orderpriority#X] Keys [1]: [o_orderpriority#X] Functions [1]: [partial_count(1)] Aggregate Attributes [1]: [count#X] Results [2]: [o_orderpriority#X, count#X] -(20) ProjectExecTransformer +(22) ProjectExecTransformer Output [3]: [hash(o_orderpriority#X, 42) AS hash_partition_key#X, o_orderpriority#X, count#X] Input [2]: [o_orderpriority#X, count#X] -(21) WholeStageCodegenTransformer (X) +(23) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, o_orderpriority#X, count#X] Arguments: false -(22) ColumnarExchange +(24) VeloxAppendBatches +Input [3]: [hash_partition_key#X, o_orderpriority#X, count#X] +Arguments: X + +(25) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderpriority#X, count#X] Arguments: hashpartitioning(o_orderpriority#X, 1), ENSURE_REQUIREMENTS, [o_orderpriority#X, count#X], [plan_id=X], [id=#X] -(23) ShuffleQueryStage +(26) ShuffleQueryStage Output [2]: [o_orderpriority#X, count#X] Arguments: X -(24) InputAdapter +(27) InputAdapter Input [2]: [o_orderpriority#X, count#X] -(25) InputIteratorTransformer +(28) InputIteratorTransformer Input [2]: [o_orderpriority#X, count#X] -(26) RegularHashAggregateExecTransformer +(29) RegularHashAggregateExecTransformer Input [2]: [o_orderpriority#X, count#X] Keys [1]: [o_orderpriority#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [o_orderpriority#X, count(1)#X AS order_count#X] -(27) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [2]: [o_orderpriority#X, order_count#X] Arguments: false -(28) ColumnarExchange +(31) VeloxAppendBatches +Input [2]: [o_orderpriority#X, order_count#X] +Arguments: X + +(32) ColumnarExchange Input [2]: [o_orderpriority#X, order_count#X] Arguments: rangepartitioning(o_orderpriority#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(29) ShuffleQueryStage +(33) ShuffleQueryStage Output [2]: [o_orderpriority#X, order_count#X] Arguments: X -(30) InputAdapter +(34) InputAdapter Input [2]: [o_orderpriority#X, order_count#X] -(31) InputIteratorTransformer +(35) InputIteratorTransformer Input [2]: [o_orderpriority#X, order_count#X] -(32) SortExecTransformer +(36) SortExecTransformer Input [2]: [o_orderpriority#X, order_count#X] Arguments: [o_orderpriority#X ASC NULLS FIRST], true, 0 -(33) WholeStageCodegenTransformer (X) +(37) WholeStageCodegenTransformer (X) Input [2]: [o_orderpriority#X, order_count#X] Arguments: false -(34) VeloxColumnarToRowExec +(38) VeloxColumnarToRowExec Input [2]: [o_orderpriority#X, order_count#X] -(35) Scan parquet +(39) Scan parquet Output [3]: [o_orderkey#X, o_orderdate#X, o_orderpriority#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1993-07-01), LessThan(o_orderdate,1993-10-01)] ReadSchema: struct -(36) Filter +(40) Filter Input [3]: [o_orderkey#X, o_orderdate#X, o_orderpriority#X] Condition : ((isnotnull(o_orderdate#X) AND (o_orderdate#X >= 1993-07-01)) AND (o_orderdate#X < 1993-10-01)) -(37) Project +(41) Project Output [2]: [o_orderkey#X, o_orderpriority#X] Input [3]: [o_orderkey#X, o_orderdate#X, o_orderpriority#X] -(38) Exchange +(42) Exchange Input [2]: [o_orderkey#X, o_orderpriority#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(39) Scan parquet +(43) Sort +Input [2]: [o_orderkey#X, o_orderpriority#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(44) Scan parquet Output [3]: [l_orderkey#X, l_commitdate#X, l_receiptdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_commitdate), IsNotNull(l_receiptdate)] ReadSchema: struct -(40) Filter +(45) Filter Input [3]: [l_orderkey#X, l_commitdate#X, l_receiptdate#X] Condition : ((isnotnull(l_commitdate#X) AND isnotnull(l_receiptdate#X)) AND (l_commitdate#X < l_receiptdate#X)) -(41) Project +(46) Project Output [1]: [l_orderkey#X] Input [3]: [l_orderkey#X, l_commitdate#X, l_receiptdate#X] -(42) Exchange +(47) Exchange Input [1]: [l_orderkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(43) ShuffledHashJoin +(48) Sort +Input [1]: [l_orderkey#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(49) SortMergeJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(44) Project +(50) Project Output [1]: [o_orderpriority#X] Input [2]: [o_orderkey#X, o_orderpriority#X] -(45) HashAggregate +(51) HashAggregate Input [1]: [o_orderpriority#X] Keys [1]: [o_orderpriority#X] Functions [1]: [partial_count(1)] Aggregate Attributes [1]: [count#X] Results [2]: [o_orderpriority#X, count#X] -(46) Exchange +(52) Exchange Input [2]: [o_orderpriority#X, count#X] Arguments: hashpartitioning(o_orderpriority#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(47) HashAggregate +(53) HashAggregate Input [2]: [o_orderpriority#X, count#X] Keys [1]: [o_orderpriority#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [o_orderpriority#X, count(1)#X AS order_count#X] -(48) Exchange +(54) Exchange Input [2]: [o_orderpriority#X, order_count#X] Arguments: rangepartitioning(o_orderpriority#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(49) Sort +(55) Sort Input [2]: [o_orderpriority#X, order_count#X] Arguments: [o_orderpriority#X ASC NULLS FIRST], true, 0 -(50) AdaptiveSparkPlan +(56) AdaptiveSparkPlan Output [2]: [o_orderpriority#X, order_count#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/5.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/5.txt index 50173820058cc..10ce074fd7601 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/5.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/5.txt @@ -1,127 +1,137 @@ == Physical Plan == -AdaptiveSparkPlan (134) +AdaptiveSparkPlan (156) +- == Final Plan == - VeloxColumnarToRowExec (94) - +- ^ SortExecTransformer (92) - +- ^ InputIteratorTransformer (91) - +- ^ InputAdapter (90) - +- ^ ShuffleQueryStage (89), Statistics(X) - +- ColumnarExchange (88) - +- ^ RegularHashAggregateExecTransformer (86) - +- ^ InputIteratorTransformer (85) - +- ^ InputAdapter (84) - +- ^ ShuffleQueryStage (83), Statistics(X) - +- ColumnarExchange (82) - +- ^ ProjectExecTransformer (80) - +- ^ FlushableHashAggregateExecTransformer (79) - +- ^ ProjectExecTransformer (78) - +- ^ ShuffledHashJoinExecTransformer Inner (77) - :- ^ InputIteratorTransformer (68) - : +- ^ InputAdapter (67) - : +- ^ ShuffleQueryStage (66), Statistics(X) - : +- ColumnarExchange (65) - : +- ^ ProjectExecTransformer (63) - : +- ^ ShuffledHashJoinExecTransformer Inner (62) - : :- ^ InputIteratorTransformer (53) - : : +- ^ InputAdapter (52) - : : +- ^ ShuffleQueryStage (51), Statistics(X) - : : +- ColumnarExchange (50) - : : +- ^ ProjectExecTransformer (48) - : : +- ^ ShuffledHashJoinExecTransformer Inner (47) - : : :- ^ InputIteratorTransformer (38) - : : : +- ^ InputAdapter (37) - : : : +- ^ ShuffleQueryStage (36), Statistics(X) - : : : +- ColumnarExchange (35) - : : : +- ^ ProjectExecTransformer (33) - : : : +- ^ ShuffledHashJoinExecTransformer Inner (32) - : : : :- ^ InputIteratorTransformer (23) - : : : : +- ^ InputAdapter (22) - : : : : +- ^ ShuffleQueryStage (21), Statistics(X) - : : : : +- ColumnarExchange (20) - : : : : +- ^ ProjectExecTransformer (18) - : : : : +- ^ ShuffledHashJoinExecTransformer Inner (17) - : : : : :- ^ InputIteratorTransformer (8) - : : : : : +- ^ InputAdapter (7) - : : : : : +- ^ ShuffleQueryStage (6), Statistics(X) - : : : : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (106) + +- ^ SortExecTransformer (104) + +- ^ InputIteratorTransformer (103) + +- ShuffleQueryStage (101), Statistics(X) + +- ColumnarExchange (100) + +- VeloxAppendBatches (99) + +- ^ RegularHashAggregateExecTransformer (97) + +- ^ InputIteratorTransformer (96) + +- ShuffleQueryStage (94), Statistics(X) + +- ColumnarExchange (93) + +- VeloxAppendBatches (92) + +- ^ ProjectExecTransformer (90) + +- ^ FlushableHashAggregateExecTransformer (89) + +- ^ ProjectExecTransformer (88) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (87) + :- ^ InputIteratorTransformer (77) + : +- ShuffleQueryStage (75), Statistics(X) + : +- ColumnarExchange (74) + : +- VeloxAppendBatches (73) + : +- ^ ProjectExecTransformer (71) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (70) + : :- ^ InputIteratorTransformer (60) + : : +- ShuffleQueryStage (58), Statistics(X) + : : +- ColumnarExchange (57) + : : +- VeloxAppendBatches (56) + : : +- ^ ProjectExecTransformer (54) + : : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (53) + : : :- ^ InputIteratorTransformer (43) + : : : +- ShuffleQueryStage (41), Statistics(X) + : : : +- ColumnarExchange (40) + : : : +- VeloxAppendBatches (39) + : : : +- ^ ProjectExecTransformer (37) + : : : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (36) + : : : :- ^ InputIteratorTransformer (26) + : : : : +- ShuffleQueryStage (24), Statistics(X) + : : : : +- ColumnarExchange (23) + : : : : +- VeloxAppendBatches (22) + : : : : +- ^ ProjectExecTransformer (20) + : : : : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (19) + : : : : :- ^ InputIteratorTransformer (9) + : : : : : +- ShuffleQueryStage (7), Statistics(X) + : : : : : +- ColumnarExchange (6) + : : : : : +- VeloxAppendBatches (5) : : : : : +- ^ ProjectExecTransformer (3) : : : : : +- ^ NoopFilter (2) : : : : : +- ^ Scan parquet (1) - : : : : +- ^ InputIteratorTransformer (16) - : : : : +- ^ InputAdapter (15) - : : : : +- ^ ShuffleQueryStage (14), Statistics(X) - : : : : +- ColumnarExchange (13) - : : : : +- ^ ProjectExecTransformer (11) - : : : : +- ^ NoopFilter (10) - : : : : +- ^ Scan parquet (9) - : : : +- ^ InputIteratorTransformer (31) - : : : +- ^ InputAdapter (30) - : : : +- ^ ShuffleQueryStage (29), Statistics(X) - : : : +- ColumnarExchange (28) - : : : +- ^ ProjectExecTransformer (26) - : : : +- ^ NoopFilter (25) - : : : +- ^ Scan parquet (24) - : : +- ^ InputIteratorTransformer (46) - : : +- ^ InputAdapter (45) - : : +- ^ ShuffleQueryStage (44), Statistics(X) - : : +- ColumnarExchange (43) - : : +- ^ ProjectExecTransformer (41) - : : +- ^ NoopFilter (40) - : : +- ^ Scan parquet (39) - : +- ^ InputIteratorTransformer (61) - : +- ^ InputAdapter (60) - : +- ^ ShuffleQueryStage (59), Statistics(X) - : +- ColumnarExchange (58) - : +- ^ ProjectExecTransformer (56) - : +- ^ NoopFilter (55) - : +- ^ Scan parquet (54) - +- ^ InputIteratorTransformer (76) - +- ^ InputAdapter (75) - +- ^ ShuffleQueryStage (74), Statistics(X) - +- ColumnarExchange (73) - +- ^ ProjectExecTransformer (71) - +- ^ NoopFilter (70) - +- ^ Scan parquet (69) + : : : : +- ^ InputIteratorTransformer (18) + : : : : +- ShuffleQueryStage (16), Statistics(X) + : : : : +- ColumnarExchange (15) + : : : : +- VeloxAppendBatches (14) + : : : : +- ^ ProjectExecTransformer (12) + : : : : +- ^ NoopFilter (11) + : : : : +- ^ Scan parquet (10) + : : : +- ^ InputIteratorTransformer (35) + : : : +- ShuffleQueryStage (33), Statistics(X) + : : : +- ColumnarExchange (32) + : : : +- VeloxAppendBatches (31) + : : : +- ^ ProjectExecTransformer (29) + : : : +- ^ NoopFilter (28) + : : : +- ^ Scan parquet (27) + : : +- ^ InputIteratorTransformer (52) + : : +- ShuffleQueryStage (50), Statistics(X) + : : +- ColumnarExchange (49) + : : +- VeloxAppendBatches (48) + : : +- ^ ProjectExecTransformer (46) + : : +- ^ NoopFilter (45) + : : +- ^ Scan parquet (44) + : +- ^ InputIteratorTransformer (69) + : +- ShuffleQueryStage (67), Statistics(X) + : +- ColumnarExchange (66) + : +- VeloxAppendBatches (65) + : +- ^ ProjectExecTransformer (63) + : +- ^ NoopFilter (62) + : +- ^ Scan parquet (61) + +- ^ InputIteratorTransformer (86) + +- ShuffleQueryStage (84), Statistics(X) + +- ColumnarExchange (83) + +- VeloxAppendBatches (82) + +- ^ ProjectExecTransformer (80) + +- ^ NoopFilter (79) + +- ^ Scan parquet (78) +- == Initial Plan == - Sort (133) - +- Exchange (132) - +- HashAggregate (131) - +- Exchange (130) - +- HashAggregate (129) - +- Project (128) - +- ShuffledHashJoin Inner BuildRight (127) - :- Exchange (122) - : +- Project (121) - : +- ShuffledHashJoin Inner BuildRight (120) - : :- Exchange (116) - : : +- Project (115) - : : +- ShuffledHashJoin Inner BuildRight (114) - : : :- Exchange (110) - : : : +- Project (109) - : : : +- ShuffledHashJoin Inner BuildRight (108) - : : : :- Exchange (104) - : : : : +- Project (103) - : : : : +- ShuffledHashJoin Inner BuildLeft (102) - : : : : :- Exchange (97) - : : : : : +- Filter (96) - : : : : : +- Scan parquet (95) - : : : : +- Exchange (101) - : : : : +- Project (100) - : : : : +- Filter (99) - : : : : +- Scan parquet (98) - : : : +- Exchange (107) - : : : +- Filter (106) - : : : +- Scan parquet (105) - : : +- Exchange (113) - : : +- Filter (112) - : : +- Scan parquet (111) - : +- Exchange (119) - : +- Filter (118) - : +- Scan parquet (117) - +- Exchange (126) - +- Project (125) - +- Filter (124) - +- Scan parquet (123) + Sort (155) + +- Exchange (154) + +- HashAggregate (153) + +- Exchange (152) + +- HashAggregate (151) + +- Project (150) + +- SortMergeJoin Inner (149) + :- Sort (143) + : +- Exchange (142) + : +- Project (141) + : +- SortMergeJoin Inner (140) + : :- Sort (135) + : : +- Exchange (134) + : : +- Project (133) + : : +- SortMergeJoin Inner (132) + : : :- Sort (127) + : : : +- Exchange (126) + : : : +- Project (125) + : : : +- SortMergeJoin Inner (124) + : : : :- Sort (119) + : : : : +- Exchange (118) + : : : : +- Project (117) + : : : : +- SortMergeJoin Inner (116) + : : : : :- Sort (110) + : : : : : +- Exchange (109) + : : : : : +- Filter (108) + : : : : : +- Scan parquet (107) + : : : : +- Sort (115) + : : : : +- Exchange (114) + : : : : +- Project (113) + : : : : +- Filter (112) + : : : : +- Scan parquet (111) + : : : +- Sort (123) + : : : +- Exchange (122) + : : : +- Filter (121) + : : : +- Scan parquet (120) + : : +- Sort (131) + : : +- Exchange (130) + : : +- Filter (129) + : : +- Scan parquet (128) + : +- Sort (139) + : +- Exchange (138) + : +- Filter (137) + : +- Scan parquet (136) + +- Sort (148) + +- Exchange (147) + +- Project (146) + +- Filter (145) + +- Scan parquet (144) (1) Scan parquet @@ -143,552 +153,640 @@ Input [2]: [c_custkey#X, c_nationkey#X] Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] +Arguments: X + +(6) ColumnarExchange Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_nationkey#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [2]: [c_custkey#X, c_nationkey#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [2]: [c_custkey#X, c_nationkey#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [2]: [c_custkey#X, c_nationkey#X] -(9) Scan parquet +(10) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1994-01-01), LessThan(o_orderdate,1995-01-01), IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(10) NoopFilter +(11) NoopFilter Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Arguments: [o_orderkey#X, o_custkey#X, o_orderdate#X] -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [3]: [hash(o_custkey#X, 42) AS hash_partition_key#X, o_orderkey#X, o_custkey#X] Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] +Arguments: X + +(15) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [2]: [o_orderkey#X, o_custkey#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [2]: [o_orderkey#X, o_custkey#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [2]: [o_orderkey#X, o_custkey#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [3]: [hash(o_orderkey#X, 42) AS hash_partition_key#X, c_nationkey#X, o_orderkey#X] Input [4]: [c_custkey#X, c_nationkey#X, o_orderkey#X, o_custkey#X] -(19) WholeStageCodegenTransformer (X) +(21) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, c_nationkey#X, o_orderkey#X] Arguments: false -(20) ColumnarExchange +(22) VeloxAppendBatches +Input [3]: [hash_partition_key#X, c_nationkey#X, o_orderkey#X] +Arguments: X + +(23) ColumnarExchange Input [3]: [hash_partition_key#X, c_nationkey#X, o_orderkey#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [c_nationkey#X, o_orderkey#X], [plan_id=X], [id=#X] -(21) ShuffleQueryStage +(24) ShuffleQueryStage Output [2]: [c_nationkey#X, o_orderkey#X] Arguments: X -(22) InputAdapter +(25) InputAdapter Input [2]: [c_nationkey#X, o_orderkey#X] -(23) InputIteratorTransformer +(26) InputIteratorTransformer Input [2]: [c_nationkey#X, o_orderkey#X] -(24) Scan parquet +(27) Scan parquet Output [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_orderkey), IsNotNull(l_suppkey)] ReadSchema: struct -(25) NoopFilter +(28) NoopFilter Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(26) ProjectExecTransformer +(29) ProjectExecTransformer Output [5]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(27) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: false -(28) ColumnarExchange +(31) VeloxAppendBatches +Input [5]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(32) ColumnarExchange Input [5]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(29) ShuffleQueryStage +(33) ShuffleQueryStage Output [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: X -(30) InputAdapter +(34) InputAdapter Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(31) InputIteratorTransformer +(35) InputIteratorTransformer Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(32) ShuffledHashJoinExecTransformer +(36) ShuffledHashJoinExecTransformer Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(33) ProjectExecTransformer +(37) ProjectExecTransformer Output [5]: [hash(l_suppkey#X, c_nationkey#X, 42) AS hash_partition_key#X, c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Input [6]: [c_nationkey#X, o_orderkey#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(34) WholeStageCodegenTransformer (X) +(38) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: false -(35) ColumnarExchange +(39) VeloxAppendBatches +Input [5]: [hash_partition_key#X, c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(40) ColumnarExchange Input [5]: [hash_partition_key#X, c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_suppkey#X, c_nationkey#X, 1), ENSURE_REQUIREMENTS, [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(36) ShuffleQueryStage +(41) ShuffleQueryStage Output [4]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: X -(37) InputAdapter +(42) InputAdapter Input [4]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(38) InputIteratorTransformer +(43) InputIteratorTransformer Input [4]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(39) Scan parquet +(44) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(40) NoopFilter +(45) NoopFilter Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: [s_suppkey#X, s_nationkey#X] -(41) ProjectExecTransformer +(46) ProjectExecTransformer Output [3]: [hash(s_suppkey#X, s_nationkey#X, 42) AS hash_partition_key#X, s_suppkey#X, s_nationkey#X] Input [2]: [s_suppkey#X, s_nationkey#X] -(42) WholeStageCodegenTransformer (X) +(47) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] Arguments: false -(43) ColumnarExchange +(48) VeloxAppendBatches +Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] +Arguments: X + +(49) ColumnarExchange Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [id=#X] -(44) ShuffleQueryStage +(50) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] Arguments: X -(45) InputAdapter +(51) InputAdapter Input [2]: [s_suppkey#X, s_nationkey#X] -(46) InputIteratorTransformer +(52) InputIteratorTransformer Input [2]: [s_suppkey#X, s_nationkey#X] -(47) ShuffledHashJoinExecTransformer +(53) ShuffledHashJoinExecTransformer Left keys [2]: [l_suppkey#X, c_nationkey#X] Right keys [2]: [s_suppkey#X, s_nationkey#X] Join condition: None -(48) ProjectExecTransformer +(54) ProjectExecTransformer Output [4]: [hash(s_nationkey#X, 42) AS hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Input [6]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, s_suppkey#X, s_nationkey#X] -(49) WholeStageCodegenTransformer (X) +(55) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: false -(50) ColumnarExchange +(56) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X] +Arguments: X + +(57) ColumnarExchange Input [4]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X], [plan_id=X], [id=#X] -(51) ShuffleQueryStage +(58) ShuffleQueryStage Output [3]: [l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: X -(52) InputAdapter +(59) InputAdapter Input [3]: [l_extendedprice#X, l_discount#X, s_nationkey#X] -(53) InputIteratorTransformer +(60) InputIteratorTransformer Input [3]: [l_extendedprice#X, l_discount#X, s_nationkey#X] -(54) Scan parquet +(61) Scan parquet Output [3]: [n_nationkey#X, n_name#X, n_regionkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), IsNotNull(n_regionkey)] ReadSchema: struct -(55) NoopFilter +(62) NoopFilter Input [3]: [n_nationkey#X, n_name#X, n_regionkey#X] Arguments: [n_nationkey#X, n_name#X, n_regionkey#X] -(56) ProjectExecTransformer +(63) ProjectExecTransformer Output [4]: [hash(n_nationkey#X, 42) AS hash_partition_key#X, n_nationkey#X, n_name#X, n_regionkey#X] Input [3]: [n_nationkey#X, n_name#X, n_regionkey#X] -(57) WholeStageCodegenTransformer (X) +(64) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, n_nationkey#X, n_name#X, n_regionkey#X] Arguments: false -(58) ColumnarExchange +(65) VeloxAppendBatches +Input [4]: [hash_partition_key#X, n_nationkey#X, n_name#X, n_regionkey#X] +Arguments: X + +(66) ColumnarExchange Input [4]: [hash_partition_key#X, n_nationkey#X, n_name#X, n_regionkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X, n_regionkey#X], [plan_id=X], [id=#X] -(59) ShuffleQueryStage +(67) ShuffleQueryStage Output [3]: [n_nationkey#X, n_name#X, n_regionkey#X] Arguments: X -(60) InputAdapter +(68) InputAdapter Input [3]: [n_nationkey#X, n_name#X, n_regionkey#X] -(61) InputIteratorTransformer +(69) InputIteratorTransformer Input [3]: [n_nationkey#X, n_name#X, n_regionkey#X] -(62) ShuffledHashJoinExecTransformer +(70) ShuffledHashJoinExecTransformer Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(63) ProjectExecTransformer +(71) ProjectExecTransformer Output [5]: [hash(n_regionkey#X, 42) AS hash_partition_key#X, l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] Input [6]: [l_extendedprice#X, l_discount#X, s_nationkey#X, n_nationkey#X, n_name#X, n_regionkey#X] -(64) WholeStageCodegenTransformer (X) +(72) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] Arguments: false -(65) ColumnarExchange +(73) VeloxAppendBatches +Input [5]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] +Arguments: X + +(74) ColumnarExchange Input [5]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] Arguments: hashpartitioning(n_regionkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X], [plan_id=X], [id=#X] -(66) ShuffleQueryStage +(75) ShuffleQueryStage Output [4]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] Arguments: X -(67) InputAdapter +(76) InputAdapter Input [4]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] -(68) InputIteratorTransformer +(77) InputIteratorTransformer Input [4]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] -(69) Scan parquet +(78) Scan parquet Output [2]: [r_regionkey#X, r_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(r_name), EqualTo(r_name,ASIA), IsNotNull(r_regionkey)] ReadSchema: struct -(70) NoopFilter +(79) NoopFilter Input [2]: [r_regionkey#X, r_name#X] Arguments: [r_regionkey#X, r_name#X] -(71) ProjectExecTransformer +(80) ProjectExecTransformer Output [2]: [hash(r_regionkey#X, 42) AS hash_partition_key#X, r_regionkey#X] Input [2]: [r_regionkey#X, r_name#X] -(72) WholeStageCodegenTransformer (X) +(81) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, r_regionkey#X] Arguments: false -(73) ColumnarExchange +(82) VeloxAppendBatches +Input [2]: [hash_partition_key#X, r_regionkey#X] +Arguments: X + +(83) ColumnarExchange Input [2]: [hash_partition_key#X, r_regionkey#X] Arguments: hashpartitioning(r_regionkey#X, 1), ENSURE_REQUIREMENTS, [r_regionkey#X], [plan_id=X], [id=#X] -(74) ShuffleQueryStage +(84) ShuffleQueryStage Output [1]: [r_regionkey#X] Arguments: X -(75) InputAdapter +(85) InputAdapter Input [1]: [r_regionkey#X] -(76) InputIteratorTransformer +(86) InputIteratorTransformer Input [1]: [r_regionkey#X] -(77) ShuffledHashJoinExecTransformer +(87) ShuffledHashJoinExecTransformer Left keys [1]: [n_regionkey#X] Right keys [1]: [r_regionkey#X] Join condition: None -(78) ProjectExecTransformer +(88) ProjectExecTransformer Output [4]: [l_extendedprice#X, l_discount#X, n_name#X, CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) AS _pre_X#X] Input [5]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X, r_regionkey#X] -(79) FlushableHashAggregateExecTransformer +(89) FlushableHashAggregateExecTransformer Input [4]: [l_extendedprice#X, l_discount#X, n_name#X, _pre_X#X] Keys [1]: [n_name#X] Functions [1]: [partial_sum(_pre_X#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [n_name#X, sum#X, isEmpty#X] -(80) ProjectExecTransformer +(90) ProjectExecTransformer Output [4]: [hash(n_name#X, 42) AS hash_partition_key#X, n_name#X, sum#X, isEmpty#X] Input [3]: [n_name#X, sum#X, isEmpty#X] -(81) WholeStageCodegenTransformer (X) +(91) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, n_name#X, sum#X, isEmpty#X] Arguments: false -(82) ColumnarExchange +(92) VeloxAppendBatches +Input [4]: [hash_partition_key#X, n_name#X, sum#X, isEmpty#X] +Arguments: X + +(93) ColumnarExchange Input [4]: [hash_partition_key#X, n_name#X, sum#X, isEmpty#X] Arguments: hashpartitioning(n_name#X, 1), ENSURE_REQUIREMENTS, [n_name#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(83) ShuffleQueryStage +(94) ShuffleQueryStage Output [3]: [n_name#X, sum#X, isEmpty#X] Arguments: X -(84) InputAdapter +(95) InputAdapter Input [3]: [n_name#X, sum#X, isEmpty#X] -(85) InputIteratorTransformer +(96) InputIteratorTransformer Input [3]: [n_name#X, sum#X, isEmpty#X] -(86) RegularHashAggregateExecTransformer +(97) RegularHashAggregateExecTransformer Input [3]: [n_name#X, sum#X, isEmpty#X] Keys [1]: [n_name#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] Results [2]: [n_name#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X AS revenue#X] -(87) WholeStageCodegenTransformer (X) +(98) WholeStageCodegenTransformer (X) Input [2]: [n_name#X, revenue#X] Arguments: false -(88) ColumnarExchange +(99) VeloxAppendBatches +Input [2]: [n_name#X, revenue#X] +Arguments: X + +(100) ColumnarExchange Input [2]: [n_name#X, revenue#X] Arguments: rangepartitioning(revenue#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(89) ShuffleQueryStage +(101) ShuffleQueryStage Output [2]: [n_name#X, revenue#X] Arguments: X -(90) InputAdapter +(102) InputAdapter Input [2]: [n_name#X, revenue#X] -(91) InputIteratorTransformer +(103) InputIteratorTransformer Input [2]: [n_name#X, revenue#X] -(92) SortExecTransformer +(104) SortExecTransformer Input [2]: [n_name#X, revenue#X] Arguments: [revenue#X DESC NULLS LAST], true, 0 -(93) WholeStageCodegenTransformer (X) +(105) WholeStageCodegenTransformer (X) Input [2]: [n_name#X, revenue#X] Arguments: false -(94) VeloxColumnarToRowExec +(106) VeloxColumnarToRowExec Input [2]: [n_name#X, revenue#X] -(95) Scan parquet +(107) Scan parquet Output [2]: [c_custkey#X, c_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey), IsNotNull(c_nationkey)] ReadSchema: struct -(96) Filter +(108) Filter Input [2]: [c_custkey#X, c_nationkey#X] Condition : (isnotnull(c_custkey#X) AND isnotnull(c_nationkey#X)) -(97) Exchange +(109) Exchange Input [2]: [c_custkey#X, c_nationkey#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(98) Scan parquet +(110) Sort +Input [2]: [c_custkey#X, c_nationkey#X] +Arguments: [c_custkey#X ASC NULLS FIRST], false, 0 + +(111) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1994-01-01), LessThan(o_orderdate,1995-01-01), IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(99) Filter +(112) Filter Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Condition : ((((isnotnull(o_orderdate#X) AND (o_orderdate#X >= 1994-01-01)) AND (o_orderdate#X < 1995-01-01)) AND isnotnull(o_custkey#X)) AND isnotnull(o_orderkey#X)) -(100) Project +(113) Project Output [2]: [o_orderkey#X, o_custkey#X] Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] -(101) Exchange +(114) Exchange Input [2]: [o_orderkey#X, o_custkey#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(102) ShuffledHashJoin +(115) Sort +Input [2]: [o_orderkey#X, o_custkey#X] +Arguments: [o_custkey#X ASC NULLS FIRST], false, 0 + +(116) SortMergeJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(103) Project +(117) Project Output [2]: [c_nationkey#X, o_orderkey#X] Input [4]: [c_custkey#X, c_nationkey#X, o_orderkey#X, o_custkey#X] -(104) Exchange +(118) Exchange Input [2]: [c_nationkey#X, o_orderkey#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(105) Scan parquet +(119) Sort +Input [2]: [c_nationkey#X, o_orderkey#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(120) Scan parquet Output [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_orderkey), IsNotNull(l_suppkey)] ReadSchema: struct -(106) Filter +(121) Filter Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Condition : (isnotnull(l_orderkey#X) AND isnotnull(l_suppkey#X)) -(107) Exchange +(122) Exchange Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(108) ShuffledHashJoin +(123) Sort +Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(124) SortMergeJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(109) Project +(125) Project Output [4]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Input [6]: [c_nationkey#X, o_orderkey#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(110) Exchange +(126) Exchange Input [4]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_suppkey#X, c_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(111) Scan parquet +(127) Sort +Input [4]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] +Arguments: [l_suppkey#X ASC NULLS FIRST, c_nationkey#X ASC NULLS FIRST], false, 0 + +(128) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(112) Filter +(129) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(113) Exchange +(130) Exchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, s_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(114) ShuffledHashJoin +(131) Sort +Input [2]: [s_suppkey#X, s_nationkey#X] +Arguments: [s_suppkey#X ASC NULLS FIRST, s_nationkey#X ASC NULLS FIRST], false, 0 + +(132) SortMergeJoin Left keys [2]: [l_suppkey#X, c_nationkey#X] Right keys [2]: [s_suppkey#X, s_nationkey#X] Join condition: None -(115) Project +(133) Project Output [3]: [l_extendedprice#X, l_discount#X, s_nationkey#X] Input [6]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, s_suppkey#X, s_nationkey#X] -(116) Exchange +(134) Exchange Input [3]: [l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(117) Scan parquet +(135) Sort +Input [3]: [l_extendedprice#X, l_discount#X, s_nationkey#X] +Arguments: [s_nationkey#X ASC NULLS FIRST], false, 0 + +(136) Scan parquet Output [3]: [n_nationkey#X, n_name#X, n_regionkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), IsNotNull(n_regionkey)] ReadSchema: struct -(118) Filter +(137) Filter Input [3]: [n_nationkey#X, n_name#X, n_regionkey#X] Condition : (isnotnull(n_nationkey#X) AND isnotnull(n_regionkey#X)) -(119) Exchange +(138) Exchange Input [3]: [n_nationkey#X, n_name#X, n_regionkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(120) ShuffledHashJoin +(139) Sort +Input [3]: [n_nationkey#X, n_name#X, n_regionkey#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(140) SortMergeJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(121) Project +(141) Project Output [4]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] Input [6]: [l_extendedprice#X, l_discount#X, s_nationkey#X, n_nationkey#X, n_name#X, n_regionkey#X] -(122) Exchange +(142) Exchange Input [4]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] Arguments: hashpartitioning(n_regionkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(123) Scan parquet +(143) Sort +Input [4]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] +Arguments: [n_regionkey#X ASC NULLS FIRST], false, 0 + +(144) Scan parquet Output [2]: [r_regionkey#X, r_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(r_name), EqualTo(r_name,ASIA), IsNotNull(r_regionkey)] ReadSchema: struct -(124) Filter +(145) Filter Input [2]: [r_regionkey#X, r_name#X] Condition : ((isnotnull(r_name#X) AND (r_name#X = ASIA)) AND isnotnull(r_regionkey#X)) -(125) Project +(146) Project Output [1]: [r_regionkey#X] Input [2]: [r_regionkey#X, r_name#X] -(126) Exchange +(147) Exchange Input [1]: [r_regionkey#X] Arguments: hashpartitioning(r_regionkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(127) ShuffledHashJoin +(148) Sort +Input [1]: [r_regionkey#X] +Arguments: [r_regionkey#X ASC NULLS FIRST], false, 0 + +(149) SortMergeJoin Left keys [1]: [n_regionkey#X] Right keys [1]: [r_regionkey#X] Join condition: None -(128) Project +(150) Project Output [3]: [l_extendedprice#X, l_discount#X, n_name#X] Input [5]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X, r_regionkey#X] -(129) HashAggregate +(151) HashAggregate Input [3]: [l_extendedprice#X, l_discount#X, n_name#X] Keys [1]: [n_name#X] Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [n_name#X, sum#X, isEmpty#X] -(130) Exchange +(152) Exchange Input [3]: [n_name#X, sum#X, isEmpty#X] Arguments: hashpartitioning(n_name#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(131) HashAggregate +(153) HashAggregate Input [3]: [n_name#X, sum#X, isEmpty#X] Keys [1]: [n_name#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] Results [2]: [n_name#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X AS revenue#X] -(132) Exchange +(154) Exchange Input [2]: [n_name#X, revenue#X] Arguments: rangepartitioning(revenue#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(133) Sort +(155) Sort Input [2]: [n_name#X, revenue#X] Arguments: [revenue#X DESC NULLS LAST], true, 0 -(134) AdaptiveSparkPlan +(156) AdaptiveSparkPlan Output [2]: [n_name#X, revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/6.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/6.txt index 51c5836bdd11e..fa9c936a0ca11 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/6.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/6.txt @@ -1,23 +1,23 @@ == Physical Plan == -AdaptiveSparkPlan (19) +AdaptiveSparkPlan (20) +- == Final Plan == - VeloxColumnarToRowExec (12) - +- ^ RegularHashAggregateExecTransformer (10) - +- ^ InputIteratorTransformer (9) - +- ^ InputAdapter (8) - +- ^ ShuffleQueryStage (7), Statistics(X) - +- ColumnarExchange (6) + VeloxColumnarToRowExec (13) + +- ^ RegularHashAggregateExecTransformer (11) + +- ^ InputIteratorTransformer (10) + +- ShuffleQueryStage (8), Statistics(X) + +- ColumnarExchange (7) + +- VeloxAppendBatches (6) +- ^ FlushableHashAggregateExecTransformer (4) +- ^ ProjectExecTransformer (3) +- ^ NoopFilter (2) +- ^ Scan parquet (1) +- == Initial Plan == - HashAggregate (18) - +- Exchange (17) - +- HashAggregate (16) - +- Project (15) - +- Filter (14) - +- Scan parquet (13) + HashAggregate (19) + +- Exchange (18) + +- HashAggregate (17) + +- Project (16) + +- Filter (15) + +- Scan parquet (14) (1) Scan parquet @@ -46,67 +46,71 @@ Results [2]: [sum#X, isEmpty#X] Input [2]: [sum#X, isEmpty#X] Arguments: false -(6) ColumnarExchange +(6) VeloxAppendBatches +Input [2]: [sum#X, isEmpty#X] +Arguments: X + +(7) ColumnarExchange Input [2]: [sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(7) ShuffleQueryStage +(8) ShuffleQueryStage Output [2]: [sum#X, isEmpty#X] Arguments: X -(8) InputAdapter +(9) InputAdapter Input [2]: [sum#X, isEmpty#X] -(9) InputIteratorTransformer +(10) InputIteratorTransformer Input [2]: [sum#X, isEmpty#X] -(10) RegularHashAggregateExecTransformer +(11) RegularHashAggregateExecTransformer Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(CheckOverflow((promote_precision(l_extendedprice#X) * promote_precision(l_discount#X)), DecimalType(25,4)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(l_extendedprice#X) * promote_precision(l_discount#X)), DecimalType(25,4)))#X] Results [1]: [sum(CheckOverflow((promote_precision(l_extendedprice#X) * promote_precision(l_discount#X)), DecimalType(25,4)))#X AS revenue#X] -(11) WholeStageCodegenTransformer (X) +(12) WholeStageCodegenTransformer (X) Input [1]: [revenue#X] Arguments: false -(12) VeloxColumnarToRowExec +(13) VeloxColumnarToRowExec Input [1]: [revenue#X] -(13) Scan parquet +(14) Scan parquet Output [4]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), IsNotNull(l_discount), IsNotNull(l_quantity), GreaterThanOrEqual(l_shipdate,1994-01-01), LessThan(l_shipdate,1995-01-01), GreaterThanOrEqual(l_discount,0.05), LessThanOrEqual(l_discount,0.07), LessThan(l_quantity,24.00)] ReadSchema: struct -(14) Filter +(15) Filter Input [4]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : (((((((isnotnull(l_shipdate#X) AND isnotnull(l_discount#X)) AND isnotnull(l_quantity#X)) AND (l_shipdate#X >= 1994-01-01)) AND (l_shipdate#X < 1995-01-01)) AND (l_discount#X >= 0.05)) AND (l_discount#X <= 0.07)) AND (l_quantity#X < 24.00)) -(15) Project +(16) Project Output [2]: [l_extendedprice#X, l_discount#X] Input [4]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(16) HashAggregate +(17) HashAggregate Input [2]: [l_extendedprice#X, l_discount#X] Keys: [] Functions [1]: [partial_sum(CheckOverflow((promote_precision(l_extendedprice#X) * promote_precision(l_discount#X)), DecimalType(25,4)))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(17) Exchange +(18) Exchange Input [2]: [sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X] -(18) HashAggregate +(19) HashAggregate Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(CheckOverflow((promote_precision(l_extendedprice#X) * promote_precision(l_discount#X)), DecimalType(25,4)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(l_extendedprice#X) * promote_precision(l_discount#X)), DecimalType(25,4)))#X] Results [1]: [sum(CheckOverflow((promote_precision(l_extendedprice#X) * promote_precision(l_discount#X)), DecimalType(25,4)))#X AS revenue#X] -(19) AdaptiveSparkPlan +(20) AdaptiveSparkPlan Output [1]: [revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/7.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/7.txt index 67f4274b3aa35..651cfa840be88 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/7.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/7.txt @@ -1,122 +1,131 @@ == Physical Plan == -AdaptiveSparkPlan (128) +AdaptiveSparkPlan (149) +- == Final Plan == - VeloxColumnarToRowExec (90) - +- ^ SortExecTransformer (88) - +- ^ InputIteratorTransformer (87) - +- ^ InputAdapter (86) - +- ^ ShuffleQueryStage (85), Statistics(X) - +- ColumnarExchange (84) - +- ^ RegularHashAggregateExecTransformer (82) - +- ^ InputIteratorTransformer (81) - +- ^ InputAdapter (80) - +- ^ ShuffleQueryStage (79), Statistics(X) - +- ColumnarExchange (78) - +- ^ ProjectExecTransformer (76) - +- ^ FlushableHashAggregateExecTransformer (75) - +- ^ ProjectExecTransformer (74) - +- ^ ShuffledHashJoinExecTransformer Inner (73) - :- ^ InputIteratorTransformer (68) - : +- ^ InputAdapter (67) - : +- ^ ShuffleQueryStage (66), Statistics(X) - : +- ColumnarExchange (65) - : +- ^ ProjectExecTransformer (63) - : +- ^ ShuffledHashJoinExecTransformer Inner (62) - : :- ^ InputIteratorTransformer (53) - : : +- ^ InputAdapter (52) - : : +- ^ ShuffleQueryStage (51), Statistics(X) - : : +- ColumnarExchange (50) - : : +- ^ ProjectExecTransformer (48) - : : +- ^ ShuffledHashJoinExecTransformer Inner (47) - : : :- ^ InputIteratorTransformer (38) - : : : +- ^ InputAdapter (37) - : : : +- ^ ShuffleQueryStage (36), Statistics(X) - : : : +- ColumnarExchange (35) - : : : +- ^ ProjectExecTransformer (33) - : : : +- ^ ShuffledHashJoinExecTransformer Inner (32) - : : : :- ^ InputIteratorTransformer (23) - : : : : +- ^ InputAdapter (22) - : : : : +- ^ ShuffleQueryStage (21), Statistics(X) - : : : : +- ColumnarExchange (20) - : : : : +- ^ ProjectExecTransformer (18) - : : : : +- ^ ShuffledHashJoinExecTransformer Inner (17) - : : : : :- ^ InputIteratorTransformer (8) - : : : : : +- ^ InputAdapter (7) - : : : : : +- ^ ShuffleQueryStage (6), Statistics(X) - : : : : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (101) + +- ^ SortExecTransformer (99) + +- ^ InputIteratorTransformer (98) + +- ShuffleQueryStage (96), Statistics(X) + +- ColumnarExchange (95) + +- VeloxAppendBatches (94) + +- ^ RegularHashAggregateExecTransformer (92) + +- ^ InputIteratorTransformer (91) + +- ShuffleQueryStage (89), Statistics(X) + +- ColumnarExchange (88) + +- VeloxAppendBatches (87) + +- ^ ProjectExecTransformer (85) + +- ^ FlushableHashAggregateExecTransformer (84) + +- ^ ProjectExecTransformer (83) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (82) + :- ^ InputIteratorTransformer (77) + : +- ShuffleQueryStage (75), Statistics(X) + : +- ColumnarExchange (74) + : +- VeloxAppendBatches (73) + : +- ^ ProjectExecTransformer (71) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (70) + : :- ^ InputIteratorTransformer (60) + : : +- ShuffleQueryStage (58), Statistics(X) + : : +- ColumnarExchange (57) + : : +- VeloxAppendBatches (56) + : : +- ^ ProjectExecTransformer (54) + : : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (53) + : : :- ^ InputIteratorTransformer (43) + : : : +- ShuffleQueryStage (41), Statistics(X) + : : : +- ColumnarExchange (40) + : : : +- VeloxAppendBatches (39) + : : : +- ^ ProjectExecTransformer (37) + : : : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (36) + : : : :- ^ InputIteratorTransformer (26) + : : : : +- ShuffleQueryStage (24), Statistics(X) + : : : : +- ColumnarExchange (23) + : : : : +- VeloxAppendBatches (22) + : : : : +- ^ ProjectExecTransformer (20) + : : : : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (19) + : : : : :- ^ InputIteratorTransformer (9) + : : : : : +- ShuffleQueryStage (7), Statistics(X) + : : : : : +- ColumnarExchange (6) + : : : : : +- VeloxAppendBatches (5) : : : : : +- ^ ProjectExecTransformer (3) : : : : : +- ^ NoopFilter (2) : : : : : +- ^ Scan parquet (1) - : : : : +- ^ InputIteratorTransformer (16) - : : : : +- ^ InputAdapter (15) - : : : : +- ^ ShuffleQueryStage (14), Statistics(X) - : : : : +- ColumnarExchange (13) - : : : : +- ^ ProjectExecTransformer (11) - : : : : +- ^ NoopFilter (10) - : : : : +- ^ Scan parquet (9) - : : : +- ^ InputIteratorTransformer (31) - : : : +- ^ InputAdapter (30) - : : : +- ^ ShuffleQueryStage (29), Statistics(X) - : : : +- ColumnarExchange (28) - : : : +- ^ ProjectExecTransformer (26) - : : : +- ^ NoopFilter (25) - : : : +- ^ Scan parquet (24) - : : +- ^ InputIteratorTransformer (46) - : : +- ^ InputAdapter (45) - : : +- ^ ShuffleQueryStage (44), Statistics(X) - : : +- ColumnarExchange (43) - : : +- ^ ProjectExecTransformer (41) - : : +- ^ NoopFilter (40) - : : +- ^ Scan parquet (39) - : +- ^ InputIteratorTransformer (61) - : +- ^ InputAdapter (60) - : +- ^ ShuffleQueryStage (59), Statistics(X) - : +- ColumnarExchange (58) - : +- ^ ProjectExecTransformer (56) - : +- ^ NoopFilter (55) - : +- ^ Scan parquet (54) - +- ^ InputIteratorTransformer (72) - +- ^ InputAdapter (71) - +- ^ ShuffleQueryStage (70), Statistics(X) - +- ReusedExchange (69) + : : : : +- ^ InputIteratorTransformer (18) + : : : : +- ShuffleQueryStage (16), Statistics(X) + : : : : +- ColumnarExchange (15) + : : : : +- VeloxAppendBatches (14) + : : : : +- ^ ProjectExecTransformer (12) + : : : : +- ^ NoopFilter (11) + : : : : +- ^ Scan parquet (10) + : : : +- ^ InputIteratorTransformer (35) + : : : +- ShuffleQueryStage (33), Statistics(X) + : : : +- ColumnarExchange (32) + : : : +- VeloxAppendBatches (31) + : : : +- ^ ProjectExecTransformer (29) + : : : +- ^ NoopFilter (28) + : : : +- ^ Scan parquet (27) + : : +- ^ InputIteratorTransformer (52) + : : +- ShuffleQueryStage (50), Statistics(X) + : : +- ColumnarExchange (49) + : : +- VeloxAppendBatches (48) + : : +- ^ ProjectExecTransformer (46) + : : +- ^ NoopFilter (45) + : : +- ^ Scan parquet (44) + : +- ^ InputIteratorTransformer (69) + : +- ShuffleQueryStage (67), Statistics(X) + : +- ColumnarExchange (66) + : +- VeloxAppendBatches (65) + : +- ^ ProjectExecTransformer (63) + : +- ^ NoopFilter (62) + : +- ^ Scan parquet (61) + +- ^ InputIteratorTransformer (81) + +- ShuffleQueryStage (79), Statistics(X) + +- ReusedExchange (78) +- == Initial Plan == - Sort (127) - +- Exchange (126) - +- HashAggregate (125) - +- Exchange (124) - +- HashAggregate (123) - +- Project (122) - +- ShuffledHashJoin Inner BuildRight (121) - :- Exchange (117) - : +- Project (116) - : +- ShuffledHashJoin Inner BuildRight (115) - : :- Exchange (111) - : : +- Project (110) - : : +- ShuffledHashJoin Inner BuildRight (109) - : : :- Exchange (105) - : : : +- Project (104) - : : : +- ShuffledHashJoin Inner BuildRight (103) - : : : :- Exchange (99) - : : : : +- Project (98) - : : : : +- ShuffledHashJoin Inner BuildLeft (97) - : : : : :- Exchange (93) - : : : : : +- Filter (92) - : : : : : +- Scan parquet (91) - : : : : +- Exchange (96) - : : : : +- Filter (95) - : : : : +- Scan parquet (94) - : : : +- Exchange (102) - : : : +- Filter (101) - : : : +- Scan parquet (100) - : : +- Exchange (108) - : : +- Filter (107) - : : +- Scan parquet (106) - : +- Exchange (114) - : +- Filter (113) - : +- Scan parquet (112) - +- Exchange (120) - +- Filter (119) - +- Scan parquet (118) + Sort (148) + +- Exchange (147) + +- HashAggregate (146) + +- Exchange (145) + +- HashAggregate (144) + +- Project (143) + +- SortMergeJoin Inner (142) + :- Sort (137) + : +- Exchange (136) + : +- Project (135) + : +- SortMergeJoin Inner (134) + : :- Sort (129) + : : +- Exchange (128) + : : +- Project (127) + : : +- SortMergeJoin Inner (126) + : : :- Sort (121) + : : : +- Exchange (120) + : : : +- Project (119) + : : : +- SortMergeJoin Inner (118) + : : : :- Sort (113) + : : : : +- Exchange (112) + : : : : +- Project (111) + : : : : +- SortMergeJoin Inner (110) + : : : : :- Sort (105) + : : : : : +- Exchange (104) + : : : : : +- Filter (103) + : : : : : +- Scan parquet (102) + : : : : +- Sort (109) + : : : : +- Exchange (108) + : : : : +- Filter (107) + : : : : +- Scan parquet (106) + : : : +- Sort (117) + : : : +- Exchange (116) + : : : +- Filter (115) + : : : +- Scan parquet (114) + : : +- Sort (125) + : : +- Exchange (124) + : : +- Filter (123) + : : +- Scan parquet (122) + : +- Sort (133) + : +- Exchange (132) + : +- Filter (131) + : +- Scan parquet (130) + +- Sort (141) + +- Exchange (140) + +- Filter (139) + +- Scan parquet (138) (1) Scan parquet @@ -138,524 +147,608 @@ Input [2]: [s_suppkey#X, s_nationkey#X] Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] +Arguments: X + +(6) ColumnarExchange Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [2]: [s_suppkey#X, s_nationkey#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [2]: [s_suppkey#X, s_nationkey#X] -(9) Scan parquet +(10) Scan parquet Output [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1995-01-01), LessThanOrEqual(l_shipdate,1996-12-31), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(10) NoopFilter +(11) NoopFilter Input [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [6]: [hash(l_suppkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Input [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [6]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] +Arguments: X + +(15) ColumnarExchange Input [6]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [s_suppkey#X] Right keys [1]: [l_suppkey#X] Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [6]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Input [7]: [s_suppkey#X, s_nationkey#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(19) WholeStageCodegenTransformer (X) +(21) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: false -(20) ColumnarExchange +(22) VeloxAppendBatches +Input [6]: [hash_partition_key#X, s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] +Arguments: X + +(23) ColumnarExchange Input [6]: [hash_partition_key#X, s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X], [plan_id=X], [id=#X] -(21) ShuffleQueryStage +(24) ShuffleQueryStage Output [5]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: X -(22) InputAdapter +(25) InputAdapter Input [5]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(23) InputIteratorTransformer +(26) InputIteratorTransformer Input [5]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(24) Scan parquet +(27) Scan parquet Output [2]: [o_orderkey#X, o_custkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderkey), IsNotNull(o_custkey)] ReadSchema: struct -(25) NoopFilter +(28) NoopFilter Input [2]: [o_orderkey#X, o_custkey#X] Arguments: [o_orderkey#X, o_custkey#X] -(26) ProjectExecTransformer +(29) ProjectExecTransformer Output [3]: [hash(o_orderkey#X, 42) AS hash_partition_key#X, o_orderkey#X, o_custkey#X] Input [2]: [o_orderkey#X, o_custkey#X] -(27) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] Arguments: false -(28) ColumnarExchange +(31) VeloxAppendBatches +Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] +Arguments: X + +(32) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [id=#X] -(29) ShuffleQueryStage +(33) ShuffleQueryStage Output [2]: [o_orderkey#X, o_custkey#X] Arguments: X -(30) InputAdapter +(34) InputAdapter Input [2]: [o_orderkey#X, o_custkey#X] -(31) InputIteratorTransformer +(35) InputIteratorTransformer Input [2]: [o_orderkey#X, o_custkey#X] -(32) ShuffledHashJoinExecTransformer +(36) ShuffledHashJoinExecTransformer Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join condition: None -(33) ProjectExecTransformer +(37) ProjectExecTransformer Output [6]: [hash(o_custkey#X, 42) AS hash_partition_key#X, s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] Input [7]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_orderkey#X, o_custkey#X] -(34) WholeStageCodegenTransformer (X) +(38) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] Arguments: false -(35) ColumnarExchange +(39) VeloxAppendBatches +Input [6]: [hash_partition_key#X, s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] +Arguments: X + +(40) ColumnarExchange Input [6]: [hash_partition_key#X, s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X], [plan_id=X], [id=#X] -(36) ShuffleQueryStage +(41) ShuffleQueryStage Output [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] Arguments: X -(37) InputAdapter +(42) InputAdapter Input [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] -(38) InputIteratorTransformer +(43) InputIteratorTransformer Input [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] -(39) Scan parquet +(44) Scan parquet Output [2]: [c_custkey#X, c_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey), IsNotNull(c_nationkey)] ReadSchema: struct -(40) NoopFilter +(45) NoopFilter Input [2]: [c_custkey#X, c_nationkey#X] Arguments: [c_custkey#X, c_nationkey#X] -(41) ProjectExecTransformer +(46) ProjectExecTransformer Output [3]: [hash(c_custkey#X, 42) AS hash_partition_key#X, c_custkey#X, c_nationkey#X] Input [2]: [c_custkey#X, c_nationkey#X] -(42) WholeStageCodegenTransformer (X) +(47) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] Arguments: false -(43) ColumnarExchange +(48) VeloxAppendBatches +Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] +Arguments: X + +(49) ColumnarExchange Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_nationkey#X], [plan_id=X], [id=#X] -(44) ShuffleQueryStage +(50) ShuffleQueryStage Output [2]: [c_custkey#X, c_nationkey#X] Arguments: X -(45) InputAdapter +(51) InputAdapter Input [2]: [c_custkey#X, c_nationkey#X] -(46) InputIteratorTransformer +(52) InputIteratorTransformer Input [2]: [c_custkey#X, c_nationkey#X] -(47) ShuffledHashJoinExecTransformer +(53) ShuffledHashJoinExecTransformer Left keys [1]: [o_custkey#X] Right keys [1]: [c_custkey#X] Join condition: None -(48) ProjectExecTransformer +(54) ProjectExecTransformer Output [6]: [hash(s_nationkey#X, 42) AS hash_partition_key#X, s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] Input [7]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X, c_custkey#X, c_nationkey#X] -(49) WholeStageCodegenTransformer (X) +(55) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] Arguments: false -(50) ColumnarExchange +(56) VeloxAppendBatches +Input [6]: [hash_partition_key#X, s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] +Arguments: X + +(57) ColumnarExchange Input [6]: [hash_partition_key#X, s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X], [plan_id=X], [id=#X] -(51) ShuffleQueryStage +(58) ShuffleQueryStage Output [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] Arguments: X -(52) InputAdapter +(59) InputAdapter Input [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] -(53) InputIteratorTransformer +(60) InputIteratorTransformer Input [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] -(54) Scan parquet +(61) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), Or(EqualTo(n_name,FRANCE),EqualTo(n_name,GERMANY))] ReadSchema: struct -(55) NoopFilter +(62) NoopFilter Input [2]: [n_nationkey#X, n_name#X] Arguments: [n_nationkey#X, n_name#X] -(56) ProjectExecTransformer +(63) ProjectExecTransformer Output [3]: [hash(n_nationkey#X, 42) AS hash_partition_key#X, n_nationkey#X, n_name#X] Input [2]: [n_nationkey#X, n_name#X] -(57) WholeStageCodegenTransformer (X) +(64) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] Arguments: false -(58) ColumnarExchange +(65) VeloxAppendBatches +Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] +Arguments: X + +(66) ColumnarExchange Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [id=#X] -(59) ShuffleQueryStage +(67) ShuffleQueryStage Output [2]: [n_nationkey#X, n_name#X] Arguments: X -(60) InputAdapter +(68) InputAdapter Input [2]: [n_nationkey#X, n_name#X] -(61) InputIteratorTransformer +(69) InputIteratorTransformer Input [2]: [n_nationkey#X, n_name#X] -(62) ShuffledHashJoinExecTransformer +(70) ShuffledHashJoinExecTransformer Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(63) ProjectExecTransformer +(71) ProjectExecTransformer Output [6]: [hash(c_nationkey#X, 42) AS hash_partition_key#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] Input [7]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_nationkey#X, n_name#X] -(64) WholeStageCodegenTransformer (X) +(72) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] Arguments: false -(65) ColumnarExchange +(73) VeloxAppendBatches +Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] +Arguments: X + +(74) ColumnarExchange Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X], [plan_id=X], [id=#X] -(66) ShuffleQueryStage +(75) ShuffleQueryStage Output [5]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] Arguments: X -(67) InputAdapter +(76) InputAdapter Input [5]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] -(68) InputIteratorTransformer +(77) InputIteratorTransformer Input [5]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] -(69) ReusedExchange [Reuses operator id: 58] +(78) ReusedExchange [Reuses operator id: 66] Output [2]: [n_nationkey#X, n_name#X] -(70) ShuffleQueryStage +(79) ShuffleQueryStage Output [2]: [n_nationkey#X, n_name#X] Arguments: X -(71) InputAdapter +(80) InputAdapter Input [2]: [n_nationkey#X, n_name#X] -(72) InputIteratorTransformer +(81) InputIteratorTransformer Input [2]: [n_nationkey#X, n_name#X] -(73) ShuffledHashJoinExecTransformer +(82) ShuffledHashJoinExecTransformer Left keys [1]: [c_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: (((n_name#X = FRANCE) AND (n_name#X = GERMANY)) OR ((n_name#X = GERMANY) AND (n_name#X = FRANCE))) -(74) ProjectExecTransformer +(83) ProjectExecTransformer Output [4]: [n_name#X AS supp_nation#X, n_name#X AS cust_nation#X, year(l_shipdate#X) AS l_year#X, CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) AS volume#X] Input [7]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X, n_nationkey#X, n_name#X] -(75) FlushableHashAggregateExecTransformer +(84) FlushableHashAggregateExecTransformer Input [4]: [supp_nation#X, cust_nation#X, l_year#X, volume#X] Keys [3]: [supp_nation#X, cust_nation#X, l_year#X] Functions [1]: [partial_sum(volume#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] -(76) ProjectExecTransformer +(85) ProjectExecTransformer Output [6]: [hash(supp_nation#X, cust_nation#X, l_year#X, 42) AS hash_partition_key#X, supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] -(77) WholeStageCodegenTransformer (X) +(86) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Arguments: false -(78) ColumnarExchange +(87) VeloxAppendBatches +Input [6]: [hash_partition_key#X, supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] +Arguments: X + +(88) ColumnarExchange Input [6]: [hash_partition_key#X, supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Arguments: hashpartitioning(supp_nation#X, cust_nation#X, l_year#X, 1), ENSURE_REQUIREMENTS, [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(79) ShuffleQueryStage +(89) ShuffleQueryStage Output [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Arguments: X -(80) InputAdapter +(90) InputAdapter Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] -(81) InputIteratorTransformer +(91) InputIteratorTransformer Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] -(82) RegularHashAggregateExecTransformer +(92) RegularHashAggregateExecTransformer Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Keys [3]: [supp_nation#X, cust_nation#X, l_year#X] Functions [1]: [sum(volume#X)] Aggregate Attributes [1]: [sum(volume#X)#X] Results [4]: [supp_nation#X, cust_nation#X, l_year#X, sum(volume#X)#X AS revenue#X] -(83) WholeStageCodegenTransformer (X) +(93) WholeStageCodegenTransformer (X) Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: false -(84) ColumnarExchange +(94) VeloxAppendBatches +Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] +Arguments: X + +(95) ColumnarExchange Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: rangepartitioning(supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(85) ShuffleQueryStage +(96) ShuffleQueryStage Output [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: X -(86) InputAdapter +(97) InputAdapter Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] -(87) InputIteratorTransformer +(98) InputIteratorTransformer Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] -(88) SortExecTransformer +(99) SortExecTransformer Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: [supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST], true, 0 -(89) WholeStageCodegenTransformer (X) +(100) WholeStageCodegenTransformer (X) Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: false -(90) VeloxColumnarToRowExec +(101) VeloxColumnarToRowExec Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] -(91) Scan parquet +(102) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(92) Filter +(103) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(93) Exchange +(104) Exchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(94) Scan parquet +(105) Sort +Input [2]: [s_suppkey#X, s_nationkey#X] +Arguments: [s_suppkey#X ASC NULLS FIRST], false, 0 + +(106) Scan parquet Output [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1995-01-01), LessThanOrEqual(l_shipdate,1996-12-31), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(95) Filter +(107) Filter Input [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : ((((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1995-01-01)) AND (l_shipdate#X <= 1996-12-31)) AND isnotnull(l_suppkey#X)) AND isnotnull(l_orderkey#X)) -(96) Exchange +(108) Exchange Input [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(97) ShuffledHashJoin +(109) Sort +Input [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] +Arguments: [l_suppkey#X ASC NULLS FIRST], false, 0 + +(110) SortMergeJoin Left keys [1]: [s_suppkey#X] Right keys [1]: [l_suppkey#X] Join condition: None -(98) Project +(111) Project Output [5]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Input [7]: [s_suppkey#X, s_nationkey#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(99) Exchange +(112) Exchange Input [5]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(100) Scan parquet +(113) Sort +Input [5]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(114) Scan parquet Output [2]: [o_orderkey#X, o_custkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderkey), IsNotNull(o_custkey)] ReadSchema: struct -(101) Filter +(115) Filter Input [2]: [o_orderkey#X, o_custkey#X] Condition : (isnotnull(o_orderkey#X) AND isnotnull(o_custkey#X)) -(102) Exchange +(116) Exchange Input [2]: [o_orderkey#X, o_custkey#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(103) ShuffledHashJoin +(117) Sort +Input [2]: [o_orderkey#X, o_custkey#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(118) SortMergeJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join condition: None -(104) Project +(119) Project Output [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] Input [7]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_orderkey#X, o_custkey#X] -(105) Exchange +(120) Exchange Input [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(106) Scan parquet +(121) Sort +Input [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] +Arguments: [o_custkey#X ASC NULLS FIRST], false, 0 + +(122) Scan parquet Output [2]: [c_custkey#X, c_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey), IsNotNull(c_nationkey)] ReadSchema: struct -(107) Filter +(123) Filter Input [2]: [c_custkey#X, c_nationkey#X] Condition : (isnotnull(c_custkey#X) AND isnotnull(c_nationkey#X)) -(108) Exchange +(124) Exchange Input [2]: [c_custkey#X, c_nationkey#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(109) ShuffledHashJoin +(125) Sort +Input [2]: [c_custkey#X, c_nationkey#X] +Arguments: [c_custkey#X ASC NULLS FIRST], false, 0 + +(126) SortMergeJoin Left keys [1]: [o_custkey#X] Right keys [1]: [c_custkey#X] Join condition: None -(110) Project +(127) Project Output [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] Input [7]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X, c_custkey#X, c_nationkey#X] -(111) Exchange +(128) Exchange Input [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(112) Scan parquet +(129) Sort +Input [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] +Arguments: [s_nationkey#X ASC NULLS FIRST], false, 0 + +(130) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), Or(EqualTo(n_name,FRANCE),EqualTo(n_name,GERMANY))] ReadSchema: struct -(113) Filter +(131) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : (isnotnull(n_nationkey#X) AND ((n_name#X = FRANCE) OR (n_name#X = GERMANY))) -(114) Exchange +(132) Exchange Input [2]: [n_nationkey#X, n_name#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(115) ShuffledHashJoin +(133) Sort +Input [2]: [n_nationkey#X, n_name#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(134) SortMergeJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(116) Project +(135) Project Output [5]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] Input [7]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_nationkey#X, n_name#X] -(117) Exchange +(136) Exchange Input [5]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(118) Scan parquet +(137) Sort +Input [5]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] +Arguments: [c_nationkey#X ASC NULLS FIRST], false, 0 + +(138) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), Or(EqualTo(n_name,GERMANY),EqualTo(n_name,FRANCE))] ReadSchema: struct -(119) Filter +(139) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : (isnotnull(n_nationkey#X) AND ((n_name#X = GERMANY) OR (n_name#X = FRANCE))) -(120) Exchange +(140) Exchange Input [2]: [n_nationkey#X, n_name#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(121) ShuffledHashJoin +(141) Sort +Input [2]: [n_nationkey#X, n_name#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(142) SortMergeJoin Left keys [1]: [c_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: (((n_name#X = FRANCE) AND (n_name#X = GERMANY)) OR ((n_name#X = GERMANY) AND (n_name#X = FRANCE))) -(122) Project +(143) Project Output [4]: [n_name#X AS supp_nation#X, n_name#X AS cust_nation#X, year(l_shipdate#X) AS l_year#X, CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) AS volume#X] Input [7]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X, n_nationkey#X, n_name#X] -(123) HashAggregate +(144) HashAggregate Input [4]: [supp_nation#X, cust_nation#X, l_year#X, volume#X] Keys [3]: [supp_nation#X, cust_nation#X, l_year#X] Functions [1]: [partial_sum(volume#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] -(124) Exchange +(145) Exchange Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Arguments: hashpartitioning(supp_nation#X, cust_nation#X, l_year#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(125) HashAggregate +(146) HashAggregate Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Keys [3]: [supp_nation#X, cust_nation#X, l_year#X] Functions [1]: [sum(volume#X)] Aggregate Attributes [1]: [sum(volume#X)#X] Results [4]: [supp_nation#X, cust_nation#X, l_year#X, sum(volume#X)#X AS revenue#X] -(126) Exchange +(147) Exchange Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: rangepartitioning(supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(127) Sort +(148) Sort Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: [supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST], true, 0 -(128) AdaptiveSparkPlan +(149) AdaptiveSparkPlan Output [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/8.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/8.txt index 16838a0513b3d..e359f4c944e72 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/8.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/8.txt @@ -1,166 +1,180 @@ == Physical Plan == -AdaptiveSparkPlan (177) +AdaptiveSparkPlan (207) +- == Final Plan == - VeloxColumnarToRowExec (125) - +- ^ SortExecTransformer (123) - +- ^ InputIteratorTransformer (122) - +- ^ InputAdapter (121) - +- ^ ShuffleQueryStage (120), Statistics(X) - +- ColumnarExchange (119) - +- ^ ProjectExecTransformer (117) - +- ^ RegularHashAggregateExecTransformer (116) - +- ^ InputIteratorTransformer (115) - +- ^ InputAdapter (114) - +- ^ ShuffleQueryStage (113), Statistics(X) - +- ColumnarExchange (112) - +- ^ ProjectExecTransformer (110) - +- ^ FlushableHashAggregateExecTransformer (109) - +- ^ ProjectExecTransformer (108) - +- ^ ShuffledHashJoinExecTransformer Inner (107) - :- ^ InputIteratorTransformer (98) - : +- ^ InputAdapter (97) - : +- ^ ShuffleQueryStage (96), Statistics(X) - : +- ColumnarExchange (95) - : +- ^ ProjectExecTransformer (93) - : +- ^ ShuffledHashJoinExecTransformer Inner (92) - : :- ^ InputIteratorTransformer (83) - : : +- ^ InputAdapter (82) - : : +- ^ ShuffleQueryStage (81), Statistics(X) - : : +- ColumnarExchange (80) - : : +- ^ ProjectExecTransformer (78) - : : +- ^ ShuffledHashJoinExecTransformer Inner (77) - : : :- ^ InputIteratorTransformer (68) - : : : +- ^ InputAdapter (67) - : : : +- ^ ShuffleQueryStage (66), Statistics(X) - : : : +- ColumnarExchange (65) - : : : +- ^ ProjectExecTransformer (63) - : : : +- ^ ShuffledHashJoinExecTransformer Inner (62) - : : : :- ^ InputIteratorTransformer (53) - : : : : +- ^ InputAdapter (52) - : : : : +- ^ ShuffleQueryStage (51), Statistics(X) - : : : : +- ColumnarExchange (50) - : : : : +- ^ ProjectExecTransformer (48) - : : : : +- ^ ShuffledHashJoinExecTransformer Inner (47) - : : : : :- ^ InputIteratorTransformer (38) - : : : : : +- ^ InputAdapter (37) - : : : : : +- ^ ShuffleQueryStage (36), Statistics(X) - : : : : : +- ColumnarExchange (35) - : : : : : +- ^ ProjectExecTransformer (33) - : : : : : +- ^ ShuffledHashJoinExecTransformer Inner (32) - : : : : : :- ^ InputIteratorTransformer (23) - : : : : : : +- ^ InputAdapter (22) - : : : : : : +- ^ ShuffleQueryStage (21), Statistics(X) - : : : : : : +- ColumnarExchange (20) - : : : : : : +- ^ ProjectExecTransformer (18) - : : : : : : +- ^ ShuffledHashJoinExecTransformer Inner (17) - : : : : : : :- ^ InputIteratorTransformer (8) - : : : : : : : +- ^ InputAdapter (7) - : : : : : : : +- ^ ShuffleQueryStage (6), Statistics(X) - : : : : : : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (141) + +- ^ SortExecTransformer (139) + +- ^ InputIteratorTransformer (138) + +- ShuffleQueryStage (136), Statistics(X) + +- ColumnarExchange (135) + +- VeloxAppendBatches (134) + +- ^ ProjectExecTransformer (132) + +- ^ RegularHashAggregateExecTransformer (131) + +- ^ InputIteratorTransformer (130) + +- ShuffleQueryStage (128), Statistics(X) + +- ColumnarExchange (127) + +- VeloxAppendBatches (126) + +- ^ ProjectExecTransformer (124) + +- ^ FlushableHashAggregateExecTransformer (123) + +- ^ ProjectExecTransformer (122) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (121) + :- ^ InputIteratorTransformer (111) + : +- ShuffleQueryStage (109), Statistics(X) + : +- ColumnarExchange (108) + : +- VeloxAppendBatches (107) + : +- ^ ProjectExecTransformer (105) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (104) + : :- ^ InputIteratorTransformer (94) + : : +- ShuffleQueryStage (92), Statistics(X) + : : +- ColumnarExchange (91) + : : +- VeloxAppendBatches (90) + : : +- ^ ProjectExecTransformer (88) + : : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (87) + : : :- ^ InputIteratorTransformer (77) + : : : +- ShuffleQueryStage (75), Statistics(X) + : : : +- ColumnarExchange (74) + : : : +- VeloxAppendBatches (73) + : : : +- ^ ProjectExecTransformer (71) + : : : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (70) + : : : :- ^ InputIteratorTransformer (60) + : : : : +- ShuffleQueryStage (58), Statistics(X) + : : : : +- ColumnarExchange (57) + : : : : +- VeloxAppendBatches (56) + : : : : +- ^ ProjectExecTransformer (54) + : : : : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (53) + : : : : :- ^ InputIteratorTransformer (43) + : : : : : +- ShuffleQueryStage (41), Statistics(X) + : : : : : +- ColumnarExchange (40) + : : : : : +- VeloxAppendBatches (39) + : : : : : +- ^ ProjectExecTransformer (37) + : : : : : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (36) + : : : : : :- ^ InputIteratorTransformer (26) + : : : : : : +- ShuffleQueryStage (24), Statistics(X) + : : : : : : +- ColumnarExchange (23) + : : : : : : +- VeloxAppendBatches (22) + : : : : : : +- ^ ProjectExecTransformer (20) + : : : : : : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (19) + : : : : : : :- ^ InputIteratorTransformer (9) + : : : : : : : +- ShuffleQueryStage (7), Statistics(X) + : : : : : : : +- ColumnarExchange (6) + : : : : : : : +- VeloxAppendBatches (5) : : : : : : : +- ^ ProjectExecTransformer (3) : : : : : : : +- ^ NoopFilter (2) : : : : : : : +- ^ Scan parquet (1) - : : : : : : +- ^ InputIteratorTransformer (16) - : : : : : : +- ^ InputAdapter (15) - : : : : : : +- ^ ShuffleQueryStage (14), Statistics(X) - : : : : : : +- ColumnarExchange (13) - : : : : : : +- ^ ProjectExecTransformer (11) - : : : : : : +- ^ NoopFilter (10) - : : : : : : +- ^ Scan parquet (9) - : : : : : +- ^ InputIteratorTransformer (31) - : : : : : +- ^ InputAdapter (30) - : : : : : +- ^ ShuffleQueryStage (29), Statistics(X) - : : : : : +- ColumnarExchange (28) - : : : : : +- ^ ProjectExecTransformer (26) - : : : : : +- ^ NoopFilter (25) - : : : : : +- ^ Scan parquet (24) - : : : : +- ^ InputIteratorTransformer (46) - : : : : +- ^ InputAdapter (45) - : : : : +- ^ ShuffleQueryStage (44), Statistics(X) - : : : : +- ColumnarExchange (43) - : : : : +- ^ ProjectExecTransformer (41) - : : : : +- ^ NoopFilter (40) - : : : : +- ^ Scan parquet (39) - : : : +- ^ InputIteratorTransformer (61) - : : : +- ^ InputAdapter (60) - : : : +- ^ ShuffleQueryStage (59), Statistics(X) - : : : +- ColumnarExchange (58) - : : : +- ^ ProjectExecTransformer (56) - : : : +- ^ NoopFilter (55) - : : : +- ^ Scan parquet (54) - : : +- ^ InputIteratorTransformer (76) - : : +- ^ InputAdapter (75) - : : +- ^ ShuffleQueryStage (74), Statistics(X) - : : +- ColumnarExchange (73) - : : +- ^ ProjectExecTransformer (71) - : : +- ^ NoopFilter (70) - : : +- ^ Scan parquet (69) - : +- ^ InputIteratorTransformer (91) - : +- ^ InputAdapter (90) - : +- ^ ShuffleQueryStage (89), Statistics(X) - : +- ColumnarExchange (88) - : +- ^ ProjectExecTransformer (86) - : +- ^ NoopFilter (85) - : +- ^ Scan parquet (84) - +- ^ InputIteratorTransformer (106) - +- ^ InputAdapter (105) - +- ^ ShuffleQueryStage (104), Statistics(X) - +- ColumnarExchange (103) - +- ^ ProjectExecTransformer (101) - +- ^ NoopFilter (100) - +- ^ Scan parquet (99) + : : : : : : +- ^ InputIteratorTransformer (18) + : : : : : : +- ShuffleQueryStage (16), Statistics(X) + : : : : : : +- ColumnarExchange (15) + : : : : : : +- VeloxAppendBatches (14) + : : : : : : +- ^ ProjectExecTransformer (12) + : : : : : : +- ^ NoopFilter (11) + : : : : : : +- ^ Scan parquet (10) + : : : : : +- ^ InputIteratorTransformer (35) + : : : : : +- ShuffleQueryStage (33), Statistics(X) + : : : : : +- ColumnarExchange (32) + : : : : : +- VeloxAppendBatches (31) + : : : : : +- ^ ProjectExecTransformer (29) + : : : : : +- ^ NoopFilter (28) + : : : : : +- ^ Scan parquet (27) + : : : : +- ^ InputIteratorTransformer (52) + : : : : +- ShuffleQueryStage (50), Statistics(X) + : : : : +- ColumnarExchange (49) + : : : : +- VeloxAppendBatches (48) + : : : : +- ^ ProjectExecTransformer (46) + : : : : +- ^ NoopFilter (45) + : : : : +- ^ Scan parquet (44) + : : : +- ^ InputIteratorTransformer (69) + : : : +- ShuffleQueryStage (67), Statistics(X) + : : : +- ColumnarExchange (66) + : : : +- VeloxAppendBatches (65) + : : : +- ^ ProjectExecTransformer (63) + : : : +- ^ NoopFilter (62) + : : : +- ^ Scan parquet (61) + : : +- ^ InputIteratorTransformer (86) + : : +- ShuffleQueryStage (84), Statistics(X) + : : +- ColumnarExchange (83) + : : +- VeloxAppendBatches (82) + : : +- ^ ProjectExecTransformer (80) + : : +- ^ NoopFilter (79) + : : +- ^ Scan parquet (78) + : +- ^ InputIteratorTransformer (103) + : +- ShuffleQueryStage (101), Statistics(X) + : +- ColumnarExchange (100) + : +- VeloxAppendBatches (99) + : +- ^ ProjectExecTransformer (97) + : +- ^ NoopFilter (96) + : +- ^ Scan parquet (95) + +- ^ InputIteratorTransformer (120) + +- ShuffleQueryStage (118), Statistics(X) + +- ColumnarExchange (117) + +- VeloxAppendBatches (116) + +- ^ ProjectExecTransformer (114) + +- ^ NoopFilter (113) + +- ^ Scan parquet (112) +- == Initial Plan == - Sort (176) - +- Exchange (175) - +- HashAggregate (174) - +- Exchange (173) - +- HashAggregate (172) - +- Project (171) - +- ShuffledHashJoin Inner BuildRight (170) - :- Exchange (165) - : +- Project (164) - : +- ShuffledHashJoin Inner BuildRight (163) - : :- Exchange (159) - : : +- Project (158) - : : +- ShuffledHashJoin Inner BuildRight (157) - : : :- Exchange (153) - : : : +- Project (152) - : : : +- ShuffledHashJoin Inner BuildRight (151) - : : : :- Exchange (147) - : : : : +- Project (146) - : : : : +- ShuffledHashJoin Inner BuildRight (145) - : : : : :- Exchange (141) - : : : : : +- Project (140) - : : : : : +- ShuffledHashJoin Inner BuildRight (139) - : : : : : :- Exchange (135) - : : : : : : +- Project (134) - : : : : : : +- ShuffledHashJoin Inner BuildLeft (133) - : : : : : : :- Exchange (129) - : : : : : : : +- Project (128) - : : : : : : : +- Filter (127) - : : : : : : : +- Scan parquet (126) - : : : : : : +- Exchange (132) - : : : : : : +- Filter (131) - : : : : : : +- Scan parquet (130) - : : : : : +- Exchange (138) - : : : : : +- Filter (137) - : : : : : +- Scan parquet (136) - : : : : +- Exchange (144) - : : : : +- Filter (143) - : : : : +- Scan parquet (142) - : : : +- Exchange (150) - : : : +- Filter (149) - : : : +- Scan parquet (148) - : : +- Exchange (156) - : : +- Filter (155) - : : +- Scan parquet (154) - : +- Exchange (162) - : +- Filter (161) - : +- Scan parquet (160) - +- Exchange (169) - +- Project (168) - +- Filter (167) - +- Scan parquet (166) + Sort (206) + +- Exchange (205) + +- HashAggregate (204) + +- Exchange (203) + +- HashAggregate (202) + +- Project (201) + +- SortMergeJoin Inner (200) + :- Sort (194) + : +- Exchange (193) + : +- Project (192) + : +- SortMergeJoin Inner (191) + : :- Sort (186) + : : +- Exchange (185) + : : +- Project (184) + : : +- SortMergeJoin Inner (183) + : : :- Sort (178) + : : : +- Exchange (177) + : : : +- Project (176) + : : : +- SortMergeJoin Inner (175) + : : : :- Sort (170) + : : : : +- Exchange (169) + : : : : +- Project (168) + : : : : +- SortMergeJoin Inner (167) + : : : : :- Sort (162) + : : : : : +- Exchange (161) + : : : : : +- Project (160) + : : : : : +- SortMergeJoin Inner (159) + : : : : : :- Sort (154) + : : : : : : +- Exchange (153) + : : : : : : +- Project (152) + : : : : : : +- SortMergeJoin Inner (151) + : : : : : : :- Sort (146) + : : : : : : : +- Exchange (145) + : : : : : : : +- Project (144) + : : : : : : : +- Filter (143) + : : : : : : : +- Scan parquet (142) + : : : : : : +- Sort (150) + : : : : : : +- Exchange (149) + : : : : : : +- Filter (148) + : : : : : : +- Scan parquet (147) + : : : : : +- Sort (158) + : : : : : +- Exchange (157) + : : : : : +- Filter (156) + : : : : : +- Scan parquet (155) + : : : : +- Sort (166) + : : : : +- Exchange (165) + : : : : +- Filter (164) + : : : : +- Scan parquet (163) + : : : +- Sort (174) + : : : +- Exchange (173) + : : : +- Filter (172) + : : : +- Scan parquet (171) + : : +- Sort (182) + : : +- Exchange (181) + : : +- Filter (180) + : : +- Scan parquet (179) + : +- Sort (190) + : +- Exchange (189) + : +- Filter (188) + : +- Scan parquet (187) + +- Sort (199) + +- Exchange (198) + +- Project (197) + +- Filter (196) + +- Scan parquet (195) (1) Scan parquet @@ -182,732 +196,852 @@ Input [2]: [p_partkey#X, p_type#X] Input [2]: [hash_partition_key#X, p_partkey#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [2]: [hash_partition_key#X, p_partkey#X] +Arguments: X + +(6) ColumnarExchange Input [2]: [hash_partition_key#X, p_partkey#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [1]: [p_partkey#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [1]: [p_partkey#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [1]: [p_partkey#X] -(9) Scan parquet +(10) Scan parquet Output [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(10) NoopFilter +(11) NoopFilter Input [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [6]: [hash(l_partkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Input [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [6]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(15) ColumnarExchange Input [6]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [p_partkey#X] Right keys [1]: [l_partkey#X] Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [5]: [hash(l_suppkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Input [6]: [p_partkey#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(19) WholeStageCodegenTransformer (X) +(21) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: false -(20) ColumnarExchange +(22) VeloxAppendBatches +Input [5]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(23) ColumnarExchange Input [5]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(21) ShuffleQueryStage +(24) ShuffleQueryStage Output [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: X -(22) InputAdapter +(25) InputAdapter Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(23) InputIteratorTransformer +(26) InputIteratorTransformer Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(24) Scan parquet +(27) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(25) NoopFilter +(28) NoopFilter Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: [s_suppkey#X, s_nationkey#X] -(26) ProjectExecTransformer +(29) ProjectExecTransformer Output [3]: [hash(s_suppkey#X, 42) AS hash_partition_key#X, s_suppkey#X, s_nationkey#X] Input [2]: [s_suppkey#X, s_nationkey#X] -(27) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] Arguments: false -(28) ColumnarExchange +(31) VeloxAppendBatches +Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] +Arguments: X + +(32) ColumnarExchange Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [id=#X] -(29) ShuffleQueryStage +(33) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] Arguments: X -(30) InputAdapter +(34) InputAdapter Input [2]: [s_suppkey#X, s_nationkey#X] -(31) InputIteratorTransformer +(35) InputIteratorTransformer Input [2]: [s_suppkey#X, s_nationkey#X] -(32) ShuffledHashJoinExecTransformer +(36) ShuffledHashJoinExecTransformer Left keys [1]: [l_suppkey#X] Right keys [1]: [s_suppkey#X] Join condition: None -(33) ProjectExecTransformer +(37) ProjectExecTransformer Output [5]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Input [6]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, s_suppkey#X, s_nationkey#X] -(34) WholeStageCodegenTransformer (X) +(38) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: false -(35) ColumnarExchange +(39) VeloxAppendBatches +Input [5]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] +Arguments: X + +(40) ColumnarExchange Input [5]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X], [plan_id=X], [id=#X] -(36) ShuffleQueryStage +(41) ShuffleQueryStage Output [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: X -(37) InputAdapter +(42) InputAdapter Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] -(38) InputIteratorTransformer +(43) InputIteratorTransformer Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] -(39) Scan parquet +(44) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1995-01-01), LessThanOrEqual(o_orderdate,1996-12-31), IsNotNull(o_orderkey), IsNotNull(o_custkey)] ReadSchema: struct -(40) NoopFilter +(45) NoopFilter Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Arguments: [o_orderkey#X, o_custkey#X, o_orderdate#X] -(41) ProjectExecTransformer +(46) ProjectExecTransformer Output [4]: [hash(o_orderkey#X, 42) AS hash_partition_key#X, o_orderkey#X, o_custkey#X, o_orderdate#X] Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] -(42) WholeStageCodegenTransformer (X) +(47) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_orderdate#X] Arguments: false -(43) ColumnarExchange +(48) VeloxAppendBatches +Input [4]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_orderdate#X] +Arguments: X + +(49) ColumnarExchange Input [4]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_orderdate#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_orderdate#X], [plan_id=X], [id=#X] -(44) ShuffleQueryStage +(50) ShuffleQueryStage Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Arguments: X -(45) InputAdapter +(51) InputAdapter Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] -(46) InputIteratorTransformer +(52) InputIteratorTransformer Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] -(47) ShuffledHashJoinExecTransformer +(53) ShuffledHashJoinExecTransformer Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join condition: None -(48) ProjectExecTransformer +(54) ProjectExecTransformer Output [6]: [hash(o_custkey#X, 42) AS hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] Input [7]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderkey#X, o_custkey#X, o_orderdate#X] -(49) WholeStageCodegenTransformer (X) +(55) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] Arguments: false -(50) ColumnarExchange +(56) VeloxAppendBatches +Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] +Arguments: X + +(57) ColumnarExchange Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X], [plan_id=X], [id=#X] -(51) ShuffleQueryStage +(58) ShuffleQueryStage Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] Arguments: X -(52) InputAdapter +(59) InputAdapter Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] -(53) InputIteratorTransformer +(60) InputIteratorTransformer Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] -(54) Scan parquet +(61) Scan parquet Output [2]: [c_custkey#X, c_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey), IsNotNull(c_nationkey)] ReadSchema: struct -(55) NoopFilter +(62) NoopFilter Input [2]: [c_custkey#X, c_nationkey#X] Arguments: [c_custkey#X, c_nationkey#X] -(56) ProjectExecTransformer +(63) ProjectExecTransformer Output [3]: [hash(c_custkey#X, 42) AS hash_partition_key#X, c_custkey#X, c_nationkey#X] Input [2]: [c_custkey#X, c_nationkey#X] -(57) WholeStageCodegenTransformer (X) +(64) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] Arguments: false -(58) ColumnarExchange +(65) VeloxAppendBatches +Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] +Arguments: X + +(66) ColumnarExchange Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_nationkey#X], [plan_id=X], [id=#X] -(59) ShuffleQueryStage +(67) ShuffleQueryStage Output [2]: [c_custkey#X, c_nationkey#X] Arguments: X -(60) InputAdapter +(68) InputAdapter Input [2]: [c_custkey#X, c_nationkey#X] -(61) InputIteratorTransformer +(69) InputIteratorTransformer Input [2]: [c_custkey#X, c_nationkey#X] -(62) ShuffledHashJoinExecTransformer +(70) ShuffledHashJoinExecTransformer Left keys [1]: [o_custkey#X] Right keys [1]: [c_custkey#X] Join condition: None -(63) ProjectExecTransformer +(71) ProjectExecTransformer Output [6]: [hash(c_nationkey#X, 42) AS hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] Input [7]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X, c_custkey#X, c_nationkey#X] -(64) WholeStageCodegenTransformer (X) +(72) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] Arguments: false -(65) ColumnarExchange +(73) VeloxAppendBatches +Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] +Arguments: X + +(74) ColumnarExchange Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X], [plan_id=X], [id=#X] -(66) ShuffleQueryStage +(75) ShuffleQueryStage Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] Arguments: X -(67) InputAdapter +(76) InputAdapter Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] -(68) InputIteratorTransformer +(77) InputIteratorTransformer Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] -(69) Scan parquet +(78) Scan parquet Output [2]: [n_nationkey#X, n_regionkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), IsNotNull(n_regionkey)] ReadSchema: struct -(70) NoopFilter +(79) NoopFilter Input [2]: [n_nationkey#X, n_regionkey#X] Arguments: [n_nationkey#X, n_regionkey#X] -(71) ProjectExecTransformer +(80) ProjectExecTransformer Output [3]: [hash(n_nationkey#X, 42) AS hash_partition_key#X, n_nationkey#X, n_regionkey#X] Input [2]: [n_nationkey#X, n_regionkey#X] -(72) WholeStageCodegenTransformer (X) +(81) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, n_nationkey#X, n_regionkey#X] Arguments: false -(73) ColumnarExchange +(82) VeloxAppendBatches +Input [3]: [hash_partition_key#X, n_nationkey#X, n_regionkey#X] +Arguments: X + +(83) ColumnarExchange Input [3]: [hash_partition_key#X, n_nationkey#X, n_regionkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_regionkey#X], [plan_id=X], [id=#X] -(74) ShuffleQueryStage +(84) ShuffleQueryStage Output [2]: [n_nationkey#X, n_regionkey#X] Arguments: X -(75) InputAdapter +(85) InputAdapter Input [2]: [n_nationkey#X, n_regionkey#X] -(76) InputIteratorTransformer +(86) InputIteratorTransformer Input [2]: [n_nationkey#X, n_regionkey#X] -(77) ShuffledHashJoinExecTransformer +(87) ShuffledHashJoinExecTransformer Left keys [1]: [c_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(78) ProjectExecTransformer +(88) ProjectExecTransformer Output [6]: [hash(s_nationkey#X, 42) AS hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] Input [7]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X, n_nationkey#X, n_regionkey#X] -(79) WholeStageCodegenTransformer (X) +(89) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] Arguments: false -(80) ColumnarExchange +(90) VeloxAppendBatches +Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] +Arguments: X + +(91) ColumnarExchange Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X], [plan_id=X], [id=#X] -(81) ShuffleQueryStage +(92) ShuffleQueryStage Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] Arguments: X -(82) InputAdapter +(93) InputAdapter Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] -(83) InputIteratorTransformer +(94) InputIteratorTransformer Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] -(84) Scan parquet +(95) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey)] ReadSchema: struct -(85) NoopFilter +(96) NoopFilter Input [2]: [n_nationkey#X, n_name#X] Arguments: [n_nationkey#X, n_name#X] -(86) ProjectExecTransformer +(97) ProjectExecTransformer Output [3]: [hash(n_nationkey#X, 42) AS hash_partition_key#X, n_nationkey#X, n_name#X] Input [2]: [n_nationkey#X, n_name#X] -(87) WholeStageCodegenTransformer (X) +(98) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] Arguments: false -(88) ColumnarExchange +(99) VeloxAppendBatches +Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] +Arguments: X + +(100) ColumnarExchange Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [id=#X] -(89) ShuffleQueryStage +(101) ShuffleQueryStage Output [2]: [n_nationkey#X, n_name#X] Arguments: X -(90) InputAdapter +(102) InputAdapter Input [2]: [n_nationkey#X, n_name#X] -(91) InputIteratorTransformer +(103) InputIteratorTransformer Input [2]: [n_nationkey#X, n_name#X] -(92) ShuffledHashJoinExecTransformer +(104) ShuffledHashJoinExecTransformer Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(93) ProjectExecTransformer +(105) ProjectExecTransformer Output [6]: [hash(n_regionkey#X, 42) AS hash_partition_key#X, l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] Input [7]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X, n_nationkey#X, n_name#X] -(94) WholeStageCodegenTransformer (X) +(106) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] Arguments: false -(95) ColumnarExchange +(107) VeloxAppendBatches +Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] +Arguments: X + +(108) ColumnarExchange Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] Arguments: hashpartitioning(n_regionkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X], [plan_id=X], [id=#X] -(96) ShuffleQueryStage +(109) ShuffleQueryStage Output [5]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] Arguments: X -(97) InputAdapter +(110) InputAdapter Input [5]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] -(98) InputIteratorTransformer +(111) InputIteratorTransformer Input [5]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] -(99) Scan parquet +(112) Scan parquet Output [2]: [r_regionkey#X, r_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(r_name), EqualTo(r_name,AMERICA), IsNotNull(r_regionkey)] ReadSchema: struct -(100) NoopFilter +(113) NoopFilter Input [2]: [r_regionkey#X, r_name#X] Arguments: [r_regionkey#X, r_name#X] -(101) ProjectExecTransformer +(114) ProjectExecTransformer Output [2]: [hash(r_regionkey#X, 42) AS hash_partition_key#X, r_regionkey#X] Input [2]: [r_regionkey#X, r_name#X] -(102) WholeStageCodegenTransformer (X) +(115) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, r_regionkey#X] Arguments: false -(103) ColumnarExchange +(116) VeloxAppendBatches +Input [2]: [hash_partition_key#X, r_regionkey#X] +Arguments: X + +(117) ColumnarExchange Input [2]: [hash_partition_key#X, r_regionkey#X] Arguments: hashpartitioning(r_regionkey#X, 1), ENSURE_REQUIREMENTS, [r_regionkey#X], [plan_id=X], [id=#X] -(104) ShuffleQueryStage +(118) ShuffleQueryStage Output [1]: [r_regionkey#X] Arguments: X -(105) InputAdapter +(119) InputAdapter Input [1]: [r_regionkey#X] -(106) InputIteratorTransformer +(120) InputIteratorTransformer Input [1]: [r_regionkey#X] -(107) ShuffledHashJoinExecTransformer +(121) ShuffledHashJoinExecTransformer Left keys [1]: [n_regionkey#X] Right keys [1]: [r_regionkey#X] Join condition: None -(108) ProjectExecTransformer +(122) ProjectExecTransformer Output [4]: [year(o_orderdate#X) AS o_year#X, CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) AS volume#X, n_name#X AS nation#X, CASE WHEN (n_name#X = BRAZIL) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) ELSE 0.0000 END AS _pre_X#X] Input [6]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X, r_regionkey#X] -(109) FlushableHashAggregateExecTransformer +(123) FlushableHashAggregateExecTransformer Input [4]: [o_year#X, volume#X, nation#X, _pre_X#X] Keys [1]: [o_year#X] Functions [2]: [partial_sum(_pre_X#X), partial_sum(volume#X)] Aggregate Attributes [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Results [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] -(110) ProjectExecTransformer +(124) ProjectExecTransformer Output [6]: [hash(o_year#X, 42) AS hash_partition_key#X, o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] -(111) WholeStageCodegenTransformer (X) +(125) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: false -(112) ColumnarExchange +(126) VeloxAppendBatches +Input [6]: [hash_partition_key#X, o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] +Arguments: X + +(127) ColumnarExchange Input [6]: [hash_partition_key#X, o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: hashpartitioning(o_year#X, 1), ENSURE_REQUIREMENTS, [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(113) ShuffleQueryStage +(128) ShuffleQueryStage Output [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: X -(114) InputAdapter +(129) InputAdapter Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] -(115) InputIteratorTransformer +(130) InputIteratorTransformer Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] -(116) RegularHashAggregateExecTransformer +(131) RegularHashAggregateExecTransformer Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Keys [1]: [o_year#X] Functions [2]: [sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END), sum(volume#X)] Aggregate Attributes [2]: [sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X, sum(volume#X)#X] Results [3]: [o_year#X, sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X, sum(volume#X)#X] -(117) ProjectExecTransformer +(132) ProjectExecTransformer Output [2]: [o_year#X, CheckOverflow((promote_precision(sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X) / promote_precision(sum(volume#X)#X)), DecimalType(38,6)) AS mkt_share#X] Input [3]: [o_year#X, sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X, sum(volume#X)#X] -(118) WholeStageCodegenTransformer (X) +(133) WholeStageCodegenTransformer (X) Input [2]: [o_year#X, mkt_share#X] Arguments: false -(119) ColumnarExchange +(134) VeloxAppendBatches +Input [2]: [o_year#X, mkt_share#X] +Arguments: X + +(135) ColumnarExchange Input [2]: [o_year#X, mkt_share#X] Arguments: rangepartitioning(o_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(120) ShuffleQueryStage +(136) ShuffleQueryStage Output [2]: [o_year#X, mkt_share#X] Arguments: X -(121) InputAdapter +(137) InputAdapter Input [2]: [o_year#X, mkt_share#X] -(122) InputIteratorTransformer +(138) InputIteratorTransformer Input [2]: [o_year#X, mkt_share#X] -(123) SortExecTransformer +(139) SortExecTransformer Input [2]: [o_year#X, mkt_share#X] Arguments: [o_year#X ASC NULLS FIRST], true, 0 -(124) WholeStageCodegenTransformer (X) +(140) WholeStageCodegenTransformer (X) Input [2]: [o_year#X, mkt_share#X] Arguments: false -(125) VeloxColumnarToRowExec +(141) VeloxColumnarToRowExec Input [2]: [o_year#X, mkt_share#X] -(126) Scan parquet +(142) Scan parquet Output [2]: [p_partkey#X, p_type#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_type), EqualTo(p_type,ECONOMY ANODIZED STEEL), IsNotNull(p_partkey)] ReadSchema: struct -(127) Filter +(143) Filter Input [2]: [p_partkey#X, p_type#X] Condition : ((isnotnull(p_type#X) AND (p_type#X = ECONOMY ANODIZED STEEL)) AND isnotnull(p_partkey#X)) -(128) Project +(144) Project Output [1]: [p_partkey#X] Input [2]: [p_partkey#X, p_type#X] -(129) Exchange +(145) Exchange Input [1]: [p_partkey#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(130) Scan parquet +(146) Sort +Input [1]: [p_partkey#X] +Arguments: [p_partkey#X ASC NULLS FIRST], false, 0 + +(147) Scan parquet Output [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(131) Filter +(148) Filter Input [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Condition : ((isnotnull(l_partkey#X) AND isnotnull(l_suppkey#X)) AND isnotnull(l_orderkey#X)) -(132) Exchange +(149) Exchange Input [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(133) ShuffledHashJoin +(150) Sort +Input [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] +Arguments: [l_partkey#X ASC NULLS FIRST], false, 0 + +(151) SortMergeJoin Left keys [1]: [p_partkey#X] Right keys [1]: [l_partkey#X] Join condition: None -(134) Project +(152) Project Output [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Input [6]: [p_partkey#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(135) Exchange +(153) Exchange Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(136) Scan parquet +(154) Sort +Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] +Arguments: [l_suppkey#X ASC NULLS FIRST], false, 0 + +(155) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(137) Filter +(156) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(138) Exchange +(157) Exchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(139) ShuffledHashJoin +(158) Sort +Input [2]: [s_suppkey#X, s_nationkey#X] +Arguments: [s_suppkey#X ASC NULLS FIRST], false, 0 + +(159) SortMergeJoin Left keys [1]: [l_suppkey#X] Right keys [1]: [s_suppkey#X] Join condition: None -(140) Project +(160) Project Output [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Input [6]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, s_suppkey#X, s_nationkey#X] -(141) Exchange +(161) Exchange Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(142) Scan parquet +(162) Sort +Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(163) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1995-01-01), LessThanOrEqual(o_orderdate,1996-12-31), IsNotNull(o_orderkey), IsNotNull(o_custkey)] ReadSchema: struct -(143) Filter +(164) Filter Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Condition : ((((isnotnull(o_orderdate#X) AND (o_orderdate#X >= 1995-01-01)) AND (o_orderdate#X <= 1996-12-31)) AND isnotnull(o_orderkey#X)) AND isnotnull(o_custkey#X)) -(144) Exchange +(165) Exchange Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(145) ShuffledHashJoin +(166) Sort +Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(167) SortMergeJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join condition: None -(146) Project +(168) Project Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] Input [7]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderkey#X, o_custkey#X, o_orderdate#X] -(147) Exchange +(169) Exchange Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(148) Scan parquet +(170) Sort +Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] +Arguments: [o_custkey#X ASC NULLS FIRST], false, 0 + +(171) Scan parquet Output [2]: [c_custkey#X, c_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey), IsNotNull(c_nationkey)] ReadSchema: struct -(149) Filter +(172) Filter Input [2]: [c_custkey#X, c_nationkey#X] Condition : (isnotnull(c_custkey#X) AND isnotnull(c_nationkey#X)) -(150) Exchange +(173) Exchange Input [2]: [c_custkey#X, c_nationkey#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(151) ShuffledHashJoin +(174) Sort +Input [2]: [c_custkey#X, c_nationkey#X] +Arguments: [c_custkey#X ASC NULLS FIRST], false, 0 + +(175) SortMergeJoin Left keys [1]: [o_custkey#X] Right keys [1]: [c_custkey#X] Join condition: None -(152) Project +(176) Project Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] Input [7]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X, c_custkey#X, c_nationkey#X] -(153) Exchange +(177) Exchange Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(154) Scan parquet +(178) Sort +Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] +Arguments: [c_nationkey#X ASC NULLS FIRST], false, 0 + +(179) Scan parquet Output [2]: [n_nationkey#X, n_regionkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), IsNotNull(n_regionkey)] ReadSchema: struct -(155) Filter +(180) Filter Input [2]: [n_nationkey#X, n_regionkey#X] Condition : (isnotnull(n_nationkey#X) AND isnotnull(n_regionkey#X)) -(156) Exchange +(181) Exchange Input [2]: [n_nationkey#X, n_regionkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(157) ShuffledHashJoin +(182) Sort +Input [2]: [n_nationkey#X, n_regionkey#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(183) SortMergeJoin Left keys [1]: [c_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(158) Project +(184) Project Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] Input [7]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X, n_nationkey#X, n_regionkey#X] -(159) Exchange +(185) Exchange Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(160) Scan parquet +(186) Sort +Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] +Arguments: [s_nationkey#X ASC NULLS FIRST], false, 0 + +(187) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey)] ReadSchema: struct -(161) Filter +(188) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : isnotnull(n_nationkey#X) -(162) Exchange +(189) Exchange Input [2]: [n_nationkey#X, n_name#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(163) ShuffledHashJoin +(190) Sort +Input [2]: [n_nationkey#X, n_name#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(191) SortMergeJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(164) Project +(192) Project Output [5]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] Input [7]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X, n_nationkey#X, n_name#X] -(165) Exchange +(193) Exchange Input [5]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] Arguments: hashpartitioning(n_regionkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(166) Scan parquet +(194) Sort +Input [5]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] +Arguments: [n_regionkey#X ASC NULLS FIRST], false, 0 + +(195) Scan parquet Output [2]: [r_regionkey#X, r_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(r_name), EqualTo(r_name,AMERICA), IsNotNull(r_regionkey)] ReadSchema: struct -(167) Filter +(196) Filter Input [2]: [r_regionkey#X, r_name#X] Condition : ((isnotnull(r_name#X) AND (r_name#X = AMERICA)) AND isnotnull(r_regionkey#X)) -(168) Project +(197) Project Output [1]: [r_regionkey#X] Input [2]: [r_regionkey#X, r_name#X] -(169) Exchange +(198) Exchange Input [1]: [r_regionkey#X] Arguments: hashpartitioning(r_regionkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(170) ShuffledHashJoin +(199) Sort +Input [1]: [r_regionkey#X] +Arguments: [r_regionkey#X ASC NULLS FIRST], false, 0 + +(200) SortMergeJoin Left keys [1]: [n_regionkey#X] Right keys [1]: [r_regionkey#X] Join condition: None -(171) Project +(201) Project Output [3]: [year(o_orderdate#X) AS o_year#X, CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) AS volume#X, n_name#X AS nation#X] Input [6]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X, r_regionkey#X] -(172) HashAggregate +(202) HashAggregate Input [3]: [o_year#X, volume#X, nation#X] Keys [1]: [o_year#X] Functions [2]: [partial_sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END), partial_sum(volume#X)] Aggregate Attributes [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Results [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] -(173) Exchange +(203) Exchange Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: hashpartitioning(o_year#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(174) HashAggregate +(204) HashAggregate Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Keys [1]: [o_year#X] Functions [2]: [sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END), sum(volume#X)] Aggregate Attributes [2]: [sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X, sum(volume#X)#X] Results [2]: [o_year#X, CheckOverflow((promote_precision(sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X) / promote_precision(sum(volume#X)#X)), DecimalType(38,6)) AS mkt_share#X] -(175) Exchange +(205) Exchange Input [2]: [o_year#X, mkt_share#X] Arguments: rangepartitioning(o_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(176) Sort +(206) Sort Input [2]: [o_year#X, mkt_share#X] Arguments: [o_year#X ASC NULLS FIRST], true, 0 -(177) AdaptiveSparkPlan +(207) AdaptiveSparkPlan Output [2]: [o_year#X, mkt_share#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/9.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/9.txt index a24b228c5f774..21c91ca141800 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/9.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/9.txt @@ -1,126 +1,136 @@ == Physical Plan == -AdaptiveSparkPlan (133) +AdaptiveSparkPlan (155) +- == Final Plan == - VeloxColumnarToRowExec (94) - +- ^ SortExecTransformer (92) - +- ^ InputIteratorTransformer (91) - +- ^ InputAdapter (90) - +- ^ ShuffleQueryStage (89), Statistics(X) - +- ColumnarExchange (88) - +- ^ RegularHashAggregateExecTransformer (86) - +- ^ InputIteratorTransformer (85) - +- ^ InputAdapter (84) - +- ^ ShuffleQueryStage (83), Statistics(X) - +- ColumnarExchange (82) - +- ^ ProjectExecTransformer (80) - +- ^ FlushableHashAggregateExecTransformer (79) - +- ^ ProjectExecTransformer (78) - +- ^ ShuffledHashJoinExecTransformer Inner (77) - :- ^ InputIteratorTransformer (68) - : +- ^ InputAdapter (67) - : +- ^ ShuffleQueryStage (66), Statistics(X) - : +- ColumnarExchange (65) - : +- ^ ProjectExecTransformer (63) - : +- ^ ShuffledHashJoinExecTransformer Inner (62) - : :- ^ InputIteratorTransformer (53) - : : +- ^ InputAdapter (52) - : : +- ^ ShuffleQueryStage (51), Statistics(X) - : : +- ColumnarExchange (50) - : : +- ^ ProjectExecTransformer (48) - : : +- ^ ShuffledHashJoinExecTransformer Inner (47) - : : :- ^ InputIteratorTransformer (38) - : : : +- ^ InputAdapter (37) - : : : +- ^ ShuffleQueryStage (36), Statistics(X) - : : : +- ColumnarExchange (35) - : : : +- ^ ProjectExecTransformer (33) - : : : +- ^ ShuffledHashJoinExecTransformer Inner (32) - : : : :- ^ InputIteratorTransformer (23) - : : : : +- ^ InputAdapter (22) - : : : : +- ^ ShuffleQueryStage (21), Statistics(X) - : : : : +- ColumnarExchange (20) - : : : : +- ^ ProjectExecTransformer (18) - : : : : +- ^ ShuffledHashJoinExecTransformer Inner (17) - : : : : :- ^ InputIteratorTransformer (8) - : : : : : +- ^ InputAdapter (7) - : : : : : +- ^ ShuffleQueryStage (6), Statistics(X) - : : : : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (106) + +- ^ SortExecTransformer (104) + +- ^ InputIteratorTransformer (103) + +- ShuffleQueryStage (101), Statistics(X) + +- ColumnarExchange (100) + +- VeloxAppendBatches (99) + +- ^ RegularHashAggregateExecTransformer (97) + +- ^ InputIteratorTransformer (96) + +- ShuffleQueryStage (94), Statistics(X) + +- ColumnarExchange (93) + +- VeloxAppendBatches (92) + +- ^ ProjectExecTransformer (90) + +- ^ FlushableHashAggregateExecTransformer (89) + +- ^ ProjectExecTransformer (88) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (87) + :- ^ InputIteratorTransformer (77) + : +- ShuffleQueryStage (75), Statistics(X) + : +- ColumnarExchange (74) + : +- VeloxAppendBatches (73) + : +- ^ ProjectExecTransformer (71) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (70) + : :- ^ InputIteratorTransformer (60) + : : +- ShuffleQueryStage (58), Statistics(X) + : : +- ColumnarExchange (57) + : : +- VeloxAppendBatches (56) + : : +- ^ ProjectExecTransformer (54) + : : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (53) + : : :- ^ InputIteratorTransformer (43) + : : : +- ShuffleQueryStage (41), Statistics(X) + : : : +- ColumnarExchange (40) + : : : +- VeloxAppendBatches (39) + : : : +- ^ ProjectExecTransformer (37) + : : : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (36) + : : : :- ^ InputIteratorTransformer (26) + : : : : +- ShuffleQueryStage (24), Statistics(X) + : : : : +- ColumnarExchange (23) + : : : : +- VeloxAppendBatches (22) + : : : : +- ^ ProjectExecTransformer (20) + : : : : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (19) + : : : : :- ^ InputIteratorTransformer (9) + : : : : : +- ShuffleQueryStage (7), Statistics(X) + : : : : : +- ColumnarExchange (6) + : : : : : +- VeloxAppendBatches (5) : : : : : +- ^ ProjectExecTransformer (3) : : : : : +- ^ NoopFilter (2) : : : : : +- ^ Scan parquet (1) - : : : : +- ^ InputIteratorTransformer (16) - : : : : +- ^ InputAdapter (15) - : : : : +- ^ ShuffleQueryStage (14), Statistics(X) - : : : : +- ColumnarExchange (13) - : : : : +- ^ ProjectExecTransformer (11) - : : : : +- ^ NoopFilter (10) - : : : : +- ^ Scan parquet (9) - : : : +- ^ InputIteratorTransformer (31) - : : : +- ^ InputAdapter (30) - : : : +- ^ ShuffleQueryStage (29), Statistics(X) - : : : +- ColumnarExchange (28) - : : : +- ^ ProjectExecTransformer (26) - : : : +- ^ NoopFilter (25) - : : : +- ^ Scan parquet (24) - : : +- ^ InputIteratorTransformer (46) - : : +- ^ InputAdapter (45) - : : +- ^ ShuffleQueryStage (44), Statistics(X) - : : +- ColumnarExchange (43) - : : +- ^ ProjectExecTransformer (41) - : : +- ^ NoopFilter (40) - : : +- ^ Scan parquet (39) - : +- ^ InputIteratorTransformer (61) - : +- ^ InputAdapter (60) - : +- ^ ShuffleQueryStage (59), Statistics(X) - : +- ColumnarExchange (58) - : +- ^ ProjectExecTransformer (56) - : +- ^ NoopFilter (55) - : +- ^ Scan parquet (54) - +- ^ InputIteratorTransformer (76) - +- ^ InputAdapter (75) - +- ^ ShuffleQueryStage (74), Statistics(X) - +- ColumnarExchange (73) - +- ^ ProjectExecTransformer (71) - +- ^ NoopFilter (70) - +- ^ Scan parquet (69) + : : : : +- ^ InputIteratorTransformer (18) + : : : : +- ShuffleQueryStage (16), Statistics(X) + : : : : +- ColumnarExchange (15) + : : : : +- VeloxAppendBatches (14) + : : : : +- ^ ProjectExecTransformer (12) + : : : : +- ^ NoopFilter (11) + : : : : +- ^ Scan parquet (10) + : : : +- ^ InputIteratorTransformer (35) + : : : +- ShuffleQueryStage (33), Statistics(X) + : : : +- ColumnarExchange (32) + : : : +- VeloxAppendBatches (31) + : : : +- ^ ProjectExecTransformer (29) + : : : +- ^ NoopFilter (28) + : : : +- ^ Scan parquet (27) + : : +- ^ InputIteratorTransformer (52) + : : +- ShuffleQueryStage (50), Statistics(X) + : : +- ColumnarExchange (49) + : : +- VeloxAppendBatches (48) + : : +- ^ ProjectExecTransformer (46) + : : +- ^ NoopFilter (45) + : : +- ^ Scan parquet (44) + : +- ^ InputIteratorTransformer (69) + : +- ShuffleQueryStage (67), Statistics(X) + : +- ColumnarExchange (66) + : +- VeloxAppendBatches (65) + : +- ^ ProjectExecTransformer (63) + : +- ^ NoopFilter (62) + : +- ^ Scan parquet (61) + +- ^ InputIteratorTransformer (86) + +- ShuffleQueryStage (84), Statistics(X) + +- ColumnarExchange (83) + +- VeloxAppendBatches (82) + +- ^ ProjectExecTransformer (80) + +- ^ NoopFilter (79) + +- ^ Scan parquet (78) +- == Initial Plan == - Sort (132) - +- Exchange (131) - +- HashAggregate (130) - +- Exchange (129) - +- HashAggregate (128) - +- Project (127) - +- ShuffledHashJoin Inner BuildRight (126) - :- Exchange (122) - : +- Project (121) - : +- ShuffledHashJoin Inner BuildRight (120) - : :- Exchange (116) - : : +- Project (115) - : : +- ShuffledHashJoin Inner BuildRight (114) - : : :- Exchange (110) - : : : +- Project (109) - : : : +- ShuffledHashJoin Inner BuildRight (108) - : : : :- Exchange (104) - : : : : +- Project (103) - : : : : +- ShuffledHashJoin Inner BuildLeft (102) - : : : : :- Exchange (98) - : : : : : +- Project (97) - : : : : : +- Filter (96) - : : : : : +- Scan parquet (95) - : : : : +- Exchange (101) - : : : : +- Filter (100) - : : : : +- Scan parquet (99) - : : : +- Exchange (107) - : : : +- Filter (106) - : : : +- Scan parquet (105) - : : +- Exchange (113) - : : +- Filter (112) - : : +- Scan parquet (111) - : +- Exchange (119) - : +- Filter (118) - : +- Scan parquet (117) - +- Exchange (125) - +- Filter (124) - +- Scan parquet (123) + Sort (154) + +- Exchange (153) + +- HashAggregate (152) + +- Exchange (151) + +- HashAggregate (150) + +- Project (149) + +- SortMergeJoin Inner (148) + :- Sort (143) + : +- Exchange (142) + : +- Project (141) + : +- SortMergeJoin Inner (140) + : :- Sort (135) + : : +- Exchange (134) + : : +- Project (133) + : : +- SortMergeJoin Inner (132) + : : :- Sort (127) + : : : +- Exchange (126) + : : : +- Project (125) + : : : +- SortMergeJoin Inner (124) + : : : :- Sort (119) + : : : : +- Exchange (118) + : : : : +- Project (117) + : : : : +- SortMergeJoin Inner (116) + : : : : :- Sort (111) + : : : : : +- Exchange (110) + : : : : : +- Project (109) + : : : : : +- Filter (108) + : : : : : +- Scan parquet (107) + : : : : +- Sort (115) + : : : : +- Exchange (114) + : : : : +- Filter (113) + : : : : +- Scan parquet (112) + : : : +- Sort (123) + : : : +- Exchange (122) + : : : +- Filter (121) + : : : +- Scan parquet (120) + : : +- Sort (131) + : : +- Exchange (130) + : : +- Filter (129) + : : +- Scan parquet (128) + : +- Sort (139) + : +- Exchange (138) + : +- Filter (137) + : +- Scan parquet (136) + +- Sort (147) + +- Exchange (146) + +- Filter (145) + +- Scan parquet (144) (1) Scan parquet @@ -142,548 +152,636 @@ Input [2]: [p_partkey#X, p_name#X] Input [2]: [hash_partition_key#X, p_partkey#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [2]: [hash_partition_key#X, p_partkey#X] +Arguments: X + +(6) ColumnarExchange Input [2]: [hash_partition_key#X, p_partkey#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [1]: [p_partkey#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [1]: [p_partkey#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [1]: [p_partkey#X] -(9) Scan parquet +(10) Scan parquet Output [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(10) NoopFilter +(11) NoopFilter Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [7]: [hash(l_partkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [7]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [7]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(15) ColumnarExchange Input [7]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [p_partkey#X] Right keys [1]: [l_partkey#X] Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [7]: [hash(l_suppkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Input [7]: [p_partkey#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -(19) WholeStageCodegenTransformer (X) +(21) WholeStageCodegenTransformer (X) Input [7]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: false -(20) ColumnarExchange +(22) VeloxAppendBatches +Input [7]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(23) ColumnarExchange Input [7]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(21) ShuffleQueryStage +(24) ShuffleQueryStage Output [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: X -(22) InputAdapter +(25) InputAdapter Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -(23) InputIteratorTransformer +(26) InputIteratorTransformer Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -(24) Scan parquet +(27) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(25) NoopFilter +(28) NoopFilter Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: [s_suppkey#X, s_nationkey#X] -(26) ProjectExecTransformer +(29) ProjectExecTransformer Output [3]: [hash(s_suppkey#X, 42) AS hash_partition_key#X, s_suppkey#X, s_nationkey#X] Input [2]: [s_suppkey#X, s_nationkey#X] -(27) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] Arguments: false -(28) ColumnarExchange +(31) VeloxAppendBatches +Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] +Arguments: X + +(32) ColumnarExchange Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [id=#X] -(29) ShuffleQueryStage +(33) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] Arguments: X -(30) InputAdapter +(34) InputAdapter Input [2]: [s_suppkey#X, s_nationkey#X] -(31) InputIteratorTransformer +(35) InputIteratorTransformer Input [2]: [s_suppkey#X, s_nationkey#X] -(32) ShuffledHashJoinExecTransformer +(36) ShuffledHashJoinExecTransformer Left keys [1]: [l_suppkey#X] Right keys [1]: [s_suppkey#X] Join condition: None -(33) ProjectExecTransformer +(37) ProjectExecTransformer Output [8]: [hash(l_suppkey#X, l_partkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Input [8]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_suppkey#X, s_nationkey#X] -(34) WholeStageCodegenTransformer (X) +(38) WholeStageCodegenTransformer (X) Input [8]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: false -(35) ColumnarExchange +(39) VeloxAppendBatches +Input [8]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] +Arguments: X + +(40) ColumnarExchange Input [8]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: hashpartitioning(l_suppkey#X, l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X], [plan_id=X], [id=#X] -(36) ShuffleQueryStage +(41) ShuffleQueryStage Output [7]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: X -(37) InputAdapter +(42) InputAdapter Input [7]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] -(38) InputIteratorTransformer +(43) InputIteratorTransformer Input [7]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] -(39) Scan parquet +(44) Scan parquet Output [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_suppkey), IsNotNull(ps_partkey)] ReadSchema: struct -(40) NoopFilter +(45) NoopFilter Input [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Arguments: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] -(41) ProjectExecTransformer +(46) ProjectExecTransformer Output [4]: [hash(ps_suppkey#X, ps_partkey#X, 42) AS hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Input [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] -(42) WholeStageCodegenTransformer (X) +(47) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Arguments: false -(43) ColumnarExchange +(48) VeloxAppendBatches +Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_supplycost#X] +Arguments: X + +(49) ColumnarExchange Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Arguments: hashpartitioning(ps_suppkey#X, ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_supplycost#X], [plan_id=X], [id=#X] -(44) ShuffleQueryStage +(50) ShuffleQueryStage Output [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Arguments: X -(45) InputAdapter +(51) InputAdapter Input [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] -(46) InputIteratorTransformer +(52) InputIteratorTransformer Input [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] -(47) ShuffledHashJoinExecTransformer +(53) ShuffledHashJoinExecTransformer Left keys [2]: [l_suppkey#X, l_partkey#X] Right keys [2]: [ps_suppkey#X, ps_partkey#X] Join condition: None -(48) ProjectExecTransformer +(54) ProjectExecTransformer Output [7]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] Input [10]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_partkey#X, ps_suppkey#X, ps_supplycost#X] -(49) WholeStageCodegenTransformer (X) +(55) WholeStageCodegenTransformer (X) Input [7]: [hash_partition_key#X, l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] Arguments: false -(50) ColumnarExchange +(56) VeloxAppendBatches +Input [7]: [hash_partition_key#X, l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] +Arguments: X + +(57) ColumnarExchange Input [7]: [hash_partition_key#X, l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X], [plan_id=X], [id=#X] -(51) ShuffleQueryStage +(58) ShuffleQueryStage Output [6]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] Arguments: X -(52) InputAdapter +(59) InputAdapter Input [6]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] -(53) InputIteratorTransformer +(60) InputIteratorTransformer Input [6]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] -(54) Scan parquet +(61) Scan parquet Output [2]: [o_orderkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderkey)] ReadSchema: struct -(55) NoopFilter +(62) NoopFilter Input [2]: [o_orderkey#X, o_orderdate#X] Arguments: [o_orderkey#X, o_orderdate#X] -(56) ProjectExecTransformer +(63) ProjectExecTransformer Output [3]: [hash(o_orderkey#X, 42) AS hash_partition_key#X, o_orderkey#X, o_orderdate#X] Input [2]: [o_orderkey#X, o_orderdate#X] -(57) WholeStageCodegenTransformer (X) +(64) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderdate#X] Arguments: false -(58) ColumnarExchange +(65) VeloxAppendBatches +Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderdate#X] +Arguments: X + +(66) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderdate#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderdate#X], [plan_id=X], [id=#X] -(59) ShuffleQueryStage +(67) ShuffleQueryStage Output [2]: [o_orderkey#X, o_orderdate#X] Arguments: X -(60) InputAdapter +(68) InputAdapter Input [2]: [o_orderkey#X, o_orderdate#X] -(61) InputIteratorTransformer +(69) InputIteratorTransformer Input [2]: [o_orderkey#X, o_orderdate#X] -(62) ShuffledHashJoinExecTransformer +(70) ShuffledHashJoinExecTransformer Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join condition: None -(63) ProjectExecTransformer +(71) ProjectExecTransformer Output [7]: [hash(s_nationkey#X, 42) AS hash_partition_key#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] Input [8]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderkey#X, o_orderdate#X] -(64) WholeStageCodegenTransformer (X) +(72) WholeStageCodegenTransformer (X) Input [7]: [hash_partition_key#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] Arguments: false -(65) ColumnarExchange +(73) VeloxAppendBatches +Input [7]: [hash_partition_key#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] +Arguments: X + +(74) ColumnarExchange Input [7]: [hash_partition_key#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X], [plan_id=X], [id=#X] -(66) ShuffleQueryStage +(75) ShuffleQueryStage Output [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] Arguments: X -(67) InputAdapter +(76) InputAdapter Input [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] -(68) InputIteratorTransformer +(77) InputIteratorTransformer Input [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] -(69) Scan parquet +(78) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey)] ReadSchema: struct -(70) NoopFilter +(79) NoopFilter Input [2]: [n_nationkey#X, n_name#X] Arguments: [n_nationkey#X, n_name#X] -(71) ProjectExecTransformer +(80) ProjectExecTransformer Output [3]: [hash(n_nationkey#X, 42) AS hash_partition_key#X, n_nationkey#X, n_name#X] Input [2]: [n_nationkey#X, n_name#X] -(72) WholeStageCodegenTransformer (X) +(81) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] Arguments: false -(73) ColumnarExchange +(82) VeloxAppendBatches +Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] +Arguments: X + +(83) ColumnarExchange Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [id=#X] -(74) ShuffleQueryStage +(84) ShuffleQueryStage Output [2]: [n_nationkey#X, n_name#X] Arguments: X -(75) InputAdapter +(85) InputAdapter Input [2]: [n_nationkey#X, n_name#X] -(76) InputIteratorTransformer +(86) InputIteratorTransformer Input [2]: [n_nationkey#X, n_name#X] -(77) ShuffledHashJoinExecTransformer +(87) ShuffledHashJoinExecTransformer Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(78) ProjectExecTransformer +(88) ProjectExecTransformer Output [3]: [n_name#X AS nation#X, year(o_orderdate#X) AS o_year#X, CheckOverflow((promote_precision(cast(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) as decimal(27,4))) - promote_precision(cast(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(l_quantity#X)), DecimalType(25,4)) as decimal(27,4)))), DecimalType(27,4)) AS amount#X] Input [8]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X, n_nationkey#X, n_name#X] -(79) FlushableHashAggregateExecTransformer +(89) FlushableHashAggregateExecTransformer Input [3]: [nation#X, o_year#X, amount#X] Keys [2]: [nation#X, o_year#X] Functions [1]: [partial_sum(amount#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [4]: [nation#X, o_year#X, sum#X, isEmpty#X] -(80) ProjectExecTransformer +(90) ProjectExecTransformer Output [5]: [hash(nation#X, o_year#X, 42) AS hash_partition_key#X, nation#X, o_year#X, sum#X, isEmpty#X] Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] -(81) WholeStageCodegenTransformer (X) +(91) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, nation#X, o_year#X, sum#X, isEmpty#X] Arguments: false -(82) ColumnarExchange +(92) VeloxAppendBatches +Input [5]: [hash_partition_key#X, nation#X, o_year#X, sum#X, isEmpty#X] +Arguments: X + +(93) ColumnarExchange Input [5]: [hash_partition_key#X, nation#X, o_year#X, sum#X, isEmpty#X] Arguments: hashpartitioning(nation#X, o_year#X, 1), ENSURE_REQUIREMENTS, [nation#X, o_year#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(83) ShuffleQueryStage +(94) ShuffleQueryStage Output [4]: [nation#X, o_year#X, sum#X, isEmpty#X] Arguments: X -(84) InputAdapter +(95) InputAdapter Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] -(85) InputIteratorTransformer +(96) InputIteratorTransformer Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] -(86) RegularHashAggregateExecTransformer +(97) RegularHashAggregateExecTransformer Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] Keys [2]: [nation#X, o_year#X] Functions [1]: [sum(amount#X)] Aggregate Attributes [1]: [sum(amount#X)#X] Results [3]: [nation#X, o_year#X, sum(amount#X)#X AS sum_profit#X] -(87) WholeStageCodegenTransformer (X) +(98) WholeStageCodegenTransformer (X) Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: false -(88) ColumnarExchange +(99) VeloxAppendBatches +Input [3]: [nation#X, o_year#X, sum_profit#X] +Arguments: X + +(100) ColumnarExchange Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: rangepartitioning(nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(89) ShuffleQueryStage +(101) ShuffleQueryStage Output [3]: [nation#X, o_year#X, sum_profit#X] Arguments: X -(90) InputAdapter +(102) InputAdapter Input [3]: [nation#X, o_year#X, sum_profit#X] -(91) InputIteratorTransformer +(103) InputIteratorTransformer Input [3]: [nation#X, o_year#X, sum_profit#X] -(92) SortExecTransformer +(104) SortExecTransformer Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: [nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST], true, 0 -(93) WholeStageCodegenTransformer (X) +(105) WholeStageCodegenTransformer (X) Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: false -(94) VeloxColumnarToRowExec +(106) VeloxColumnarToRowExec Input [3]: [nation#X, o_year#X, sum_profit#X] -(95) Scan parquet +(107) Scan parquet Output [2]: [p_partkey#X, p_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_name), StringContains(p_name,green), IsNotNull(p_partkey)] ReadSchema: struct -(96) Filter +(108) Filter Input [2]: [p_partkey#X, p_name#X] Condition : ((isnotnull(p_name#X) AND Contains(p_name#X, green)) AND isnotnull(p_partkey#X)) -(97) Project +(109) Project Output [1]: [p_partkey#X] Input [2]: [p_partkey#X, p_name#X] -(98) Exchange +(110) Exchange Input [1]: [p_partkey#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(99) Scan parquet +(111) Sort +Input [1]: [p_partkey#X] +Arguments: [p_partkey#X ASC NULLS FIRST], false, 0 + +(112) Scan parquet Output [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(100) Filter +(113) Filter Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Condition : ((isnotnull(l_partkey#X) AND isnotnull(l_suppkey#X)) AND isnotnull(l_orderkey#X)) -(101) Exchange +(114) Exchange Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(102) ShuffledHashJoin +(115) Sort +Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] +Arguments: [l_partkey#X ASC NULLS FIRST], false, 0 + +(116) SortMergeJoin Left keys [1]: [p_partkey#X] Right keys [1]: [l_partkey#X] Join condition: None -(103) Project +(117) Project Output [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Input [7]: [p_partkey#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -(104) Exchange +(118) Exchange Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(105) Scan parquet +(119) Sort +Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] +Arguments: [l_suppkey#X ASC NULLS FIRST], false, 0 + +(120) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(106) Filter +(121) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(107) Exchange +(122) Exchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(108) ShuffledHashJoin +(123) Sort +Input [2]: [s_suppkey#X, s_nationkey#X] +Arguments: [s_suppkey#X ASC NULLS FIRST], false, 0 + +(124) SortMergeJoin Left keys [1]: [l_suppkey#X] Right keys [1]: [s_suppkey#X] Join condition: None -(109) Project +(125) Project Output [7]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Input [8]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_suppkey#X, s_nationkey#X] -(110) Exchange +(126) Exchange Input [7]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: hashpartitioning(l_suppkey#X, l_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(111) Scan parquet +(127) Sort +Input [7]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] +Arguments: [l_suppkey#X ASC NULLS FIRST, l_partkey#X ASC NULLS FIRST], false, 0 + +(128) Scan parquet Output [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_suppkey), IsNotNull(ps_partkey)] ReadSchema: struct -(112) Filter +(129) Filter Input [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Condition : (isnotnull(ps_suppkey#X) AND isnotnull(ps_partkey#X)) -(113) Exchange +(130) Exchange Input [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Arguments: hashpartitioning(ps_suppkey#X, ps_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(114) ShuffledHashJoin +(131) Sort +Input [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] +Arguments: [ps_suppkey#X ASC NULLS FIRST, ps_partkey#X ASC NULLS FIRST], false, 0 + +(132) SortMergeJoin Left keys [2]: [l_suppkey#X, l_partkey#X] Right keys [2]: [ps_suppkey#X, ps_partkey#X] Join condition: None -(115) Project +(133) Project Output [6]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] Input [10]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_partkey#X, ps_suppkey#X, ps_supplycost#X] -(116) Exchange +(134) Exchange Input [6]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(117) Scan parquet +(135) Sort +Input [6]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(136) Scan parquet Output [2]: [o_orderkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderkey)] ReadSchema: struct -(118) Filter +(137) Filter Input [2]: [o_orderkey#X, o_orderdate#X] Condition : isnotnull(o_orderkey#X) -(119) Exchange +(138) Exchange Input [2]: [o_orderkey#X, o_orderdate#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(120) ShuffledHashJoin +(139) Sort +Input [2]: [o_orderkey#X, o_orderdate#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(140) SortMergeJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join condition: None -(121) Project +(141) Project Output [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] Input [8]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderkey#X, o_orderdate#X] -(122) Exchange +(142) Exchange Input [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(123) Scan parquet +(143) Sort +Input [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] +Arguments: [s_nationkey#X ASC NULLS FIRST], false, 0 + +(144) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey)] ReadSchema: struct -(124) Filter +(145) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : isnotnull(n_nationkey#X) -(125) Exchange +(146) Exchange Input [2]: [n_nationkey#X, n_name#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(126) ShuffledHashJoin +(147) Sort +Input [2]: [n_nationkey#X, n_name#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(148) SortMergeJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(127) Project +(149) Project Output [3]: [n_name#X AS nation#X, year(o_orderdate#X) AS o_year#X, CheckOverflow((promote_precision(cast(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) as decimal(27,4))) - promote_precision(cast(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(l_quantity#X)), DecimalType(25,4)) as decimal(27,4)))), DecimalType(27,4)) AS amount#X] Input [8]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X, n_nationkey#X, n_name#X] -(128) HashAggregate +(150) HashAggregate Input [3]: [nation#X, o_year#X, amount#X] Keys [2]: [nation#X, o_year#X] Functions [1]: [partial_sum(amount#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [4]: [nation#X, o_year#X, sum#X, isEmpty#X] -(129) Exchange +(151) Exchange Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] Arguments: hashpartitioning(nation#X, o_year#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(130) HashAggregate +(152) HashAggregate Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] Keys [2]: [nation#X, o_year#X] Functions [1]: [sum(amount#X)] Aggregate Attributes [1]: [sum(amount#X)#X] Results [3]: [nation#X, o_year#X, sum(amount#X)#X AS sum_profit#X] -(131) Exchange +(153) Exchange Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: rangepartitioning(nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(132) Sort +(154) Sort Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: [nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST], true, 0 -(133) AdaptiveSparkPlan +(155) AdaptiveSparkPlan Output [3]: [nation#X, o_year#X, sum_profit#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/1.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/1.txt index 41613c36f7bd8..545f2e7e086df 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/1.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/1.txt @@ -1,31 +1,31 @@ == Physical Plan == -AdaptiveSparkPlan (28) +AdaptiveSparkPlan (30) +- == Final Plan == - VeloxColumnarToRowExec (19) - +- ^ SortExecTransformer (17) - +- ^ InputIteratorTransformer (16) - +- ^ InputAdapter (15) - +- ^ ShuffleQueryStage (14), Statistics(X) - +- ColumnarExchange (13) - +- ^ RegularHashAggregateExecTransformer (11) - +- ^ InputIteratorTransformer (10) - +- ^ InputAdapter (9) - +- ^ ShuffleQueryStage (8), Statistics(X) - +- ColumnarExchange (7) + VeloxColumnarToRowExec (21) + +- ^ SortExecTransformer (19) + +- ^ InputIteratorTransformer (18) + +- ShuffleQueryStage (16), Statistics(X) + +- ColumnarExchange (15) + +- VeloxAppendBatches (14) + +- ^ RegularHashAggregateExecTransformer (12) + +- ^ InputIteratorTransformer (11) + +- ShuffleQueryStage (9), Statistics(X) + +- ColumnarExchange (8) + +- VeloxAppendBatches (7) +- ^ ProjectExecTransformer (5) +- ^ FlushableHashAggregateExecTransformer (4) +- ^ ProjectExecTransformer (3) +- ^ NoopFilter (2) +- ^ Scan parquet (1) +- == Initial Plan == - Sort (27) - +- Exchange (26) - +- HashAggregate (25) - +- Exchange (24) - +- HashAggregate (23) - +- Project (22) - +- Filter (21) - +- Scan parquet (20) + Sort (29) + +- Exchange (28) + +- HashAggregate (27) + +- Exchange (26) + +- HashAggregate (25) + +- Project (24) + +- Filter (23) + +- Scan parquet (22) (1) Scan parquet @@ -58,97 +58,105 @@ Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, Input [18]: [hash_partition_key#X, l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Arguments: false -(7) ColumnarExchange +(7) VeloxAppendBatches +Input [18]: [hash_partition_key#X, l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] +Arguments: X + +(8) ColumnarExchange Input [18]: [hash_partition_key#X, l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Arguments: hashpartitioning(l_returnflag#X, l_linestatus#X, 1), ENSURE_REQUIREMENTS, [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X], [plan_id=X], [id=#X] -(8) ShuffleQueryStage +(9) ShuffleQueryStage Output [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Arguments: X -(9) InputAdapter +(10) InputAdapter Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] -(10) InputIteratorTransformer +(11) InputIteratorTransformer Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] -(11) RegularHashAggregateExecTransformer +(12) RegularHashAggregateExecTransformer Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Keys [2]: [l_returnflag#X, l_linestatus#X] Functions [8]: [sum(l_quantity#X), sum(l_extendedprice#X), sum((l_extendedprice#X * (1 - l_discount#X))), sum(((l_extendedprice#X * (1 - l_discount#X)) * (1 + l_tax#X))), avg(l_quantity#X), avg(l_extendedprice#X), avg(l_discount#X), count(1)] Aggregate Attributes [8]: [sum(l_quantity#X)#X, sum(l_extendedprice#X)#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X, sum(((l_extendedprice#X * (1 - l_discount#X)) * (1 + l_tax#X)))#X, avg(l_quantity#X)#X, avg(l_extendedprice#X)#X, avg(l_discount#X)#X, count(1)#X] Results [10]: [l_returnflag#X, l_linestatus#X, sum(l_quantity#X)#X AS sum_qty#X, sum(l_extendedprice#X)#X AS sum_base_price#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X AS sum_disc_price#X, sum(((l_extendedprice#X * (1 - l_discount#X)) * (1 + l_tax#X)))#X AS sum_charge#X, avg(l_quantity#X)#X AS avg_qty#X, avg(l_extendedprice#X)#X AS avg_price#X, avg(l_discount#X)#X AS avg_disc#X, count(1)#X AS count_order#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] +Arguments: X + +(15) ColumnarExchange Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: rangepartitioning(l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] -(17) SortExecTransformer +(19) SortExecTransformer Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: [l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST], true, 0 -(18) WholeStageCodegenTransformer (X) +(20) WholeStageCodegenTransformer (X) Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: false -(19) VeloxColumnarToRowExec +(21) VeloxColumnarToRowExec Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] -(20) Scan parquet +(22) Scan parquet Output [7]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_tax#X, l_returnflag#X, l_linestatus#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), LessThanOrEqual(l_shipdate,1998-09-02)] ReadSchema: struct -(21) Filter +(23) Filter Input [7]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_tax#X, l_returnflag#X, l_linestatus#X, l_shipdate#X] Condition : (isnotnull(l_shipdate#X) AND (l_shipdate#X <= 1998-09-02)) -(22) Project +(24) Project Output [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_tax#X, l_returnflag#X, l_linestatus#X] Input [7]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_tax#X, l_returnflag#X, l_linestatus#X, l_shipdate#X] -(23) HashAggregate +(25) HashAggregate Input [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_tax#X, l_returnflag#X, l_linestatus#X] Keys [2]: [l_returnflag#X, l_linestatus#X] Functions [8]: [partial_sum(l_quantity#X), partial_sum(l_extendedprice#X), partial_sum((l_extendedprice#X * (1 - l_discount#X))), partial_sum(((l_extendedprice#X * (1 - l_discount#X)) * (1 + l_tax#X))), partial_avg(l_quantity#X), partial_avg(l_extendedprice#X), partial_avg(l_discount#X), partial_count(1)] Aggregate Attributes [15]: [sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Results [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] -(24) Exchange +(26) Exchange Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Arguments: hashpartitioning(l_returnflag#X, l_linestatus#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(25) HashAggregate +(27) HashAggregate Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Keys [2]: [l_returnflag#X, l_linestatus#X] Functions [8]: [sum(l_quantity#X), sum(l_extendedprice#X), sum((l_extendedprice#X * (1 - l_discount#X))), sum(((l_extendedprice#X * (1 - l_discount#X)) * (1 + l_tax#X))), avg(l_quantity#X), avg(l_extendedprice#X), avg(l_discount#X), count(1)] Aggregate Attributes [8]: [sum(l_quantity#X)#X, sum(l_extendedprice#X)#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X, sum(((l_extendedprice#X * (1 - l_discount#X)) * (1 + l_tax#X)))#X, avg(l_quantity#X)#X, avg(l_extendedprice#X)#X, avg(l_discount#X)#X, count(1)#X] Results [10]: [l_returnflag#X, l_linestatus#X, sum(l_quantity#X)#X AS sum_qty#X, sum(l_extendedprice#X)#X AS sum_base_price#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X AS sum_disc_price#X, sum(((l_extendedprice#X * (1 - l_discount#X)) * (1 + l_tax#X)))#X AS sum_charge#X, avg(l_quantity#X)#X AS avg_qty#X, avg(l_extendedprice#X)#X AS avg_price#X, avg(l_discount#X)#X AS avg_disc#X, count(1)#X AS count_order#X] -(26) Exchange +(28) Exchange Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: rangepartitioning(l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(27) Sort +(29) Sort Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: [l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST], true, 0 -(28) AdaptiveSparkPlan +(30) AdaptiveSparkPlan Output [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/10.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/10.txt index 79ff600fa6b73..5e48ceb742d71 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/10.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/10.txt @@ -1,85 +1,91 @@ == Physical Plan == -AdaptiveSparkPlan (87) +AdaptiveSparkPlan (100) +- == Final Plan == - VeloxColumnarToRowExec (60) - +- TakeOrderedAndProjectExecTransformer (59) - +- ^ ProjectExecTransformer (57) - +- ^ RegularHashAggregateExecTransformer (56) - +- ^ InputIteratorTransformer (55) - +- ^ InputAdapter (54) - +- ^ ShuffleQueryStage (53), Statistics(X) - +- ColumnarExchange (52) - +- ^ ProjectExecTransformer (50) - +- ^ FlushableHashAggregateExecTransformer (49) - +- ^ ProjectExecTransformer (48) - +- ^ ShuffledHashJoinExecTransformer Inner (47) - :- ^ InputIteratorTransformer (38) - : +- ^ InputAdapter (37) - : +- ^ ShuffleQueryStage (36), Statistics(X) - : +- ColumnarExchange (35) - : +- ^ ProjectExecTransformer (33) - : +- ^ ShuffledHashJoinExecTransformer Inner (32) - : :- ^ InputIteratorTransformer (23) - : : +- ^ InputAdapter (22) - : : +- ^ ShuffleQueryStage (21), Statistics(X) - : : +- ColumnarExchange (20) - : : +- ^ ProjectExecTransformer (18) - : : +- ^ ShuffledHashJoinExecTransformer Inner (17) - : : :- ^ InputIteratorTransformer (8) - : : : +- ^ InputAdapter (7) - : : : +- ^ ShuffleQueryStage (6), Statistics(X) - : : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (67) + +- TakeOrderedAndProjectExecTransformer (66) + +- ^ ProjectExecTransformer (64) + +- ^ RegularHashAggregateExecTransformer (63) + +- ^ InputIteratorTransformer (62) + +- ShuffleQueryStage (60), Statistics(X) + +- ColumnarExchange (59) + +- VeloxAppendBatches (58) + +- ^ ProjectExecTransformer (56) + +- ^ FlushableHashAggregateExecTransformer (55) + +- ^ ProjectExecTransformer (54) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (53) + :- ^ InputIteratorTransformer (43) + : +- ShuffleQueryStage (41), Statistics(X) + : +- ColumnarExchange (40) + : +- VeloxAppendBatches (39) + : +- ^ ProjectExecTransformer (37) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (36) + : :- ^ InputIteratorTransformer (26) + : : +- ShuffleQueryStage (24), Statistics(X) + : : +- ColumnarExchange (23) + : : +- VeloxAppendBatches (22) + : : +- ^ ProjectExecTransformer (20) + : : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (19) + : : :- ^ InputIteratorTransformer (9) + : : : +- ShuffleQueryStage (7), Statistics(X) + : : : +- ColumnarExchange (6) + : : : +- VeloxAppendBatches (5) : : : +- ^ ProjectExecTransformer (3) : : : +- ^ NoopFilter (2) : : : +- ^ Scan parquet (1) - : : +- ^ InputIteratorTransformer (16) - : : +- ^ InputAdapter (15) - : : +- ^ ShuffleQueryStage (14), Statistics(X) - : : +- ColumnarExchange (13) - : : +- ^ ProjectExecTransformer (11) - : : +- ^ NoopFilter (10) - : : +- ^ Scan parquet (9) - : +- ^ InputIteratorTransformer (31) - : +- ^ InputAdapter (30) - : +- ^ ShuffleQueryStage (29), Statistics(X) - : +- ColumnarExchange (28) - : +- ^ ProjectExecTransformer (26) - : +- ^ NoopFilter (25) - : +- ^ Scan parquet (24) - +- ^ InputIteratorTransformer (46) - +- ^ InputAdapter (45) - +- ^ ShuffleQueryStage (44), Statistics(X) - +- ColumnarExchange (43) - +- ^ ProjectExecTransformer (41) - +- ^ NoopFilter (40) - +- ^ Scan parquet (39) + : : +- ^ InputIteratorTransformer (18) + : : +- ShuffleQueryStage (16), Statistics(X) + : : +- ColumnarExchange (15) + : : +- VeloxAppendBatches (14) + : : +- ^ ProjectExecTransformer (12) + : : +- ^ NoopFilter (11) + : : +- ^ Scan parquet (10) + : +- ^ InputIteratorTransformer (35) + : +- ShuffleQueryStage (33), Statistics(X) + : +- ColumnarExchange (32) + : +- VeloxAppendBatches (31) + : +- ^ ProjectExecTransformer (29) + : +- ^ NoopFilter (28) + : +- ^ Scan parquet (27) + +- ^ InputIteratorTransformer (52) + +- ShuffleQueryStage (50), Statistics(X) + +- ColumnarExchange (49) + +- VeloxAppendBatches (48) + +- ^ ProjectExecTransformer (46) + +- ^ NoopFilter (45) + +- ^ Scan parquet (44) +- == Initial Plan == - TakeOrderedAndProject (86) - +- HashAggregate (85) - +- Exchange (84) - +- HashAggregate (83) - +- Project (82) - +- ShuffledHashJoin Inner BuildRight (81) - :- Exchange (77) - : +- Project (76) - : +- ShuffledHashJoin Inner BuildRight (75) - : :- Exchange (70) - : : +- Project (69) - : : +- ShuffledHashJoin Inner BuildRight (68) - : : :- Exchange (63) - : : : +- Filter (62) - : : : +- Scan parquet (61) - : : +- Exchange (67) - : : +- Project (66) - : : +- Filter (65) - : : +- Scan parquet (64) - : +- Exchange (74) - : +- Project (73) - : +- Filter (72) - : +- Scan parquet (71) - +- Exchange (80) - +- Filter (79) - +- Scan parquet (78) + TakeOrderedAndProject (99) + +- HashAggregate (98) + +- Exchange (97) + +- HashAggregate (96) + +- Project (95) + +- SortMergeJoin Inner (94) + :- Sort (89) + : +- Exchange (88) + : +- Project (87) + : +- SortMergeJoin Inner (86) + : :- Sort (80) + : : +- Exchange (79) + : : +- Project (78) + : : +- SortMergeJoin Inner (77) + : : :- Sort (71) + : : : +- Exchange (70) + : : : +- Filter (69) + : : : +- Scan parquet (68) + : : +- Sort (76) + : : +- Exchange (75) + : : +- Project (74) + : : +- Filter (73) + : : +- Scan parquet (72) + : +- Sort (85) + : +- Exchange (84) + : +- Project (83) + : +- Filter (82) + : +- Scan parquet (81) + +- Sort (93) + +- Exchange (92) + +- Filter (91) + +- Scan parquet (90) (1) Scan parquet @@ -101,364 +107,416 @@ Input [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acct Input [8]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [8]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] +Arguments: X + +(6) ColumnarExchange Input [8]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] -(9) Scan parquet +(10) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1993-10-01), LessThan(o_orderdate,1994-01-01), IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(10) NoopFilter +(11) NoopFilter Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Arguments: [o_orderkey#X, o_custkey#X, o_orderdate#X] -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [3]: [hash(o_custkey#X, 42) AS hash_partition_key#X, o_orderkey#X, o_custkey#X] Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] +Arguments: X + +(15) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [2]: [o_orderkey#X, o_custkey#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [2]: [o_orderkey#X, o_custkey#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [2]: [o_orderkey#X, o_custkey#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join type: Inner Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [9]: [hash(o_orderkey#X, 42) AS hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] Input [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X, o_custkey#X] -(19) WholeStageCodegenTransformer (X) +(21) WholeStageCodegenTransformer (X) Input [9]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] Arguments: false -(20) ColumnarExchange +(22) VeloxAppendBatches +Input [9]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] +Arguments: X + +(23) ColumnarExchange Input [9]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X], [plan_id=X], [id=#X] -(21) ShuffleQueryStage +(24) ShuffleQueryStage Output [8]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] Arguments: X -(22) InputAdapter +(25) InputAdapter Input [8]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] -(23) InputIteratorTransformer +(26) InputIteratorTransformer Input [8]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] -(24) Scan parquet +(27) Scan parquet Output [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_returnflag#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_returnflag), EqualTo(l_returnflag,R), IsNotNull(l_orderkey)] ReadSchema: struct -(25) NoopFilter +(28) NoopFilter Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_returnflag#X] Arguments: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_returnflag#X] -(26) ProjectExecTransformer +(29) ProjectExecTransformer Output [4]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_returnflag#X] -(27) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X] Arguments: false -(28) ColumnarExchange +(31) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(32) ColumnarExchange Input [4]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(29) ShuffleQueryStage +(33) ShuffleQueryStage Output [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] Arguments: X -(30) InputAdapter +(34) InputAdapter Input [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] -(31) InputIteratorTransformer +(35) InputIteratorTransformer Input [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] -(32) ShuffledHashJoinExecTransformer +(36) ShuffledHashJoinExecTransformer Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: Inner Join condition: None -(33) ProjectExecTransformer +(37) ProjectExecTransformer Output [10]: [hash(c_nationkey#X, 42) AS hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] Input [11]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X] -(34) WholeStageCodegenTransformer (X) +(38) WholeStageCodegenTransformer (X) Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] Arguments: false -(35) ColumnarExchange +(39) VeloxAppendBatches +Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(40) ColumnarExchange Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(36) ShuffleQueryStage +(41) ShuffleQueryStage Output [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] Arguments: X -(37) InputAdapter +(42) InputAdapter Input [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] -(38) InputIteratorTransformer +(43) InputIteratorTransformer Input [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] -(39) Scan parquet +(44) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey)] ReadSchema: struct -(40) NoopFilter +(45) NoopFilter Input [2]: [n_nationkey#X, n_name#X] Arguments: [n_nationkey#X, n_name#X] -(41) ProjectExecTransformer +(46) ProjectExecTransformer Output [3]: [hash(n_nationkey#X, 42) AS hash_partition_key#X, n_nationkey#X, n_name#X] Input [2]: [n_nationkey#X, n_name#X] -(42) WholeStageCodegenTransformer (X) +(47) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] Arguments: false -(43) ColumnarExchange +(48) VeloxAppendBatches +Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] +Arguments: X + +(49) ColumnarExchange Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [id=#X] -(44) ShuffleQueryStage +(50) ShuffleQueryStage Output [2]: [n_nationkey#X, n_name#X] Arguments: X -(45) InputAdapter +(51) InputAdapter Input [2]: [n_nationkey#X, n_name#X] -(46) InputIteratorTransformer +(52) InputIteratorTransformer Input [2]: [n_nationkey#X, n_name#X] -(47) ShuffledHashJoinExecTransformer +(53) ShuffledHashJoinExecTransformer Left keys [1]: [c_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(48) ProjectExecTransformer +(54) ProjectExecTransformer Output [10]: [c_custkey#X, c_name#X, c_address#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X, n_name#X, (l_extendedprice#X * (1 - l_discount#X)) AS _pre_X#X] Input [11]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X, n_nationkey#X, n_name#X] -(49) FlushableHashAggregateExecTransformer +(55) FlushableHashAggregateExecTransformer Input [10]: [c_custkey#X, c_name#X, c_address#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X, n_name#X, _pre_X#X] Keys [7]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X] Functions [1]: [partial_sum(_pre_X#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] -(50) ProjectExecTransformer +(56) ProjectExecTransformer Output [10]: [hash(c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, 42) AS hash_partition_key#X, c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] -(51) WholeStageCodegenTransformer (X) +(57) WholeStageCodegenTransformer (X) Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Arguments: false -(52) ColumnarExchange +(58) VeloxAppendBatches +Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] +Arguments: X + +(59) ColumnarExchange Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Arguments: hashpartitioning(c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(53) ShuffleQueryStage +(60) ShuffleQueryStage Output [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Arguments: X -(54) InputAdapter +(61) InputAdapter Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] -(55) InputIteratorTransformer +(62) InputIteratorTransformer Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] -(56) RegularHashAggregateExecTransformer +(63) RegularHashAggregateExecTransformer Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Keys [7]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X] Functions [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X] Results [8]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X] -(57) ProjectExecTransformer +(64) ProjectExecTransformer Output [8]: [c_custkey#X, c_name#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X AS revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] Input [8]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X] -(58) WholeStageCodegenTransformer (X) +(65) WholeStageCodegenTransformer (X) Input [8]: [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] Arguments: false -(59) TakeOrderedAndProjectExecTransformer +(66) TakeOrderedAndProjectExecTransformer Input [8]: [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] Arguments: X, [revenue#X DESC NULLS LAST], [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X], 0 -(60) VeloxColumnarToRowExec +(67) VeloxColumnarToRowExec Input [8]: [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] -(61) Scan parquet +(68) Scan parquet Output [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey), IsNotNull(c_nationkey)] ReadSchema: struct -(62) Filter +(69) Filter Input [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] Condition : (isnotnull(c_custkey#X) AND isnotnull(c_nationkey#X)) -(63) Exchange +(70) Exchange Input [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(64) Scan parquet +(71) Sort +Input [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] +Arguments: [c_custkey#X ASC NULLS FIRST], false, 0 + +(72) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1993-10-01), LessThan(o_orderdate,1994-01-01), IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(65) Filter +(73) Filter Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Condition : ((((isnotnull(o_orderdate#X) AND (o_orderdate#X >= 1993-10-01)) AND (o_orderdate#X < 1994-01-01)) AND isnotnull(o_custkey#X)) AND isnotnull(o_orderkey#X)) -(66) Project +(74) Project Output [2]: [o_orderkey#X, o_custkey#X] Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] -(67) Exchange +(75) Exchange Input [2]: [o_orderkey#X, o_custkey#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(68) ShuffledHashJoin +(76) Sort +Input [2]: [o_orderkey#X, o_custkey#X] +Arguments: [o_custkey#X ASC NULLS FIRST], false, 0 + +(77) SortMergeJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join type: Inner Join condition: None -(69) Project +(78) Project Output [8]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] Input [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X, o_custkey#X] -(70) Exchange +(79) Exchange Input [8]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(71) Scan parquet +(80) Sort +Input [8]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(81) Scan parquet Output [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_returnflag#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_returnflag), EqualTo(l_returnflag,R), IsNotNull(l_orderkey)] ReadSchema: struct -(72) Filter +(82) Filter Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_returnflag#X] Condition : ((isnotnull(l_returnflag#X) AND (l_returnflag#X = R)) AND isnotnull(l_orderkey#X)) -(73) Project +(83) Project Output [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_returnflag#X] -(74) Exchange +(84) Exchange Input [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(75) ShuffledHashJoin +(85) Sort +Input [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(86) SortMergeJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: Inner Join condition: None -(76) Project +(87) Project Output [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] Input [11]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X] -(77) Exchange +(88) Exchange Input [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(78) Scan parquet +(89) Sort +Input [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] +Arguments: [c_nationkey#X ASC NULLS FIRST], false, 0 + +(90) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey)] ReadSchema: struct -(79) Filter +(91) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : isnotnull(n_nationkey#X) -(80) Exchange +(92) Exchange Input [2]: [n_nationkey#X, n_name#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(81) ShuffledHashJoin +(93) Sort +Input [2]: [n_nationkey#X, n_name#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(94) SortMergeJoin Left keys [1]: [c_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(82) Project +(95) Project Output [9]: [c_custkey#X, c_name#X, c_address#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X, n_name#X] Input [11]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X, n_nationkey#X, n_name#X] -(83) HashAggregate +(96) HashAggregate Input [9]: [c_custkey#X, c_name#X, c_address#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X, n_name#X] Keys [7]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X] Functions [1]: [partial_sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] -(84) Exchange +(97) Exchange Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Arguments: hashpartitioning(c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(85) HashAggregate +(98) HashAggregate Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Keys [7]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X] Functions [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X] Results [8]: [c_custkey#X, c_name#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X AS revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] -(86) TakeOrderedAndProject +(99) TakeOrderedAndProject Input [8]: [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] Arguments: X, [revenue#X DESC NULLS LAST], [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] -(87) AdaptiveSparkPlan +(100) AdaptiveSparkPlan Output [8]: [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/11.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/11.txt index 1b5a8743db5a9..6c9eef6b1f1fc 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/11.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/11.txt @@ -1,71 +1,75 @@ == Physical Plan == -AdaptiveSparkPlan (72) +AdaptiveSparkPlan (82) +- == Final Plan == - VeloxColumnarToRowExec (50) - +- ^ SortExecTransformer (48) - +- ^ InputIteratorTransformer (47) - +- ^ InputAdapter (46) - +- ^ ShuffleQueryStage (45), Statistics(X) - +- ColumnarExchange (44) - +- ^ FilterExecTransformer (42) - +- ^ RegularHashAggregateExecTransformer (41) - +- ^ InputIteratorTransformer (40) - +- ^ InputAdapter (39) - +- ^ ShuffleQueryStage (38), Statistics(X) - +- ColumnarExchange (37) - +- ^ ProjectExecTransformer (35) - +- ^ FlushableHashAggregateExecTransformer (34) - +- ^ ProjectExecTransformer (33) - +- ^ ShuffledHashJoinExecTransformer Inner (32) - :- ^ InputIteratorTransformer (23) - : +- ^ InputAdapter (22) - : +- ^ ShuffleQueryStage (21), Statistics(X) - : +- ColumnarExchange (20) - : +- ^ ProjectExecTransformer (18) - : +- ^ ShuffledHashJoinExecTransformer Inner (17) - : :- ^ InputIteratorTransformer (8) - : : +- ^ InputAdapter (7) - : : +- ^ ShuffleQueryStage (6), Statistics(X) - : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (56) + +- ^ SortExecTransformer (54) + +- ^ InputIteratorTransformer (53) + +- ShuffleQueryStage (51), Statistics(X) + +- ColumnarExchange (50) + +- VeloxAppendBatches (49) + +- ^ FilterExecTransformer (47) + +- ^ RegularHashAggregateExecTransformer (46) + +- ^ InputIteratorTransformer (45) + +- ShuffleQueryStage (43), Statistics(X) + +- ColumnarExchange (42) + +- VeloxAppendBatches (41) + +- ^ ProjectExecTransformer (39) + +- ^ FlushableHashAggregateExecTransformer (38) + +- ^ ProjectExecTransformer (37) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (36) + :- ^ InputIteratorTransformer (26) + : +- ShuffleQueryStage (24), Statistics(X) + : +- ColumnarExchange (23) + : +- VeloxAppendBatches (22) + : +- ^ ProjectExecTransformer (20) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (19) + : :- ^ InputIteratorTransformer (9) + : : +- ShuffleQueryStage (7), Statistics(X) + : : +- ColumnarExchange (6) + : : +- VeloxAppendBatches (5) : : +- ^ ProjectExecTransformer (3) : : +- ^ NoopFilter (2) : : +- ^ Scan parquet (1) - : +- ^ InputIteratorTransformer (16) - : +- ^ InputAdapter (15) - : +- ^ ShuffleQueryStage (14), Statistics(X) - : +- ColumnarExchange (13) - : +- ^ ProjectExecTransformer (11) - : +- ^ NoopFilter (10) - : +- ^ Scan parquet (9) - +- ^ InputIteratorTransformer (31) - +- ^ InputAdapter (30) - +- ^ ShuffleQueryStage (29), Statistics(X) - +- ColumnarExchange (28) - +- ^ ProjectExecTransformer (26) - +- ^ NoopFilter (25) - +- ^ Scan parquet (24) + : +- ^ InputIteratorTransformer (18) + : +- ShuffleQueryStage (16), Statistics(X) + : +- ColumnarExchange (15) + : +- VeloxAppendBatches (14) + : +- ^ ProjectExecTransformer (12) + : +- ^ NoopFilter (11) + : +- ^ Scan parquet (10) + +- ^ InputIteratorTransformer (35) + +- ShuffleQueryStage (33), Statistics(X) + +- ColumnarExchange (32) + +- VeloxAppendBatches (31) + +- ^ ProjectExecTransformer (29) + +- ^ NoopFilter (28) + +- ^ Scan parquet (27) +- == Initial Plan == - Sort (71) - +- Exchange (70) - +- Filter (69) - +- HashAggregate (68) - +- Exchange (67) - +- HashAggregate (66) - +- Project (65) - +- ShuffledHashJoin Inner BuildRight (64) - :- Exchange (59) - : +- Project (58) - : +- ShuffledHashJoin Inner BuildRight (57) - : :- Exchange (53) - : : +- Filter (52) - : : +- Scan parquet (51) - : +- Exchange (56) - : +- Filter (55) - : +- Scan parquet (54) - +- Exchange (63) - +- Project (62) - +- Filter (61) - +- Scan parquet (60) + Sort (81) + +- Exchange (80) + +- Filter (79) + +- HashAggregate (78) + +- Exchange (77) + +- HashAggregate (76) + +- Project (75) + +- SortMergeJoin Inner (74) + :- Sort (68) + : +- Exchange (67) + : +- Project (66) + : +- SortMergeJoin Inner (65) + : :- Sort (60) + : : +- Exchange (59) + : : +- Filter (58) + : : +- Scan parquet (57) + : +- Sort (64) + : +- Exchange (63) + : +- Filter (62) + : +- Scan parquet (61) + +- Sort (73) + +- Exchange (72) + +- Project (71) + +- Filter (70) + +- Scan parquet (69) (1) Scan parquet @@ -87,561 +91,627 @@ Input [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Input [5]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [5]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] +Arguments: X + +(6) ColumnarExchange Input [5]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] -(9) Scan parquet +(10) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(10) NoopFilter +(11) NoopFilter Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: [s_suppkey#X, s_nationkey#X] -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [3]: [hash(s_suppkey#X, 42) AS hash_partition_key#X, s_suppkey#X, s_nationkey#X] Input [2]: [s_suppkey#X, s_nationkey#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] +Arguments: X + +(15) ColumnarExchange Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [2]: [s_suppkey#X, s_nationkey#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [2]: [s_suppkey#X, s_nationkey#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [ps_suppkey#X] Right keys [1]: [s_suppkey#X] Join type: Inner Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [5]: [hash(s_nationkey#X, 42) AS hash_partition_key#X, ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] Input [6]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X, s_suppkey#X, s_nationkey#X] -(19) WholeStageCodegenTransformer (X) +(21) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] Arguments: false -(20) ColumnarExchange +(22) VeloxAppendBatches +Input [5]: [hash_partition_key#X, ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] +Arguments: X + +(23) ColumnarExchange Input [5]: [hash_partition_key#X, ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X], [plan_id=X], [id=#X] -(21) ShuffleQueryStage +(24) ShuffleQueryStage Output [4]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] Arguments: X -(22) InputAdapter +(25) InputAdapter Input [4]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] -(23) InputIteratorTransformer +(26) InputIteratorTransformer Input [4]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] -(24) Scan parquet +(27) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,GERMANY), IsNotNull(n_nationkey)] ReadSchema: struct -(25) NoopFilter +(28) NoopFilter Input [2]: [n_nationkey#X, n_name#X] Arguments: [n_nationkey#X, n_name#X] -(26) ProjectExecTransformer +(29) ProjectExecTransformer Output [2]: [hash(n_nationkey#X, 42) AS hash_partition_key#X, n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(27) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, n_nationkey#X] Arguments: false -(28) ColumnarExchange +(31) VeloxAppendBatches +Input [2]: [hash_partition_key#X, n_nationkey#X] +Arguments: X + +(32) ColumnarExchange Input [2]: [hash_partition_key#X, n_nationkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X], [plan_id=X], [id=#X] -(29) ShuffleQueryStage +(33) ShuffleQueryStage Output [1]: [n_nationkey#X] Arguments: X -(30) InputAdapter +(34) InputAdapter Input [1]: [n_nationkey#X] -(31) InputIteratorTransformer +(35) InputIteratorTransformer Input [1]: [n_nationkey#X] -(32) ShuffledHashJoinExecTransformer +(36) ShuffledHashJoinExecTransformer Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(33) ProjectExecTransformer +(37) ProjectExecTransformer Output [4]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, (ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))) AS _pre_X#X] Input [5]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X, n_nationkey#X] -(34) FlushableHashAggregateExecTransformer +(38) FlushableHashAggregateExecTransformer Input [4]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, _pre_X#X] Keys [1]: [ps_partkey#X] Functions [1]: [partial_sum(_pre_X#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [ps_partkey#X, sum#X, isEmpty#X] -(35) ProjectExecTransformer +(39) ProjectExecTransformer Output [4]: [hash(ps_partkey#X, 42) AS hash_partition_key#X, ps_partkey#X, sum#X, isEmpty#X] Input [3]: [ps_partkey#X, sum#X, isEmpty#X] -(36) WholeStageCodegenTransformer (X) +(40) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, ps_partkey#X, sum#X, isEmpty#X] Arguments: false -(37) ColumnarExchange +(41) VeloxAppendBatches +Input [4]: [hash_partition_key#X, ps_partkey#X, sum#X, isEmpty#X] +Arguments: X + +(42) ColumnarExchange Input [4]: [hash_partition_key#X, ps_partkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(38) ShuffleQueryStage +(43) ShuffleQueryStage Output [3]: [ps_partkey#X, sum#X, isEmpty#X] Arguments: X -(39) InputAdapter +(44) InputAdapter Input [3]: [ps_partkey#X, sum#X, isEmpty#X] -(40) InputIteratorTransformer +(45) InputIteratorTransformer Input [3]: [ps_partkey#X, sum#X, isEmpty#X] -(41) RegularHashAggregateExecTransformer +(46) RegularHashAggregateExecTransformer Input [3]: [ps_partkey#X, sum#X, isEmpty#X] Keys [1]: [ps_partkey#X] Functions [1]: [sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))] Aggregate Attributes [1]: [sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))#X] Results [2]: [ps_partkey#X, sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))#X AS value#X] -(42) FilterExecTransformer +(47) FilterExecTransformer Input [2]: [ps_partkey#X, value#X] Arguments: (isnotnull(value#X) AND (cast(value#X as decimal(38,6)) > Subquery subquery#X, [id=#X])) -(43) WholeStageCodegenTransformer (X) +(48) WholeStageCodegenTransformer (X) Input [2]: [ps_partkey#X, value#X] Arguments: false -(44) ColumnarExchange +(49) VeloxAppendBatches +Input [2]: [ps_partkey#X, value#X] +Arguments: X + +(50) ColumnarExchange Input [2]: [ps_partkey#X, value#X] Arguments: rangepartitioning(value#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(45) ShuffleQueryStage +(51) ShuffleQueryStage Output [2]: [ps_partkey#X, value#X] Arguments: X -(46) InputAdapter +(52) InputAdapter Input [2]: [ps_partkey#X, value#X] -(47) InputIteratorTransformer +(53) InputIteratorTransformer Input [2]: [ps_partkey#X, value#X] -(48) SortExecTransformer +(54) SortExecTransformer Input [2]: [ps_partkey#X, value#X] Arguments: [value#X DESC NULLS LAST], true, 0 -(49) WholeStageCodegenTransformer (X) +(55) WholeStageCodegenTransformer (X) Input [2]: [ps_partkey#X, value#X] Arguments: false -(50) VeloxColumnarToRowExec +(56) VeloxColumnarToRowExec Input [2]: [ps_partkey#X, value#X] -(51) Scan parquet +(57) Scan parquet Output [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_suppkey)] ReadSchema: struct -(52) Filter +(58) Filter Input [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Condition : isnotnull(ps_suppkey#X) -(53) Exchange +(59) Exchange Input [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(54) Scan parquet +(60) Sort +Input [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] +Arguments: [ps_suppkey#X ASC NULLS FIRST], false, 0 + +(61) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(55) Filter +(62) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(56) Exchange +(63) Exchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(57) ShuffledHashJoin +(64) Sort +Input [2]: [s_suppkey#X, s_nationkey#X] +Arguments: [s_suppkey#X ASC NULLS FIRST], false, 0 + +(65) SortMergeJoin Left keys [1]: [ps_suppkey#X] Right keys [1]: [s_suppkey#X] Join type: Inner Join condition: None -(58) Project +(66) Project Output [4]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] Input [6]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X, s_suppkey#X, s_nationkey#X] -(59) Exchange +(67) Exchange Input [4]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(60) Scan parquet +(68) Sort +Input [4]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] +Arguments: [s_nationkey#X ASC NULLS FIRST], false, 0 + +(69) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,GERMANY), IsNotNull(n_nationkey)] ReadSchema: struct -(61) Filter +(70) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : ((isnotnull(n_name#X) AND (n_name#X = GERMANY)) AND isnotnull(n_nationkey#X)) -(62) Project +(71) Project Output [1]: [n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(63) Exchange +(72) Exchange Input [1]: [n_nationkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(64) ShuffledHashJoin +(73) Sort +Input [1]: [n_nationkey#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(74) SortMergeJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(65) Project +(75) Project Output [3]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X] Input [5]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X, n_nationkey#X] -(66) HashAggregate +(76) HashAggregate Input [3]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X] Keys [1]: [ps_partkey#X] Functions [1]: [partial_sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [ps_partkey#X, sum#X, isEmpty#X] -(67) Exchange +(77) Exchange Input [3]: [ps_partkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(68) HashAggregate +(78) HashAggregate Input [3]: [ps_partkey#X, sum#X, isEmpty#X] Keys [1]: [ps_partkey#X] Functions [1]: [sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))] Aggregate Attributes [1]: [sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))#X] Results [2]: [ps_partkey#X, sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))#X AS value#X] -(69) Filter +(79) Filter Input [2]: [ps_partkey#X, value#X] Condition : (isnotnull(value#X) AND (cast(value#X as decimal(38,6)) > Subquery subquery#X, [id=#X])) -(70) Exchange +(80) Exchange Input [2]: [ps_partkey#X, value#X] Arguments: rangepartitioning(value#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(71) Sort +(81) Sort Input [2]: [ps_partkey#X, value#X] Arguments: [value#X DESC NULLS LAST], true, 0 -(72) AdaptiveSparkPlan +(82) AdaptiveSparkPlan Output [2]: [ps_partkey#X, value#X] Arguments: isFinalPlan=true ===== Subqueries ===== -Subquery:1 Hosting operator id = 42 Hosting Expression = Subquery subquery#X, [id=#X] -AdaptiveSparkPlan (120) +Subquery:1 Hosting operator id = 47 Hosting Expression = Subquery subquery#X, [id=#X] +AdaptiveSparkPlan (136) +- == Final Plan == - VeloxColumnarToRowExec (102) - +- ^ ProjectExecTransformer (100) - +- ^ RegularHashAggregateExecTransformer (99) - +- ^ RegularHashAggregateExecTransformer (98) - +- ^ ProjectExecTransformer (97) - +- ^ ShuffledHashJoinExecTransformer Inner (96) - :- ^ InputIteratorTransformer (91) - : +- ^ InputAdapter (90) - : +- ^ ShuffleQueryStage (89), Statistics(X) - : +- ColumnarExchange (88) - : +- ^ ProjectExecTransformer (86) - : +- ^ ShuffledHashJoinExecTransformer Inner (85) - : :- ^ InputIteratorTransformer (80) - : : +- ^ InputAdapter (79) - : : +- ^ ShuffleQueryStage (78), Statistics(X) - : : +- ColumnarExchange (77) - : : +- ^ ProjectExecTransformer (75) - : : +- ^ NoopFilter (74) - : : +- ^ Scan parquet (73) - : +- ^ InputIteratorTransformer (84) - : +- ^ InputAdapter (83) - : +- ^ ShuffleQueryStage (82), Statistics(X) - : +- ReusedExchange (81) - +- ^ InputIteratorTransformer (95) - +- ^ InputAdapter (94) - +- ^ ShuffleQueryStage (93), Statistics(X) - +- ReusedExchange (92) + VeloxColumnarToRowExec (114) + +- ^ ProjectExecTransformer (112) + +- ^ RegularHashAggregateExecTransformer (111) + +- ^ RegularHashAggregateExecTransformer (110) + +- ^ ProjectExecTransformer (109) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (108) + :- ^ InputIteratorTransformer (103) + : +- ShuffleQueryStage (101), Statistics(X) + : +- ColumnarExchange (100) + : +- VeloxAppendBatches (99) + : +- ^ ProjectExecTransformer (97) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (96) + : :- ^ InputIteratorTransformer (91) + : : +- ShuffleQueryStage (89), Statistics(X) + : : +- ColumnarExchange (88) + : : +- VeloxAppendBatches (87) + : : +- ^ ProjectExecTransformer (85) + : : +- ^ NoopFilter (84) + : : +- ^ Scan parquet (83) + : +- ^ InputIteratorTransformer (95) + : +- ShuffleQueryStage (93), Statistics(X) + : +- ReusedExchange (92) + +- ^ InputIteratorTransformer (107) + +- ShuffleQueryStage (105), Statistics(X) + +- ReusedExchange (104) +- == Initial Plan == - HashAggregate (119) - +- HashAggregate (118) - +- Project (117) - +- ShuffledHashJoin Inner BuildRight (116) - :- Exchange (111) - : +- Project (110) - : +- ShuffledHashJoin Inner BuildRight (109) - : :- Exchange (105) - : : +- Filter (104) - : : +- Scan parquet (103) - : +- Exchange (108) - : +- Filter (107) - : +- Scan parquet (106) - +- Exchange (115) - +- Project (114) - +- Filter (113) - +- Scan parquet (112) - - -(73) Scan parquet + HashAggregate (135) + +- HashAggregate (134) + +- Project (133) + +- SortMergeJoin Inner (132) + :- Sort (126) + : +- Exchange (125) + : +- Project (124) + : +- SortMergeJoin Inner (123) + : :- Sort (118) + : : +- Exchange (117) + : : +- Filter (116) + : : +- Scan parquet (115) + : +- Sort (122) + : +- Exchange (121) + : +- Filter (120) + : +- Scan parquet (119) + +- Sort (131) + +- Exchange (130) + +- Project (129) + +- Filter (128) + +- Scan parquet (127) + + +(83) Scan parquet Output [3]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_suppkey)] ReadSchema: struct -(74) NoopFilter +(84) NoopFilter Input [3]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Arguments: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] -(75) ProjectExecTransformer +(85) ProjectExecTransformer Output [4]: [hash(ps_suppkey#X, 42) AS hash_partition_key#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Input [3]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] -(76) WholeStageCodegenTransformer (X) +(86) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Arguments: false -(77) ColumnarExchange +(87) VeloxAppendBatches +Input [4]: [hash_partition_key#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] +Arguments: X + +(88) ColumnarExchange Input [4]: [hash_partition_key#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_suppkey#X, ps_availqty#X, ps_supplycost#X], [plan_id=X], [id=#X] -(78) ShuffleQueryStage +(89) ShuffleQueryStage Output [3]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Arguments: X -(79) InputAdapter +(90) InputAdapter Input [3]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] -(80) InputIteratorTransformer +(91) InputIteratorTransformer Input [3]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] -(81) ReusedExchange [Reuses operator id: 13] +(92) ReusedExchange [Reuses operator id: 15] Output [2]: [s_suppkey#X, s_nationkey#X] -(82) ShuffleQueryStage +(93) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] Arguments: X -(83) InputAdapter +(94) InputAdapter Input [2]: [s_suppkey#X, s_nationkey#X] -(84) InputIteratorTransformer +(95) InputIteratorTransformer Input [2]: [s_suppkey#X, s_nationkey#X] -(85) ShuffledHashJoinExecTransformer +(96) ShuffledHashJoinExecTransformer Left keys [1]: [ps_suppkey#X] Right keys [1]: [s_suppkey#X] Join type: Inner Join condition: None -(86) ProjectExecTransformer +(97) ProjectExecTransformer Output [4]: [hash(s_nationkey#X, 42) AS hash_partition_key#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] Input [5]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X, s_suppkey#X, s_nationkey#X] -(87) WholeStageCodegenTransformer (X) +(98) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] Arguments: false -(88) ColumnarExchange +(99) VeloxAppendBatches +Input [4]: [hash_partition_key#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] +Arguments: X + +(100) ColumnarExchange Input [4]: [hash_partition_key#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [ps_availqty#X, ps_supplycost#X, s_nationkey#X], [plan_id=X], [id=#X] -(89) ShuffleQueryStage +(101) ShuffleQueryStage Output [3]: [ps_availqty#X, ps_supplycost#X, s_nationkey#X] Arguments: X -(90) InputAdapter +(102) InputAdapter Input [3]: [ps_availqty#X, ps_supplycost#X, s_nationkey#X] -(91) InputIteratorTransformer +(103) InputIteratorTransformer Input [3]: [ps_availqty#X, ps_supplycost#X, s_nationkey#X] -(92) ReusedExchange [Reuses operator id: 28] +(104) ReusedExchange [Reuses operator id: 32] Output [1]: [n_nationkey#X] -(93) ShuffleQueryStage +(105) ShuffleQueryStage Output [1]: [n_nationkey#X] Arguments: X -(94) InputAdapter +(106) InputAdapter Input [1]: [n_nationkey#X] -(95) InputIteratorTransformer +(107) InputIteratorTransformer Input [1]: [n_nationkey#X] -(96) ShuffledHashJoinExecTransformer +(108) ShuffledHashJoinExecTransformer Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(97) ProjectExecTransformer +(109) ProjectExecTransformer Output [3]: [ps_availqty#X, ps_supplycost#X, (ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))) AS _pre_X#X] Input [4]: [ps_availqty#X, ps_supplycost#X, s_nationkey#X, n_nationkey#X] -(98) RegularHashAggregateExecTransformer +(110) RegularHashAggregateExecTransformer Input [3]: [ps_availqty#X, ps_supplycost#X, _pre_X#X] Keys: [] Functions [1]: [partial_sum(_pre_X#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(99) RegularHashAggregateExecTransformer +(111) RegularHashAggregateExecTransformer Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))] Aggregate Attributes [1]: [sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))#X] Results [1]: [sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))#X] -(100) ProjectExecTransformer +(112) ProjectExecTransformer Output [1]: [(sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))#X * 0.0001000000) AS (sum((ps_supplycost * ps_availqty)) * 0.0001000000)#X] Input [1]: [sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))#X] -(101) WholeStageCodegenTransformer (X) +(113) WholeStageCodegenTransformer (X) Input [1]: [(sum((ps_supplycost * ps_availqty)) * 0.0001000000)#X] Arguments: false -(102) VeloxColumnarToRowExec +(114) VeloxColumnarToRowExec Input [1]: [(sum((ps_supplycost * ps_availqty)) * 0.0001000000)#X] -(103) Scan parquet +(115) Scan parquet Output [3]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_suppkey)] ReadSchema: struct -(104) Filter +(116) Filter Input [3]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Condition : isnotnull(ps_suppkey#X) -(105) Exchange +(117) Exchange Input [3]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(106) Scan parquet +(118) Sort +Input [3]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] +Arguments: [ps_suppkey#X ASC NULLS FIRST], false, 0 + +(119) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(107) Filter +(120) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(108) Exchange +(121) Exchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(109) ShuffledHashJoin +(122) Sort +Input [2]: [s_suppkey#X, s_nationkey#X] +Arguments: [s_suppkey#X ASC NULLS FIRST], false, 0 + +(123) SortMergeJoin Left keys [1]: [ps_suppkey#X] Right keys [1]: [s_suppkey#X] Join type: Inner Join condition: None -(110) Project +(124) Project Output [3]: [ps_availqty#X, ps_supplycost#X, s_nationkey#X] Input [5]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X, s_suppkey#X, s_nationkey#X] -(111) Exchange +(125) Exchange Input [3]: [ps_availqty#X, ps_supplycost#X, s_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(112) Scan parquet +(126) Sort +Input [3]: [ps_availqty#X, ps_supplycost#X, s_nationkey#X] +Arguments: [s_nationkey#X ASC NULLS FIRST], false, 0 + +(127) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,GERMANY), IsNotNull(n_nationkey)] ReadSchema: struct -(113) Filter +(128) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : ((isnotnull(n_name#X) AND (n_name#X = GERMANY)) AND isnotnull(n_nationkey#X)) -(114) Project +(129) Project Output [1]: [n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(115) Exchange +(130) Exchange Input [1]: [n_nationkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(116) ShuffledHashJoin +(131) Sort +Input [1]: [n_nationkey#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(132) SortMergeJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(117) Project +(133) Project Output [2]: [ps_availqty#X, ps_supplycost#X] Input [4]: [ps_availqty#X, ps_supplycost#X, s_nationkey#X, n_nationkey#X] -(118) HashAggregate +(134) HashAggregate Input [2]: [ps_availqty#X, ps_supplycost#X] Keys: [] Functions [1]: [partial_sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(119) HashAggregate +(135) HashAggregate Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))] Aggregate Attributes [1]: [sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))#X] Results [1]: [(sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))#X * 0.0001000000) AS (sum((ps_supplycost * ps_availqty)) * 0.0001000000)#X] -(120) AdaptiveSparkPlan +(136) AdaptiveSparkPlan Output [1]: [(sum((ps_supplycost * ps_availqty)) * 0.0001000000)#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/12.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/12.txt index 595d52af52567..8f963c49aba83 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/12.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/12.txt @@ -1,50 +1,52 @@ == Physical Plan == -AdaptiveSparkPlan (49) +AdaptiveSparkPlan (55) +- == Final Plan == - VeloxColumnarToRowExec (34) - +- ^ SortExecTransformer (32) - +- ^ InputIteratorTransformer (31) - +- ^ InputAdapter (30) - +- ^ ShuffleQueryStage (29), Statistics(X) - +- ColumnarExchange (28) - +- ^ RegularHashAggregateExecTransformer (26) - +- ^ InputIteratorTransformer (25) - +- ^ InputAdapter (24) - +- ^ ShuffleQueryStage (23), Statistics(X) - +- ColumnarExchange (22) - +- ^ ProjectExecTransformer (20) - +- ^ FlushableHashAggregateExecTransformer (19) - +- ^ ProjectExecTransformer (18) - +- ^ ShuffledHashJoinExecTransformer Inner (17) - :- ^ InputIteratorTransformer (8) - : +- ^ InputAdapter (7) - : +- ^ ShuffleQueryStage (6), Statistics(X) - : +- ColumnarExchange (5) + VeloxColumnarToRowExec (38) + +- ^ SortExecTransformer (36) + +- ^ InputIteratorTransformer (35) + +- ShuffleQueryStage (33), Statistics(X) + +- ColumnarExchange (32) + +- VeloxAppendBatches (31) + +- ^ RegularHashAggregateExecTransformer (29) + +- ^ InputIteratorTransformer (28) + +- ShuffleQueryStage (26), Statistics(X) + +- ColumnarExchange (25) + +- VeloxAppendBatches (24) + +- ^ ProjectExecTransformer (22) + +- ^ FlushableHashAggregateExecTransformer (21) + +- ^ ProjectExecTransformer (20) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (19) + :- ^ InputIteratorTransformer (9) + : +- ShuffleQueryStage (7), Statistics(X) + : +- ColumnarExchange (6) + : +- VeloxAppendBatches (5) : +- ^ ProjectExecTransformer (3) : +- ^ NoopFilter (2) : +- ^ Scan parquet (1) - +- ^ InputIteratorTransformer (16) - +- ^ InputAdapter (15) - +- ^ ShuffleQueryStage (14), Statistics(X) - +- ColumnarExchange (13) - +- ^ ProjectExecTransformer (11) - +- ^ NoopFilter (10) - +- ^ Scan parquet (9) + +- ^ InputIteratorTransformer (18) + +- ShuffleQueryStage (16), Statistics(X) + +- ColumnarExchange (15) + +- VeloxAppendBatches (14) + +- ^ ProjectExecTransformer (12) + +- ^ NoopFilter (11) + +- ^ Scan parquet (10) +- == Initial Plan == - Sort (48) - +- Exchange (47) - +- HashAggregate (46) - +- Exchange (45) - +- HashAggregate (44) - +- Project (43) - +- ShuffledHashJoin Inner BuildLeft (42) - :- Exchange (37) - : +- Filter (36) - : +- Scan parquet (35) - +- Exchange (41) - +- Project (40) - +- Filter (39) - +- Scan parquet (38) + Sort (54) + +- Exchange (53) + +- HashAggregate (52) + +- Exchange (51) + +- HashAggregate (50) + +- Project (49) + +- SortMergeJoin Inner (48) + :- Sort (42) + : +- Exchange (41) + : +- Filter (40) + : +- Scan parquet (39) + +- Sort (47) + +- Exchange (46) + +- Project (45) + +- Filter (44) + +- Scan parquet (43) (1) Scan parquet @@ -66,198 +68,222 @@ Input [2]: [o_orderkey#X, o_orderpriority#X] Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderpriority#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderpriority#X] +Arguments: X + +(6) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderpriority#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderpriority#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [2]: [o_orderkey#X, o_orderpriority#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [2]: [o_orderkey#X, o_orderpriority#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [2]: [o_orderkey#X, o_orderpriority#X] -(9) Scan parquet +(10) Scan parquet Output [5]: [l_orderkey#X, l_shipdate#X, l_commitdate#X, l_receiptdate#X, l_shipmode#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_commitdate), IsNotNull(l_receiptdate), IsNotNull(l_shipdate), In(l_shipmode, [MAIL,SHIP]), GreaterThanOrEqual(l_receiptdate,1994-01-01), LessThan(l_receiptdate,1995-01-01), IsNotNull(l_orderkey)] ReadSchema: struct -(10) NoopFilter +(11) NoopFilter Input [5]: [l_orderkey#X, l_shipdate#X, l_commitdate#X, l_receiptdate#X, l_shipmode#X] Arguments: [l_orderkey#X, l_shipdate#X, l_commitdate#X, l_receiptdate#X, l_shipmode#X] -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [3]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_shipmode#X] Input [5]: [l_orderkey#X, l_shipdate#X, l_commitdate#X, l_receiptdate#X, l_shipmode#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, l_orderkey#X, l_shipmode#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [3]: [hash_partition_key#X, l_orderkey#X, l_shipmode#X] +Arguments: X + +(15) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_shipmode#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_shipmode#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [2]: [l_orderkey#X, l_shipmode#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [2]: [l_orderkey#X, l_shipmode#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [2]: [l_orderkey#X, l_shipmode#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: Inner Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [4]: [o_orderpriority#X, l_shipmode#X, CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END AS _pre_X#X, CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END AS _pre_X#X] Input [4]: [o_orderkey#X, o_orderpriority#X, l_orderkey#X, l_shipmode#X] -(19) FlushableHashAggregateExecTransformer +(21) FlushableHashAggregateExecTransformer Input [4]: [o_orderpriority#X, l_shipmode#X, _pre_X#X, _pre_X#X] Keys [1]: [l_shipmode#X] Functions [2]: [partial_sum(_pre_X#X), partial_sum(_pre_X#X)] Aggregate Attributes [2]: [sum#X, sum#X] Results [3]: [l_shipmode#X, sum#X, sum#X] -(20) ProjectExecTransformer +(22) ProjectExecTransformer Output [4]: [hash(l_shipmode#X, 42) AS hash_partition_key#X, l_shipmode#X, sum#X, sum#X] Input [3]: [l_shipmode#X, sum#X, sum#X] -(21) WholeStageCodegenTransformer (X) +(23) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, l_shipmode#X, sum#X, sum#X] Arguments: false -(22) ColumnarExchange +(24) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_shipmode#X, sum#X, sum#X] +Arguments: X + +(25) ColumnarExchange Input [4]: [hash_partition_key#X, l_shipmode#X, sum#X, sum#X] Arguments: hashpartitioning(l_shipmode#X, 1), ENSURE_REQUIREMENTS, [l_shipmode#X, sum#X, sum#X], [plan_id=X], [id=#X] -(23) ShuffleQueryStage +(26) ShuffleQueryStage Output [3]: [l_shipmode#X, sum#X, sum#X] Arguments: X -(24) InputAdapter +(27) InputAdapter Input [3]: [l_shipmode#X, sum#X, sum#X] -(25) InputIteratorTransformer +(28) InputIteratorTransformer Input [3]: [l_shipmode#X, sum#X, sum#X] -(26) RegularHashAggregateExecTransformer +(29) RegularHashAggregateExecTransformer Input [3]: [l_shipmode#X, sum#X, sum#X] Keys [1]: [l_shipmode#X] Functions [2]: [sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END), sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)] Aggregate Attributes [2]: [sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X, sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X] Results [3]: [l_shipmode#X, sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X AS high_line_count#X, sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X AS low_line_count#X] -(27) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: false -(28) ColumnarExchange +(31) VeloxAppendBatches +Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] +Arguments: X + +(32) ColumnarExchange Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: rangepartitioning(l_shipmode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(29) ShuffleQueryStage +(33) ShuffleQueryStage Output [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: X -(30) InputAdapter +(34) InputAdapter Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] -(31) InputIteratorTransformer +(35) InputIteratorTransformer Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] -(32) SortExecTransformer +(36) SortExecTransformer Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: [l_shipmode#X ASC NULLS FIRST], true, 0 -(33) WholeStageCodegenTransformer (X) +(37) WholeStageCodegenTransformer (X) Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: false -(34) VeloxColumnarToRowExec +(38) VeloxColumnarToRowExec Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] -(35) Scan parquet +(39) Scan parquet Output [2]: [o_orderkey#X, o_orderpriority#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderkey)] ReadSchema: struct -(36) Filter +(40) Filter Input [2]: [o_orderkey#X, o_orderpriority#X] Condition : isnotnull(o_orderkey#X) -(37) Exchange +(41) Exchange Input [2]: [o_orderkey#X, o_orderpriority#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(38) Scan parquet +(42) Sort +Input [2]: [o_orderkey#X, o_orderpriority#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(43) Scan parquet Output [5]: [l_orderkey#X, l_shipdate#X, l_commitdate#X, l_receiptdate#X, l_shipmode#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_commitdate), IsNotNull(l_receiptdate), IsNotNull(l_shipdate), In(l_shipmode, [MAIL,SHIP]), GreaterThanOrEqual(l_receiptdate,1994-01-01), LessThan(l_receiptdate,1995-01-01), IsNotNull(l_orderkey)] ReadSchema: struct -(39) Filter +(44) Filter Input [5]: [l_orderkey#X, l_shipdate#X, l_commitdate#X, l_receiptdate#X, l_shipmode#X] Condition : ((((((((isnotnull(l_commitdate#X) AND isnotnull(l_receiptdate#X)) AND isnotnull(l_shipdate#X)) AND l_shipmode#X IN (MAIL,SHIP)) AND (l_commitdate#X < l_receiptdate#X)) AND (l_shipdate#X < l_commitdate#X)) AND (l_receiptdate#X >= 1994-01-01)) AND (l_receiptdate#X < 1995-01-01)) AND isnotnull(l_orderkey#X)) -(40) Project +(45) Project Output [2]: [l_orderkey#X, l_shipmode#X] Input [5]: [l_orderkey#X, l_shipdate#X, l_commitdate#X, l_receiptdate#X, l_shipmode#X] -(41) Exchange +(46) Exchange Input [2]: [l_orderkey#X, l_shipmode#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(42) ShuffledHashJoin +(47) Sort +Input [2]: [l_orderkey#X, l_shipmode#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(48) SortMergeJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: Inner Join condition: None -(43) Project +(49) Project Output [2]: [o_orderpriority#X, l_shipmode#X] Input [4]: [o_orderkey#X, o_orderpriority#X, l_orderkey#X, l_shipmode#X] -(44) HashAggregate +(50) HashAggregate Input [2]: [o_orderpriority#X, l_shipmode#X] Keys [1]: [l_shipmode#X] Functions [2]: [partial_sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END), partial_sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)] Aggregate Attributes [2]: [sum#X, sum#X] Results [3]: [l_shipmode#X, sum#X, sum#X] -(45) Exchange +(51) Exchange Input [3]: [l_shipmode#X, sum#X, sum#X] Arguments: hashpartitioning(l_shipmode#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(46) HashAggregate +(52) HashAggregate Input [3]: [l_shipmode#X, sum#X, sum#X] Keys [1]: [l_shipmode#X] Functions [2]: [sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END), sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)] Aggregate Attributes [2]: [sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X, sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X] Results [3]: [l_shipmode#X, sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X AS high_line_count#X, sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X AS low_line_count#X] -(47) Exchange +(53) Exchange Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: rangepartitioning(l_shipmode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(48) Sort +(54) Sort Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: [l_shipmode#X ASC NULLS FIRST], true, 0 -(49) AdaptiveSparkPlan +(55) AdaptiveSparkPlan Output [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/13.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/13.txt index 10a7818f6c4db..9584f92628cd8 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/13.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/13.txt @@ -1,53 +1,55 @@ == Physical Plan == -AdaptiveSparkPlan (52) +AdaptiveSparkPlan (58) +- == Final Plan == - VeloxColumnarToRowExec (36) - +- ^ SortExecTransformer (34) - +- ^ InputIteratorTransformer (33) - +- ^ InputAdapter (32) - +- ^ ShuffleQueryStage (31), Statistics(X) - +- ColumnarExchange (30) - +- ^ RegularHashAggregateExecTransformer (28) - +- ^ InputIteratorTransformer (27) - +- ^ InputAdapter (26) - +- ^ ShuffleQueryStage (25), Statistics(X) - +- ColumnarExchange (24) - +- ^ ProjectExecTransformer (22) - +- ^ FlushableHashAggregateExecTransformer (21) - +- ^ ProjectExecTransformer (20) - +- ^ RegularHashAggregateExecTransformer (19) - +- ^ RegularHashAggregateExecTransformer (18) - +- ^ ProjectExecTransformer (17) - +- ^ ShuffledHashJoinExecTransformer LeftOuter (16) - :- ^ InputIteratorTransformer (7) - : +- ^ InputAdapter (6) - : +- ^ ShuffleQueryStage (5), Statistics(X) - : +- ColumnarExchange (4) + VeloxColumnarToRowExec (40) + +- ^ SortExecTransformer (38) + +- ^ InputIteratorTransformer (37) + +- ShuffleQueryStage (35), Statistics(X) + +- ColumnarExchange (34) + +- VeloxAppendBatches (33) + +- ^ RegularHashAggregateExecTransformer (31) + +- ^ InputIteratorTransformer (30) + +- ShuffleQueryStage (28), Statistics(X) + +- ColumnarExchange (27) + +- VeloxAppendBatches (26) + +- ^ ProjectExecTransformer (24) + +- ^ FlushableHashAggregateExecTransformer (23) + +- ^ ProjectExecTransformer (22) + +- ^ RegularHashAggregateExecTransformer (21) + +- ^ RegularHashAggregateExecTransformer (20) + +- ^ ProjectExecTransformer (19) + +- ^ ShuffledHashJoinExecTransformer LeftOuter BuildLeft (18) + :- ^ InputIteratorTransformer (8) + : +- ShuffleQueryStage (6), Statistics(X) + : +- ColumnarExchange (5) + : +- VeloxAppendBatches (4) : +- ^ ProjectExecTransformer (2) : +- ^ Scan parquet (1) - +- ^ InputIteratorTransformer (15) - +- ^ InputAdapter (14) - +- ^ ShuffleQueryStage (13), Statistics(X) - +- ColumnarExchange (12) - +- ^ ProjectExecTransformer (10) - +- ^ NoopFilter (9) - +- ^ Scan parquet (8) + +- ^ InputIteratorTransformer (17) + +- ShuffleQueryStage (15), Statistics(X) + +- ColumnarExchange (14) + +- VeloxAppendBatches (13) + +- ^ ProjectExecTransformer (11) + +- ^ NoopFilter (10) + +- ^ Scan parquet (9) +- == Initial Plan == - Sort (51) - +- Exchange (50) - +- HashAggregate (49) - +- Exchange (48) - +- HashAggregate (47) - +- HashAggregate (46) - +- HashAggregate (45) - +- Project (44) - +- ShuffledHashJoin LeftOuter BuildRight (43) - :- Exchange (38) - : +- Scan parquet (37) - +- Exchange (42) - +- Project (41) - +- Filter (40) - +- Scan parquet (39) + Sort (57) + +- Exchange (56) + +- HashAggregate (55) + +- Exchange (54) + +- HashAggregate (53) + +- HashAggregate (52) + +- HashAggregate (51) + +- Project (50) + +- SortMergeJoin LeftOuter (49) + :- Sort (43) + : +- Exchange (42) + : +- Scan parquet (41) + +- Sort (48) + +- Exchange (47) + +- Project (46) + +- Filter (45) + +- Scan parquet (44) (1) Scan parquet @@ -64,225 +66,249 @@ Input [1]: [c_custkey#X] Input [2]: [hash_partition_key#X, c_custkey#X] Arguments: false -(4) ColumnarExchange +(4) VeloxAppendBatches +Input [2]: [hash_partition_key#X, c_custkey#X] +Arguments: X + +(5) ColumnarExchange Input [2]: [hash_partition_key#X, c_custkey#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X], [plan_id=X], [id=#X] -(5) ShuffleQueryStage +(6) ShuffleQueryStage Output [1]: [c_custkey#X] Arguments: X -(6) InputAdapter +(7) InputAdapter Input [1]: [c_custkey#X] -(7) InputIteratorTransformer +(8) InputIteratorTransformer Input [1]: [c_custkey#X] -(8) Scan parquet +(9) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_comment#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_comment), IsNotNull(o_custkey)] ReadSchema: struct -(9) NoopFilter +(10) NoopFilter Input [3]: [o_orderkey#X, o_custkey#X, o_comment#X] Arguments: [o_orderkey#X, o_custkey#X, o_comment#X] -(10) ProjectExecTransformer +(11) ProjectExecTransformer Output [3]: [hash(o_custkey#X, 42) AS hash_partition_key#X, o_orderkey#X, o_custkey#X] Input [3]: [o_orderkey#X, o_custkey#X, o_comment#X] -(11) WholeStageCodegenTransformer (X) +(12) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] Arguments: false -(12) ColumnarExchange +(13) VeloxAppendBatches +Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] +Arguments: X + +(14) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [id=#X] -(13) ShuffleQueryStage +(15) ShuffleQueryStage Output [2]: [o_orderkey#X, o_custkey#X] Arguments: X -(14) InputAdapter +(16) InputAdapter Input [2]: [o_orderkey#X, o_custkey#X] -(15) InputIteratorTransformer +(17) InputIteratorTransformer Input [2]: [o_orderkey#X, o_custkey#X] -(16) ShuffledHashJoinExecTransformer +(18) ShuffledHashJoinExecTransformer Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join type: LeftOuter Join condition: None -(17) ProjectExecTransformer +(19) ProjectExecTransformer Output [2]: [c_custkey#X, o_orderkey#X] Input [3]: [c_custkey#X, o_orderkey#X, o_custkey#X] -(18) RegularHashAggregateExecTransformer +(20) RegularHashAggregateExecTransformer Input [2]: [c_custkey#X, o_orderkey#X] Keys [1]: [c_custkey#X] Functions [1]: [partial_count(o_orderkey#X)] Aggregate Attributes [1]: [count#X] Results [2]: [c_custkey#X, count#X] -(19) RegularHashAggregateExecTransformer +(21) RegularHashAggregateExecTransformer Input [2]: [c_custkey#X, count#X] Keys [1]: [c_custkey#X] Functions [1]: [count(o_orderkey#X)] Aggregate Attributes [1]: [count(o_orderkey#X)#X] Results [2]: [c_custkey#X, count(o_orderkey#X)#X] -(20) ProjectExecTransformer +(22) ProjectExecTransformer Output [1]: [count(o_orderkey#X)#X AS c_count#X] Input [2]: [c_custkey#X, count(o_orderkey#X)#X] -(21) FlushableHashAggregateExecTransformer +(23) FlushableHashAggregateExecTransformer Input [1]: [c_count#X] Keys [1]: [c_count#X] Functions [1]: [partial_count(1)] Aggregate Attributes [1]: [count#X] Results [2]: [c_count#X, count#X] -(22) ProjectExecTransformer +(24) ProjectExecTransformer Output [3]: [hash(c_count#X, 42) AS hash_partition_key#X, c_count#X, count#X] Input [2]: [c_count#X, count#X] -(23) WholeStageCodegenTransformer (X) +(25) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, c_count#X, count#X] Arguments: false -(24) ColumnarExchange +(26) VeloxAppendBatches +Input [3]: [hash_partition_key#X, c_count#X, count#X] +Arguments: X + +(27) ColumnarExchange Input [3]: [hash_partition_key#X, c_count#X, count#X] Arguments: hashpartitioning(c_count#X, 1), ENSURE_REQUIREMENTS, [c_count#X, count#X], [plan_id=X], [id=#X] -(25) ShuffleQueryStage +(28) ShuffleQueryStage Output [2]: [c_count#X, count#X] Arguments: X -(26) InputAdapter +(29) InputAdapter Input [2]: [c_count#X, count#X] -(27) InputIteratorTransformer +(30) InputIteratorTransformer Input [2]: [c_count#X, count#X] -(28) RegularHashAggregateExecTransformer +(31) RegularHashAggregateExecTransformer Input [2]: [c_count#X, count#X] Keys [1]: [c_count#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [c_count#X, count(1)#X AS custdist#X] -(29) WholeStageCodegenTransformer (X) +(32) WholeStageCodegenTransformer (X) Input [2]: [c_count#X, custdist#X] Arguments: false -(30) ColumnarExchange +(33) VeloxAppendBatches +Input [2]: [c_count#X, custdist#X] +Arguments: X + +(34) ColumnarExchange Input [2]: [c_count#X, custdist#X] Arguments: rangepartitioning(custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(31) ShuffleQueryStage +(35) ShuffleQueryStage Output [2]: [c_count#X, custdist#X] Arguments: X -(32) InputAdapter +(36) InputAdapter Input [2]: [c_count#X, custdist#X] -(33) InputIteratorTransformer +(37) InputIteratorTransformer Input [2]: [c_count#X, custdist#X] -(34) SortExecTransformer +(38) SortExecTransformer Input [2]: [c_count#X, custdist#X] Arguments: [custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST], true, 0 -(35) WholeStageCodegenTransformer (X) +(39) WholeStageCodegenTransformer (X) Input [2]: [c_count#X, custdist#X] Arguments: false -(36) VeloxColumnarToRowExec +(40) VeloxColumnarToRowExec Input [2]: [c_count#X, custdist#X] -(37) Scan parquet +(41) Scan parquet Output [1]: [c_custkey#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(38) Exchange +(42) Exchange Input [1]: [c_custkey#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(39) Scan parquet +(43) Sort +Input [1]: [c_custkey#X] +Arguments: [c_custkey#X ASC NULLS FIRST], false, 0 + +(44) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_comment#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_comment), IsNotNull(o_custkey)] ReadSchema: struct -(40) Filter +(45) Filter Input [3]: [o_orderkey#X, o_custkey#X, o_comment#X] Condition : ((isnotnull(o_comment#X) AND NOT o_comment#X LIKE %special%requests%) AND isnotnull(o_custkey#X)) -(41) Project +(46) Project Output [2]: [o_orderkey#X, o_custkey#X] Input [3]: [o_orderkey#X, o_custkey#X, o_comment#X] -(42) Exchange +(47) Exchange Input [2]: [o_orderkey#X, o_custkey#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(43) ShuffledHashJoin +(48) Sort +Input [2]: [o_orderkey#X, o_custkey#X] +Arguments: [o_custkey#X ASC NULLS FIRST], false, 0 + +(49) SortMergeJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join type: LeftOuter Join condition: None -(44) Project +(50) Project Output [2]: [c_custkey#X, o_orderkey#X] Input [3]: [c_custkey#X, o_orderkey#X, o_custkey#X] -(45) HashAggregate +(51) HashAggregate Input [2]: [c_custkey#X, o_orderkey#X] Keys [1]: [c_custkey#X] Functions [1]: [partial_count(o_orderkey#X)] Aggregate Attributes [1]: [count#X] Results [2]: [c_custkey#X, count#X] -(46) HashAggregate +(52) HashAggregate Input [2]: [c_custkey#X, count#X] Keys [1]: [c_custkey#X] Functions [1]: [count(o_orderkey#X)] Aggregate Attributes [1]: [count(o_orderkey#X)#X] Results [1]: [count(o_orderkey#X)#X AS c_count#X] -(47) HashAggregate +(53) HashAggregate Input [1]: [c_count#X] Keys [1]: [c_count#X] Functions [1]: [partial_count(1)] Aggregate Attributes [1]: [count#X] Results [2]: [c_count#X, count#X] -(48) Exchange +(54) Exchange Input [2]: [c_count#X, count#X] Arguments: hashpartitioning(c_count#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(49) HashAggregate +(55) HashAggregate Input [2]: [c_count#X, count#X] Keys [1]: [c_count#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [c_count#X, count(1)#X AS custdist#X] -(50) Exchange +(56) Exchange Input [2]: [c_count#X, custdist#X] Arguments: rangepartitioning(custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(51) Sort +(57) Sort Input [2]: [c_count#X, custdist#X] Arguments: [custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST], true, 0 -(52) AdaptiveSparkPlan +(58) AdaptiveSparkPlan Output [2]: [c_count#X, custdist#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/14.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/14.txt index 4a4159446f40e..175a0e5a97cce 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/14.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/14.txt @@ -1,38 +1,40 @@ == Physical Plan == -AdaptiveSparkPlan (35) +AdaptiveSparkPlan (39) +- == Final Plan == - VeloxColumnarToRowExec (23) - +- ^ ProjectExecTransformer (21) - +- ^ RegularHashAggregateExecTransformer (20) - +- ^ RegularHashAggregateExecTransformer (19) - +- ^ ProjectExecTransformer (18) - +- ^ ShuffledHashJoinExecTransformer Inner (17) - :- ^ InputIteratorTransformer (8) - : +- ^ InputAdapter (7) - : +- ^ ShuffleQueryStage (6), Statistics(X) - : +- ColumnarExchange (5) + VeloxColumnarToRowExec (25) + +- ^ ProjectExecTransformer (23) + +- ^ RegularHashAggregateExecTransformer (22) + +- ^ RegularHashAggregateExecTransformer (21) + +- ^ ProjectExecTransformer (20) + +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (19) + :- ^ InputIteratorTransformer (9) + : +- ShuffleQueryStage (7), Statistics(X) + : +- ColumnarExchange (6) + : +- VeloxAppendBatches (5) : +- ^ ProjectExecTransformer (3) : +- ^ NoopFilter (2) : +- ^ Scan parquet (1) - +- ^ InputIteratorTransformer (16) - +- ^ InputAdapter (15) - +- ^ ShuffleQueryStage (14), Statistics(X) - +- ColumnarExchange (13) - +- ^ ProjectExecTransformer (11) - +- ^ NoopFilter (10) - +- ^ Scan parquet (9) + +- ^ InputIteratorTransformer (18) + +- ShuffleQueryStage (16), Statistics(X) + +- ColumnarExchange (15) + +- VeloxAppendBatches (14) + +- ^ ProjectExecTransformer (12) + +- ^ NoopFilter (11) + +- ^ Scan parquet (10) +- == Initial Plan == - HashAggregate (34) - +- HashAggregate (33) - +- Project (32) - +- ShuffledHashJoin Inner BuildRight (31) - :- Exchange (27) - : +- Project (26) - : +- Filter (25) - : +- Scan parquet (24) - +- Exchange (30) - +- Filter (29) - +- Scan parquet (28) + HashAggregate (38) + +- HashAggregate (37) + +- Project (36) + +- SortMergeJoin Inner (35) + :- Sort (30) + : +- Exchange (29) + : +- Project (28) + : +- Filter (27) + : +- Scan parquet (26) + +- Sort (34) + +- Exchange (33) + +- Filter (32) + +- Scan parquet (31) (1) Scan parquet @@ -54,146 +56,162 @@ Input [4]: [l_partkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Input [4]: [hash_partition_key#X, l_partkey#X, l_extendedprice#X, l_discount#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_partkey#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(6) ColumnarExchange Input [4]: [hash_partition_key#X, l_partkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [3]: [l_partkey#X, l_extendedprice#X, l_discount#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [3]: [l_partkey#X, l_extendedprice#X, l_discount#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [3]: [l_partkey#X, l_extendedprice#X, l_discount#X] -(9) Scan parquet +(10) Scan parquet Output [2]: [p_partkey#X, p_type#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_partkey)] ReadSchema: struct -(10) NoopFilter +(11) NoopFilter Input [2]: [p_partkey#X, p_type#X] Arguments: [p_partkey#X, p_type#X] -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [3]: [hash(p_partkey#X, 42) AS hash_partition_key#X, p_partkey#X, p_type#X] Input [2]: [p_partkey#X, p_type#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, p_partkey#X, p_type#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [3]: [hash_partition_key#X, p_partkey#X, p_type#X] +Arguments: X + +(15) ColumnarExchange Input [3]: [hash_partition_key#X, p_partkey#X, p_type#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X, p_type#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [2]: [p_partkey#X, p_type#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [2]: [p_partkey#X, p_type#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [2]: [p_partkey#X, p_type#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join type: Inner Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [5]: [l_extendedprice#X, l_discount#X, p_type#X, CASE WHEN StartsWith(p_type#X, PROMO) THEN (l_extendedprice#X * (1 - l_discount#X)) ELSE 0.0000 END AS _pre_X#X, (l_extendedprice#X * (1 - l_discount#X)) AS _pre_X#X] Input [5]: [l_partkey#X, l_extendedprice#X, l_discount#X, p_partkey#X, p_type#X] -(19) RegularHashAggregateExecTransformer +(21) RegularHashAggregateExecTransformer Input [5]: [l_extendedprice#X, l_discount#X, p_type#X, _pre_X#X, _pre_X#X] Keys: [] Functions [2]: [partial_sum(_pre_X#X), partial_sum(_pre_X#X)] Aggregate Attributes [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Results [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] -(20) RegularHashAggregateExecTransformer +(22) RegularHashAggregateExecTransformer Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Keys: [] Functions [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN (l_extendedprice#X * (1 - l_discount#X)) ELSE 0.0000 END), sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN (l_extendedprice#X * (1 - l_discount#X)) ELSE 0.0000 END)#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X] Results [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN (l_extendedprice#X * (1 - l_discount#X)) ELSE 0.0000 END)#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X] -(21) ProjectExecTransformer +(23) ProjectExecTransformer Output [1]: [((100.00 * sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN (l_extendedprice#X * (1 - l_discount#X)) ELSE 0.0000 END)#X) / sum((l_extendedprice#X * (1 - l_discount#X)))#X) AS promo_revenue#X] Input [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN (l_extendedprice#X * (1 - l_discount#X)) ELSE 0.0000 END)#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X] -(22) WholeStageCodegenTransformer (X) +(24) WholeStageCodegenTransformer (X) Input [1]: [promo_revenue#X] Arguments: false -(23) VeloxColumnarToRowExec +(25) VeloxColumnarToRowExec Input [1]: [promo_revenue#X] -(24) Scan parquet +(26) Scan parquet Output [4]: [l_partkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1995-09-01), LessThan(l_shipdate,1995-10-01), IsNotNull(l_partkey)] ReadSchema: struct -(25) Filter +(27) Filter Input [4]: [l_partkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : (((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1995-09-01)) AND (l_shipdate#X < 1995-10-01)) AND isnotnull(l_partkey#X)) -(26) Project +(28) Project Output [3]: [l_partkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_partkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(27) Exchange +(29) Exchange Input [3]: [l_partkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(28) Scan parquet +(30) Sort +Input [3]: [l_partkey#X, l_extendedprice#X, l_discount#X] +Arguments: [l_partkey#X ASC NULLS FIRST], false, 0 + +(31) Scan parquet Output [2]: [p_partkey#X, p_type#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_partkey)] ReadSchema: struct -(29) Filter +(32) Filter Input [2]: [p_partkey#X, p_type#X] Condition : isnotnull(p_partkey#X) -(30) Exchange +(33) Exchange Input [2]: [p_partkey#X, p_type#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(31) ShuffledHashJoin +(34) Sort +Input [2]: [p_partkey#X, p_type#X] +Arguments: [p_partkey#X ASC NULLS FIRST], false, 0 + +(35) SortMergeJoin Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join type: Inner Join condition: None -(32) Project +(36) Project Output [3]: [l_extendedprice#X, l_discount#X, p_type#X] Input [5]: [l_partkey#X, l_extendedprice#X, l_discount#X, p_partkey#X, p_type#X] -(33) HashAggregate +(37) HashAggregate Input [3]: [l_extendedprice#X, l_discount#X, p_type#X] Keys: [] Functions [2]: [partial_sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN (l_extendedprice#X * (1 - l_discount#X)) ELSE 0.0000 END), partial_sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Results [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] -(34) HashAggregate +(38) HashAggregate Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Keys: [] Functions [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN (l_extendedprice#X * (1 - l_discount#X)) ELSE 0.0000 END), sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN (l_extendedprice#X * (1 - l_discount#X)) ELSE 0.0000 END)#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X] Results [1]: [((100.00 * sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN (l_extendedprice#X * (1 - l_discount#X)) ELSE 0.0000 END)#X) / sum((l_extendedprice#X * (1 - l_discount#X)))#X) AS promo_revenue#X] -(35) AdaptiveSparkPlan +(39) AdaptiveSparkPlan Output [1]: [promo_revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/15.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/15.txt index 90558947bb35f..130d9036b4a25 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/15.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/15.txt @@ -1,45 +1,48 @@ == Physical Plan == -AdaptiveSparkPlan (42) +AdaptiveSparkPlan (47) +- == Final Plan == - VeloxColumnarToRowExec (27) - +- AQEShuffleRead (26) - +- ShuffleQueryStage (25), Statistics(X) - +- ColumnarExchange (24) - +- ^ ProjectExecTransformer (22) - +- ^ ShuffledHashJoinExecTransformer Inner (21) - :- ^ InputIteratorTransformer (8) - : +- ^ InputAdapter (7) - : +- ^ ShuffleQueryStage (6), Statistics(X) - : +- ColumnarExchange (5) - : +- ^ ProjectExecTransformer (3) - : +- ^ NoopFilter (2) - : +- ^ Scan parquet (1) - +- ^ FilterExecTransformer (20) - +- ^ RegularHashAggregateExecTransformer (19) - +- ^ InputIteratorTransformer (18) - +- ^ InputAdapter (17) - +- ^ ShuffleQueryStage (16), Statistics(X) - +- ColumnarExchange (15) - +- ^ ProjectExecTransformer (13) - +- ^ FlushableHashAggregateExecTransformer (12) - +- ^ ProjectExecTransformer (11) - +- ^ NoopFilter (10) - +- ^ Scan parquet (9) + VeloxColumnarToRowExec (30) + +- AQEShuffleRead (29) + +- ShuffleQueryStage (28), Statistics(X) + +- ColumnarExchange (27) + +- VeloxAppendBatches (26) + +- ^ ProjectExecTransformer (24) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (23) + :- ^ InputIteratorTransformer (9) + : +- ShuffleQueryStage (7), Statistics(X) + : +- ColumnarExchange (6) + : +- VeloxAppendBatches (5) + : +- ^ ProjectExecTransformer (3) + : +- ^ NoopFilter (2) + : +- ^ Scan parquet (1) + +- ^ FilterExecTransformer (22) + +- ^ RegularHashAggregateExecTransformer (21) + +- ^ InputIteratorTransformer (20) + +- ShuffleQueryStage (18), Statistics(X) + +- ColumnarExchange (17) + +- VeloxAppendBatches (16) + +- ^ ProjectExecTransformer (14) + +- ^ FlushableHashAggregateExecTransformer (13) + +- ^ ProjectExecTransformer (12) + +- ^ NoopFilter (11) + +- ^ Scan parquet (10) +- == Initial Plan == - Sort (41) - +- Exchange (40) - +- Project (39) - +- ShuffledHashJoin Inner BuildLeft (38) - :- Exchange (30) - : +- Filter (29) - : +- Scan parquet (28) - +- Filter (37) - +- HashAggregate (36) - +- Exchange (35) - +- HashAggregate (34) - +- Project (33) - +- Filter (32) - +- Scan parquet (31) + Sort (46) + +- Exchange (45) + +- Project (44) + +- SortMergeJoin Inner (43) + :- Sort (34) + : +- Exchange (33) + : +- Filter (32) + : +- Scan parquet (31) + +- Sort (42) + +- Filter (41) + +- HashAggregate (40) + +- Exchange (39) + +- HashAggregate (38) + +- Project (37) + +- Filter (36) + +- Scan parquet (35) (1) Scan parquet @@ -61,331 +64,355 @@ Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_phone#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_phone#X] +Arguments: X + +(6) ColumnarExchange Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_phone#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_address#X, s_phone#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] -(9) Scan parquet +(10) Scan parquet Output [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1996-01-01), LessThan(l_shipdate,1996-04-01), IsNotNull(l_suppkey)] ReadSchema: struct -(10) NoopFilter +(11) NoopFilter Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, (l_extendedprice#X * (1 - l_discount#X)) AS _pre_X#X] Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(12) FlushableHashAggregateExecTransformer +(13) FlushableHashAggregateExecTransformer Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, _pre_X#X] Keys [1]: [l_suppkey#X] Functions [1]: [partial_sum(_pre_X#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_suppkey#X, sum#X, isEmpty#X] -(13) ProjectExecTransformer +(14) ProjectExecTransformer Output [4]: [hash(l_suppkey#X, 42) AS hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] Input [3]: [l_suppkey#X, sum#X, isEmpty#X] -(14) WholeStageCodegenTransformer (X) +(15) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: false -(15) ColumnarExchange +(16) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] +Arguments: X + +(17) ColumnarExchange Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(16) ShuffleQueryStage +(18) ShuffleQueryStage Output [3]: [l_suppkey#X, sum#X, isEmpty#X] Arguments: X -(17) InputAdapter +(19) InputAdapter Input [3]: [l_suppkey#X, sum#X, isEmpty#X] -(18) InputIteratorTransformer +(20) InputIteratorTransformer Input [3]: [l_suppkey#X, sum#X, isEmpty#X] -(19) RegularHashAggregateExecTransformer +(21) RegularHashAggregateExecTransformer Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Keys [1]: [l_suppkey#X] Functions [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X] Results [2]: [l_suppkey#X AS supplier_no#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X AS total_revenue#X] -(20) FilterExecTransformer +(22) FilterExecTransformer Input [2]: [supplier_no#X, total_revenue#X] Arguments: (isnotnull(total_revenue#X) AND (total_revenue#X = Subquery subquery#X, [id=#X])) -(21) ShuffledHashJoinExecTransformer +(23) ShuffledHashJoinExecTransformer Left keys [1]: [s_suppkey#X] Right keys [1]: [supplier_no#X] Join type: Inner Join condition: None -(22) ProjectExecTransformer +(24) ProjectExecTransformer Output [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Input [6]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, supplier_no#X, total_revenue#X] -(23) WholeStageCodegenTransformer (X) +(25) WholeStageCodegenTransformer (X) Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: false -(24) ColumnarExchange +(26) VeloxAppendBatches +Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] +Arguments: X + +(27) ColumnarExchange Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: rangepartitioning(s_suppkey#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(25) ShuffleQueryStage +(28) ShuffleQueryStage Output [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: X -(26) AQEShuffleRead +(29) AQEShuffleRead Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: local -(27) VeloxColumnarToRowExec +(30) VeloxColumnarToRowExec Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] -(28) Scan parquet +(31) Scan parquet Output [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey)] ReadSchema: struct -(29) Filter +(32) Filter Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] Condition : isnotnull(s_suppkey#X) -(30) Exchange +(33) Exchange Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(31) Scan parquet +(34) Sort +Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] +Arguments: [s_suppkey#X ASC NULLS FIRST], false, 0 + +(35) Scan parquet Output [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1996-01-01), LessThan(l_shipdate,1996-04-01), IsNotNull(l_suppkey)] ReadSchema: struct -(32) Filter +(36) Filter Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : (((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1996-01-01)) AND (l_shipdate#X < 1996-04-01)) AND isnotnull(l_suppkey#X)) -(33) Project +(37) Project Output [3]: [l_suppkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(34) HashAggregate +(38) HashAggregate Input [3]: [l_suppkey#X, l_extendedprice#X, l_discount#X] Keys [1]: [l_suppkey#X] Functions [1]: [partial_sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_suppkey#X, sum#X, isEmpty#X] -(35) Exchange +(39) Exchange Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(36) HashAggregate +(40) HashAggregate Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Keys [1]: [l_suppkey#X] Functions [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X] Results [2]: [l_suppkey#X AS supplier_no#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X AS total_revenue#X] -(37) Filter +(41) Filter Input [2]: [supplier_no#X, total_revenue#X] Condition : (isnotnull(total_revenue#X) AND (total_revenue#X = Subquery subquery#X, [id=#X])) -(38) ShuffledHashJoin +(42) Sort +Input [2]: [supplier_no#X, total_revenue#X] +Arguments: [supplier_no#X ASC NULLS FIRST], false, 0 + +(43) SortMergeJoin Left keys [1]: [s_suppkey#X] Right keys [1]: [supplier_no#X] Join type: Inner Join condition: None -(39) Project +(44) Project Output [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Input [6]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, supplier_no#X, total_revenue#X] -(40) Exchange +(45) Exchange Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: rangepartitioning(s_suppkey#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(41) Sort +(46) Sort Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: [s_suppkey#X ASC NULLS FIRST], true, 0 -(42) AdaptiveSparkPlan +(47) AdaptiveSparkPlan Output [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: isFinalPlan=true ===== Subqueries ===== -Subquery:1 Hosting operator id = 20 Hosting Expression = Subquery subquery#X, [id=#X] -AdaptiveSparkPlan (67) +Subquery:1 Hosting operator id = 22 Hosting Expression = Subquery subquery#X, [id=#X] +AdaptiveSparkPlan (73) +- == Final Plan == - VeloxColumnarToRowExec (58) - +- ^ RegularHashAggregateExecTransformer (56) - +- ^ RegularHashAggregateExecTransformer (55) - +- ^ ProjectExecTransformer (54) - +- ^ RegularHashAggregateExecTransformer (53) - +- ^ InputIteratorTransformer (52) - +- ^ InputAdapter (51) - +- ^ ShuffleQueryStage (50), Statistics(X) - +- ColumnarExchange (49) - +- ^ ProjectExecTransformer (47) - +- ^ FlushableHashAggregateExecTransformer (46) - +- ^ ProjectExecTransformer (45) - +- ^ NoopFilter (44) - +- ^ Scan parquet (43) + VeloxColumnarToRowExec (64) + +- ^ RegularHashAggregateExecTransformer (62) + +- ^ RegularHashAggregateExecTransformer (61) + +- ^ ProjectExecTransformer (60) + +- ^ RegularHashAggregateExecTransformer (59) + +- ^ InputIteratorTransformer (58) + +- ShuffleQueryStage (56), Statistics(X) + +- ColumnarExchange (55) + +- VeloxAppendBatches (54) + +- ^ ProjectExecTransformer (52) + +- ^ FlushableHashAggregateExecTransformer (51) + +- ^ ProjectExecTransformer (50) + +- ^ NoopFilter (49) + +- ^ Scan parquet (48) +- == Initial Plan == - HashAggregate (66) - +- HashAggregate (65) - +- HashAggregate (64) - +- Exchange (63) - +- HashAggregate (62) - +- Project (61) - +- Filter (60) - +- Scan parquet (59) + HashAggregate (72) + +- HashAggregate (71) + +- HashAggregate (70) + +- Exchange (69) + +- HashAggregate (68) + +- Project (67) + +- Filter (66) + +- Scan parquet (65) -(43) Scan parquet +(48) Scan parquet Output [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1996-01-01), LessThan(l_shipdate,1996-04-01)] ReadSchema: struct -(44) NoopFilter +(49) NoopFilter Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(45) ProjectExecTransformer +(50) ProjectExecTransformer Output [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, (l_extendedprice#X * (1 - l_discount#X)) AS _pre_X#X] Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(46) FlushableHashAggregateExecTransformer +(51) FlushableHashAggregateExecTransformer Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, _pre_X#X] Keys [1]: [l_suppkey#X] Functions [1]: [partial_sum(_pre_X#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_suppkey#X, sum#X, isEmpty#X] -(47) ProjectExecTransformer +(52) ProjectExecTransformer Output [4]: [hash(l_suppkey#X, 42) AS hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] Input [3]: [l_suppkey#X, sum#X, isEmpty#X] -(48) WholeStageCodegenTransformer (X) +(53) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: false -(49) ColumnarExchange +(54) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] +Arguments: X + +(55) ColumnarExchange Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(50) ShuffleQueryStage +(56) ShuffleQueryStage Output [3]: [l_suppkey#X, sum#X, isEmpty#X] Arguments: X -(51) InputAdapter +(57) InputAdapter Input [3]: [l_suppkey#X, sum#X, isEmpty#X] -(52) InputIteratorTransformer +(58) InputIteratorTransformer Input [3]: [l_suppkey#X, sum#X, isEmpty#X] -(53) RegularHashAggregateExecTransformer +(59) RegularHashAggregateExecTransformer Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Keys [1]: [l_suppkey#X] Functions [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X] Results [2]: [l_suppkey#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X] -(54) ProjectExecTransformer +(60) ProjectExecTransformer Output [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X AS total_revenue#X] Input [2]: [l_suppkey#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X] -(55) RegularHashAggregateExecTransformer +(61) RegularHashAggregateExecTransformer Input [1]: [total_revenue#X] Keys: [] Functions [1]: [partial_max(total_revenue#X)] Aggregate Attributes [1]: [max#X] Results [1]: [max#X] -(56) RegularHashAggregateExecTransformer +(62) RegularHashAggregateExecTransformer Input [1]: [max#X] Keys: [] Functions [1]: [max(total_revenue#X)] Aggregate Attributes [1]: [max(total_revenue#X)#X] Results [1]: [max(total_revenue#X)#X AS max(total_revenue)#X] -(57) WholeStageCodegenTransformer (X) +(63) WholeStageCodegenTransformer (X) Input [1]: [max(total_revenue)#X] Arguments: false -(58) VeloxColumnarToRowExec +(64) VeloxColumnarToRowExec Input [1]: [max(total_revenue)#X] -(59) Scan parquet +(65) Scan parquet Output [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1996-01-01), LessThan(l_shipdate,1996-04-01)] ReadSchema: struct -(60) Filter +(66) Filter Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : ((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1996-01-01)) AND (l_shipdate#X < 1996-04-01)) -(61) Project +(67) Project Output [3]: [l_suppkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(62) HashAggregate +(68) HashAggregate Input [3]: [l_suppkey#X, l_extendedprice#X, l_discount#X] Keys [1]: [l_suppkey#X] Functions [1]: [partial_sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_suppkey#X, sum#X, isEmpty#X] -(63) Exchange +(69) Exchange Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(64) HashAggregate +(70) HashAggregate Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Keys [1]: [l_suppkey#X] Functions [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X] Results [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X AS total_revenue#X] -(65) HashAggregate +(71) HashAggregate Input [1]: [total_revenue#X] Keys: [] Functions [1]: [partial_max(total_revenue#X)] Aggregate Attributes [1]: [max#X] Results [1]: [max#X] -(66) HashAggregate +(72) HashAggregate Input [1]: [max#X] Keys: [] Functions [1]: [max(total_revenue#X)] Aggregate Attributes [1]: [max(total_revenue#X)#X] Results [1]: [max(total_revenue#X)#X AS max(total_revenue)#X] -(67) AdaptiveSparkPlan +(73) AdaptiveSparkPlan Output [1]: [max(total_revenue)#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/16.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/16.txt index 98c7da22a549e..32a24beb94b95 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/16.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/16.txt @@ -1,64 +1,66 @@ == Physical Plan == -AdaptiveSparkPlan (64) +AdaptiveSparkPlan (71) +- == Final Plan == - VeloxColumnarToRowExec (42) - +- ^ SortExecTransformer (40) - +- ^ InputIteratorTransformer (39) - +- ^ InputAdapter (38) - +- ^ ShuffleQueryStage (37), Statistics(X) - +- ColumnarExchange (36) - +- ^ RegularHashAggregateExecTransformer (34) - +- ^ InputIteratorTransformer (33) - +- ^ InputAdapter (32) - +- ^ ShuffleQueryStage (31), Statistics(X) - +- ColumnarExchange (30) - +- ^ ProjectExecTransformer (28) - +- ^ FlushableHashAggregateExecTransformer (27) - +- ^ RegularHashAggregateExecTransformer (26) - +- ^ InputIteratorTransformer (25) - +- ^ InputAdapter (24) - +- ^ ShuffleQueryStage (23), Statistics(X) - +- ColumnarExchange (22) - +- ^ ProjectExecTransformer (20) - +- ^ FlushableHashAggregateExecTransformer (19) - +- ^ ProjectExecTransformer (18) - +- ^ ShuffledHashJoinExecTransformer Inner (17) - :- ^ InputIteratorTransformer (8) - : +- ^ InputAdapter (7) - : +- ^ ShuffleQueryStage (6), Statistics(X) - : +- ColumnarExchange (5) + VeloxColumnarToRowExec (47) + +- ^ SortExecTransformer (45) + +- ^ InputIteratorTransformer (44) + +- ShuffleQueryStage (42), Statistics(X) + +- ColumnarExchange (41) + +- VeloxAppendBatches (40) + +- ^ RegularHashAggregateExecTransformer (38) + +- ^ InputIteratorTransformer (37) + +- ShuffleQueryStage (35), Statistics(X) + +- ColumnarExchange (34) + +- VeloxAppendBatches (33) + +- ^ ProjectExecTransformer (31) + +- ^ FlushableHashAggregateExecTransformer (30) + +- ^ RegularHashAggregateExecTransformer (29) + +- ^ InputIteratorTransformer (28) + +- ShuffleQueryStage (26), Statistics(X) + +- ColumnarExchange (25) + +- VeloxAppendBatches (24) + +- ^ ProjectExecTransformer (22) + +- ^ FlushableHashAggregateExecTransformer (21) + +- ^ ProjectExecTransformer (20) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (19) + :- ^ InputIteratorTransformer (9) + : +- ShuffleQueryStage (7), Statistics(X) + : +- ColumnarExchange (6) + : +- VeloxAppendBatches (5) : +- ^ ProjectExecTransformer (3) : +- ^ NoopFilter (2) : +- ^ Scan parquet (1) - +- ^ InputIteratorTransformer (16) - +- ^ InputAdapter (15) - +- ^ ShuffleQueryStage (14), Statistics(X) - +- ColumnarExchange (13) - +- ^ ProjectExecTransformer (11) - +- ^ NoopFilter (10) - +- ^ Scan parquet (9) + +- ^ InputIteratorTransformer (18) + +- ShuffleQueryStage (16), Statistics(X) + +- ColumnarExchange (15) + +- VeloxAppendBatches (14) + +- ^ ProjectExecTransformer (12) + +- ^ NoopFilter (11) + +- ^ Scan parquet (10) +- == Initial Plan == - Sort (63) - +- Exchange (62) - +- HashAggregate (61) - +- Exchange (60) - +- HashAggregate (59) - +- HashAggregate (58) - +- Exchange (57) - +- HashAggregate (56) - +- Project (55) - +- ShuffledHashJoin Inner BuildRight (54) - :- Exchange (50) - : +- BroadcastHashJoin LeftAnti BuildRight (49) - : :- Filter (44) - : : +- Scan parquet (43) - : +- BroadcastExchange (48) - : +- Project (47) - : +- Filter (46) - : +- Scan parquet (45) - +- Exchange (53) - +- Filter (52) - +- Scan parquet (51) + Sort (70) + +- Exchange (69) + +- HashAggregate (68) + +- Exchange (67) + +- HashAggregate (66) + +- HashAggregate (65) + +- Exchange (64) + +- HashAggregate (63) + +- Project (62) + +- SortMergeJoin Inner (61) + :- Sort (56) + : +- Exchange (55) + : +- BroadcastHashJoin LeftAnti BuildRight (54) + : :- Filter (49) + : : +- Scan parquet (48) + : +- BroadcastExchange (53) + : +- Project (52) + : +- Filter (51) + : +- Scan parquet (50) + +- Sort (60) + +- Exchange (59) + +- Filter (58) + +- Scan parquet (57) (1) Scan parquet @@ -80,273 +82,301 @@ Input [2]: [ps_partkey#X, ps_suppkey#X] Input [3]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [3]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X] +Arguments: X + +(6) ColumnarExchange Input [3]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X] Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [2]: [ps_partkey#X, ps_suppkey#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [2]: [ps_partkey#X, ps_suppkey#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [2]: [ps_partkey#X, ps_suppkey#X] -(9) Scan parquet +(10) Scan parquet Output [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_brand), IsNotNull(p_type), Not(EqualTo(p_brand,Brand#X)), Not(StringStartsWith(p_type,MEDIUM POLISHED)), In(p_size, [14,19,23,3,36,45,49,9]), IsNotNull(p_partkey)] ReadSchema: struct -(10) NoopFilter +(11) NoopFilter Input [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] Arguments: [p_partkey#X, p_brand#X, p_type#X, p_size#X] -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [5]: [hash(p_partkey#X, 42) AS hash_partition_key#X, p_partkey#X, p_brand#X, p_type#X, p_size#X] Input [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, p_partkey#X, p_brand#X, p_type#X, p_size#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [5]: [hash_partition_key#X, p_partkey#X, p_brand#X, p_type#X, p_size#X] +Arguments: X + +(15) ColumnarExchange Input [5]: [hash_partition_key#X, p_partkey#X, p_brand#X, p_type#X, p_size#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X, p_brand#X, p_type#X, p_size#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [ps_partkey#X] Right keys [1]: [p_partkey#X] Join type: Inner Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [4]: [ps_suppkey#X, p_brand#X, p_type#X, p_size#X] Input [6]: [ps_partkey#X, ps_suppkey#X, p_partkey#X, p_brand#X, p_type#X, p_size#X] -(19) FlushableHashAggregateExecTransformer +(21) FlushableHashAggregateExecTransformer Input [4]: [ps_suppkey#X, p_brand#X, p_type#X, p_size#X] Keys [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Functions: [] Aggregate Attributes: [] Results [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(20) ProjectExecTransformer +(22) ProjectExecTransformer Output [5]: [hash(p_brand#X, p_type#X, p_size#X, ps_suppkey#X, 42) AS hash_partition_key#X, p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(21) WholeStageCodegenTransformer (X) +(23) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Arguments: false -(22) ColumnarExchange +(24) VeloxAppendBatches +Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, ps_suppkey#X] +Arguments: X + +(25) ColumnarExchange Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, ps_suppkey#X], [plan_id=X], [id=#X] -(23) ShuffleQueryStage +(26) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Arguments: X -(24) InputAdapter +(27) InputAdapter Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(25) InputIteratorTransformer +(28) InputIteratorTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(26) RegularHashAggregateExecTransformer +(29) RegularHashAggregateExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Keys [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Functions: [] Aggregate Attributes: [] Results [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(27) FlushableHashAggregateExecTransformer +(30) FlushableHashAggregateExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] Functions [1]: [partial_count(distinct ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count#X] -(28) ProjectExecTransformer +(31) ProjectExecTransformer Output [5]: [hash(p_brand#X, p_type#X, p_size#X, 42) AS hash_partition_key#X, p_brand#X, p_type#X, p_size#X, count#X] Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] -(29) WholeStageCodegenTransformer (X) +(32) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, count#X] Arguments: false -(30) ColumnarExchange +(33) VeloxAppendBatches +Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, count#X] +Arguments: X + +(34) ColumnarExchange Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, count#X] Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, count#X], [plan_id=X], [id=#X] -(31) ShuffleQueryStage +(35) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, count#X] Arguments: X -(32) InputAdapter +(36) InputAdapter Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] -(33) InputIteratorTransformer +(37) InputIteratorTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] -(34) RegularHashAggregateExecTransformer +(38) RegularHashAggregateExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] Functions [1]: [count(distinct ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count(ps_suppkey#X)#X AS supplier_cnt#X] -(35) WholeStageCodegenTransformer (X) +(39) WholeStageCodegenTransformer (X) Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: false -(36) ColumnarExchange +(40) VeloxAppendBatches +Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] +Arguments: X + +(41) ColumnarExchange Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: rangepartitioning(supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(37) ShuffleQueryStage +(42) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: X -(38) InputAdapter +(43) InputAdapter Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] -(39) InputIteratorTransformer +(44) InputIteratorTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] -(40) SortExecTransformer +(45) SortExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: [supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST], true, 0 -(41) WholeStageCodegenTransformer (X) +(46) WholeStageCodegenTransformer (X) Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: false -(42) VeloxColumnarToRowExec +(47) VeloxColumnarToRowExec Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] -(43) Scan parquet +(48) Scan parquet Output [2]: [ps_partkey#X, ps_suppkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_partkey)] ReadSchema: struct -(44) Filter +(49) Filter Input [2]: [ps_partkey#X, ps_suppkey#X] Condition : isnotnull(ps_partkey#X) -(45) Scan parquet +(50) Scan parquet Output [2]: [s_suppkey#X, s_comment#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_comment)] ReadSchema: struct -(46) Filter +(51) Filter Input [2]: [s_suppkey#X, s_comment#X] Condition : (isnotnull(s_comment#X) AND s_comment#X LIKE %Customer%Complaints%) -(47) Project +(52) Project Output [1]: [s_suppkey#X] Input [2]: [s_suppkey#X, s_comment#X] -(48) BroadcastExchange +(53) BroadcastExchange Input [1]: [s_suppkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),true), [plan_id=X] -(49) BroadcastHashJoin +(54) BroadcastHashJoin Left keys [1]: [ps_suppkey#X] Right keys [1]: [s_suppkey#X] Join type: LeftAnti Join condition: None -(50) Exchange +(55) Exchange Input [2]: [ps_partkey#X, ps_suppkey#X] Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(51) Scan parquet +(56) Sort +Input [2]: [ps_partkey#X, ps_suppkey#X] +Arguments: [ps_partkey#X ASC NULLS FIRST], false, 0 + +(57) Scan parquet Output [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_brand), IsNotNull(p_type), Not(EqualTo(p_brand,Brand#X)), Not(StringStartsWith(p_type,MEDIUM POLISHED)), In(p_size, [14,19,23,3,36,45,49,9]), IsNotNull(p_partkey)] ReadSchema: struct -(52) Filter +(58) Filter Input [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] Condition : (((((isnotnull(p_brand#X) AND isnotnull(p_type#X)) AND NOT (p_brand#X = Brand#X)) AND NOT StartsWith(p_type#X, MEDIUM POLISHED)) AND p_size#X IN (49,14,23,45,19,3,36,9)) AND isnotnull(p_partkey#X)) -(53) Exchange +(59) Exchange Input [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(54) ShuffledHashJoin +(60) Sort +Input [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] +Arguments: [p_partkey#X ASC NULLS FIRST], false, 0 + +(61) SortMergeJoin Left keys [1]: [ps_partkey#X] Right keys [1]: [p_partkey#X] Join type: Inner Join condition: None -(55) Project +(62) Project Output [4]: [ps_suppkey#X, p_brand#X, p_type#X, p_size#X] Input [6]: [ps_partkey#X, ps_suppkey#X, p_partkey#X, p_brand#X, p_type#X, p_size#X] -(56) HashAggregate +(63) HashAggregate Input [4]: [ps_suppkey#X, p_brand#X, p_type#X, p_size#X] Keys [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Functions: [] Aggregate Attributes: [] Results [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(57) Exchange +(64) Exchange Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(58) HashAggregate +(65) HashAggregate Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Keys [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Functions: [] Aggregate Attributes: [] Results [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(59) HashAggregate +(66) HashAggregate Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] Functions [1]: [partial_count(distinct ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count#X] -(60) Exchange +(67) Exchange Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(61) HashAggregate +(68) HashAggregate Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] Functions [1]: [count(distinct ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count(ps_suppkey#X)#X AS supplier_cnt#X] -(62) Exchange +(69) Exchange Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: rangepartitioning(supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(63) Sort +(70) Sort Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: [supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST], true, 0 -(64) AdaptiveSparkPlan +(71) AdaptiveSparkPlan Output [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/17.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/17.txt index 97c56c7dae8da..59baa2d7a08ca 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/17.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/17.txt @@ -1,59 +1,62 @@ == Physical Plan == -AdaptiveSparkPlan (57) +AdaptiveSparkPlan (63) +- == Final Plan == - VeloxColumnarToRowExec (37) - +- ^ ProjectExecTransformer (35) - +- ^ RegularHashAggregateExecTransformer (34) - +- ^ RegularHashAggregateExecTransformer (33) - +- ^ ProjectExecTransformer (32) - +- ^ ShuffledHashJoinExecTransformer Inner (31) - :- ^ ProjectExecTransformer (18) - : +- ^ ShuffledHashJoinExecTransformer Inner (17) - : :- ^ InputIteratorTransformer (8) - : : +- ^ InputAdapter (7) - : : +- ^ ShuffleQueryStage (6), Statistics(X) - : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (40) + +- ^ ProjectExecTransformer (38) + +- ^ RegularHashAggregateExecTransformer (37) + +- ^ RegularHashAggregateExecTransformer (36) + +- ^ ProjectExecTransformer (35) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (34) + :- ^ ProjectExecTransformer (20) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (19) + : :- ^ InputIteratorTransformer (9) + : : +- ShuffleQueryStage (7), Statistics(X) + : : +- ColumnarExchange (6) + : : +- VeloxAppendBatches (5) : : +- ^ ProjectExecTransformer (3) : : +- ^ NoopFilter (2) : : +- ^ Scan parquet (1) - : +- ^ InputIteratorTransformer (16) - : +- ^ InputAdapter (15) - : +- ^ ShuffleQueryStage (14), Statistics(X) - : +- ColumnarExchange (13) - : +- ^ ProjectExecTransformer (11) - : +- ^ NoopFilter (10) - : +- ^ Scan parquet (9) - +- ^ FilterExecTransformer (30) - +- ^ ProjectExecTransformer (29) - +- ^ RegularHashAggregateExecTransformer (28) - +- ^ InputIteratorTransformer (27) - +- ^ InputAdapter (26) - +- ^ ShuffleQueryStage (25), Statistics(X) - +- ColumnarExchange (24) - +- ^ ProjectExecTransformer (22) - +- ^ FlushableHashAggregateExecTransformer (21) - +- ^ NoopFilter (20) - +- ^ Scan parquet (19) + : +- ^ InputIteratorTransformer (18) + : +- ShuffleQueryStage (16), Statistics(X) + : +- ColumnarExchange (15) + : +- VeloxAppendBatches (14) + : +- ^ ProjectExecTransformer (12) + : +- ^ NoopFilter (11) + : +- ^ Scan parquet (10) + +- ^ FilterExecTransformer (33) + +- ^ ProjectExecTransformer (32) + +- ^ RegularHashAggregateExecTransformer (31) + +- ^ InputIteratorTransformer (30) + +- ShuffleQueryStage (28), Statistics(X) + +- ColumnarExchange (27) + +- VeloxAppendBatches (26) + +- ^ ProjectExecTransformer (24) + +- ^ FlushableHashAggregateExecTransformer (23) + +- ^ NoopFilter (22) + +- ^ Scan parquet (21) +- == Initial Plan == - HashAggregate (56) - +- HashAggregate (55) - +- Project (54) - +- ShuffledHashJoin Inner BuildRight (53) - :- Project (46) - : +- ShuffledHashJoin Inner BuildRight (45) - : :- Exchange (40) - : : +- Filter (39) - : : +- Scan parquet (38) - : +- Exchange (44) - : +- Project (43) - : +- Filter (42) - : +- Scan parquet (41) - +- Filter (52) - +- HashAggregate (51) - +- Exchange (50) - +- HashAggregate (49) - +- Filter (48) - +- Scan parquet (47) + HashAggregate (62) + +- HashAggregate (61) + +- Project (60) + +- SortMergeJoin Inner (59) + :- Project (51) + : +- SortMergeJoin Inner (50) + : :- Sort (44) + : : +- Exchange (43) + : : +- Filter (42) + : : +- Scan parquet (41) + : +- Sort (49) + : +- Exchange (48) + : +- Project (47) + : +- Filter (46) + : +- Scan parquet (45) + +- Sort (58) + +- Filter (57) + +- HashAggregate (56) + +- Exchange (55) + +- HashAggregate (54) + +- Filter (53) + +- Scan parquet (52) (1) Scan parquet @@ -75,254 +78,278 @@ Input [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] Input [4]: [hash_partition_key#X, l_partkey#X, l_quantity#X, l_extendedprice#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_partkey#X, l_quantity#X, l_extendedprice#X] +Arguments: X + +(6) ColumnarExchange Input [4]: [hash_partition_key#X, l_partkey#X, l_quantity#X, l_extendedprice#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_quantity#X, l_extendedprice#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] -(9) Scan parquet +(10) Scan parquet Output [3]: [p_partkey#X, p_brand#X, p_container#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_brand), IsNotNull(p_container), EqualTo(p_brand,Brand#X), EqualTo(p_container,MED BOX), IsNotNull(p_partkey)] ReadSchema: struct -(10) NoopFilter +(11) NoopFilter Input [3]: [p_partkey#X, p_brand#X, p_container#X] Arguments: [p_partkey#X, p_brand#X, p_container#X] -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [2]: [hash(p_partkey#X, 42) AS hash_partition_key#X, p_partkey#X] Input [3]: [p_partkey#X, p_brand#X, p_container#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, p_partkey#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [2]: [hash_partition_key#X, p_partkey#X] +Arguments: X + +(15) ColumnarExchange Input [2]: [hash_partition_key#X, p_partkey#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [1]: [p_partkey#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [1]: [p_partkey#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [1]: [p_partkey#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join type: Inner Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [3]: [l_quantity#X, l_extendedprice#X, p_partkey#X] Input [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, p_partkey#X] -(19) Scan parquet +(21) Scan parquet Output [2]: [l_partkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey)] ReadSchema: struct -(20) NoopFilter +(22) NoopFilter Input [2]: [l_partkey#X, l_quantity#X] Arguments: [l_partkey#X, l_quantity#X] -(21) FlushableHashAggregateExecTransformer +(23) FlushableHashAggregateExecTransformer Input [2]: [l_partkey#X, l_quantity#X] Keys [1]: [l_partkey#X] Functions [1]: [partial_avg(l_quantity#X)] Aggregate Attributes [2]: [sum#X, count#X] Results [3]: [l_partkey#X, sum#X, count#X] -(22) ProjectExecTransformer +(24) ProjectExecTransformer Output [4]: [hash(l_partkey#X, 42) AS hash_partition_key#X, l_partkey#X, sum#X, count#X] Input [3]: [l_partkey#X, sum#X, count#X] -(23) WholeStageCodegenTransformer (X) +(25) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, l_partkey#X, sum#X, count#X] Arguments: false -(24) ColumnarExchange +(26) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_partkey#X, sum#X, count#X] +Arguments: X + +(27) ColumnarExchange Input [4]: [hash_partition_key#X, l_partkey#X, sum#X, count#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, sum#X, count#X], [plan_id=X], [id=#X] -(25) ShuffleQueryStage +(28) ShuffleQueryStage Output [3]: [l_partkey#X, sum#X, count#X] Arguments: X -(26) InputAdapter +(29) InputAdapter Input [3]: [l_partkey#X, sum#X, count#X] -(27) InputIteratorTransformer +(30) InputIteratorTransformer Input [3]: [l_partkey#X, sum#X, count#X] -(28) RegularHashAggregateExecTransformer +(31) RegularHashAggregateExecTransformer Input [3]: [l_partkey#X, sum#X, count#X] Keys [1]: [l_partkey#X] Functions [1]: [avg(l_quantity#X)] Aggregate Attributes [1]: [avg(l_quantity#X)#X] Results [2]: [l_partkey#X, avg(l_quantity#X)#X] -(29) ProjectExecTransformer +(32) ProjectExecTransformer Output [2]: [(0.2 * avg(l_quantity#X)#X) AS (0.2 * avg(l_quantity))#X, l_partkey#X] Input [2]: [l_partkey#X, avg(l_quantity#X)#X] -(30) FilterExecTransformer +(33) FilterExecTransformer Input [2]: [(0.2 * avg(l_quantity))#X, l_partkey#X] Arguments: isnotnull((0.2 * avg(l_quantity))#X) -(31) ShuffledHashJoinExecTransformer +(34) ShuffledHashJoinExecTransformer Left keys [1]: [p_partkey#X] Right keys [1]: [l_partkey#X] Join type: Inner Join condition: (cast(l_quantity#X as decimal(18,7)) < (0.2 * avg(l_quantity))#X) -(32) ProjectExecTransformer +(35) ProjectExecTransformer Output [1]: [l_extendedprice#X] Input [5]: [l_quantity#X, l_extendedprice#X, p_partkey#X, (0.2 * avg(l_quantity))#X, l_partkey#X] -(33) RegularHashAggregateExecTransformer +(36) RegularHashAggregateExecTransformer Input [1]: [l_extendedprice#X] Keys: [] Functions [1]: [partial_sum(l_extendedprice#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(34) RegularHashAggregateExecTransformer +(37) RegularHashAggregateExecTransformer Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(l_extendedprice#X)] Aggregate Attributes [1]: [sum(l_extendedprice#X)#X] Results [1]: [sum(l_extendedprice#X)#X] -(35) ProjectExecTransformer +(38) ProjectExecTransformer Output [1]: [(sum(l_extendedprice#X)#X / 7.0) AS avg_yearly#X] Input [1]: [sum(l_extendedprice#X)#X] -(36) WholeStageCodegenTransformer (X) +(39) WholeStageCodegenTransformer (X) Input [1]: [avg_yearly#X] Arguments: false -(37) VeloxColumnarToRowExec +(40) VeloxColumnarToRowExec Input [1]: [avg_yearly#X] -(38) Scan parquet +(41) Scan parquet Output [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey), IsNotNull(l_quantity)] ReadSchema: struct -(39) Filter +(42) Filter Input [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] Condition : (isnotnull(l_partkey#X) AND isnotnull(l_quantity#X)) -(40) Exchange +(43) Exchange Input [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(41) Scan parquet +(44) Sort +Input [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] +Arguments: [l_partkey#X ASC NULLS FIRST], false, 0 + +(45) Scan parquet Output [3]: [p_partkey#X, p_brand#X, p_container#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_brand), IsNotNull(p_container), EqualTo(p_brand,Brand#X), EqualTo(p_container,MED BOX), IsNotNull(p_partkey)] ReadSchema: struct -(42) Filter +(46) Filter Input [3]: [p_partkey#X, p_brand#X, p_container#X] Condition : ((((isnotnull(p_brand#X) AND isnotnull(p_container#X)) AND (p_brand#X = Brand#X)) AND (p_container#X = MED BOX)) AND isnotnull(p_partkey#X)) -(43) Project +(47) Project Output [1]: [p_partkey#X] Input [3]: [p_partkey#X, p_brand#X, p_container#X] -(44) Exchange +(48) Exchange Input [1]: [p_partkey#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(45) ShuffledHashJoin +(49) Sort +Input [1]: [p_partkey#X] +Arguments: [p_partkey#X ASC NULLS FIRST], false, 0 + +(50) SortMergeJoin Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join type: Inner Join condition: None -(46) Project +(51) Project Output [3]: [l_quantity#X, l_extendedprice#X, p_partkey#X] Input [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, p_partkey#X] -(47) Scan parquet +(52) Scan parquet Output [2]: [l_partkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey)] ReadSchema: struct -(48) Filter +(53) Filter Input [2]: [l_partkey#X, l_quantity#X] Condition : isnotnull(l_partkey#X) -(49) HashAggregate +(54) HashAggregate Input [2]: [l_partkey#X, l_quantity#X] Keys [1]: [l_partkey#X] Functions [1]: [partial_avg(l_quantity#X)] Aggregate Attributes [2]: [sum#X, count#X] Results [3]: [l_partkey#X, sum#X, count#X] -(50) Exchange +(55) Exchange Input [3]: [l_partkey#X, sum#X, count#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(51) HashAggregate +(56) HashAggregate Input [3]: [l_partkey#X, sum#X, count#X] Keys [1]: [l_partkey#X] Functions [1]: [avg(l_quantity#X)] Aggregate Attributes [1]: [avg(l_quantity#X)#X] Results [2]: [(0.2 * avg(l_quantity#X)#X) AS (0.2 * avg(l_quantity))#X, l_partkey#X] -(52) Filter +(57) Filter Input [2]: [(0.2 * avg(l_quantity))#X, l_partkey#X] Condition : isnotnull((0.2 * avg(l_quantity))#X) -(53) ShuffledHashJoin +(58) Sort +Input [2]: [(0.2 * avg(l_quantity))#X, l_partkey#X] +Arguments: [l_partkey#X ASC NULLS FIRST], false, 0 + +(59) SortMergeJoin Left keys [1]: [p_partkey#X] Right keys [1]: [l_partkey#X] Join type: Inner Join condition: (cast(l_quantity#X as decimal(18,7)) < (0.2 * avg(l_quantity))#X) -(54) Project +(60) Project Output [1]: [l_extendedprice#X] Input [5]: [l_quantity#X, l_extendedprice#X, p_partkey#X, (0.2 * avg(l_quantity))#X, l_partkey#X] -(55) HashAggregate +(61) HashAggregate Input [1]: [l_extendedprice#X] Keys: [] Functions [1]: [partial_sum(l_extendedprice#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(56) HashAggregate +(62) HashAggregate Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(l_extendedprice#X)] Aggregate Attributes [1]: [sum(l_extendedprice#X)#X] Results [1]: [(sum(l_extendedprice#X)#X / 7.0) AS avg_yearly#X] -(57) AdaptiveSparkPlan +(63) AdaptiveSparkPlan Output [1]: [avg_yearly#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/18.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/18.txt index f0537d1fcf079..c78e265e54d95 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/18.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/18.txt @@ -1,96 +1,102 @@ == Physical Plan == -AdaptiveSparkPlan (97) +AdaptiveSparkPlan (110) +- == Final Plan == - VeloxColumnarToRowExec (64) - +- TakeOrderedAndProjectExecTransformer (63) - +- ^ RegularHashAggregateExecTransformer (61) - +- ^ RegularHashAggregateExecTransformer (60) - +- ^ ProjectExecTransformer (59) - +- ^ ShuffledHashJoinExecTransformer Inner (58) - :- ^ InputIteratorTransformer (41) - : +- ^ InputAdapter (40) - : +- ^ ShuffleQueryStage (39), Statistics(X) - : +- ColumnarExchange (38) - : +- ^ ProjectExecTransformer (36) - : +- ^ ShuffledHashJoinExecTransformer Inner (35) - : :- ^ InputIteratorTransformer (8) - : : +- ^ InputAdapter (7) - : : +- ^ ShuffleQueryStage (6), Statistics(X) - : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (70) + +- TakeOrderedAndProjectExecTransformer (69) + +- ^ RegularHashAggregateExecTransformer (67) + +- ^ RegularHashAggregateExecTransformer (66) + +- ^ ProjectExecTransformer (65) + +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (64) + :- ^ InputIteratorTransformer (46) + : +- ShuffleQueryStage (44), Statistics(X) + : +- ColumnarExchange (43) + : +- VeloxAppendBatches (42) + : +- ^ ProjectExecTransformer (40) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (39) + : :- ^ InputIteratorTransformer (9) + : : +- ShuffleQueryStage (7), Statistics(X) + : : +- ColumnarExchange (6) + : : +- VeloxAppendBatches (5) : : +- ^ ProjectExecTransformer (3) : : +- ^ NoopFilter (2) : : +- ^ Scan parquet (1) - : +- ^ InputIteratorTransformer (34) - : +- ^ InputAdapter (33) - : +- ^ ShuffleQueryStage (32), Statistics(X) - : +- ColumnarExchange (31) - : +- ^ ProjectExecTransformer (29) - : +- ^ ShuffledHashJoinExecTransformer LeftSemi (28) - : :- ^ InputIteratorTransformer (16) - : : +- ^ InputAdapter (15) - : : +- ^ ShuffleQueryStage (14), Statistics(X) - : : +- ColumnarExchange (13) - : : +- ^ ProjectExecTransformer (11) - : : +- ^ NoopFilter (10) - : : +- ^ Scan parquet (9) - : +- ^ ProjectExecTransformer (27) - : +- ^ FilterExecTransformer (26) - : +- ^ RegularHashAggregateExecTransformer (25) - : +- ^ InputIteratorTransformer (24) - : +- ^ InputAdapter (23) - : +- ^ ShuffleQueryStage (22), Statistics(X) - : +- ColumnarExchange (21) - : +- ^ ProjectExecTransformer (19) - : +- ^ FlushableHashAggregateExecTransformer (18) - : +- ^ Scan parquet (17) - +- ^ ShuffledHashJoinExecTransformer LeftSemi (57) - :- ^ InputIteratorTransformer (49) - : +- ^ InputAdapter (48) - : +- ^ ShuffleQueryStage (47), Statistics(X) - : +- ColumnarExchange (46) - : +- ^ ProjectExecTransformer (44) - : +- ^ NoopFilter (43) - : +- ^ Scan parquet (42) - +- ^ ProjectExecTransformer (56) - +- ^ FilterExecTransformer (55) - +- ^ RegularHashAggregateExecTransformer (54) - +- ^ InputIteratorTransformer (53) - +- ^ InputAdapter (52) - +- ^ ShuffleQueryStage (51), Statistics(X) - +- ReusedExchange (50) + : +- ^ InputIteratorTransformer (38) + : +- ShuffleQueryStage (36), Statistics(X) + : +- ColumnarExchange (35) + : +- VeloxAppendBatches (34) + : +- ^ ProjectExecTransformer (32) + : +- ^ ShuffledHashJoinExecTransformer LeftSemi BuildRight (31) + : :- ^ InputIteratorTransformer (18) + : : +- ShuffleQueryStage (16), Statistics(X) + : : +- ColumnarExchange (15) + : : +- VeloxAppendBatches (14) + : : +- ^ ProjectExecTransformer (12) + : : +- ^ NoopFilter (11) + : : +- ^ Scan parquet (10) + : +- ^ ProjectExecTransformer (30) + : +- ^ FilterExecTransformer (29) + : +- ^ RegularHashAggregateExecTransformer (28) + : +- ^ InputIteratorTransformer (27) + : +- ShuffleQueryStage (25), Statistics(X) + : +- ColumnarExchange (24) + : +- VeloxAppendBatches (23) + : +- ^ ProjectExecTransformer (21) + : +- ^ FlushableHashAggregateExecTransformer (20) + : +- ^ Scan parquet (19) + +- ^ ShuffledHashJoinExecTransformer LeftSemi BuildRight (63) + :- ^ InputIteratorTransformer (55) + : +- ShuffleQueryStage (53), Statistics(X) + : +- ColumnarExchange (52) + : +- VeloxAppendBatches (51) + : +- ^ ProjectExecTransformer (49) + : +- ^ NoopFilter (48) + : +- ^ Scan parquet (47) + +- ^ ProjectExecTransformer (62) + +- ^ FilterExecTransformer (61) + +- ^ RegularHashAggregateExecTransformer (60) + +- ^ InputIteratorTransformer (59) + +- ShuffleQueryStage (57), Statistics(X) + +- ReusedExchange (56) +- == Initial Plan == - TakeOrderedAndProject (96) - +- HashAggregate (95) - +- HashAggregate (94) - +- Project (93) - +- ShuffledHashJoin Inner BuildRight (92) - :- Exchange (81) - : +- Project (80) - : +- ShuffledHashJoin Inner BuildLeft (79) - : :- Exchange (67) - : : +- Filter (66) - : : +- Scan parquet (65) - : +- Exchange (78) - : +- ShuffledHashJoin LeftSemi BuildRight (77) - : :- Exchange (70) - : : +- Filter (69) - : : +- Scan parquet (68) - : +- Project (76) - : +- Filter (75) - : +- HashAggregate (74) - : +- Exchange (73) - : +- HashAggregate (72) - : +- Scan parquet (71) - +- ShuffledHashJoin LeftSemi BuildRight (91) - :- Exchange (84) - : +- Filter (83) - : +- Scan parquet (82) - +- Project (90) - +- Filter (89) - +- HashAggregate (88) - +- Exchange (87) - +- HashAggregate (86) - +- Scan parquet (85) + TakeOrderedAndProject (109) + +- HashAggregate (108) + +- HashAggregate (107) + +- Project (106) + +- SortMergeJoin Inner (105) + :- Sort (92) + : +- Exchange (91) + : +- Project (90) + : +- SortMergeJoin Inner (89) + : :- Sort (74) + : : +- Exchange (73) + : : +- Filter (72) + : : +- Scan parquet (71) + : +- Sort (88) + : +- Exchange (87) + : +- SortMergeJoin LeftSemi (86) + : :- Sort (78) + : : +- Exchange (77) + : : +- Filter (76) + : : +- Scan parquet (75) + : +- Sort (85) + : +- Project (84) + : +- Filter (83) + : +- HashAggregate (82) + : +- Exchange (81) + : +- HashAggregate (80) + : +- Scan parquet (79) + +- SortMergeJoin LeftSemi (104) + :- Sort (96) + : +- Exchange (95) + : +- Filter (94) + : +- Scan parquet (93) + +- Sort (103) + +- Project (102) + +- Filter (101) + +- HashAggregate (100) + +- Exchange (99) + +- HashAggregate (98) + +- Scan parquet (97) (1) Scan parquet @@ -112,428 +118,480 @@ Input [2]: [c_custkey#X, c_name#X] Input [3]: [hash_partition_key#X, c_custkey#X, c_name#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [3]: [hash_partition_key#X, c_custkey#X, c_name#X] +Arguments: X + +(6) ColumnarExchange Input [3]: [hash_partition_key#X, c_custkey#X, c_name#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [2]: [c_custkey#X, c_name#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [2]: [c_custkey#X, c_name#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [2]: [c_custkey#X, c_name#X] -(9) Scan parquet +(10) Scan parquet Output [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(10) NoopFilter +(11) NoopFilter Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Arguments: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [5]: [hash(o_orderkey#X, 42) AS hash_partition_key#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] +Arguments: X + +(15) ColumnarExchange Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -(17) Scan parquet +(19) Scan parquet Output [2]: [l_orderkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(18) FlushableHashAggregateExecTransformer +(20) FlushableHashAggregateExecTransformer Input [2]: [l_orderkey#X, l_quantity#X] Keys [1]: [l_orderkey#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_orderkey#X, sum#X, isEmpty#X] -(19) ProjectExecTransformer +(21) ProjectExecTransformer Output [4]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, sum#X, isEmpty#X] Input [3]: [l_orderkey#X, sum#X, isEmpty#X] -(20) WholeStageCodegenTransformer (X) +(22) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, l_orderkey#X, sum#X, isEmpty#X] Arguments: false -(21) ColumnarExchange +(23) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_orderkey#X, sum#X, isEmpty#X] +Arguments: X + +(24) ColumnarExchange Input [4]: [hash_partition_key#X, l_orderkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(22) ShuffleQueryStage +(25) ShuffleQueryStage Output [3]: [l_orderkey#X, sum#X, isEmpty#X] Arguments: X -(23) InputAdapter +(26) InputAdapter Input [3]: [l_orderkey#X, sum#X, isEmpty#X] -(24) InputIteratorTransformer +(27) InputIteratorTransformer Input [3]: [l_orderkey#X, sum#X, isEmpty#X] -(25) RegularHashAggregateExecTransformer +(28) RegularHashAggregateExecTransformer Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Keys [1]: [l_orderkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [2]: [l_orderkey#X, sum(l_quantity#X)#X AS sum(l_quantity#X)#X] -(26) FilterExecTransformer +(29) FilterExecTransformer Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] Arguments: (isnotnull(sum(l_quantity#X)#X) AND (sum(l_quantity#X)#X > 300.00)) -(27) ProjectExecTransformer +(30) ProjectExecTransformer Output [1]: [l_orderkey#X] Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] -(28) ShuffledHashJoinExecTransformer +(31) ShuffledHashJoinExecTransformer Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: LeftSemi Join condition: None -(29) ProjectExecTransformer +(32) ProjectExecTransformer Output [5]: [hash(o_custkey#X, 42) AS hash_partition_key#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -(30) WholeStageCodegenTransformer (X) +(33) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Arguments: false -(31) ColumnarExchange +(34) VeloxAppendBatches +Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] +Arguments: X + +(35) ColumnarExchange Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X], [plan_id=X], [id=#X] -(32) ShuffleQueryStage +(36) ShuffleQueryStage Output [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Arguments: X -(33) InputAdapter +(37) InputAdapter Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -(34) InputIteratorTransformer +(38) InputIteratorTransformer Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -(35) ShuffledHashJoinExecTransformer +(39) ShuffledHashJoinExecTransformer Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join type: Inner Join condition: None -(36) ProjectExecTransformer +(40) ProjectExecTransformer Output [6]: [hash(o_orderkey#X, 42) AS hash_partition_key#X, c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] Input [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -(37) WholeStageCodegenTransformer (X) +(41) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] Arguments: false -(38) ColumnarExchange +(42) VeloxAppendBatches +Input [6]: [hash_partition_key#X, c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] +Arguments: X + +(43) ColumnarExchange Input [6]: [hash_partition_key#X, c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X], [plan_id=X], [id=#X] -(39) ShuffleQueryStage +(44) ShuffleQueryStage Output [5]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] Arguments: X -(40) InputAdapter +(45) InputAdapter Input [5]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] -(41) InputIteratorTransformer +(46) InputIteratorTransformer Input [5]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] -(42) Scan parquet +(47) Scan parquet Output [2]: [l_orderkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_orderkey)] ReadSchema: struct -(43) NoopFilter +(48) NoopFilter Input [2]: [l_orderkey#X, l_quantity#X] Arguments: [l_orderkey#X, l_quantity#X] -(44) ProjectExecTransformer +(49) ProjectExecTransformer Output [3]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_quantity#X] Input [2]: [l_orderkey#X, l_quantity#X] -(45) WholeStageCodegenTransformer (X) +(50) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, l_orderkey#X, l_quantity#X] Arguments: false -(46) ColumnarExchange +(51) VeloxAppendBatches +Input [3]: [hash_partition_key#X, l_orderkey#X, l_quantity#X] +Arguments: X + +(52) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_quantity#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_quantity#X], [plan_id=X], [id=#X] -(47) ShuffleQueryStage +(53) ShuffleQueryStage Output [2]: [l_orderkey#X, l_quantity#X] Arguments: X -(48) InputAdapter +(54) InputAdapter Input [2]: [l_orderkey#X, l_quantity#X] -(49) InputIteratorTransformer +(55) InputIteratorTransformer Input [2]: [l_orderkey#X, l_quantity#X] -(50) ReusedExchange [Reuses operator id: 21] +(56) ReusedExchange [Reuses operator id: 24] Output [3]: [l_orderkey#X, sum#X, isEmpty#X] -(51) ShuffleQueryStage +(57) ShuffleQueryStage Output [3]: [l_orderkey#X, sum#X, isEmpty#X] Arguments: X -(52) InputAdapter +(58) InputAdapter Input [3]: [l_orderkey#X, sum#X, isEmpty#X] -(53) InputIteratorTransformer +(59) InputIteratorTransformer Input [3]: [l_orderkey#X, sum#X, isEmpty#X] -(54) RegularHashAggregateExecTransformer +(60) RegularHashAggregateExecTransformer Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Keys [1]: [l_orderkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [2]: [l_orderkey#X, sum(l_quantity#X)#X AS sum(l_quantity#X)#X] -(55) FilterExecTransformer +(61) FilterExecTransformer Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] Arguments: (isnotnull(sum(l_quantity#X)#X) AND (sum(l_quantity#X)#X > 300.00)) -(56) ProjectExecTransformer +(62) ProjectExecTransformer Output [1]: [l_orderkey#X] Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] -(57) ShuffledHashJoinExecTransformer +(63) ShuffledHashJoinExecTransformer Left keys [1]: [l_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: LeftSemi Join condition: None -(58) ShuffledHashJoinExecTransformer +(64) ShuffledHashJoinExecTransformer Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: Inner Join condition: None -(59) ProjectExecTransformer +(65) ProjectExecTransformer Output [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_quantity#X] Input [7]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_orderkey#X, l_quantity#X] -(60) RegularHashAggregateExecTransformer +(66) RegularHashAggregateExecTransformer Input [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_quantity#X] Keys [5]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] -(61) RegularHashAggregateExecTransformer +(67) RegularHashAggregateExecTransformer Input [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] Keys [5]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity#X)#X AS sum(l_quantity)#X] -(62) WholeStageCodegenTransformer (X) +(68) WholeStageCodegenTransformer (X) Input [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] Arguments: false -(63) TakeOrderedAndProjectExecTransformer +(69) TakeOrderedAndProjectExecTransformer Input [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] Arguments: X, [o_totalprice#X DESC NULLS LAST, o_orderdate#X ASC NULLS FIRST], [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X], 0 -(64) VeloxColumnarToRowExec +(70) VeloxColumnarToRowExec Input [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] -(65) Scan parquet +(71) Scan parquet Output [2]: [c_custkey#X, c_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey)] ReadSchema: struct -(66) Filter +(72) Filter Input [2]: [c_custkey#X, c_name#X] Condition : isnotnull(c_custkey#X) -(67) Exchange +(73) Exchange Input [2]: [c_custkey#X, c_name#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(68) Scan parquet +(74) Sort +Input [2]: [c_custkey#X, c_name#X] +Arguments: [c_custkey#X ASC NULLS FIRST], false, 0 + +(75) Scan parquet Output [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(69) Filter +(76) Filter Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Condition : (isnotnull(o_custkey#X) AND isnotnull(o_orderkey#X)) -(70) Exchange +(77) Exchange Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(71) Scan parquet +(78) Sort +Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(79) Scan parquet Output [2]: [l_orderkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(72) HashAggregate +(80) HashAggregate Input [2]: [l_orderkey#X, l_quantity#X] Keys [1]: [l_orderkey#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_orderkey#X, sum#X, isEmpty#X] -(73) Exchange +(81) Exchange Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(74) HashAggregate +(82) HashAggregate Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Keys [1]: [l_orderkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [2]: [l_orderkey#X, sum(l_quantity#X)#X AS sum(l_quantity#X)#X] -(75) Filter +(83) Filter Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] Condition : (isnotnull(sum(l_quantity#X)#X) AND (sum(l_quantity#X)#X > 300.00)) -(76) Project +(84) Project Output [1]: [l_orderkey#X] Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] -(77) ShuffledHashJoin +(85) Sort +Input [1]: [l_orderkey#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(86) SortMergeJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: LeftSemi Join condition: None -(78) Exchange +(87) Exchange Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(79) ShuffledHashJoin +(88) Sort +Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] +Arguments: [o_custkey#X ASC NULLS FIRST], false, 0 + +(89) SortMergeJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join type: Inner Join condition: None -(80) Project +(90) Project Output [5]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] Input [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -(81) Exchange +(91) Exchange Input [5]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(82) Scan parquet +(92) Sort +Input [5]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(93) Scan parquet Output [2]: [l_orderkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_orderkey)] ReadSchema: struct -(83) Filter +(94) Filter Input [2]: [l_orderkey#X, l_quantity#X] Condition : isnotnull(l_orderkey#X) -(84) Exchange +(95) Exchange Input [2]: [l_orderkey#X, l_quantity#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(85) Scan parquet +(96) Sort +Input [2]: [l_orderkey#X, l_quantity#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(97) Scan parquet Output [2]: [l_orderkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(86) HashAggregate +(98) HashAggregate Input [2]: [l_orderkey#X, l_quantity#X] Keys [1]: [l_orderkey#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_orderkey#X, sum#X, isEmpty#X] -(87) Exchange +(99) Exchange Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(88) HashAggregate +(100) HashAggregate Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Keys [1]: [l_orderkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [2]: [l_orderkey#X, sum(l_quantity#X)#X AS sum(l_quantity#X)#X] -(89) Filter +(101) Filter Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] Condition : (isnotnull(sum(l_quantity#X)#X) AND (sum(l_quantity#X)#X > 300.00)) -(90) Project +(102) Project Output [1]: [l_orderkey#X] Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] -(91) ShuffledHashJoin +(103) Sort +Input [1]: [l_orderkey#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(104) SortMergeJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: LeftSemi Join condition: None -(92) ShuffledHashJoin +(105) SortMergeJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: Inner Join condition: None -(93) Project +(106) Project Output [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_quantity#X] Input [7]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_orderkey#X, l_quantity#X] -(94) HashAggregate +(107) HashAggregate Input [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_quantity#X] Keys [5]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] -(95) HashAggregate +(108) HashAggregate Input [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] Keys [5]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity#X)#X AS sum(l_quantity)#X] -(96) TakeOrderedAndProject +(109) TakeOrderedAndProject Input [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] Arguments: X, [o_totalprice#X DESC NULLS LAST, o_orderdate#X ASC NULLS FIRST], [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] -(97) AdaptiveSparkPlan +(110) AdaptiveSparkPlan Output [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/19.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/19.txt index 7d03f6eca9635..a9c629524fb71 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/19.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/19.txt @@ -1,37 +1,39 @@ == Physical Plan == -AdaptiveSparkPlan (34) +AdaptiveSparkPlan (38) +- == Final Plan == - VeloxColumnarToRowExec (22) - +- ^ RegularHashAggregateExecTransformer (20) - +- ^ RegularHashAggregateExecTransformer (19) - +- ^ ProjectExecTransformer (18) - +- ^ ShuffledHashJoinExecTransformer Inner (17) - :- ^ InputIteratorTransformer (8) - : +- ^ InputAdapter (7) - : +- ^ ShuffleQueryStage (6), Statistics(X) - : +- ColumnarExchange (5) + VeloxColumnarToRowExec (24) + +- ^ RegularHashAggregateExecTransformer (22) + +- ^ RegularHashAggregateExecTransformer (21) + +- ^ ProjectExecTransformer (20) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (19) + :- ^ InputIteratorTransformer (9) + : +- ShuffleQueryStage (7), Statistics(X) + : +- ColumnarExchange (6) + : +- VeloxAppendBatches (5) : +- ^ ProjectExecTransformer (3) : +- ^ NoopFilter (2) : +- ^ Scan parquet (1) - +- ^ InputIteratorTransformer (16) - +- ^ InputAdapter (15) - +- ^ ShuffleQueryStage (14), Statistics(X) - +- ColumnarExchange (13) - +- ^ ProjectExecTransformer (11) - +- ^ NoopFilter (10) - +- ^ Scan parquet (9) + +- ^ InputIteratorTransformer (18) + +- ShuffleQueryStage (16), Statistics(X) + +- ColumnarExchange (15) + +- VeloxAppendBatches (14) + +- ^ ProjectExecTransformer (12) + +- ^ NoopFilter (11) + +- ^ Scan parquet (10) +- == Initial Plan == - HashAggregate (33) - +- HashAggregate (32) - +- Project (31) - +- ShuffledHashJoin Inner BuildRight (30) - :- Exchange (26) - : +- Project (25) - : +- Filter (24) - : +- Scan parquet (23) - +- Exchange (29) - +- Filter (28) - +- Scan parquet (27) + HashAggregate (37) + +- HashAggregate (36) + +- Project (35) + +- SortMergeJoin Inner (34) + :- Sort (29) + : +- Exchange (28) + : +- Project (27) + : +- Filter (26) + : +- Scan parquet (25) + +- Sort (33) + +- Exchange (32) + +- Filter (31) + +- Scan parquet (30) (1) Scan parquet @@ -53,142 +55,158 @@ Input [6]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, l_shipin Input [5]: [hash_partition_key#X, l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [5]: [hash_partition_key#X, l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(6) ColumnarExchange Input [5]: [hash_partition_key#X, l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -(9) Scan parquet +(10) Scan parquet Output [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_size), GreaterThanOrEqual(p_size,1), IsNotNull(p_partkey), Or(Or(And(And(EqualTo(p_brand,Brand#X),In(p_container, [SM BOX,SM CASE,SM PACK,SM PKG])),LessThanOrEqual(p_size,5)),And(And(EqualTo(p_brand,Brand#X),In(p_container, [MED BAG,MED BOX,MED PACK,MED PKG])),LessThanOrEqual(p_size,10))),And(And(EqualTo(p_brand,Brand#X),In(p_container, [LG BOX,LG CASE,LG PACK,LG PKG])),LessThanOrEqual(p_size,15)))] ReadSchema: struct -(10) NoopFilter +(11) NoopFilter Input [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] Arguments: [p_partkey#X, p_brand#X, p_size#X, p_container#X] -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [5]: [hash(p_partkey#X, 42) AS hash_partition_key#X, p_partkey#X, p_brand#X, p_size#X, p_container#X] Input [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, p_partkey#X, p_brand#X, p_size#X, p_container#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [5]: [hash_partition_key#X, p_partkey#X, p_brand#X, p_size#X, p_container#X] +Arguments: X + +(15) ColumnarExchange Input [5]: [hash_partition_key#X, p_partkey#X, p_brand#X, p_size#X, p_container#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X, p_brand#X, p_size#X, p_container#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join type: Inner Join condition: (((((((p_brand#X = Brand#X) AND p_container#X IN (SM CASE,SM BOX,SM PACK,SM PKG)) AND (l_quantity#X >= 1.00)) AND (l_quantity#X <= 11.00)) AND (p_size#X <= 5)) OR (((((p_brand#X = Brand#X) AND p_container#X IN (MED BAG,MED BOX,MED PKG,MED PACK)) AND (l_quantity#X >= 10.00)) AND (l_quantity#X <= 20.00)) AND (p_size#X <= 10))) OR (((((p_brand#X = Brand#X) AND p_container#X IN (LG CASE,LG BOX,LG PACK,LG PKG)) AND (l_quantity#X >= 20.00)) AND (l_quantity#X <= 30.00)) AND (p_size#X <= 15))) -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [3]: [l_extendedprice#X, l_discount#X, (l_extendedprice#X * (1 - l_discount#X)) AS _pre_X#X] Input [8]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, p_partkey#X, p_brand#X, p_size#X, p_container#X] -(19) RegularHashAggregateExecTransformer +(21) RegularHashAggregateExecTransformer Input [3]: [l_extendedprice#X, l_discount#X, _pre_X#X] Keys: [] Functions [1]: [partial_sum(_pre_X#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(20) RegularHashAggregateExecTransformer +(22) RegularHashAggregateExecTransformer Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X] Results [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X AS revenue#X] -(21) WholeStageCodegenTransformer (X) +(23) WholeStageCodegenTransformer (X) Input [1]: [revenue#X] Arguments: false -(22) VeloxColumnarToRowExec +(24) VeloxColumnarToRowExec Input [1]: [revenue#X] -(23) Scan parquet +(25) Scan parquet Output [6]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, l_shipinstruct#X, l_shipmode#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipinstruct), In(l_shipmode, [AIR,AIR REG]), EqualTo(l_shipinstruct,DELIVER IN PERSON), IsNotNull(l_partkey), Or(Or(And(GreaterThanOrEqual(l_quantity,1.00),LessThanOrEqual(l_quantity,11.00)),And(GreaterThanOrEqual(l_quantity,10.00),LessThanOrEqual(l_quantity,20.00))),And(GreaterThanOrEqual(l_quantity,20.00),LessThanOrEqual(l_quantity,30.00)))] ReadSchema: struct -(24) Filter +(26) Filter Input [6]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, l_shipinstruct#X, l_shipmode#X] Condition : ((((isnotnull(l_shipinstruct#X) AND l_shipmode#X IN (AIR,AIR REG)) AND (l_shipinstruct#X = DELIVER IN PERSON)) AND isnotnull(l_partkey#X)) AND ((((l_quantity#X >= 1.00) AND (l_quantity#X <= 11.00)) OR ((l_quantity#X >= 10.00) AND (l_quantity#X <= 20.00))) OR ((l_quantity#X >= 20.00) AND (l_quantity#X <= 30.00)))) -(25) Project +(27) Project Output [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Input [6]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, l_shipinstruct#X, l_shipmode#X] -(26) Exchange +(28) Exchange Input [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(27) Scan parquet +(29) Sort +Input [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] +Arguments: [l_partkey#X ASC NULLS FIRST], false, 0 + +(30) Scan parquet Output [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_size), GreaterThanOrEqual(p_size,1), IsNotNull(p_partkey), Or(Or(And(And(EqualTo(p_brand,Brand#X),In(p_container, [SM BOX,SM CASE,SM PACK,SM PKG])),LessThanOrEqual(p_size,5)),And(And(EqualTo(p_brand,Brand#X),In(p_container, [MED BAG,MED BOX,MED PACK,MED PKG])),LessThanOrEqual(p_size,10))),And(And(EqualTo(p_brand,Brand#X),In(p_container, [LG BOX,LG CASE,LG PACK,LG PKG])),LessThanOrEqual(p_size,15)))] ReadSchema: struct -(28) Filter +(31) Filter Input [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] Condition : (((isnotnull(p_size#X) AND (p_size#X >= 1)) AND isnotnull(p_partkey#X)) AND (((((p_brand#X = Brand#X) AND p_container#X IN (SM CASE,SM BOX,SM PACK,SM PKG)) AND (p_size#X <= 5)) OR (((p_brand#X = Brand#X) AND p_container#X IN (MED BAG,MED BOX,MED PKG,MED PACK)) AND (p_size#X <= 10))) OR (((p_brand#X = Brand#X) AND p_container#X IN (LG CASE,LG BOX,LG PACK,LG PKG)) AND (p_size#X <= 15)))) -(29) Exchange +(32) Exchange Input [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(30) ShuffledHashJoin +(33) Sort +Input [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] +Arguments: [p_partkey#X ASC NULLS FIRST], false, 0 + +(34) SortMergeJoin Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join type: Inner Join condition: (((((((p_brand#X = Brand#X) AND p_container#X IN (SM CASE,SM BOX,SM PACK,SM PKG)) AND (l_quantity#X >= 1.00)) AND (l_quantity#X <= 11.00)) AND (p_size#X <= 5)) OR (((((p_brand#X = Brand#X) AND p_container#X IN (MED BAG,MED BOX,MED PKG,MED PACK)) AND (l_quantity#X >= 10.00)) AND (l_quantity#X <= 20.00)) AND (p_size#X <= 10))) OR (((((p_brand#X = Brand#X) AND p_container#X IN (LG CASE,LG BOX,LG PACK,LG PKG)) AND (l_quantity#X >= 20.00)) AND (l_quantity#X <= 30.00)) AND (p_size#X <= 15))) -(31) Project +(35) Project Output [2]: [l_extendedprice#X, l_discount#X] Input [8]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, p_partkey#X, p_brand#X, p_size#X, p_container#X] -(32) HashAggregate +(36) HashAggregate Input [2]: [l_extendedprice#X, l_discount#X] Keys: [] Functions [1]: [partial_sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(33) HashAggregate +(37) HashAggregate Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X] Results [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X AS revenue#X] -(34) AdaptiveSparkPlan +(38) AdaptiveSparkPlan Output [1]: [revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/20.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/20.txt index 4e67d57112e78..8e929ff7b2961 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/20.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/20.txt @@ -1,119 +1,129 @@ == Physical Plan == -AdaptiveSparkPlan (123) +AdaptiveSparkPlan (143) +- == Final Plan == - VeloxColumnarToRowExec (83) - +- AQEShuffleRead (82) - +- ShuffleQueryStage (81), Statistics(X) - +- ColumnarExchange (80) - +- ^ ProjectExecTransformer (78) - +- ^ ShuffledHashJoinExecTransformer Inner (77) - :- ^ InputIteratorTransformer (68) - : +- ^ InputAdapter (67) - : +- ^ ShuffleQueryStage (66), Statistics(X) - : +- ColumnarExchange (65) - : +- ^ ProjectExecTransformer (63) - : +- ^ ShuffledHashJoinExecTransformer LeftSemi (62) - : :- ^ InputIteratorTransformer (8) - : : +- ^ InputAdapter (7) - : : +- ^ ShuffleQueryStage (6), Statistics(X) - : : +- ColumnarExchange (5) - : : +- ^ ProjectExecTransformer (3) - : : +- ^ NoopFilter (2) - : : +- ^ Scan parquet (1) - : +- ^ InputIteratorTransformer (61) - : +- ^ InputAdapter (60) - : +- ^ ShuffleQueryStage (59), Statistics(X) - : +- ColumnarExchange (58) - : +- ^ ProjectExecTransformer (56) - : +- ^ ShuffledHashJoinExecTransformer Inner (55) - : :- ^ InputIteratorTransformer (31) - : : +- ^ InputAdapter (30) - : : +- ^ ShuffleQueryStage (29), Statistics(X) - : : +- ColumnarExchange (28) - : : +- ^ ProjectExecTransformer (26) - : : +- ^ ShuffledHashJoinExecTransformer LeftSemi (25) - : : :- ^ InputIteratorTransformer (16) - : : : +- ^ InputAdapter (15) - : : : +- ^ ShuffleQueryStage (14), Statistics(X) - : : : +- ColumnarExchange (13) - : : : +- ^ ProjectExecTransformer (11) - : : : +- ^ NoopFilter (10) - : : : +- ^ Scan parquet (9) - : : +- ^ InputIteratorTransformer (24) - : : +- ^ InputAdapter (23) - : : +- ^ ShuffleQueryStage (22), Statistics(X) - : : +- ColumnarExchange (21) - : : +- ^ ProjectExecTransformer (19) - : : +- ^ NoopFilter (18) - : : +- ^ Scan parquet (17) - : +- ^ InputIteratorTransformer (54) - : +- ^ InputAdapter (53) - : +- ^ ShuffleQueryStage (52), Statistics(X) - : +- ColumnarExchange (51) - : +- ^ ProjectExecTransformer (49) - : +- ^ FilterExecTransformer (48) - : +- ^ ProjectExecTransformer (47) - : +- ^ RegularHashAggregateExecTransformer (46) - : +- ^ RegularHashAggregateExecTransformer (45) - : +- ^ ShuffledHashJoinExecTransformer LeftSemi (44) - : :- ^ InputIteratorTransformer (39) - : : +- ^ InputAdapter (38) - : : +- ^ ShuffleQueryStage (37), Statistics(X) - : : +- ColumnarExchange (36) - : : +- ^ ProjectExecTransformer (34) - : : +- ^ NoopFilter (33) - : : +- ^ Scan parquet (32) - : +- ^ InputIteratorTransformer (43) - : +- ^ InputAdapter (42) - : +- ^ ShuffleQueryStage (41), Statistics(X) - : +- ReusedExchange (40) - +- ^ InputIteratorTransformer (76) - +- ^ InputAdapter (75) - +- ^ ShuffleQueryStage (74), Statistics(X) - +- ColumnarExchange (73) - +- ^ ProjectExecTransformer (71) - +- ^ NoopFilter (70) - +- ^ Scan parquet (69) + VeloxColumnarToRowExec (93) + +- AQEShuffleRead (92) + +- ShuffleQueryStage (91), Statistics(X) + +- ColumnarExchange (90) + +- VeloxAppendBatches (89) + +- ^ ProjectExecTransformer (87) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (86) + :- ^ InputIteratorTransformer (76) + : +- ShuffleQueryStage (74), Statistics(X) + : +- ColumnarExchange (73) + : +- VeloxAppendBatches (72) + : +- ^ ProjectExecTransformer (70) + : +- ^ ShuffledHashJoinExecTransformer LeftSemi BuildRight (69) + : :- ^ InputIteratorTransformer (9) + : : +- ShuffleQueryStage (7), Statistics(X) + : : +- ColumnarExchange (6) + : : +- VeloxAppendBatches (5) + : : +- ^ ProjectExecTransformer (3) + : : +- ^ NoopFilter (2) + : : +- ^ Scan parquet (1) + : +- ^ InputIteratorTransformer (68) + : +- ShuffleQueryStage (66), Statistics(X) + : +- ColumnarExchange (65) + : +- VeloxAppendBatches (64) + : +- ^ ProjectExecTransformer (62) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (61) + : :- ^ InputIteratorTransformer (35) + : : +- ShuffleQueryStage (33), Statistics(X) + : : +- ColumnarExchange (32) + : : +- VeloxAppendBatches (31) + : : +- ^ ProjectExecTransformer (29) + : : +- ^ ShuffledHashJoinExecTransformer LeftSemi BuildRight (28) + : : :- ^ InputIteratorTransformer (18) + : : : +- ShuffleQueryStage (16), Statistics(X) + : : : +- ColumnarExchange (15) + : : : +- VeloxAppendBatches (14) + : : : +- ^ ProjectExecTransformer (12) + : : : +- ^ NoopFilter (11) + : : : +- ^ Scan parquet (10) + : : +- ^ InputIteratorTransformer (27) + : : +- ShuffleQueryStage (25), Statistics(X) + : : +- ColumnarExchange (24) + : : +- VeloxAppendBatches (23) + : : +- ^ ProjectExecTransformer (21) + : : +- ^ NoopFilter (20) + : : +- ^ Scan parquet (19) + : +- ^ InputIteratorTransformer (60) + : +- ShuffleQueryStage (58), Statistics(X) + : +- ColumnarExchange (57) + : +- VeloxAppendBatches (56) + : +- ^ ProjectExecTransformer (54) + : +- ^ FilterExecTransformer (53) + : +- ^ ProjectExecTransformer (52) + : +- ^ RegularHashAggregateExecTransformer (51) + : +- ^ RegularHashAggregateExecTransformer (50) + : +- ^ ShuffledHashJoinExecTransformer LeftSemi BuildRight (49) + : :- ^ InputIteratorTransformer (44) + : : +- ShuffleQueryStage (42), Statistics(X) + : : +- ColumnarExchange (41) + : : +- VeloxAppendBatches (40) + : : +- ^ ProjectExecTransformer (38) + : : +- ^ NoopFilter (37) + : : +- ^ Scan parquet (36) + : +- ^ InputIteratorTransformer (48) + : +- ShuffleQueryStage (46), Statistics(X) + : +- ReusedExchange (45) + +- ^ InputIteratorTransformer (85) + +- ShuffleQueryStage (83), Statistics(X) + +- ColumnarExchange (82) + +- VeloxAppendBatches (81) + +- ^ ProjectExecTransformer (79) + +- ^ NoopFilter (78) + +- ^ Scan parquet (77) +- == Initial Plan == - Sort (122) - +- Exchange (121) - +- Project (120) - +- ShuffledHashJoin Inner BuildRight (119) - :- Exchange (114) - : +- Project (113) - : +- ShuffledHashJoin LeftSemi BuildRight (112) - : :- Exchange (86) - : : +- Filter (85) - : : +- Scan parquet (84) - : +- Exchange (111) - : +- Project (110) - : +- ShuffledHashJoin Inner BuildLeft (109) - : :- Exchange (95) - : : +- ShuffledHashJoin LeftSemi BuildRight (94) - : : :- Exchange (89) - : : : +- Filter (88) - : : : +- Scan parquet (87) - : : +- Exchange (93) - : : +- Project (92) - : : +- Filter (91) - : : +- Scan parquet (90) - : +- Exchange (108) - : +- Filter (107) - : +- HashAggregate (106) - : +- HashAggregate (105) - : +- ShuffledHashJoin LeftSemi BuildRight (104) - : :- Exchange (99) - : : +- Project (98) - : : +- Filter (97) - : : +- Scan parquet (96) - : +- Exchange (103) - : +- Project (102) - : +- Filter (101) - : +- Scan parquet (100) - +- Exchange (118) - +- Project (117) - +- Filter (116) - +- Scan parquet (115) + Sort (142) + +- Exchange (141) + +- Project (140) + +- SortMergeJoin Inner (139) + :- Sort (133) + : +- Exchange (132) + : +- Project (131) + : +- SortMergeJoin LeftSemi (130) + : :- Sort (97) + : : +- Exchange (96) + : : +- Filter (95) + : : +- Scan parquet (94) + : +- Sort (129) + : +- Exchange (128) + : +- Project (127) + : +- SortMergeJoin Inner (126) + : :- Sort (109) + : : +- Exchange (108) + : : +- SortMergeJoin LeftSemi (107) + : : :- Sort (101) + : : : +- Exchange (100) + : : : +- Filter (99) + : : : +- Scan parquet (98) + : : +- Sort (106) + : : +- Exchange (105) + : : +- Project (104) + : : +- Filter (103) + : : +- Scan parquet (102) + : +- Sort (125) + : +- Exchange (124) + : +- Filter (123) + : +- HashAggregate (122) + : +- HashAggregate (121) + : +- SortMergeJoin LeftSemi (120) + : :- Sort (114) + : : +- Exchange (113) + : : +- Project (112) + : : +- Filter (111) + : : +- Scan parquet (110) + : +- Sort (119) + : +- Exchange (118) + : +- Project (117) + : +- Filter (116) + : +- Scan parquet (115) + +- Sort (138) + +- Exchange (137) + +- Project (136) + +- Filter (135) + +- Scan parquet (134) (1) Scan parquet @@ -135,518 +145,598 @@ Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] +Arguments: X + +(6) ColumnarExchange Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] -(9) Scan parquet +(10) Scan parquet Output [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_availqty), IsNotNull(ps_partkey), IsNotNull(ps_suppkey)] ReadSchema: struct -(10) NoopFilter +(11) NoopFilter Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [4]: [hash(ps_partkey#X, 42) AS hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X] Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X] +Arguments: X + +(15) ColumnarExchange Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_availqty#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] -(17) Scan parquet +(19) Scan parquet Output [2]: [p_partkey#X, p_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_name), StringStartsWith(p_name,forest)] ReadSchema: struct -(18) NoopFilter +(20) NoopFilter Input [2]: [p_partkey#X, p_name#X] Arguments: [p_partkey#X, p_name#X] -(19) ProjectExecTransformer +(21) ProjectExecTransformer Output [2]: [hash(p_partkey#X, 42) AS hash_partition_key#X, p_partkey#X] Input [2]: [p_partkey#X, p_name#X] -(20) WholeStageCodegenTransformer (X) +(22) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, p_partkey#X] Arguments: false -(21) ColumnarExchange +(23) VeloxAppendBatches +Input [2]: [hash_partition_key#X, p_partkey#X] +Arguments: X + +(24) ColumnarExchange Input [2]: [hash_partition_key#X, p_partkey#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [id=#X] -(22) ShuffleQueryStage +(25) ShuffleQueryStage Output [1]: [p_partkey#X] Arguments: X -(23) InputAdapter +(26) InputAdapter Input [1]: [p_partkey#X] -(24) InputIteratorTransformer +(27) InputIteratorTransformer Input [1]: [p_partkey#X] -(25) ShuffledHashJoinExecTransformer +(28) ShuffledHashJoinExecTransformer Left keys [1]: [ps_partkey#X] Right keys [1]: [p_partkey#X] Join type: LeftSemi Join condition: None -(26) ProjectExecTransformer +(29) ProjectExecTransformer Output [4]: [hash(ps_partkey#X, ps_suppkey#X, 42) AS hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X] Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] -(27) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: false -(28) ColumnarExchange +(31) VeloxAppendBatches +Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X] +Arguments: X + +(32) ColumnarExchange Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: hashpartitioning(ps_partkey#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_availqty#X], [plan_id=X], [id=#X] -(29) ShuffleQueryStage +(33) ShuffleQueryStage Output [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: X -(30) InputAdapter +(34) InputAdapter Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] -(31) InputIteratorTransformer +(35) InputIteratorTransformer Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] -(32) Scan parquet +(36) Scan parquet Output [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1994-01-01), LessThan(l_shipdate,1995-01-01), IsNotNull(l_partkey), IsNotNull(l_suppkey)] ReadSchema: struct -(33) NoopFilter +(37) NoopFilter Input [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] Arguments: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] -(34) ProjectExecTransformer +(38) ProjectExecTransformer Output [4]: [hash(l_partkey#X, 42) AS hash_partition_key#X, l_partkey#X, l_suppkey#X, l_quantity#X] Input [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] -(35) WholeStageCodegenTransformer (X) +(39) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, l_partkey#X, l_suppkey#X, l_quantity#X] Arguments: false -(36) ColumnarExchange +(40) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_partkey#X, l_suppkey#X, l_quantity#X] +Arguments: X + +(41) ColumnarExchange Input [4]: [hash_partition_key#X, l_partkey#X, l_suppkey#X, l_quantity#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_suppkey#X, l_quantity#X], [plan_id=X], [id=#X] -(37) ShuffleQueryStage +(42) ShuffleQueryStage Output [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] Arguments: X -(38) InputAdapter +(43) InputAdapter Input [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] -(39) InputIteratorTransformer +(44) InputIteratorTransformer Input [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] -(40) ReusedExchange [Reuses operator id: 21] +(45) ReusedExchange [Reuses operator id: 24] Output [1]: [p_partkey#X] -(41) ShuffleQueryStage +(46) ShuffleQueryStage Output [1]: [p_partkey#X] Arguments: X -(42) InputAdapter +(47) InputAdapter Input [1]: [p_partkey#X] -(43) InputIteratorTransformer +(48) InputIteratorTransformer Input [1]: [p_partkey#X] -(44) ShuffledHashJoinExecTransformer +(49) ShuffledHashJoinExecTransformer Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join type: LeftSemi Join condition: None -(45) RegularHashAggregateExecTransformer +(50) RegularHashAggregateExecTransformer Input [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] Keys [2]: [l_partkey#X, l_suppkey#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] -(46) RegularHashAggregateExecTransformer +(51) RegularHashAggregateExecTransformer Input [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] Keys [2]: [l_partkey#X, l_suppkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [3]: [l_partkey#X, l_suppkey#X, sum(l_quantity#X)#X] -(47) ProjectExecTransformer +(52) ProjectExecTransformer Output [3]: [(0.5 * sum(l_quantity#X)#X) AS (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Input [3]: [l_partkey#X, l_suppkey#X, sum(l_quantity#X)#X] -(48) FilterExecTransformer +(53) FilterExecTransformer Input [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Arguments: isnotnull((0.5 * sum(l_quantity))#X) -(49) ProjectExecTransformer +(54) ProjectExecTransformer Output [4]: [hash(l_partkey#X, l_suppkey#X, 42) AS hash_partition_key#X, (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Input [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] -(50) WholeStageCodegenTransformer (X) +(55) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Arguments: false -(51) ColumnarExchange +(56) VeloxAppendBatches +Input [4]: [hash_partition_key#X, (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] +Arguments: X + +(57) ColumnarExchange Input [4]: [hash_partition_key#X, (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Arguments: hashpartitioning(l_partkey#X, l_suppkey#X, 1), ENSURE_REQUIREMENTS, [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X], [plan_id=X], [id=#X] -(52) ShuffleQueryStage +(58) ShuffleQueryStage Output [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Arguments: X -(53) InputAdapter +(59) InputAdapter Input [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] -(54) InputIteratorTransformer +(60) InputIteratorTransformer Input [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] -(55) ShuffledHashJoinExecTransformer +(61) ShuffledHashJoinExecTransformer Left keys [2]: [ps_partkey#X, ps_suppkey#X] Right keys [2]: [l_partkey#X, l_suppkey#X] Join type: Inner Join condition: (cast(ps_availqty#X as decimal(24,3)) > (0.5 * sum(l_quantity))#X) -(56) ProjectExecTransformer +(62) ProjectExecTransformer Output [2]: [hash(ps_suppkey#X, 42) AS hash_partition_key#X, ps_suppkey#X] Input [6]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] -(57) WholeStageCodegenTransformer (X) +(63) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, ps_suppkey#X] Arguments: false -(58) ColumnarExchange +(64) VeloxAppendBatches +Input [2]: [hash_partition_key#X, ps_suppkey#X] +Arguments: X + +(65) ColumnarExchange Input [2]: [hash_partition_key#X, ps_suppkey#X] Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_suppkey#X], [plan_id=X], [id=#X] -(59) ShuffleQueryStage +(66) ShuffleQueryStage Output [1]: [ps_suppkey#X] Arguments: X -(60) InputAdapter +(67) InputAdapter Input [1]: [ps_suppkey#X] -(61) InputIteratorTransformer +(68) InputIteratorTransformer Input [1]: [ps_suppkey#X] -(62) ShuffledHashJoinExecTransformer +(69) ShuffledHashJoinExecTransformer Left keys [1]: [s_suppkey#X] Right keys [1]: [ps_suppkey#X] Join type: LeftSemi Join condition: None -(63) ProjectExecTransformer +(70) ProjectExecTransformer Output [4]: [hash(s_nationkey#X, 42) AS hash_partition_key#X, s_name#X, s_address#X, s_nationkey#X] Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] -(64) WholeStageCodegenTransformer (X) +(71) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, s_name#X, s_address#X, s_nationkey#X] Arguments: false -(65) ColumnarExchange +(72) VeloxAppendBatches +Input [4]: [hash_partition_key#X, s_name#X, s_address#X, s_nationkey#X] +Arguments: X + +(73) ColumnarExchange Input [4]: [hash_partition_key#X, s_name#X, s_address#X, s_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_name#X, s_address#X, s_nationkey#X], [plan_id=X], [id=#X] -(66) ShuffleQueryStage +(74) ShuffleQueryStage Output [3]: [s_name#X, s_address#X, s_nationkey#X] Arguments: X -(67) InputAdapter +(75) InputAdapter Input [3]: [s_name#X, s_address#X, s_nationkey#X] -(68) InputIteratorTransformer +(76) InputIteratorTransformer Input [3]: [s_name#X, s_address#X, s_nationkey#X] -(69) Scan parquet +(77) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,CANADA), IsNotNull(n_nationkey)] ReadSchema: struct -(70) NoopFilter +(78) NoopFilter Input [2]: [n_nationkey#X, n_name#X] Arguments: [n_nationkey#X, n_name#X] -(71) ProjectExecTransformer +(79) ProjectExecTransformer Output [2]: [hash(n_nationkey#X, 42) AS hash_partition_key#X, n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(72) WholeStageCodegenTransformer (X) +(80) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, n_nationkey#X] Arguments: false -(73) ColumnarExchange +(81) VeloxAppendBatches +Input [2]: [hash_partition_key#X, n_nationkey#X] +Arguments: X + +(82) ColumnarExchange Input [2]: [hash_partition_key#X, n_nationkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X], [plan_id=X], [id=#X] -(74) ShuffleQueryStage +(83) ShuffleQueryStage Output [1]: [n_nationkey#X] Arguments: X -(75) InputAdapter +(84) InputAdapter Input [1]: [n_nationkey#X] -(76) InputIteratorTransformer +(85) InputIteratorTransformer Input [1]: [n_nationkey#X] -(77) ShuffledHashJoinExecTransformer +(86) ShuffledHashJoinExecTransformer Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(78) ProjectExecTransformer +(87) ProjectExecTransformer Output [2]: [s_name#X, s_address#X] Input [4]: [s_name#X, s_address#X, s_nationkey#X, n_nationkey#X] -(79) WholeStageCodegenTransformer (X) +(88) WholeStageCodegenTransformer (X) Input [2]: [s_name#X, s_address#X] Arguments: false -(80) ColumnarExchange +(89) VeloxAppendBatches +Input [2]: [s_name#X, s_address#X] +Arguments: X + +(90) ColumnarExchange Input [2]: [s_name#X, s_address#X] Arguments: rangepartitioning(s_name#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(81) ShuffleQueryStage +(91) ShuffleQueryStage Output [2]: [s_name#X, s_address#X] Arguments: X -(82) AQEShuffleRead +(92) AQEShuffleRead Input [2]: [s_name#X, s_address#X] Arguments: local -(83) VeloxColumnarToRowExec +(93) VeloxColumnarToRowExec Input [2]: [s_name#X, s_address#X] -(84) Scan parquet +(94) Scan parquet Output [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_nationkey)] ReadSchema: struct -(85) Filter +(95) Filter Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Condition : isnotnull(s_nationkey#X) -(86) Exchange +(96) Exchange Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(87) Scan parquet +(97) Sort +Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] +Arguments: [s_suppkey#X ASC NULLS FIRST], false, 0 + +(98) Scan parquet Output [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_availqty), IsNotNull(ps_partkey), IsNotNull(ps_suppkey)] ReadSchema: struct -(88) Filter +(99) Filter Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Condition : ((isnotnull(ps_availqty#X) AND isnotnull(ps_partkey#X)) AND isnotnull(ps_suppkey#X)) -(89) Exchange +(100) Exchange Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(90) Scan parquet +(101) Sort +Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] +Arguments: [ps_partkey#X ASC NULLS FIRST], false, 0 + +(102) Scan parquet Output [2]: [p_partkey#X, p_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_name), StringStartsWith(p_name,forest)] ReadSchema: struct -(91) Filter +(103) Filter Input [2]: [p_partkey#X, p_name#X] Condition : (isnotnull(p_name#X) AND StartsWith(p_name#X, forest)) -(92) Project +(104) Project Output [1]: [p_partkey#X] Input [2]: [p_partkey#X, p_name#X] -(93) Exchange +(105) Exchange Input [1]: [p_partkey#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(94) ShuffledHashJoin +(106) Sort +Input [1]: [p_partkey#X] +Arguments: [p_partkey#X ASC NULLS FIRST], false, 0 + +(107) SortMergeJoin Left keys [1]: [ps_partkey#X] Right keys [1]: [p_partkey#X] Join type: LeftSemi Join condition: None -(95) Exchange +(108) Exchange Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: hashpartitioning(ps_partkey#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(96) Scan parquet +(109) Sort +Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] +Arguments: [ps_partkey#X ASC NULLS FIRST, ps_suppkey#X ASC NULLS FIRST], false, 0 + +(110) Scan parquet Output [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1994-01-01), LessThan(l_shipdate,1995-01-01), IsNotNull(l_partkey), IsNotNull(l_suppkey)] ReadSchema: struct -(97) Filter +(111) Filter Input [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] Condition : ((((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1994-01-01)) AND (l_shipdate#X < 1995-01-01)) AND isnotnull(l_partkey#X)) AND isnotnull(l_suppkey#X)) -(98) Project +(112) Project Output [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] Input [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] -(99) Exchange +(113) Exchange Input [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(100) Scan parquet +(114) Sort +Input [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] +Arguments: [l_partkey#X ASC NULLS FIRST], false, 0 + +(115) Scan parquet Output [2]: [p_partkey#X, p_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_name), StringStartsWith(p_name,forest)] ReadSchema: struct -(101) Filter +(116) Filter Input [2]: [p_partkey#X, p_name#X] Condition : (isnotnull(p_name#X) AND StartsWith(p_name#X, forest)) -(102) Project +(117) Project Output [1]: [p_partkey#X] Input [2]: [p_partkey#X, p_name#X] -(103) Exchange +(118) Exchange Input [1]: [p_partkey#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(104) ShuffledHashJoin +(119) Sort +Input [1]: [p_partkey#X] +Arguments: [p_partkey#X ASC NULLS FIRST], false, 0 + +(120) SortMergeJoin Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join type: LeftSemi Join condition: None -(105) HashAggregate +(121) HashAggregate Input [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] Keys [2]: [l_partkey#X, l_suppkey#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] -(106) HashAggregate +(122) HashAggregate Input [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] Keys [2]: [l_partkey#X, l_suppkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [3]: [(0.5 * sum(l_quantity#X)#X) AS (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] -(107) Filter +(123) Filter Input [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Condition : isnotnull((0.5 * sum(l_quantity))#X) -(108) Exchange +(124) Exchange Input [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Arguments: hashpartitioning(l_partkey#X, l_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(109) ShuffledHashJoin +(125) Sort +Input [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] +Arguments: [l_partkey#X ASC NULLS FIRST, l_suppkey#X ASC NULLS FIRST], false, 0 + +(126) SortMergeJoin Left keys [2]: [ps_partkey#X, ps_suppkey#X] Right keys [2]: [l_partkey#X, l_suppkey#X] Join type: Inner Join condition: (cast(ps_availqty#X as decimal(24,3)) > (0.5 * sum(l_quantity))#X) -(110) Project +(127) Project Output [1]: [ps_suppkey#X] Input [6]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] -(111) Exchange +(128) Exchange Input [1]: [ps_suppkey#X] Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(112) ShuffledHashJoin +(129) Sort +Input [1]: [ps_suppkey#X] +Arguments: [ps_suppkey#X ASC NULLS FIRST], false, 0 + +(130) SortMergeJoin Left keys [1]: [s_suppkey#X] Right keys [1]: [ps_suppkey#X] Join type: LeftSemi Join condition: None -(113) Project +(131) Project Output [3]: [s_name#X, s_address#X, s_nationkey#X] Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] -(114) Exchange +(132) Exchange Input [3]: [s_name#X, s_address#X, s_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(115) Scan parquet +(133) Sort +Input [3]: [s_name#X, s_address#X, s_nationkey#X] +Arguments: [s_nationkey#X ASC NULLS FIRST], false, 0 + +(134) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,CANADA), IsNotNull(n_nationkey)] ReadSchema: struct -(116) Filter +(135) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : ((isnotnull(n_name#X) AND (n_name#X = CANADA)) AND isnotnull(n_nationkey#X)) -(117) Project +(136) Project Output [1]: [n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(118) Exchange +(137) Exchange Input [1]: [n_nationkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(119) ShuffledHashJoin +(138) Sort +Input [1]: [n_nationkey#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(139) SortMergeJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(120) Project +(140) Project Output [2]: [s_name#X, s_address#X] Input [4]: [s_name#X, s_address#X, s_nationkey#X, n_nationkey#X] -(121) Exchange +(141) Exchange Input [2]: [s_name#X, s_address#X] Arguments: rangepartitioning(s_name#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(122) Sort +(142) Sort Input [2]: [s_name#X, s_address#X] Arguments: [s_name#X ASC NULLS FIRST], true, 0 -(123) AdaptiveSparkPlan +(143) AdaptiveSparkPlan Output [2]: [s_name#X, s_address#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/21.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/21.txt index 658a499a30cec..279f4f0966927 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/21.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/21.txt @@ -1,113 +1,122 @@ == Physical Plan == -AdaptiveSparkPlan (118) +AdaptiveSparkPlan (137) +- == Final Plan == - VeloxColumnarToRowExec (81) - +- ^ RegularHashAggregateExecTransformer (79) - +- ^ InputIteratorTransformer (78) - +- ^ InputAdapter (77) - +- ^ ShuffleQueryStage (76), Statistics(X) - +- ColumnarExchange (75) - +- ^ ProjectExecTransformer (73) - +- ^ FlushableHashAggregateExecTransformer (72) - +- ^ ProjectExecTransformer (71) - +- ^ ShuffledHashJoinExecTransformer Inner (70) - :- ^ InputIteratorTransformer (61) - : +- ^ InputAdapter (60) - : +- ^ ShuffleQueryStage (59), Statistics(X) - : +- ColumnarExchange (58) - : +- ^ ProjectExecTransformer (56) - : +- ^ ShuffledHashJoinExecTransformer Inner (55) - : :- ^ InputIteratorTransformer (46) - : : +- ^ InputAdapter (45) - : : +- ^ ShuffleQueryStage (44), Statistics(X) - : : +- ColumnarExchange (43) - : : +- ^ ProjectExecTransformer (41) - : : +- ^ ShuffledHashJoinExecTransformer Inner (40) - : : :- ^ InputIteratorTransformer (8) - : : : +- ^ InputAdapter (7) - : : : +- ^ ShuffleQueryStage (6), Statistics(X) - : : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (91) + +- ^ RegularHashAggregateExecTransformer (89) + +- ^ InputIteratorTransformer (88) + +- ShuffleQueryStage (86), Statistics(X) + +- ColumnarExchange (85) + +- VeloxAppendBatches (84) + +- ^ ProjectExecTransformer (82) + +- ^ FlushableHashAggregateExecTransformer (81) + +- ^ ProjectExecTransformer (80) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (79) + :- ^ InputIteratorTransformer (69) + : +- ShuffleQueryStage (67), Statistics(X) + : +- ColumnarExchange (66) + : +- VeloxAppendBatches (65) + : +- ^ ProjectExecTransformer (63) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (62) + : :- ^ InputIteratorTransformer (52) + : : +- ShuffleQueryStage (50), Statistics(X) + : : +- ColumnarExchange (49) + : : +- VeloxAppendBatches (48) + : : +- ^ ProjectExecTransformer (46) + : : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (45) + : : :- ^ InputIteratorTransformer (9) + : : : +- ShuffleQueryStage (7), Statistics(X) + : : : +- ColumnarExchange (6) + : : : +- VeloxAppendBatches (5) : : : +- ^ ProjectExecTransformer (3) : : : +- ^ NoopFilter (2) : : : +- ^ Scan parquet (1) - : : +- ^ InputIteratorTransformer (39) - : : +- ^ InputAdapter (38) - : : +- ^ ShuffleQueryStage (37), Statistics(X) - : : +- ColumnarExchange (36) - : : +- ^ ProjectExecTransformer (34) - : : +- ^ ShuffledHashJoinExecTransformer LeftAnti (33) - : : :- ^ ShuffledHashJoinExecTransformer LeftSemi (24) - : : : :- ^ InputIteratorTransformer (16) - : : : : +- ^ InputAdapter (15) - : : : : +- ^ ShuffleQueryStage (14), Statistics(X) - : : : : +- ColumnarExchange (13) - : : : : +- ^ ProjectExecTransformer (11) - : : : : +- ^ NoopFilter (10) - : : : : +- ^ Scan parquet (9) - : : : +- ^ InputIteratorTransformer (23) - : : : +- ^ InputAdapter (22) - : : : +- ^ ShuffleQueryStage (21), Statistics(X) - : : : +- ColumnarExchange (20) - : : : +- ^ ProjectExecTransformer (18) - : : : +- ^ Scan parquet (17) - : : +- ^ InputIteratorTransformer (32) - : : +- ^ InputAdapter (31) - : : +- ^ ShuffleQueryStage (30), Statistics(X) - : : +- ColumnarExchange (29) - : : +- ^ ProjectExecTransformer (27) - : : +- ^ NoopFilter (26) - : : +- ^ Scan parquet (25) - : +- ^ InputIteratorTransformer (54) - : +- ^ InputAdapter (53) - : +- ^ ShuffleQueryStage (52), Statistics(X) - : +- ColumnarExchange (51) - : +- ^ ProjectExecTransformer (49) - : +- ^ NoopFilter (48) - : +- ^ Scan parquet (47) - +- ^ InputIteratorTransformer (69) - +- ^ InputAdapter (68) - +- ^ ShuffleQueryStage (67), Statistics(X) - +- ColumnarExchange (66) - +- ^ ProjectExecTransformer (64) - +- ^ NoopFilter (63) - +- ^ Scan parquet (62) + : : +- ^ InputIteratorTransformer (44) + : : +- ShuffleQueryStage (42), Statistics(X) + : : +- ColumnarExchange (41) + : : +- VeloxAppendBatches (40) + : : +- ^ ProjectExecTransformer (38) + : : +- ^ ShuffledHashJoinExecTransformer LeftAnti BuildRight (37) + : : :- ^ ShuffledHashJoinExecTransformer LeftSemi BuildRight (27) + : : : :- ^ InputIteratorTransformer (18) + : : : : +- ShuffleQueryStage (16), Statistics(X) + : : : : +- ColumnarExchange (15) + : : : : +- VeloxAppendBatches (14) + : : : : +- ^ ProjectExecTransformer (12) + : : : : +- ^ NoopFilter (11) + : : : : +- ^ Scan parquet (10) + : : : +- ^ InputIteratorTransformer (26) + : : : +- ShuffleQueryStage (24), Statistics(X) + : : : +- ColumnarExchange (23) + : : : +- VeloxAppendBatches (22) + : : : +- ^ ProjectExecTransformer (20) + : : : +- ^ Scan parquet (19) + : : +- ^ InputIteratorTransformer (36) + : : +- ShuffleQueryStage (34), Statistics(X) + : : +- ColumnarExchange (33) + : : +- VeloxAppendBatches (32) + : : +- ^ ProjectExecTransformer (30) + : : +- ^ NoopFilter (29) + : : +- ^ Scan parquet (28) + : +- ^ InputIteratorTransformer (61) + : +- ShuffleQueryStage (59), Statistics(X) + : +- ColumnarExchange (58) + : +- VeloxAppendBatches (57) + : +- ^ ProjectExecTransformer (55) + : +- ^ NoopFilter (54) + : +- ^ Scan parquet (53) + +- ^ InputIteratorTransformer (78) + +- ShuffleQueryStage (76), Statistics(X) + +- ColumnarExchange (75) + +- VeloxAppendBatches (74) + +- ^ ProjectExecTransformer (72) + +- ^ NoopFilter (71) + +- ^ Scan parquet (70) +- == Initial Plan == - TakeOrderedAndProject (117) - +- HashAggregate (116) - +- Exchange (115) - +- HashAggregate (114) - +- Project (113) - +- ShuffledHashJoin Inner BuildRight (112) - :- Exchange (107) - : +- Project (106) - : +- ShuffledHashJoin Inner BuildRight (105) - : :- Exchange (100) - : : +- Project (99) - : : +- ShuffledHashJoin Inner BuildLeft (98) - : : :- Exchange (84) - : : : +- Filter (83) - : : : +- Scan parquet (82) - : : +- Exchange (97) - : : +- ShuffledHashJoin LeftAnti BuildRight (96) - : : :- ShuffledHashJoin LeftSemi BuildRight (91) - : : : :- Exchange (88) - : : : : +- Project (87) - : : : : +- Filter (86) - : : : : +- Scan parquet (85) - : : : +- Exchange (90) - : : : +- Scan parquet (89) - : : +- Exchange (95) - : : +- Project (94) - : : +- Filter (93) - : : +- Scan parquet (92) - : +- Exchange (104) - : +- Project (103) - : +- Filter (102) - : +- Scan parquet (101) - +- Exchange (111) - +- Project (110) - +- Filter (109) - +- Scan parquet (108) + TakeOrderedAndProject (136) + +- HashAggregate (135) + +- Exchange (134) + +- HashAggregate (133) + +- Project (132) + +- SortMergeJoin Inner (131) + :- Sort (125) + : +- Exchange (124) + : +- Project (123) + : +- SortMergeJoin Inner (122) + : :- Sort (116) + : : +- Exchange (115) + : : +- Project (114) + : : +- SortMergeJoin Inner (113) + : : :- Sort (95) + : : : +- Exchange (94) + : : : +- Filter (93) + : : : +- Scan parquet (92) + : : +- Sort (112) + : : +- Exchange (111) + : : +- SortMergeJoin LeftAnti (110) + : : :- SortMergeJoin LeftSemi (104) + : : : :- Sort (100) + : : : : +- Exchange (99) + : : : : +- Project (98) + : : : : +- Filter (97) + : : : : +- Scan parquet (96) + : : : +- Sort (103) + : : : +- Exchange (102) + : : : +- Scan parquet (101) + : : +- Sort (109) + : : +- Exchange (108) + : : +- Project (107) + : : +- Filter (106) + : : +- Scan parquet (105) + : +- Sort (121) + : +- Exchange (120) + : +- Project (119) + : +- Filter (118) + : +- Scan parquet (117) + +- Sort (130) + +- Exchange (129) + +- Project (128) + +- Filter (127) + +- Scan parquet (126) (1) Scan parquet @@ -129,500 +138,576 @@ Input [3]: [s_suppkey#X, s_name#X, s_nationkey#X] Input [4]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_nationkey#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [4]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_nationkey#X] +Arguments: X + +(6) ColumnarExchange Input [4]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_nationkey#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [3]: [s_suppkey#X, s_name#X, s_nationkey#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [3]: [s_suppkey#X, s_name#X, s_nationkey#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [3]: [s_suppkey#X, s_name#X, s_nationkey#X] -(9) Scan parquet +(10) Scan parquet Output [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_receiptdate), IsNotNull(l_commitdate), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(10) NoopFilter +(11) NoopFilter Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Arguments: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [3]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_suppkey#X] Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] +Arguments: X + +(15) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [2]: [l_orderkey#X, l_suppkey#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [2]: [l_orderkey#X, l_suppkey#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [2]: [l_orderkey#X, l_suppkey#X] -(17) Scan parquet +(19) Scan parquet Output [2]: [l_orderkey#X, l_suppkey#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [3]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_suppkey#X] Input [2]: [l_orderkey#X, l_suppkey#X] -(19) WholeStageCodegenTransformer (X) +(21) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] Arguments: false -(20) ColumnarExchange +(22) VeloxAppendBatches +Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] +Arguments: X + +(23) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [id=#X] -(21) ShuffleQueryStage +(24) ShuffleQueryStage Output [2]: [l_orderkey#X, l_suppkey#X] Arguments: X -(22) InputAdapter +(25) InputAdapter Input [2]: [l_orderkey#X, l_suppkey#X] -(23) InputIteratorTransformer +(26) InputIteratorTransformer Input [2]: [l_orderkey#X, l_suppkey#X] -(24) ShuffledHashJoinExecTransformer +(27) ShuffledHashJoinExecTransformer Left keys [1]: [l_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: LeftSemi Join condition: NOT (l_suppkey#X = l_suppkey#X) -(25) Scan parquet +(28) Scan parquet Output [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_receiptdate), IsNotNull(l_commitdate)] ReadSchema: struct -(26) NoopFilter +(29) NoopFilter Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Arguments: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] -(27) ProjectExecTransformer +(30) ProjectExecTransformer Output [3]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_suppkey#X] Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] -(28) WholeStageCodegenTransformer (X) +(31) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] Arguments: false -(29) ColumnarExchange +(32) VeloxAppendBatches +Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] +Arguments: X + +(33) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [id=#X] -(30) ShuffleQueryStage +(34) ShuffleQueryStage Output [2]: [l_orderkey#X, l_suppkey#X] Arguments: X -(31) InputAdapter +(35) InputAdapter Input [2]: [l_orderkey#X, l_suppkey#X] -(32) InputIteratorTransformer +(36) InputIteratorTransformer Input [2]: [l_orderkey#X, l_suppkey#X] -(33) ShuffledHashJoinExecTransformer +(37) ShuffledHashJoinExecTransformer Left keys [1]: [l_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: LeftAnti Join condition: NOT (l_suppkey#X = l_suppkey#X) -(34) ProjectExecTransformer +(38) ProjectExecTransformer Output [3]: [hash(l_suppkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_suppkey#X] Input [2]: [l_orderkey#X, l_suppkey#X] -(35) WholeStageCodegenTransformer (X) +(39) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] Arguments: false -(36) ColumnarExchange +(40) VeloxAppendBatches +Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] +Arguments: X + +(41) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [id=#X] -(37) ShuffleQueryStage +(42) ShuffleQueryStage Output [2]: [l_orderkey#X, l_suppkey#X] Arguments: X -(38) InputAdapter +(43) InputAdapter Input [2]: [l_orderkey#X, l_suppkey#X] -(39) InputIteratorTransformer +(44) InputIteratorTransformer Input [2]: [l_orderkey#X, l_suppkey#X] -(40) ShuffledHashJoinExecTransformer +(45) ShuffledHashJoinExecTransformer Left keys [1]: [s_suppkey#X] Right keys [1]: [l_suppkey#X] Join type: Inner Join condition: None -(41) ProjectExecTransformer +(46) ProjectExecTransformer Output [4]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, s_name#X, s_nationkey#X, l_orderkey#X] Input [5]: [s_suppkey#X, s_name#X, s_nationkey#X, l_orderkey#X, l_suppkey#X] -(42) WholeStageCodegenTransformer (X) +(47) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, s_name#X, s_nationkey#X, l_orderkey#X] Arguments: false -(43) ColumnarExchange +(48) VeloxAppendBatches +Input [4]: [hash_partition_key#X, s_name#X, s_nationkey#X, l_orderkey#X] +Arguments: X + +(49) ColumnarExchange Input [4]: [hash_partition_key#X, s_name#X, s_nationkey#X, l_orderkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [s_name#X, s_nationkey#X, l_orderkey#X], [plan_id=X], [id=#X] -(44) ShuffleQueryStage +(50) ShuffleQueryStage Output [3]: [s_name#X, s_nationkey#X, l_orderkey#X] Arguments: X -(45) InputAdapter +(51) InputAdapter Input [3]: [s_name#X, s_nationkey#X, l_orderkey#X] -(46) InputIteratorTransformer +(52) InputIteratorTransformer Input [3]: [s_name#X, s_nationkey#X, l_orderkey#X] -(47) Scan parquet +(53) Scan parquet Output [2]: [o_orderkey#X, o_orderstatus#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderstatus), EqualTo(o_orderstatus,F), IsNotNull(o_orderkey)] ReadSchema: struct -(48) NoopFilter +(54) NoopFilter Input [2]: [o_orderkey#X, o_orderstatus#X] Arguments: [o_orderkey#X, o_orderstatus#X] -(49) ProjectExecTransformer +(55) ProjectExecTransformer Output [2]: [hash(o_orderkey#X, 42) AS hash_partition_key#X, o_orderkey#X] Input [2]: [o_orderkey#X, o_orderstatus#X] -(50) WholeStageCodegenTransformer (X) +(56) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, o_orderkey#X] Arguments: false -(51) ColumnarExchange +(57) VeloxAppendBatches +Input [2]: [hash_partition_key#X, o_orderkey#X] +Arguments: X + +(58) ColumnarExchange Input [2]: [hash_partition_key#X, o_orderkey#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X], [plan_id=X], [id=#X] -(52) ShuffleQueryStage +(59) ShuffleQueryStage Output [1]: [o_orderkey#X] Arguments: X -(53) InputAdapter +(60) InputAdapter Input [1]: [o_orderkey#X] -(54) InputIteratorTransformer +(61) InputIteratorTransformer Input [1]: [o_orderkey#X] -(55) ShuffledHashJoinExecTransformer +(62) ShuffledHashJoinExecTransformer Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join type: Inner Join condition: None -(56) ProjectExecTransformer +(63) ProjectExecTransformer Output [3]: [hash(s_nationkey#X, 42) AS hash_partition_key#X, s_name#X, s_nationkey#X] Input [4]: [s_name#X, s_nationkey#X, l_orderkey#X, o_orderkey#X] -(57) WholeStageCodegenTransformer (X) +(64) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, s_name#X, s_nationkey#X] Arguments: false -(58) ColumnarExchange +(65) VeloxAppendBatches +Input [3]: [hash_partition_key#X, s_name#X, s_nationkey#X] +Arguments: X + +(66) ColumnarExchange Input [3]: [hash_partition_key#X, s_name#X, s_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_name#X, s_nationkey#X], [plan_id=X], [id=#X] -(59) ShuffleQueryStage +(67) ShuffleQueryStage Output [2]: [s_name#X, s_nationkey#X] Arguments: X -(60) InputAdapter +(68) InputAdapter Input [2]: [s_name#X, s_nationkey#X] -(61) InputIteratorTransformer +(69) InputIteratorTransformer Input [2]: [s_name#X, s_nationkey#X] -(62) Scan parquet +(70) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,SAUDI ARABIA), IsNotNull(n_nationkey)] ReadSchema: struct -(63) NoopFilter +(71) NoopFilter Input [2]: [n_nationkey#X, n_name#X] Arguments: [n_nationkey#X, n_name#X] -(64) ProjectExecTransformer +(72) ProjectExecTransformer Output [2]: [hash(n_nationkey#X, 42) AS hash_partition_key#X, n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(65) WholeStageCodegenTransformer (X) +(73) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, n_nationkey#X] Arguments: false -(66) ColumnarExchange +(74) VeloxAppendBatches +Input [2]: [hash_partition_key#X, n_nationkey#X] +Arguments: X + +(75) ColumnarExchange Input [2]: [hash_partition_key#X, n_nationkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X], [plan_id=X], [id=#X] -(67) ShuffleQueryStage +(76) ShuffleQueryStage Output [1]: [n_nationkey#X] Arguments: X -(68) InputAdapter +(77) InputAdapter Input [1]: [n_nationkey#X] -(69) InputIteratorTransformer +(78) InputIteratorTransformer Input [1]: [n_nationkey#X] -(70) ShuffledHashJoinExecTransformer +(79) ShuffledHashJoinExecTransformer Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(71) ProjectExecTransformer +(80) ProjectExecTransformer Output [1]: [s_name#X] Input [3]: [s_name#X, s_nationkey#X, n_nationkey#X] -(72) FlushableHashAggregateExecTransformer +(81) FlushableHashAggregateExecTransformer Input [1]: [s_name#X] Keys [1]: [s_name#X] Functions [1]: [partial_count(1)] Aggregate Attributes [1]: [count#X] Results [2]: [s_name#X, count#X] -(73) ProjectExecTransformer +(82) ProjectExecTransformer Output [3]: [hash(s_name#X, 42) AS hash_partition_key#X, s_name#X, count#X] Input [2]: [s_name#X, count#X] -(74) WholeStageCodegenTransformer (X) +(83) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, s_name#X, count#X] Arguments: false -(75) ColumnarExchange +(84) VeloxAppendBatches +Input [3]: [hash_partition_key#X, s_name#X, count#X] +Arguments: X + +(85) ColumnarExchange Input [3]: [hash_partition_key#X, s_name#X, count#X] Arguments: hashpartitioning(s_name#X, 1), ENSURE_REQUIREMENTS, [s_name#X, count#X], [plan_id=X], [id=#X] -(76) ShuffleQueryStage +(86) ShuffleQueryStage Output [2]: [s_name#X, count#X] Arguments: X -(77) InputAdapter +(87) InputAdapter Input [2]: [s_name#X, count#X] -(78) InputIteratorTransformer +(88) InputIteratorTransformer Input [2]: [s_name#X, count#X] -(79) RegularHashAggregateExecTransformer +(89) RegularHashAggregateExecTransformer Input [2]: [s_name#X, count#X] Keys [1]: [s_name#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [s_name#X, count(1)#X AS numwait#X] -(80) WholeStageCodegenTransformer (X) +(90) WholeStageCodegenTransformer (X) Input [2]: [s_name#X, numwait#X] Arguments: false -(81) VeloxColumnarToRowExec +(91) VeloxColumnarToRowExec Input [2]: [s_name#X, numwait#X] -(82) Scan parquet +(92) Scan parquet Output [3]: [s_suppkey#X, s_name#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(83) Filter +(93) Filter Input [3]: [s_suppkey#X, s_name#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(84) Exchange +(94) Exchange Input [3]: [s_suppkey#X, s_name#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(85) Scan parquet +(95) Sort +Input [3]: [s_suppkey#X, s_name#X, s_nationkey#X] +Arguments: [s_suppkey#X ASC NULLS FIRST], false, 0 + +(96) Scan parquet Output [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_receiptdate), IsNotNull(l_commitdate), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(86) Filter +(97) Filter Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Condition : ((((isnotnull(l_receiptdate#X) AND isnotnull(l_commitdate#X)) AND (l_receiptdate#X > l_commitdate#X)) AND isnotnull(l_suppkey#X)) AND isnotnull(l_orderkey#X)) -(87) Project +(98) Project Output [2]: [l_orderkey#X, l_suppkey#X] Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] -(88) Exchange +(99) Exchange Input [2]: [l_orderkey#X, l_suppkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(89) Scan parquet +(100) Sort +Input [2]: [l_orderkey#X, l_suppkey#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(101) Scan parquet Output [2]: [l_orderkey#X, l_suppkey#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(90) Exchange +(102) Exchange Input [2]: [l_orderkey#X, l_suppkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(91) ShuffledHashJoin +(103) Sort +Input [2]: [l_orderkey#X, l_suppkey#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(104) SortMergeJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: LeftSemi Join condition: NOT (l_suppkey#X = l_suppkey#X) -(92) Scan parquet +(105) Scan parquet Output [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_receiptdate), IsNotNull(l_commitdate)] ReadSchema: struct -(93) Filter +(106) Filter Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Condition : ((isnotnull(l_receiptdate#X) AND isnotnull(l_commitdate#X)) AND (l_receiptdate#X > l_commitdate#X)) -(94) Project +(107) Project Output [2]: [l_orderkey#X, l_suppkey#X] Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] -(95) Exchange +(108) Exchange Input [2]: [l_orderkey#X, l_suppkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(96) ShuffledHashJoin +(109) Sort +Input [2]: [l_orderkey#X, l_suppkey#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(110) SortMergeJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: LeftAnti Join condition: NOT (l_suppkey#X = l_suppkey#X) -(97) Exchange +(111) Exchange Input [2]: [l_orderkey#X, l_suppkey#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(98) ShuffledHashJoin +(112) Sort +Input [2]: [l_orderkey#X, l_suppkey#X] +Arguments: [l_suppkey#X ASC NULLS FIRST], false, 0 + +(113) SortMergeJoin Left keys [1]: [s_suppkey#X] Right keys [1]: [l_suppkey#X] Join type: Inner Join condition: None -(99) Project +(114) Project Output [3]: [s_name#X, s_nationkey#X, l_orderkey#X] Input [5]: [s_suppkey#X, s_name#X, s_nationkey#X, l_orderkey#X, l_suppkey#X] -(100) Exchange +(115) Exchange Input [3]: [s_name#X, s_nationkey#X, l_orderkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(101) Scan parquet +(116) Sort +Input [3]: [s_name#X, s_nationkey#X, l_orderkey#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(117) Scan parquet Output [2]: [o_orderkey#X, o_orderstatus#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderstatus), EqualTo(o_orderstatus,F), IsNotNull(o_orderkey)] ReadSchema: struct -(102) Filter +(118) Filter Input [2]: [o_orderkey#X, o_orderstatus#X] Condition : ((isnotnull(o_orderstatus#X) AND (o_orderstatus#X = F)) AND isnotnull(o_orderkey#X)) -(103) Project +(119) Project Output [1]: [o_orderkey#X] Input [2]: [o_orderkey#X, o_orderstatus#X] -(104) Exchange +(120) Exchange Input [1]: [o_orderkey#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(105) ShuffledHashJoin +(121) Sort +Input [1]: [o_orderkey#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(122) SortMergeJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join type: Inner Join condition: None -(106) Project +(123) Project Output [2]: [s_name#X, s_nationkey#X] Input [4]: [s_name#X, s_nationkey#X, l_orderkey#X, o_orderkey#X] -(107) Exchange +(124) Exchange Input [2]: [s_name#X, s_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(108) Scan parquet +(125) Sort +Input [2]: [s_name#X, s_nationkey#X] +Arguments: [s_nationkey#X ASC NULLS FIRST], false, 0 + +(126) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,SAUDI ARABIA), IsNotNull(n_nationkey)] ReadSchema: struct -(109) Filter +(127) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : ((isnotnull(n_name#X) AND (n_name#X = SAUDI ARABIA)) AND isnotnull(n_nationkey#X)) -(110) Project +(128) Project Output [1]: [n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(111) Exchange +(129) Exchange Input [1]: [n_nationkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(112) ShuffledHashJoin +(130) Sort +Input [1]: [n_nationkey#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(131) SortMergeJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(113) Project +(132) Project Output [1]: [s_name#X] Input [3]: [s_name#X, s_nationkey#X, n_nationkey#X] -(114) HashAggregate +(133) HashAggregate Input [1]: [s_name#X] Keys [1]: [s_name#X] Functions [1]: [partial_count(1)] Aggregate Attributes [1]: [count#X] Results [2]: [s_name#X, count#X] -(115) Exchange +(134) Exchange Input [2]: [s_name#X, count#X] Arguments: hashpartitioning(s_name#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(116) HashAggregate +(135) HashAggregate Input [2]: [s_name#X, count#X] Keys [1]: [s_name#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [s_name#X, count(1)#X AS numwait#X] -(117) TakeOrderedAndProject +(136) TakeOrderedAndProject Input [2]: [s_name#X, numwait#X] Arguments: X, [numwait#X DESC NULLS LAST, s_name#X ASC NULLS FIRST], [s_name#X, numwait#X] -(118) AdaptiveSparkPlan +(137) AdaptiveSparkPlan Output [2]: [s_name#X, numwait#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/22.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/22.txt index e94b7b01715f8..2b93055014bd0 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/22.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/22.txt @@ -1,47 +1,49 @@ == Physical Plan == -AdaptiveSparkPlan (46) +AdaptiveSparkPlan (52) +- == Final Plan == - VeloxColumnarToRowExec (33) - +- ^ SortExecTransformer (31) - +- ^ InputIteratorTransformer (30) - +- ^ InputAdapter (29) - +- ^ ShuffleQueryStage (28), Statistics(X) - +- ColumnarExchange (27) - +- ^ RegularHashAggregateExecTransformer (25) - +- ^ InputIteratorTransformer (24) - +- ^ InputAdapter (23) - +- ^ ShuffleQueryStage (22), Statistics(X) - +- ColumnarExchange (21) - +- ^ ProjectExecTransformer (19) - +- ^ FlushableHashAggregateExecTransformer (18) - +- ^ ProjectExecTransformer (17) - +- ^ ShuffledHashJoinExecTransformer LeftAnti (16) - :- ^ InputIteratorTransformer (8) - : +- ^ InputAdapter (7) - : +- ^ ShuffleQueryStage (6), Statistics(X) - : +- ColumnarExchange (5) + VeloxColumnarToRowExec (37) + +- ^ SortExecTransformer (35) + +- ^ InputIteratorTransformer (34) + +- ShuffleQueryStage (32), Statistics(X) + +- ColumnarExchange (31) + +- VeloxAppendBatches (30) + +- ^ RegularHashAggregateExecTransformer (28) + +- ^ InputIteratorTransformer (27) + +- ShuffleQueryStage (25), Statistics(X) + +- ColumnarExchange (24) + +- VeloxAppendBatches (23) + +- ^ ProjectExecTransformer (21) + +- ^ FlushableHashAggregateExecTransformer (20) + +- ^ ProjectExecTransformer (19) + +- ^ ShuffledHashJoinExecTransformer LeftAnti BuildRight (18) + :- ^ InputIteratorTransformer (9) + : +- ShuffleQueryStage (7), Statistics(X) + : +- ColumnarExchange (6) + : +- VeloxAppendBatches (5) : +- ^ ProjectExecTransformer (3) : +- ^ NoopFilter (2) : +- ^ Scan parquet (1) - +- ^ InputIteratorTransformer (15) - +- ^ InputAdapter (14) - +- ^ ShuffleQueryStage (13), Statistics(X) - +- ColumnarExchange (12) - +- ^ ProjectExecTransformer (10) - +- ^ Scan parquet (9) + +- ^ InputIteratorTransformer (17) + +- ShuffleQueryStage (15), Statistics(X) + +- ColumnarExchange (14) + +- VeloxAppendBatches (13) + +- ^ ProjectExecTransformer (11) + +- ^ Scan parquet (10) +- == Initial Plan == - Sort (45) - +- Exchange (44) - +- HashAggregate (43) - +- Exchange (42) - +- HashAggregate (41) - +- Project (40) - +- ShuffledHashJoin LeftAnti BuildRight (39) - :- Exchange (36) - : +- Filter (35) - : +- Scan parquet (34) - +- Exchange (38) - +- Scan parquet (37) + Sort (51) + +- Exchange (50) + +- HashAggregate (49) + +- Exchange (48) + +- HashAggregate (47) + +- Project (46) + +- SortMergeJoin LeftAnti (45) + :- Sort (41) + : +- Exchange (40) + : +- Filter (39) + : +- Scan parquet (38) + +- Sort (44) + +- Exchange (43) + +- Scan parquet (42) (1) Scan parquet @@ -63,299 +65,327 @@ Input [3]: [c_custkey#X, c_phone#X, c_acctbal#X] Input [4]: [hash_partition_key#X, c_custkey#X, c_phone#X, c_acctbal#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [4]: [hash_partition_key#X, c_custkey#X, c_phone#X, c_acctbal#X] +Arguments: X + +(6) ColumnarExchange Input [4]: [hash_partition_key#X, c_custkey#X, c_phone#X, c_acctbal#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_phone#X, c_acctbal#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [3]: [c_custkey#X, c_phone#X, c_acctbal#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [3]: [c_custkey#X, c_phone#X, c_acctbal#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [3]: [c_custkey#X, c_phone#X, c_acctbal#X] -(9) Scan parquet +(10) Scan parquet Output [1]: [o_custkey#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(10) ProjectExecTransformer +(11) ProjectExecTransformer Output [2]: [hash(o_custkey#X, 42) AS hash_partition_key#X, o_custkey#X] Input [1]: [o_custkey#X] -(11) WholeStageCodegenTransformer (X) +(12) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, o_custkey#X] Arguments: false -(12) ColumnarExchange +(13) VeloxAppendBatches +Input [2]: [hash_partition_key#X, o_custkey#X] +Arguments: X + +(14) ColumnarExchange Input [2]: [hash_partition_key#X, o_custkey#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_custkey#X], [plan_id=X], [id=#X] -(13) ShuffleQueryStage +(15) ShuffleQueryStage Output [1]: [o_custkey#X] Arguments: X -(14) InputAdapter +(16) InputAdapter Input [1]: [o_custkey#X] -(15) InputIteratorTransformer +(17) InputIteratorTransformer Input [1]: [o_custkey#X] -(16) ShuffledHashJoinExecTransformer +(18) ShuffledHashJoinExecTransformer Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join type: LeftAnti Join condition: None -(17) ProjectExecTransformer +(19) ProjectExecTransformer Output [2]: [substring(c_phone#X, 1, 2) AS cntrycode#X, c_acctbal#X] Input [3]: [c_custkey#X, c_phone#X, c_acctbal#X] -(18) FlushableHashAggregateExecTransformer +(20) FlushableHashAggregateExecTransformer Input [2]: [cntrycode#X, c_acctbal#X] Keys [1]: [cntrycode#X] Functions [2]: [partial_count(1), partial_sum(c_acctbal#X)] Aggregate Attributes [3]: [count#X, sum#X, isEmpty#X] Results [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] -(19) ProjectExecTransformer +(21) ProjectExecTransformer Output [5]: [hash(cntrycode#X, 42) AS hash_partition_key#X, cntrycode#X, count#X, sum#X, isEmpty#X] Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] -(20) WholeStageCodegenTransformer (X) +(22) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, cntrycode#X, count#X, sum#X, isEmpty#X] Arguments: false -(21) ColumnarExchange +(23) VeloxAppendBatches +Input [5]: [hash_partition_key#X, cntrycode#X, count#X, sum#X, isEmpty#X] +Arguments: X + +(24) ColumnarExchange Input [5]: [hash_partition_key#X, cntrycode#X, count#X, sum#X, isEmpty#X] Arguments: hashpartitioning(cntrycode#X, 1), ENSURE_REQUIREMENTS, [cntrycode#X, count#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(22) ShuffleQueryStage +(25) ShuffleQueryStage Output [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] Arguments: X -(23) InputAdapter +(26) InputAdapter Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] -(24) InputIteratorTransformer +(27) InputIteratorTransformer Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] -(25) RegularHashAggregateExecTransformer +(28) RegularHashAggregateExecTransformer Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] Keys [1]: [cntrycode#X] Functions [2]: [count(1), sum(c_acctbal#X)] Aggregate Attributes [2]: [count(1)#X, sum(c_acctbal#X)#X] Results [3]: [cntrycode#X, count(1)#X AS numcust#X, sum(c_acctbal#X)#X AS totacctbal#X] -(26) WholeStageCodegenTransformer (X) +(29) WholeStageCodegenTransformer (X) Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: false -(27) ColumnarExchange +(30) VeloxAppendBatches +Input [3]: [cntrycode#X, numcust#X, totacctbal#X] +Arguments: X + +(31) ColumnarExchange Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: rangepartitioning(cntrycode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(28) ShuffleQueryStage +(32) ShuffleQueryStage Output [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: X -(29) InputAdapter +(33) InputAdapter Input [3]: [cntrycode#X, numcust#X, totacctbal#X] -(30) InputIteratorTransformer +(34) InputIteratorTransformer Input [3]: [cntrycode#X, numcust#X, totacctbal#X] -(31) SortExecTransformer +(35) SortExecTransformer Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: [cntrycode#X ASC NULLS FIRST], true, 0 -(32) WholeStageCodegenTransformer (X) +(36) WholeStageCodegenTransformer (X) Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: false -(33) VeloxColumnarToRowExec +(37) VeloxColumnarToRowExec Input [3]: [cntrycode#X, numcust#X, totacctbal#X] -(34) Scan parquet +(38) Scan parquet Output [3]: [c_custkey#X, c_phone#X, c_acctbal#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_acctbal)] ReadSchema: struct -(35) Filter +(39) Filter Input [3]: [c_custkey#X, c_phone#X, c_acctbal#X] Condition : ((isnotnull(c_acctbal#X) AND substring(c_phone#X, 1, 2) IN (13,31,23,29,30,18,17)) AND (cast(c_acctbal#X as decimal(16,6)) > Subquery subquery#X, [id=#X])) -(36) Exchange +(40) Exchange Input [3]: [c_custkey#X, c_phone#X, c_acctbal#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(37) Scan parquet +(41) Sort +Input [3]: [c_custkey#X, c_phone#X, c_acctbal#X] +Arguments: [c_custkey#X ASC NULLS FIRST], false, 0 + +(42) Scan parquet Output [1]: [o_custkey#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(38) Exchange +(43) Exchange Input [1]: [o_custkey#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(39) ShuffledHashJoin +(44) Sort +Input [1]: [o_custkey#X] +Arguments: [o_custkey#X ASC NULLS FIRST], false, 0 + +(45) SortMergeJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join type: LeftAnti Join condition: None -(40) Project +(46) Project Output [2]: [substring(c_phone#X, 1, 2) AS cntrycode#X, c_acctbal#X] Input [3]: [c_custkey#X, c_phone#X, c_acctbal#X] -(41) HashAggregate +(47) HashAggregate Input [2]: [cntrycode#X, c_acctbal#X] Keys [1]: [cntrycode#X] Functions [2]: [partial_count(1), partial_sum(c_acctbal#X)] Aggregate Attributes [3]: [count#X, sum#X, isEmpty#X] Results [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] -(42) Exchange +(48) Exchange Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] Arguments: hashpartitioning(cntrycode#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(43) HashAggregate +(49) HashAggregate Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] Keys [1]: [cntrycode#X] Functions [2]: [count(1), sum(c_acctbal#X)] Aggregate Attributes [2]: [count(1)#X, sum(c_acctbal#X)#X] Results [3]: [cntrycode#X, count(1)#X AS numcust#X, sum(c_acctbal#X)#X AS totacctbal#X] -(44) Exchange +(50) Exchange Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: rangepartitioning(cntrycode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(45) Sort +(51) Sort Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: [cntrycode#X ASC NULLS FIRST], true, 0 -(46) AdaptiveSparkPlan +(52) AdaptiveSparkPlan Output [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: isFinalPlan=true ===== Subqueries ===== Subquery:1 Hosting operator id = 1 Hosting Expression = Subquery subquery#X, [id=#X] -AdaptiveSparkPlan (65) +AdaptiveSparkPlan (72) +- == Final Plan == - VeloxColumnarToRowExec (58) - +- ^ RegularHashAggregateExecTransformer (56) - +- ^ InputIteratorTransformer (55) - +- ^ InputAdapter (54) - +- ^ ShuffleQueryStage (53), Statistics(X) - +- ColumnarExchange (52) - +- ^ FlushableHashAggregateExecTransformer (50) - +- ^ ProjectExecTransformer (49) - +- ^ NoopFilter (48) - +- ^ Scan parquet (47) + VeloxColumnarToRowExec (65) + +- ^ RegularHashAggregateExecTransformer (63) + +- ^ InputIteratorTransformer (62) + +- ShuffleQueryStage (60), Statistics(X) + +- ColumnarExchange (59) + +- VeloxAppendBatches (58) + +- ^ FlushableHashAggregateExecTransformer (56) + +- ^ ProjectExecTransformer (55) + +- ^ NoopFilter (54) + +- ^ Scan parquet (53) +- == Initial Plan == - HashAggregate (64) - +- Exchange (63) - +- HashAggregate (62) - +- Project (61) - +- Filter (60) - +- Scan parquet (59) + HashAggregate (71) + +- Exchange (70) + +- HashAggregate (69) + +- Project (68) + +- Filter (67) + +- Scan parquet (66) -(47) Scan parquet +(53) Scan parquet Output [2]: [c_phone#X, c_acctbal#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_acctbal), GreaterThan(c_acctbal,0.00)] ReadSchema: struct -(48) NoopFilter +(54) NoopFilter Input [2]: [c_phone#X, c_acctbal#X] Arguments: [c_phone#X, c_acctbal#X] -(49) ProjectExecTransformer +(55) ProjectExecTransformer Output [1]: [c_acctbal#X] Input [2]: [c_phone#X, c_acctbal#X] -(50) FlushableHashAggregateExecTransformer +(56) FlushableHashAggregateExecTransformer Input [1]: [c_acctbal#X] Keys: [] Functions [1]: [partial_avg(c_acctbal#X)] Aggregate Attributes [2]: [sum#X, count#X] Results [2]: [sum#X, count#X] -(51) WholeStageCodegenTransformer (X) +(57) WholeStageCodegenTransformer (X) Input [2]: [sum#X, count#X] Arguments: false -(52) ColumnarExchange +(58) VeloxAppendBatches +Input [2]: [sum#X, count#X] +Arguments: X + +(59) ColumnarExchange Input [2]: [sum#X, count#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(53) ShuffleQueryStage +(60) ShuffleQueryStage Output [2]: [sum#X, count#X] Arguments: X -(54) InputAdapter +(61) InputAdapter Input [2]: [sum#X, count#X] -(55) InputIteratorTransformer +(62) InputIteratorTransformer Input [2]: [sum#X, count#X] -(56) RegularHashAggregateExecTransformer +(63) RegularHashAggregateExecTransformer Input [2]: [sum#X, count#X] Keys: [] Functions [1]: [avg(c_acctbal#X)] Aggregate Attributes [1]: [avg(c_acctbal#X)#X] Results [1]: [avg(c_acctbal#X)#X AS avg(c_acctbal)#X] -(57) WholeStageCodegenTransformer (X) +(64) WholeStageCodegenTransformer (X) Input [1]: [avg(c_acctbal)#X] Arguments: false -(58) VeloxColumnarToRowExec +(65) VeloxColumnarToRowExec Input [1]: [avg(c_acctbal)#X] -(59) Scan parquet +(66) Scan parquet Output [2]: [c_phone#X, c_acctbal#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_acctbal), GreaterThan(c_acctbal,0.00)] ReadSchema: struct -(60) Filter +(67) Filter Input [2]: [c_phone#X, c_acctbal#X] Condition : ((isnotnull(c_acctbal#X) AND (c_acctbal#X > 0.00)) AND substring(c_phone#X, 1, 2) IN (13,31,23,29,30,18,17)) -(61) Project +(68) Project Output [1]: [c_acctbal#X] Input [2]: [c_phone#X, c_acctbal#X] -(62) HashAggregate +(69) HashAggregate Input [1]: [c_acctbal#X] Keys: [] Functions [1]: [partial_avg(c_acctbal#X)] Aggregate Attributes [2]: [sum#X, count#X] Results [2]: [sum#X, count#X] -(63) Exchange +(70) Exchange Input [2]: [sum#X, count#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X] -(64) HashAggregate +(71) HashAggregate Input [2]: [sum#X, count#X] Keys: [] Functions [1]: [avg(c_acctbal#X)] Aggregate Attributes [1]: [avg(c_acctbal#X)#X] Results [1]: [avg(c_acctbal#X)#X AS avg(c_acctbal)#X] -(65) AdaptiveSparkPlan +(72) AdaptiveSparkPlan Output [1]: [avg(c_acctbal)#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/3.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/3.txt index 16615ac0598fd..aa679861da7c3 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/3.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/3.txt @@ -1,60 +1,64 @@ == Physical Plan == -AdaptiveSparkPlan (59) +AdaptiveSparkPlan (67) +- == Final Plan == - VeloxColumnarToRowExec (39) - +- TakeOrderedAndProjectExecTransformer (38) - +- ^ ProjectExecTransformer (36) - +- ^ RegularHashAggregateExecTransformer (35) - +- ^ RegularHashAggregateExecTransformer (34) - +- ^ ProjectExecTransformer (33) - +- ^ ShuffledHashJoinExecTransformer Inner (32) - :- ^ InputIteratorTransformer (23) - : +- ^ InputAdapter (22) - : +- ^ ShuffleQueryStage (21), Statistics(X) - : +- ColumnarExchange (20) - : +- ^ ProjectExecTransformer (18) - : +- ^ ShuffledHashJoinExecTransformer Inner (17) - : :- ^ InputIteratorTransformer (8) - : : +- ^ InputAdapter (7) - : : +- ^ ShuffleQueryStage (6), Statistics(X) - : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (43) + +- TakeOrderedAndProjectExecTransformer (42) + +- ^ ProjectExecTransformer (40) + +- ^ RegularHashAggregateExecTransformer (39) + +- ^ RegularHashAggregateExecTransformer (38) + +- ^ ProjectExecTransformer (37) + +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (36) + :- ^ InputIteratorTransformer (26) + : +- ShuffleQueryStage (24), Statistics(X) + : +- ColumnarExchange (23) + : +- VeloxAppendBatches (22) + : +- ^ ProjectExecTransformer (20) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (19) + : :- ^ InputIteratorTransformer (9) + : : +- ShuffleQueryStage (7), Statistics(X) + : : +- ColumnarExchange (6) + : : +- VeloxAppendBatches (5) : : +- ^ ProjectExecTransformer (3) : : +- ^ NoopFilter (2) : : +- ^ Scan parquet (1) - : +- ^ InputIteratorTransformer (16) - : +- ^ InputAdapter (15) - : +- ^ ShuffleQueryStage (14), Statistics(X) - : +- ColumnarExchange (13) - : +- ^ ProjectExecTransformer (11) - : +- ^ NoopFilter (10) - : +- ^ Scan parquet (9) - +- ^ InputIteratorTransformer (31) - +- ^ InputAdapter (30) - +- ^ ShuffleQueryStage (29), Statistics(X) - +- ColumnarExchange (28) - +- ^ ProjectExecTransformer (26) - +- ^ NoopFilter (25) - +- ^ Scan parquet (24) + : +- ^ InputIteratorTransformer (18) + : +- ShuffleQueryStage (16), Statistics(X) + : +- ColumnarExchange (15) + : +- VeloxAppendBatches (14) + : +- ^ ProjectExecTransformer (12) + : +- ^ NoopFilter (11) + : +- ^ Scan parquet (10) + +- ^ InputIteratorTransformer (35) + +- ShuffleQueryStage (33), Statistics(X) + +- ColumnarExchange (32) + +- VeloxAppendBatches (31) + +- ^ ProjectExecTransformer (29) + +- ^ NoopFilter (28) + +- ^ Scan parquet (27) +- == Initial Plan == - TakeOrderedAndProject (58) - +- HashAggregate (57) - +- HashAggregate (56) - +- Project (55) - +- ShuffledHashJoin Inner BuildRight (54) - :- Exchange (49) - : +- Project (48) - : +- ShuffledHashJoin Inner BuildLeft (47) - : :- Exchange (43) - : : +- Project (42) - : : +- Filter (41) - : : +- Scan parquet (40) - : +- Exchange (46) - : +- Filter (45) - : +- Scan parquet (44) - +- Exchange (53) - +- Project (52) - +- Filter (51) - +- Scan parquet (50) + TakeOrderedAndProject (66) + +- HashAggregate (65) + +- HashAggregate (64) + +- Project (63) + +- SortMergeJoin Inner (62) + :- Sort (56) + : +- Exchange (55) + : +- Project (54) + : +- SortMergeJoin Inner (53) + : :- Sort (48) + : : +- Exchange (47) + : : +- Project (46) + : : +- Filter (45) + : : +- Scan parquet (44) + : +- Sort (52) + : +- Exchange (51) + : +- Filter (50) + : +- Scan parquet (49) + +- Sort (61) + +- Exchange (60) + +- Project (59) + +- Filter (58) + +- Scan parquet (57) (1) Scan parquet @@ -76,248 +80,280 @@ Input [2]: [c_custkey#X, c_mktsegment#X] Input [2]: [hash_partition_key#X, c_custkey#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [2]: [hash_partition_key#X, c_custkey#X] +Arguments: X + +(6) ColumnarExchange Input [2]: [hash_partition_key#X, c_custkey#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [1]: [c_custkey#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [1]: [c_custkey#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [1]: [c_custkey#X] -(9) Scan parquet +(10) Scan parquet Output [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), LessThan(o_orderdate,1995-03-15), IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(10) NoopFilter +(11) NoopFilter Input [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Arguments: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [5]: [hash(o_custkey#X, 42) AS hash_partition_key#X, o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Input [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] +Arguments: X + +(15) ColumnarExchange Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join type: Inner Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [4]: [hash(o_orderkey#X, 42) AS hash_partition_key#X, o_orderkey#X, o_orderdate#X, o_shippriority#X] Input [5]: [c_custkey#X, o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] -(19) WholeStageCodegenTransformer (X) +(21) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, o_orderkey#X, o_orderdate#X, o_shippriority#X] Arguments: false -(20) ColumnarExchange +(22) VeloxAppendBatches +Input [4]: [hash_partition_key#X, o_orderkey#X, o_orderdate#X, o_shippriority#X] +Arguments: X + +(23) ColumnarExchange Input [4]: [hash_partition_key#X, o_orderkey#X, o_orderdate#X, o_shippriority#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderdate#X, o_shippriority#X], [plan_id=X], [id=#X] -(21) ShuffleQueryStage +(24) ShuffleQueryStage Output [3]: [o_orderkey#X, o_orderdate#X, o_shippriority#X] Arguments: X -(22) InputAdapter +(25) InputAdapter Input [3]: [o_orderkey#X, o_orderdate#X, o_shippriority#X] -(23) InputIteratorTransformer +(26) InputIteratorTransformer Input [3]: [o_orderkey#X, o_orderdate#X, o_shippriority#X] -(24) Scan parquet +(27) Scan parquet Output [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThan(l_shipdate,1995-03-15), IsNotNull(l_orderkey)] ReadSchema: struct -(25) NoopFilter +(28) NoopFilter Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(26) ProjectExecTransformer +(29) ProjectExecTransformer Output [4]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(27) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X] Arguments: false -(28) ColumnarExchange +(31) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(32) ColumnarExchange Input [4]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(29) ShuffleQueryStage +(33) ShuffleQueryStage Output [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] Arguments: X -(30) InputAdapter +(34) InputAdapter Input [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] -(31) InputIteratorTransformer +(35) InputIteratorTransformer Input [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] -(32) ShuffledHashJoinExecTransformer +(36) ShuffledHashJoinExecTransformer Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: Inner Join condition: None -(33) ProjectExecTransformer +(37) ProjectExecTransformer Output [6]: [o_orderdate#X, o_shippriority#X, l_orderkey#X, l_extendedprice#X, l_discount#X, (l_extendedprice#X * (1 - l_discount#X)) AS _pre_X#X] Input [6]: [o_orderkey#X, o_orderdate#X, o_shippriority#X, l_orderkey#X, l_extendedprice#X, l_discount#X] -(34) RegularHashAggregateExecTransformer +(38) RegularHashAggregateExecTransformer Input [6]: [o_orderdate#X, o_shippriority#X, l_orderkey#X, l_extendedprice#X, l_discount#X, _pre_X#X] Keys [3]: [l_orderkey#X, o_orderdate#X, o_shippriority#X] Functions [1]: [partial_sum(_pre_X#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] -(35) RegularHashAggregateExecTransformer +(39) RegularHashAggregateExecTransformer Input [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] Keys [3]: [l_orderkey#X, o_orderdate#X, o_shippriority#X] Functions [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X] Results [4]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X] -(36) ProjectExecTransformer +(40) ProjectExecTransformer Output [4]: [l_orderkey#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X AS revenue#X, o_orderdate#X, o_shippriority#X] Input [4]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X] -(37) WholeStageCodegenTransformer (X) +(41) WholeStageCodegenTransformer (X) Input [4]: [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] Arguments: false -(38) TakeOrderedAndProjectExecTransformer +(42) TakeOrderedAndProjectExecTransformer Input [4]: [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] Arguments: X, [revenue#X DESC NULLS LAST, o_orderdate#X ASC NULLS FIRST], [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X], 0 -(39) VeloxColumnarToRowExec +(43) VeloxColumnarToRowExec Input [4]: [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] -(40) Scan parquet +(44) Scan parquet Output [2]: [c_custkey#X, c_mktsegment#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_mktsegment), EqualTo(c_mktsegment,BUILDING), IsNotNull(c_custkey)] ReadSchema: struct -(41) Filter +(45) Filter Input [2]: [c_custkey#X, c_mktsegment#X] Condition : ((isnotnull(c_mktsegment#X) AND (c_mktsegment#X = BUILDING)) AND isnotnull(c_custkey#X)) -(42) Project +(46) Project Output [1]: [c_custkey#X] Input [2]: [c_custkey#X, c_mktsegment#X] -(43) Exchange +(47) Exchange Input [1]: [c_custkey#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(44) Scan parquet +(48) Sort +Input [1]: [c_custkey#X] +Arguments: [c_custkey#X ASC NULLS FIRST], false, 0 + +(49) Scan parquet Output [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), LessThan(o_orderdate,1995-03-15), IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(45) Filter +(50) Filter Input [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Condition : (((isnotnull(o_orderdate#X) AND (o_orderdate#X < 1995-03-15)) AND isnotnull(o_custkey#X)) AND isnotnull(o_orderkey#X)) -(46) Exchange +(51) Exchange Input [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(47) ShuffledHashJoin +(52) Sort +Input [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] +Arguments: [o_custkey#X ASC NULLS FIRST], false, 0 + +(53) SortMergeJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join type: Inner Join condition: None -(48) Project +(54) Project Output [3]: [o_orderkey#X, o_orderdate#X, o_shippriority#X] Input [5]: [c_custkey#X, o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] -(49) Exchange +(55) Exchange Input [3]: [o_orderkey#X, o_orderdate#X, o_shippriority#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(50) Scan parquet +(56) Sort +Input [3]: [o_orderkey#X, o_orderdate#X, o_shippriority#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(57) Scan parquet Output [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThan(l_shipdate,1995-03-15), IsNotNull(l_orderkey)] ReadSchema: struct -(51) Filter +(58) Filter Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : ((isnotnull(l_shipdate#X) AND (l_shipdate#X > 1995-03-15)) AND isnotnull(l_orderkey#X)) -(52) Project +(59) Project Output [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(53) Exchange +(60) Exchange Input [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(54) ShuffledHashJoin +(61) Sort +Input [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(62) SortMergeJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: Inner Join condition: None -(55) Project +(63) Project Output [5]: [o_orderdate#X, o_shippriority#X, l_orderkey#X, l_extendedprice#X, l_discount#X] Input [6]: [o_orderkey#X, o_orderdate#X, o_shippriority#X, l_orderkey#X, l_extendedprice#X, l_discount#X] -(56) HashAggregate +(64) HashAggregate Input [5]: [o_orderdate#X, o_shippriority#X, l_orderkey#X, l_extendedprice#X, l_discount#X] Keys [3]: [l_orderkey#X, o_orderdate#X, o_shippriority#X] Functions [1]: [partial_sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] -(57) HashAggregate +(65) HashAggregate Input [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] Keys [3]: [l_orderkey#X, o_orderdate#X, o_shippriority#X] Functions [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X] Results [4]: [l_orderkey#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X AS revenue#X, o_orderdate#X, o_shippriority#X] -(58) TakeOrderedAndProject +(66) TakeOrderedAndProject Input [4]: [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] Arguments: X, [revenue#X DESC NULLS LAST, o_orderdate#X ASC NULLS FIRST], [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] -(59) AdaptiveSparkPlan +(67) AdaptiveSparkPlan Output [4]: [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/4.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/4.txt index 1d3f8903f89fe..02c494288f95c 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/4.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/4.txt @@ -1,51 +1,53 @@ == Physical Plan == -AdaptiveSparkPlan (50) +AdaptiveSparkPlan (56) +- == Final Plan == - VeloxColumnarToRowExec (34) - +- ^ SortExecTransformer (32) - +- ^ InputIteratorTransformer (31) - +- ^ InputAdapter (30) - +- ^ ShuffleQueryStage (29), Statistics(X) - +- ColumnarExchange (28) - +- ^ RegularHashAggregateExecTransformer (26) - +- ^ InputIteratorTransformer (25) - +- ^ InputAdapter (24) - +- ^ ShuffleQueryStage (23), Statistics(X) - +- ColumnarExchange (22) - +- ^ ProjectExecTransformer (20) - +- ^ FlushableHashAggregateExecTransformer (19) - +- ^ ProjectExecTransformer (18) - +- ^ ShuffledHashJoinExecTransformer LeftSemi (17) - :- ^ InputIteratorTransformer (8) - : +- ^ InputAdapter (7) - : +- ^ ShuffleQueryStage (6), Statistics(X) - : +- ColumnarExchange (5) + VeloxColumnarToRowExec (38) + +- ^ SortExecTransformer (36) + +- ^ InputIteratorTransformer (35) + +- ShuffleQueryStage (33), Statistics(X) + +- ColumnarExchange (32) + +- VeloxAppendBatches (31) + +- ^ RegularHashAggregateExecTransformer (29) + +- ^ InputIteratorTransformer (28) + +- ShuffleQueryStage (26), Statistics(X) + +- ColumnarExchange (25) + +- VeloxAppendBatches (24) + +- ^ ProjectExecTransformer (22) + +- ^ FlushableHashAggregateExecTransformer (21) + +- ^ ProjectExecTransformer (20) + +- ^ ShuffledHashJoinExecTransformer LeftSemi BuildRight (19) + :- ^ InputIteratorTransformer (9) + : +- ShuffleQueryStage (7), Statistics(X) + : +- ColumnarExchange (6) + : +- VeloxAppendBatches (5) : +- ^ ProjectExecTransformer (3) : +- ^ NoopFilter (2) : +- ^ Scan parquet (1) - +- ^ InputIteratorTransformer (16) - +- ^ InputAdapter (15) - +- ^ ShuffleQueryStage (14), Statistics(X) - +- ColumnarExchange (13) - +- ^ ProjectExecTransformer (11) - +- ^ NoopFilter (10) - +- ^ Scan parquet (9) + +- ^ InputIteratorTransformer (18) + +- ShuffleQueryStage (16), Statistics(X) + +- ColumnarExchange (15) + +- VeloxAppendBatches (14) + +- ^ ProjectExecTransformer (12) + +- ^ NoopFilter (11) + +- ^ Scan parquet (10) +- == Initial Plan == - Sort (49) - +- Exchange (48) - +- HashAggregate (47) - +- Exchange (46) - +- HashAggregate (45) - +- Project (44) - +- ShuffledHashJoin LeftSemi BuildRight (43) - :- Exchange (38) - : +- Project (37) - : +- Filter (36) - : +- Scan parquet (35) - +- Exchange (42) - +- Project (41) - +- Filter (40) - +- Scan parquet (39) + Sort (55) + +- Exchange (54) + +- HashAggregate (53) + +- Exchange (52) + +- HashAggregate (51) + +- Project (50) + +- SortMergeJoin LeftSemi (49) + :- Sort (43) + : +- Exchange (42) + : +- Project (41) + : +- Filter (40) + : +- Scan parquet (39) + +- Sort (48) + +- Exchange (47) + +- Project (46) + +- Filter (45) + +- Scan parquet (44) (1) Scan parquet @@ -67,202 +69,226 @@ Input [3]: [o_orderkey#X, o_orderdate#X, o_orderpriority#X] Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderpriority#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderpriority#X] +Arguments: X + +(6) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderpriority#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderpriority#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [2]: [o_orderkey#X, o_orderpriority#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [2]: [o_orderkey#X, o_orderpriority#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [2]: [o_orderkey#X, o_orderpriority#X] -(9) Scan parquet +(10) Scan parquet Output [3]: [l_orderkey#X, l_commitdate#X, l_receiptdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_commitdate), IsNotNull(l_receiptdate)] ReadSchema: struct -(10) NoopFilter +(11) NoopFilter Input [3]: [l_orderkey#X, l_commitdate#X, l_receiptdate#X] Arguments: [l_orderkey#X, l_commitdate#X, l_receiptdate#X] -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [2]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X] Input [3]: [l_orderkey#X, l_commitdate#X, l_receiptdate#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, l_orderkey#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [2]: [hash_partition_key#X, l_orderkey#X] +Arguments: X + +(15) ColumnarExchange Input [2]: [hash_partition_key#X, l_orderkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [1]: [l_orderkey#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [1]: [l_orderkey#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [1]: [l_orderkey#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: LeftSemi Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [1]: [o_orderpriority#X] Input [2]: [o_orderkey#X, o_orderpriority#X] -(19) FlushableHashAggregateExecTransformer +(21) FlushableHashAggregateExecTransformer Input [1]: [o_orderpriority#X] Keys [1]: [o_orderpriority#X] Functions [1]: [partial_count(1)] Aggregate Attributes [1]: [count#X] Results [2]: [o_orderpriority#X, count#X] -(20) ProjectExecTransformer +(22) ProjectExecTransformer Output [3]: [hash(o_orderpriority#X, 42) AS hash_partition_key#X, o_orderpriority#X, count#X] Input [2]: [o_orderpriority#X, count#X] -(21) WholeStageCodegenTransformer (X) +(23) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, o_orderpriority#X, count#X] Arguments: false -(22) ColumnarExchange +(24) VeloxAppendBatches +Input [3]: [hash_partition_key#X, o_orderpriority#X, count#X] +Arguments: X + +(25) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderpriority#X, count#X] Arguments: hashpartitioning(o_orderpriority#X, 1), ENSURE_REQUIREMENTS, [o_orderpriority#X, count#X], [plan_id=X], [id=#X] -(23) ShuffleQueryStage +(26) ShuffleQueryStage Output [2]: [o_orderpriority#X, count#X] Arguments: X -(24) InputAdapter +(27) InputAdapter Input [2]: [o_orderpriority#X, count#X] -(25) InputIteratorTransformer +(28) InputIteratorTransformer Input [2]: [o_orderpriority#X, count#X] -(26) RegularHashAggregateExecTransformer +(29) RegularHashAggregateExecTransformer Input [2]: [o_orderpriority#X, count#X] Keys [1]: [o_orderpriority#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [o_orderpriority#X, count(1)#X AS order_count#X] -(27) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [2]: [o_orderpriority#X, order_count#X] Arguments: false -(28) ColumnarExchange +(31) VeloxAppendBatches +Input [2]: [o_orderpriority#X, order_count#X] +Arguments: X + +(32) ColumnarExchange Input [2]: [o_orderpriority#X, order_count#X] Arguments: rangepartitioning(o_orderpriority#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(29) ShuffleQueryStage +(33) ShuffleQueryStage Output [2]: [o_orderpriority#X, order_count#X] Arguments: X -(30) InputAdapter +(34) InputAdapter Input [2]: [o_orderpriority#X, order_count#X] -(31) InputIteratorTransformer +(35) InputIteratorTransformer Input [2]: [o_orderpriority#X, order_count#X] -(32) SortExecTransformer +(36) SortExecTransformer Input [2]: [o_orderpriority#X, order_count#X] Arguments: [o_orderpriority#X ASC NULLS FIRST], true, 0 -(33) WholeStageCodegenTransformer (X) +(37) WholeStageCodegenTransformer (X) Input [2]: [o_orderpriority#X, order_count#X] Arguments: false -(34) VeloxColumnarToRowExec +(38) VeloxColumnarToRowExec Input [2]: [o_orderpriority#X, order_count#X] -(35) Scan parquet +(39) Scan parquet Output [3]: [o_orderkey#X, o_orderdate#X, o_orderpriority#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1993-07-01), LessThan(o_orderdate,1993-10-01)] ReadSchema: struct -(36) Filter +(40) Filter Input [3]: [o_orderkey#X, o_orderdate#X, o_orderpriority#X] Condition : ((isnotnull(o_orderdate#X) AND (o_orderdate#X >= 1993-07-01)) AND (o_orderdate#X < 1993-10-01)) -(37) Project +(41) Project Output [2]: [o_orderkey#X, o_orderpriority#X] Input [3]: [o_orderkey#X, o_orderdate#X, o_orderpriority#X] -(38) Exchange +(42) Exchange Input [2]: [o_orderkey#X, o_orderpriority#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(39) Scan parquet +(43) Sort +Input [2]: [o_orderkey#X, o_orderpriority#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(44) Scan parquet Output [3]: [l_orderkey#X, l_commitdate#X, l_receiptdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_commitdate), IsNotNull(l_receiptdate)] ReadSchema: struct -(40) Filter +(45) Filter Input [3]: [l_orderkey#X, l_commitdate#X, l_receiptdate#X] Condition : ((isnotnull(l_commitdate#X) AND isnotnull(l_receiptdate#X)) AND (l_commitdate#X < l_receiptdate#X)) -(41) Project +(46) Project Output [1]: [l_orderkey#X] Input [3]: [l_orderkey#X, l_commitdate#X, l_receiptdate#X] -(42) Exchange +(47) Exchange Input [1]: [l_orderkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(43) ShuffledHashJoin +(48) Sort +Input [1]: [l_orderkey#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(49) SortMergeJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: LeftSemi Join condition: None -(44) Project +(50) Project Output [1]: [o_orderpriority#X] Input [2]: [o_orderkey#X, o_orderpriority#X] -(45) HashAggregate +(51) HashAggregate Input [1]: [o_orderpriority#X] Keys [1]: [o_orderpriority#X] Functions [1]: [partial_count(1)] Aggregate Attributes [1]: [count#X] Results [2]: [o_orderpriority#X, count#X] -(46) Exchange +(52) Exchange Input [2]: [o_orderpriority#X, count#X] Arguments: hashpartitioning(o_orderpriority#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(47) HashAggregate +(53) HashAggregate Input [2]: [o_orderpriority#X, count#X] Keys [1]: [o_orderpriority#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [o_orderpriority#X, count(1)#X AS order_count#X] -(48) Exchange +(54) Exchange Input [2]: [o_orderpriority#X, order_count#X] Arguments: rangepartitioning(o_orderpriority#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(49) Sort +(55) Sort Input [2]: [o_orderpriority#X, order_count#X] Arguments: [o_orderpriority#X ASC NULLS FIRST], true, 0 -(50) AdaptiveSparkPlan +(56) AdaptiveSparkPlan Output [2]: [o_orderpriority#X, order_count#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/5.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/5.txt index 12c4c6add2404..67150984ab61e 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/5.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/5.txt @@ -1,127 +1,137 @@ == Physical Plan == -AdaptiveSparkPlan (134) +AdaptiveSparkPlan (156) +- == Final Plan == - VeloxColumnarToRowExec (94) - +- ^ SortExecTransformer (92) - +- ^ InputIteratorTransformer (91) - +- ^ InputAdapter (90) - +- ^ ShuffleQueryStage (89), Statistics(X) - +- ColumnarExchange (88) - +- ^ RegularHashAggregateExecTransformer (86) - +- ^ InputIteratorTransformer (85) - +- ^ InputAdapter (84) - +- ^ ShuffleQueryStage (83), Statistics(X) - +- ColumnarExchange (82) - +- ^ ProjectExecTransformer (80) - +- ^ FlushableHashAggregateExecTransformer (79) - +- ^ ProjectExecTransformer (78) - +- ^ ShuffledHashJoinExecTransformer Inner (77) - :- ^ InputIteratorTransformer (68) - : +- ^ InputAdapter (67) - : +- ^ ShuffleQueryStage (66), Statistics(X) - : +- ColumnarExchange (65) - : +- ^ ProjectExecTransformer (63) - : +- ^ ShuffledHashJoinExecTransformer Inner (62) - : :- ^ InputIteratorTransformer (53) - : : +- ^ InputAdapter (52) - : : +- ^ ShuffleQueryStage (51), Statistics(X) - : : +- ColumnarExchange (50) - : : +- ^ ProjectExecTransformer (48) - : : +- ^ ShuffledHashJoinExecTransformer Inner (47) - : : :- ^ InputIteratorTransformer (38) - : : : +- ^ InputAdapter (37) - : : : +- ^ ShuffleQueryStage (36), Statistics(X) - : : : +- ColumnarExchange (35) - : : : +- ^ ProjectExecTransformer (33) - : : : +- ^ ShuffledHashJoinExecTransformer Inner (32) - : : : :- ^ InputIteratorTransformer (23) - : : : : +- ^ InputAdapter (22) - : : : : +- ^ ShuffleQueryStage (21), Statistics(X) - : : : : +- ColumnarExchange (20) - : : : : +- ^ ProjectExecTransformer (18) - : : : : +- ^ ShuffledHashJoinExecTransformer Inner (17) - : : : : :- ^ InputIteratorTransformer (8) - : : : : : +- ^ InputAdapter (7) - : : : : : +- ^ ShuffleQueryStage (6), Statistics(X) - : : : : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (106) + +- ^ SortExecTransformer (104) + +- ^ InputIteratorTransformer (103) + +- ShuffleQueryStage (101), Statistics(X) + +- ColumnarExchange (100) + +- VeloxAppendBatches (99) + +- ^ RegularHashAggregateExecTransformer (97) + +- ^ InputIteratorTransformer (96) + +- ShuffleQueryStage (94), Statistics(X) + +- ColumnarExchange (93) + +- VeloxAppendBatches (92) + +- ^ ProjectExecTransformer (90) + +- ^ FlushableHashAggregateExecTransformer (89) + +- ^ ProjectExecTransformer (88) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (87) + :- ^ InputIteratorTransformer (77) + : +- ShuffleQueryStage (75), Statistics(X) + : +- ColumnarExchange (74) + : +- VeloxAppendBatches (73) + : +- ^ ProjectExecTransformer (71) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (70) + : :- ^ InputIteratorTransformer (60) + : : +- ShuffleQueryStage (58), Statistics(X) + : : +- ColumnarExchange (57) + : : +- VeloxAppendBatches (56) + : : +- ^ ProjectExecTransformer (54) + : : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (53) + : : :- ^ InputIteratorTransformer (43) + : : : +- ShuffleQueryStage (41), Statistics(X) + : : : +- ColumnarExchange (40) + : : : +- VeloxAppendBatches (39) + : : : +- ^ ProjectExecTransformer (37) + : : : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (36) + : : : :- ^ InputIteratorTransformer (26) + : : : : +- ShuffleQueryStage (24), Statistics(X) + : : : : +- ColumnarExchange (23) + : : : : +- VeloxAppendBatches (22) + : : : : +- ^ ProjectExecTransformer (20) + : : : : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (19) + : : : : :- ^ InputIteratorTransformer (9) + : : : : : +- ShuffleQueryStage (7), Statistics(X) + : : : : : +- ColumnarExchange (6) + : : : : : +- VeloxAppendBatches (5) : : : : : +- ^ ProjectExecTransformer (3) : : : : : +- ^ NoopFilter (2) : : : : : +- ^ Scan parquet (1) - : : : : +- ^ InputIteratorTransformer (16) - : : : : +- ^ InputAdapter (15) - : : : : +- ^ ShuffleQueryStage (14), Statistics(X) - : : : : +- ColumnarExchange (13) - : : : : +- ^ ProjectExecTransformer (11) - : : : : +- ^ NoopFilter (10) - : : : : +- ^ Scan parquet (9) - : : : +- ^ InputIteratorTransformer (31) - : : : +- ^ InputAdapter (30) - : : : +- ^ ShuffleQueryStage (29), Statistics(X) - : : : +- ColumnarExchange (28) - : : : +- ^ ProjectExecTransformer (26) - : : : +- ^ NoopFilter (25) - : : : +- ^ Scan parquet (24) - : : +- ^ InputIteratorTransformer (46) - : : +- ^ InputAdapter (45) - : : +- ^ ShuffleQueryStage (44), Statistics(X) - : : +- ColumnarExchange (43) - : : +- ^ ProjectExecTransformer (41) - : : +- ^ NoopFilter (40) - : : +- ^ Scan parquet (39) - : +- ^ InputIteratorTransformer (61) - : +- ^ InputAdapter (60) - : +- ^ ShuffleQueryStage (59), Statistics(X) - : +- ColumnarExchange (58) - : +- ^ ProjectExecTransformer (56) - : +- ^ NoopFilter (55) - : +- ^ Scan parquet (54) - +- ^ InputIteratorTransformer (76) - +- ^ InputAdapter (75) - +- ^ ShuffleQueryStage (74), Statistics(X) - +- ColumnarExchange (73) - +- ^ ProjectExecTransformer (71) - +- ^ NoopFilter (70) - +- ^ Scan parquet (69) + : : : : +- ^ InputIteratorTransformer (18) + : : : : +- ShuffleQueryStage (16), Statistics(X) + : : : : +- ColumnarExchange (15) + : : : : +- VeloxAppendBatches (14) + : : : : +- ^ ProjectExecTransformer (12) + : : : : +- ^ NoopFilter (11) + : : : : +- ^ Scan parquet (10) + : : : +- ^ InputIteratorTransformer (35) + : : : +- ShuffleQueryStage (33), Statistics(X) + : : : +- ColumnarExchange (32) + : : : +- VeloxAppendBatches (31) + : : : +- ^ ProjectExecTransformer (29) + : : : +- ^ NoopFilter (28) + : : : +- ^ Scan parquet (27) + : : +- ^ InputIteratorTransformer (52) + : : +- ShuffleQueryStage (50), Statistics(X) + : : +- ColumnarExchange (49) + : : +- VeloxAppendBatches (48) + : : +- ^ ProjectExecTransformer (46) + : : +- ^ NoopFilter (45) + : : +- ^ Scan parquet (44) + : +- ^ InputIteratorTransformer (69) + : +- ShuffleQueryStage (67), Statistics(X) + : +- ColumnarExchange (66) + : +- VeloxAppendBatches (65) + : +- ^ ProjectExecTransformer (63) + : +- ^ NoopFilter (62) + : +- ^ Scan parquet (61) + +- ^ InputIteratorTransformer (86) + +- ShuffleQueryStage (84), Statistics(X) + +- ColumnarExchange (83) + +- VeloxAppendBatches (82) + +- ^ ProjectExecTransformer (80) + +- ^ NoopFilter (79) + +- ^ Scan parquet (78) +- == Initial Plan == - Sort (133) - +- Exchange (132) - +- HashAggregate (131) - +- Exchange (130) - +- HashAggregate (129) - +- Project (128) - +- ShuffledHashJoin Inner BuildRight (127) - :- Exchange (122) - : +- Project (121) - : +- ShuffledHashJoin Inner BuildRight (120) - : :- Exchange (116) - : : +- Project (115) - : : +- ShuffledHashJoin Inner BuildRight (114) - : : :- Exchange (110) - : : : +- Project (109) - : : : +- ShuffledHashJoin Inner BuildRight (108) - : : : :- Exchange (104) - : : : : +- Project (103) - : : : : +- ShuffledHashJoin Inner BuildLeft (102) - : : : : :- Exchange (97) - : : : : : +- Filter (96) - : : : : : +- Scan parquet (95) - : : : : +- Exchange (101) - : : : : +- Project (100) - : : : : +- Filter (99) - : : : : +- Scan parquet (98) - : : : +- Exchange (107) - : : : +- Filter (106) - : : : +- Scan parquet (105) - : : +- Exchange (113) - : : +- Filter (112) - : : +- Scan parquet (111) - : +- Exchange (119) - : +- Filter (118) - : +- Scan parquet (117) - +- Exchange (126) - +- Project (125) - +- Filter (124) - +- Scan parquet (123) + Sort (155) + +- Exchange (154) + +- HashAggregate (153) + +- Exchange (152) + +- HashAggregate (151) + +- Project (150) + +- SortMergeJoin Inner (149) + :- Sort (143) + : +- Exchange (142) + : +- Project (141) + : +- SortMergeJoin Inner (140) + : :- Sort (135) + : : +- Exchange (134) + : : +- Project (133) + : : +- SortMergeJoin Inner (132) + : : :- Sort (127) + : : : +- Exchange (126) + : : : +- Project (125) + : : : +- SortMergeJoin Inner (124) + : : : :- Sort (119) + : : : : +- Exchange (118) + : : : : +- Project (117) + : : : : +- SortMergeJoin Inner (116) + : : : : :- Sort (110) + : : : : : +- Exchange (109) + : : : : : +- Filter (108) + : : : : : +- Scan parquet (107) + : : : : +- Sort (115) + : : : : +- Exchange (114) + : : : : +- Project (113) + : : : : +- Filter (112) + : : : : +- Scan parquet (111) + : : : +- Sort (123) + : : : +- Exchange (122) + : : : +- Filter (121) + : : : +- Scan parquet (120) + : : +- Sort (131) + : : +- Exchange (130) + : : +- Filter (129) + : : +- Scan parquet (128) + : +- Sort (139) + : +- Exchange (138) + : +- Filter (137) + : +- Scan parquet (136) + +- Sort (148) + +- Exchange (147) + +- Project (146) + +- Filter (145) + +- Scan parquet (144) (1) Scan parquet @@ -143,562 +153,650 @@ Input [2]: [c_custkey#X, c_nationkey#X] Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] +Arguments: X + +(6) ColumnarExchange Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_nationkey#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [2]: [c_custkey#X, c_nationkey#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [2]: [c_custkey#X, c_nationkey#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [2]: [c_custkey#X, c_nationkey#X] -(9) Scan parquet +(10) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1994-01-01), LessThan(o_orderdate,1995-01-01), IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(10) NoopFilter +(11) NoopFilter Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Arguments: [o_orderkey#X, o_custkey#X, o_orderdate#X] -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [3]: [hash(o_custkey#X, 42) AS hash_partition_key#X, o_orderkey#X, o_custkey#X] Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] +Arguments: X + +(15) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [2]: [o_orderkey#X, o_custkey#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [2]: [o_orderkey#X, o_custkey#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [2]: [o_orderkey#X, o_custkey#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join type: Inner Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [3]: [hash(o_orderkey#X, 42) AS hash_partition_key#X, c_nationkey#X, o_orderkey#X] Input [4]: [c_custkey#X, c_nationkey#X, o_orderkey#X, o_custkey#X] -(19) WholeStageCodegenTransformer (X) +(21) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, c_nationkey#X, o_orderkey#X] Arguments: false -(20) ColumnarExchange +(22) VeloxAppendBatches +Input [3]: [hash_partition_key#X, c_nationkey#X, o_orderkey#X] +Arguments: X + +(23) ColumnarExchange Input [3]: [hash_partition_key#X, c_nationkey#X, o_orderkey#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [c_nationkey#X, o_orderkey#X], [plan_id=X], [id=#X] -(21) ShuffleQueryStage +(24) ShuffleQueryStage Output [2]: [c_nationkey#X, o_orderkey#X] Arguments: X -(22) InputAdapter +(25) InputAdapter Input [2]: [c_nationkey#X, o_orderkey#X] -(23) InputIteratorTransformer +(26) InputIteratorTransformer Input [2]: [c_nationkey#X, o_orderkey#X] -(24) Scan parquet +(27) Scan parquet Output [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_orderkey), IsNotNull(l_suppkey)] ReadSchema: struct -(25) NoopFilter +(28) NoopFilter Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(26) ProjectExecTransformer +(29) ProjectExecTransformer Output [5]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(27) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: false -(28) ColumnarExchange +(31) VeloxAppendBatches +Input [5]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(32) ColumnarExchange Input [5]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(29) ShuffleQueryStage +(33) ShuffleQueryStage Output [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: X -(30) InputAdapter +(34) InputAdapter Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(31) InputIteratorTransformer +(35) InputIteratorTransformer Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(32) ShuffledHashJoinExecTransformer +(36) ShuffledHashJoinExecTransformer Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: Inner Join condition: None -(33) ProjectExecTransformer +(37) ProjectExecTransformer Output [5]: [hash(l_suppkey#X, c_nationkey#X, 42) AS hash_partition_key#X, c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Input [6]: [c_nationkey#X, o_orderkey#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(34) WholeStageCodegenTransformer (X) +(38) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: false -(35) ColumnarExchange +(39) VeloxAppendBatches +Input [5]: [hash_partition_key#X, c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(40) ColumnarExchange Input [5]: [hash_partition_key#X, c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_suppkey#X, c_nationkey#X, 1), ENSURE_REQUIREMENTS, [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(36) ShuffleQueryStage +(41) ShuffleQueryStage Output [4]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: X -(37) InputAdapter +(42) InputAdapter Input [4]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(38) InputIteratorTransformer +(43) InputIteratorTransformer Input [4]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(39) Scan parquet +(44) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(40) NoopFilter +(45) NoopFilter Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: [s_suppkey#X, s_nationkey#X] -(41) ProjectExecTransformer +(46) ProjectExecTransformer Output [3]: [hash(s_suppkey#X, s_nationkey#X, 42) AS hash_partition_key#X, s_suppkey#X, s_nationkey#X] Input [2]: [s_suppkey#X, s_nationkey#X] -(42) WholeStageCodegenTransformer (X) +(47) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] Arguments: false -(43) ColumnarExchange +(48) VeloxAppendBatches +Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] +Arguments: X + +(49) ColumnarExchange Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [id=#X] -(44) ShuffleQueryStage +(50) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] Arguments: X -(45) InputAdapter +(51) InputAdapter Input [2]: [s_suppkey#X, s_nationkey#X] -(46) InputIteratorTransformer +(52) InputIteratorTransformer Input [2]: [s_suppkey#X, s_nationkey#X] -(47) ShuffledHashJoinExecTransformer +(53) ShuffledHashJoinExecTransformer Left keys [2]: [l_suppkey#X, c_nationkey#X] Right keys [2]: [s_suppkey#X, s_nationkey#X] Join type: Inner Join condition: None -(48) ProjectExecTransformer +(54) ProjectExecTransformer Output [4]: [hash(s_nationkey#X, 42) AS hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Input [6]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, s_suppkey#X, s_nationkey#X] -(49) WholeStageCodegenTransformer (X) +(55) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: false -(50) ColumnarExchange +(56) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X] +Arguments: X + +(57) ColumnarExchange Input [4]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X], [plan_id=X], [id=#X] -(51) ShuffleQueryStage +(58) ShuffleQueryStage Output [3]: [l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: X -(52) InputAdapter +(59) InputAdapter Input [3]: [l_extendedprice#X, l_discount#X, s_nationkey#X] -(53) InputIteratorTransformer +(60) InputIteratorTransformer Input [3]: [l_extendedprice#X, l_discount#X, s_nationkey#X] -(54) Scan parquet +(61) Scan parquet Output [3]: [n_nationkey#X, n_name#X, n_regionkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), IsNotNull(n_regionkey)] ReadSchema: struct -(55) NoopFilter +(62) NoopFilter Input [3]: [n_nationkey#X, n_name#X, n_regionkey#X] Arguments: [n_nationkey#X, n_name#X, n_regionkey#X] -(56) ProjectExecTransformer +(63) ProjectExecTransformer Output [4]: [hash(n_nationkey#X, 42) AS hash_partition_key#X, n_nationkey#X, n_name#X, n_regionkey#X] Input [3]: [n_nationkey#X, n_name#X, n_regionkey#X] -(57) WholeStageCodegenTransformer (X) +(64) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, n_nationkey#X, n_name#X, n_regionkey#X] Arguments: false -(58) ColumnarExchange +(65) VeloxAppendBatches +Input [4]: [hash_partition_key#X, n_nationkey#X, n_name#X, n_regionkey#X] +Arguments: X + +(66) ColumnarExchange Input [4]: [hash_partition_key#X, n_nationkey#X, n_name#X, n_regionkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X, n_regionkey#X], [plan_id=X], [id=#X] -(59) ShuffleQueryStage +(67) ShuffleQueryStage Output [3]: [n_nationkey#X, n_name#X, n_regionkey#X] Arguments: X -(60) InputAdapter +(68) InputAdapter Input [3]: [n_nationkey#X, n_name#X, n_regionkey#X] -(61) InputIteratorTransformer +(69) InputIteratorTransformer Input [3]: [n_nationkey#X, n_name#X, n_regionkey#X] -(62) ShuffledHashJoinExecTransformer +(70) ShuffledHashJoinExecTransformer Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(63) ProjectExecTransformer +(71) ProjectExecTransformer Output [5]: [hash(n_regionkey#X, 42) AS hash_partition_key#X, l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] Input [6]: [l_extendedprice#X, l_discount#X, s_nationkey#X, n_nationkey#X, n_name#X, n_regionkey#X] -(64) WholeStageCodegenTransformer (X) +(72) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] Arguments: false -(65) ColumnarExchange +(73) VeloxAppendBatches +Input [5]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] +Arguments: X + +(74) ColumnarExchange Input [5]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] Arguments: hashpartitioning(n_regionkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X], [plan_id=X], [id=#X] -(66) ShuffleQueryStage +(75) ShuffleQueryStage Output [4]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] Arguments: X -(67) InputAdapter +(76) InputAdapter Input [4]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] -(68) InputIteratorTransformer +(77) InputIteratorTransformer Input [4]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] -(69) Scan parquet +(78) Scan parquet Output [2]: [r_regionkey#X, r_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(r_name), EqualTo(r_name,ASIA), IsNotNull(r_regionkey)] ReadSchema: struct -(70) NoopFilter +(79) NoopFilter Input [2]: [r_regionkey#X, r_name#X] Arguments: [r_regionkey#X, r_name#X] -(71) ProjectExecTransformer +(80) ProjectExecTransformer Output [2]: [hash(r_regionkey#X, 42) AS hash_partition_key#X, r_regionkey#X] Input [2]: [r_regionkey#X, r_name#X] -(72) WholeStageCodegenTransformer (X) +(81) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, r_regionkey#X] Arguments: false -(73) ColumnarExchange +(82) VeloxAppendBatches +Input [2]: [hash_partition_key#X, r_regionkey#X] +Arguments: X + +(83) ColumnarExchange Input [2]: [hash_partition_key#X, r_regionkey#X] Arguments: hashpartitioning(r_regionkey#X, 1), ENSURE_REQUIREMENTS, [r_regionkey#X], [plan_id=X], [id=#X] -(74) ShuffleQueryStage +(84) ShuffleQueryStage Output [1]: [r_regionkey#X] Arguments: X -(75) InputAdapter +(85) InputAdapter Input [1]: [r_regionkey#X] -(76) InputIteratorTransformer +(86) InputIteratorTransformer Input [1]: [r_regionkey#X] -(77) ShuffledHashJoinExecTransformer +(87) ShuffledHashJoinExecTransformer Left keys [1]: [n_regionkey#X] Right keys [1]: [r_regionkey#X] Join type: Inner Join condition: None -(78) ProjectExecTransformer +(88) ProjectExecTransformer Output [4]: [l_extendedprice#X, l_discount#X, n_name#X, (l_extendedprice#X * (1 - l_discount#X)) AS _pre_X#X] Input [5]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X, r_regionkey#X] -(79) FlushableHashAggregateExecTransformer +(89) FlushableHashAggregateExecTransformer Input [4]: [l_extendedprice#X, l_discount#X, n_name#X, _pre_X#X] Keys [1]: [n_name#X] Functions [1]: [partial_sum(_pre_X#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [n_name#X, sum#X, isEmpty#X] -(80) ProjectExecTransformer +(90) ProjectExecTransformer Output [4]: [hash(n_name#X, 42) AS hash_partition_key#X, n_name#X, sum#X, isEmpty#X] Input [3]: [n_name#X, sum#X, isEmpty#X] -(81) WholeStageCodegenTransformer (X) +(91) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, n_name#X, sum#X, isEmpty#X] Arguments: false -(82) ColumnarExchange +(92) VeloxAppendBatches +Input [4]: [hash_partition_key#X, n_name#X, sum#X, isEmpty#X] +Arguments: X + +(93) ColumnarExchange Input [4]: [hash_partition_key#X, n_name#X, sum#X, isEmpty#X] Arguments: hashpartitioning(n_name#X, 1), ENSURE_REQUIREMENTS, [n_name#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(83) ShuffleQueryStage +(94) ShuffleQueryStage Output [3]: [n_name#X, sum#X, isEmpty#X] Arguments: X -(84) InputAdapter +(95) InputAdapter Input [3]: [n_name#X, sum#X, isEmpty#X] -(85) InputIteratorTransformer +(96) InputIteratorTransformer Input [3]: [n_name#X, sum#X, isEmpty#X] -(86) RegularHashAggregateExecTransformer +(97) RegularHashAggregateExecTransformer Input [3]: [n_name#X, sum#X, isEmpty#X] Keys [1]: [n_name#X] Functions [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X] Results [2]: [n_name#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X AS revenue#X] -(87) WholeStageCodegenTransformer (X) +(98) WholeStageCodegenTransformer (X) Input [2]: [n_name#X, revenue#X] Arguments: false -(88) ColumnarExchange +(99) VeloxAppendBatches +Input [2]: [n_name#X, revenue#X] +Arguments: X + +(100) ColumnarExchange Input [2]: [n_name#X, revenue#X] Arguments: rangepartitioning(revenue#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(89) ShuffleQueryStage +(101) ShuffleQueryStage Output [2]: [n_name#X, revenue#X] Arguments: X -(90) InputAdapter +(102) InputAdapter Input [2]: [n_name#X, revenue#X] -(91) InputIteratorTransformer +(103) InputIteratorTransformer Input [2]: [n_name#X, revenue#X] -(92) SortExecTransformer +(104) SortExecTransformer Input [2]: [n_name#X, revenue#X] Arguments: [revenue#X DESC NULLS LAST], true, 0 -(93) WholeStageCodegenTransformer (X) +(105) WholeStageCodegenTransformer (X) Input [2]: [n_name#X, revenue#X] Arguments: false -(94) VeloxColumnarToRowExec +(106) VeloxColumnarToRowExec Input [2]: [n_name#X, revenue#X] -(95) Scan parquet +(107) Scan parquet Output [2]: [c_custkey#X, c_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey), IsNotNull(c_nationkey)] ReadSchema: struct -(96) Filter +(108) Filter Input [2]: [c_custkey#X, c_nationkey#X] Condition : (isnotnull(c_custkey#X) AND isnotnull(c_nationkey#X)) -(97) Exchange +(109) Exchange Input [2]: [c_custkey#X, c_nationkey#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(98) Scan parquet +(110) Sort +Input [2]: [c_custkey#X, c_nationkey#X] +Arguments: [c_custkey#X ASC NULLS FIRST], false, 0 + +(111) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1994-01-01), LessThan(o_orderdate,1995-01-01), IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(99) Filter +(112) Filter Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Condition : ((((isnotnull(o_orderdate#X) AND (o_orderdate#X >= 1994-01-01)) AND (o_orderdate#X < 1995-01-01)) AND isnotnull(o_custkey#X)) AND isnotnull(o_orderkey#X)) -(100) Project +(113) Project Output [2]: [o_orderkey#X, o_custkey#X] Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] -(101) Exchange +(114) Exchange Input [2]: [o_orderkey#X, o_custkey#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(102) ShuffledHashJoin +(115) Sort +Input [2]: [o_orderkey#X, o_custkey#X] +Arguments: [o_custkey#X ASC NULLS FIRST], false, 0 + +(116) SortMergeJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join type: Inner Join condition: None -(103) Project +(117) Project Output [2]: [c_nationkey#X, o_orderkey#X] Input [4]: [c_custkey#X, c_nationkey#X, o_orderkey#X, o_custkey#X] -(104) Exchange +(118) Exchange Input [2]: [c_nationkey#X, o_orderkey#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(105) Scan parquet +(119) Sort +Input [2]: [c_nationkey#X, o_orderkey#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(120) Scan parquet Output [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_orderkey), IsNotNull(l_suppkey)] ReadSchema: struct -(106) Filter +(121) Filter Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Condition : (isnotnull(l_orderkey#X) AND isnotnull(l_suppkey#X)) -(107) Exchange +(122) Exchange Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(108) ShuffledHashJoin +(123) Sort +Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(124) SortMergeJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: Inner Join condition: None -(109) Project +(125) Project Output [4]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Input [6]: [c_nationkey#X, o_orderkey#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(110) Exchange +(126) Exchange Input [4]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_suppkey#X, c_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(111) Scan parquet +(127) Sort +Input [4]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] +Arguments: [l_suppkey#X ASC NULLS FIRST, c_nationkey#X ASC NULLS FIRST], false, 0 + +(128) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(112) Filter +(129) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(113) Exchange +(130) Exchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, s_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(114) ShuffledHashJoin +(131) Sort +Input [2]: [s_suppkey#X, s_nationkey#X] +Arguments: [s_suppkey#X ASC NULLS FIRST, s_nationkey#X ASC NULLS FIRST], false, 0 + +(132) SortMergeJoin Left keys [2]: [l_suppkey#X, c_nationkey#X] Right keys [2]: [s_suppkey#X, s_nationkey#X] Join type: Inner Join condition: None -(115) Project +(133) Project Output [3]: [l_extendedprice#X, l_discount#X, s_nationkey#X] Input [6]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, s_suppkey#X, s_nationkey#X] -(116) Exchange +(134) Exchange Input [3]: [l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(117) Scan parquet +(135) Sort +Input [3]: [l_extendedprice#X, l_discount#X, s_nationkey#X] +Arguments: [s_nationkey#X ASC NULLS FIRST], false, 0 + +(136) Scan parquet Output [3]: [n_nationkey#X, n_name#X, n_regionkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), IsNotNull(n_regionkey)] ReadSchema: struct -(118) Filter +(137) Filter Input [3]: [n_nationkey#X, n_name#X, n_regionkey#X] Condition : (isnotnull(n_nationkey#X) AND isnotnull(n_regionkey#X)) -(119) Exchange +(138) Exchange Input [3]: [n_nationkey#X, n_name#X, n_regionkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(120) ShuffledHashJoin +(139) Sort +Input [3]: [n_nationkey#X, n_name#X, n_regionkey#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(140) SortMergeJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(121) Project +(141) Project Output [4]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] Input [6]: [l_extendedprice#X, l_discount#X, s_nationkey#X, n_nationkey#X, n_name#X, n_regionkey#X] -(122) Exchange +(142) Exchange Input [4]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] Arguments: hashpartitioning(n_regionkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(123) Scan parquet +(143) Sort +Input [4]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] +Arguments: [n_regionkey#X ASC NULLS FIRST], false, 0 + +(144) Scan parquet Output [2]: [r_regionkey#X, r_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(r_name), EqualTo(r_name,ASIA), IsNotNull(r_regionkey)] ReadSchema: struct -(124) Filter +(145) Filter Input [2]: [r_regionkey#X, r_name#X] Condition : ((isnotnull(r_name#X) AND (r_name#X = ASIA)) AND isnotnull(r_regionkey#X)) -(125) Project +(146) Project Output [1]: [r_regionkey#X] Input [2]: [r_regionkey#X, r_name#X] -(126) Exchange +(147) Exchange Input [1]: [r_regionkey#X] Arguments: hashpartitioning(r_regionkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(127) ShuffledHashJoin +(148) Sort +Input [1]: [r_regionkey#X] +Arguments: [r_regionkey#X ASC NULLS FIRST], false, 0 + +(149) SortMergeJoin Left keys [1]: [n_regionkey#X] Right keys [1]: [r_regionkey#X] Join type: Inner Join condition: None -(128) Project +(150) Project Output [3]: [l_extendedprice#X, l_discount#X, n_name#X] Input [5]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X, r_regionkey#X] -(129) HashAggregate +(151) HashAggregate Input [3]: [l_extendedprice#X, l_discount#X, n_name#X] Keys [1]: [n_name#X] Functions [1]: [partial_sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [n_name#X, sum#X, isEmpty#X] -(130) Exchange +(152) Exchange Input [3]: [n_name#X, sum#X, isEmpty#X] Arguments: hashpartitioning(n_name#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(131) HashAggregate +(153) HashAggregate Input [3]: [n_name#X, sum#X, isEmpty#X] Keys [1]: [n_name#X] Functions [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X] Results [2]: [n_name#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X AS revenue#X] -(132) Exchange +(154) Exchange Input [2]: [n_name#X, revenue#X] Arguments: rangepartitioning(revenue#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(133) Sort +(155) Sort Input [2]: [n_name#X, revenue#X] Arguments: [revenue#X DESC NULLS LAST], true, 0 -(134) AdaptiveSparkPlan +(156) AdaptiveSparkPlan Output [2]: [n_name#X, revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/6.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/6.txt index 9333e2a8ad599..64624c791f726 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/6.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/6.txt @@ -1,23 +1,23 @@ == Physical Plan == -AdaptiveSparkPlan (19) +AdaptiveSparkPlan (20) +- == Final Plan == - VeloxColumnarToRowExec (12) - +- ^ RegularHashAggregateExecTransformer (10) - +- ^ InputIteratorTransformer (9) - +- ^ InputAdapter (8) - +- ^ ShuffleQueryStage (7), Statistics(X) - +- ColumnarExchange (6) + VeloxColumnarToRowExec (13) + +- ^ RegularHashAggregateExecTransformer (11) + +- ^ InputIteratorTransformer (10) + +- ShuffleQueryStage (8), Statistics(X) + +- ColumnarExchange (7) + +- VeloxAppendBatches (6) +- ^ FlushableHashAggregateExecTransformer (4) +- ^ ProjectExecTransformer (3) +- ^ NoopFilter (2) +- ^ Scan parquet (1) +- == Initial Plan == - HashAggregate (18) - +- Exchange (17) - +- HashAggregate (16) - +- Project (15) - +- Filter (14) - +- Scan parquet (13) + HashAggregate (19) + +- Exchange (18) + +- HashAggregate (17) + +- Project (16) + +- Filter (15) + +- Scan parquet (14) (1) Scan parquet @@ -46,67 +46,71 @@ Results [2]: [sum#X, isEmpty#X] Input [2]: [sum#X, isEmpty#X] Arguments: false -(6) ColumnarExchange +(6) VeloxAppendBatches +Input [2]: [sum#X, isEmpty#X] +Arguments: X + +(7) ColumnarExchange Input [2]: [sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(7) ShuffleQueryStage +(8) ShuffleQueryStage Output [2]: [sum#X, isEmpty#X] Arguments: X -(8) InputAdapter +(9) InputAdapter Input [2]: [sum#X, isEmpty#X] -(9) InputIteratorTransformer +(10) InputIteratorTransformer Input [2]: [sum#X, isEmpty#X] -(10) RegularHashAggregateExecTransformer +(11) RegularHashAggregateExecTransformer Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum((l_extendedprice#X * l_discount#X))] Aggregate Attributes [1]: [sum((l_extendedprice#X * l_discount#X))#X] Results [1]: [sum((l_extendedprice#X * l_discount#X))#X AS revenue#X] -(11) WholeStageCodegenTransformer (X) +(12) WholeStageCodegenTransformer (X) Input [1]: [revenue#X] Arguments: false -(12) VeloxColumnarToRowExec +(13) VeloxColumnarToRowExec Input [1]: [revenue#X] -(13) Scan parquet +(14) Scan parquet Output [4]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), IsNotNull(l_discount), IsNotNull(l_quantity), GreaterThanOrEqual(l_shipdate,1994-01-01), LessThan(l_shipdate,1995-01-01), GreaterThanOrEqual(l_discount,0.05), LessThanOrEqual(l_discount,0.07), LessThan(l_quantity,24.00)] ReadSchema: struct -(14) Filter +(15) Filter Input [4]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : (((((((isnotnull(l_shipdate#X) AND isnotnull(l_discount#X)) AND isnotnull(l_quantity#X)) AND (l_shipdate#X >= 1994-01-01)) AND (l_shipdate#X < 1995-01-01)) AND (l_discount#X >= 0.05)) AND (l_discount#X <= 0.07)) AND (l_quantity#X < 24.00)) -(15) Project +(16) Project Output [2]: [l_extendedprice#X, l_discount#X] Input [4]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(16) HashAggregate +(17) HashAggregate Input [2]: [l_extendedprice#X, l_discount#X] Keys: [] Functions [1]: [partial_sum((l_extendedprice#X * l_discount#X))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(17) Exchange +(18) Exchange Input [2]: [sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X] -(18) HashAggregate +(19) HashAggregate Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum((l_extendedprice#X * l_discount#X))] Aggregate Attributes [1]: [sum((l_extendedprice#X * l_discount#X))#X] Results [1]: [sum((l_extendedprice#X * l_discount#X))#X AS revenue#X] -(19) AdaptiveSparkPlan +(20) AdaptiveSparkPlan Output [1]: [revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/7.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/7.txt index 03305d572de7a..65dfab993c3c3 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/7.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/7.txt @@ -1,122 +1,131 @@ == Physical Plan == -AdaptiveSparkPlan (128) +AdaptiveSparkPlan (149) +- == Final Plan == - VeloxColumnarToRowExec (90) - +- ^ SortExecTransformer (88) - +- ^ InputIteratorTransformer (87) - +- ^ InputAdapter (86) - +- ^ ShuffleQueryStage (85), Statistics(X) - +- ColumnarExchange (84) - +- ^ RegularHashAggregateExecTransformer (82) - +- ^ InputIteratorTransformer (81) - +- ^ InputAdapter (80) - +- ^ ShuffleQueryStage (79), Statistics(X) - +- ColumnarExchange (78) - +- ^ ProjectExecTransformer (76) - +- ^ FlushableHashAggregateExecTransformer (75) - +- ^ ProjectExecTransformer (74) - +- ^ ShuffledHashJoinExecTransformer Inner (73) - :- ^ InputIteratorTransformer (68) - : +- ^ InputAdapter (67) - : +- ^ ShuffleQueryStage (66), Statistics(X) - : +- ColumnarExchange (65) - : +- ^ ProjectExecTransformer (63) - : +- ^ ShuffledHashJoinExecTransformer Inner (62) - : :- ^ InputIteratorTransformer (53) - : : +- ^ InputAdapter (52) - : : +- ^ ShuffleQueryStage (51), Statistics(X) - : : +- ColumnarExchange (50) - : : +- ^ ProjectExecTransformer (48) - : : +- ^ ShuffledHashJoinExecTransformer Inner (47) - : : :- ^ InputIteratorTransformer (38) - : : : +- ^ InputAdapter (37) - : : : +- ^ ShuffleQueryStage (36), Statistics(X) - : : : +- ColumnarExchange (35) - : : : +- ^ ProjectExecTransformer (33) - : : : +- ^ ShuffledHashJoinExecTransformer Inner (32) - : : : :- ^ InputIteratorTransformer (23) - : : : : +- ^ InputAdapter (22) - : : : : +- ^ ShuffleQueryStage (21), Statistics(X) - : : : : +- ColumnarExchange (20) - : : : : +- ^ ProjectExecTransformer (18) - : : : : +- ^ ShuffledHashJoinExecTransformer Inner (17) - : : : : :- ^ InputIteratorTransformer (8) - : : : : : +- ^ InputAdapter (7) - : : : : : +- ^ ShuffleQueryStage (6), Statistics(X) - : : : : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (101) + +- ^ SortExecTransformer (99) + +- ^ InputIteratorTransformer (98) + +- ShuffleQueryStage (96), Statistics(X) + +- ColumnarExchange (95) + +- VeloxAppendBatches (94) + +- ^ RegularHashAggregateExecTransformer (92) + +- ^ InputIteratorTransformer (91) + +- ShuffleQueryStage (89), Statistics(X) + +- ColumnarExchange (88) + +- VeloxAppendBatches (87) + +- ^ ProjectExecTransformer (85) + +- ^ FlushableHashAggregateExecTransformer (84) + +- ^ ProjectExecTransformer (83) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (82) + :- ^ InputIteratorTransformer (77) + : +- ShuffleQueryStage (75), Statistics(X) + : +- ColumnarExchange (74) + : +- VeloxAppendBatches (73) + : +- ^ ProjectExecTransformer (71) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (70) + : :- ^ InputIteratorTransformer (60) + : : +- ShuffleQueryStage (58), Statistics(X) + : : +- ColumnarExchange (57) + : : +- VeloxAppendBatches (56) + : : +- ^ ProjectExecTransformer (54) + : : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (53) + : : :- ^ InputIteratorTransformer (43) + : : : +- ShuffleQueryStage (41), Statistics(X) + : : : +- ColumnarExchange (40) + : : : +- VeloxAppendBatches (39) + : : : +- ^ ProjectExecTransformer (37) + : : : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (36) + : : : :- ^ InputIteratorTransformer (26) + : : : : +- ShuffleQueryStage (24), Statistics(X) + : : : : +- ColumnarExchange (23) + : : : : +- VeloxAppendBatches (22) + : : : : +- ^ ProjectExecTransformer (20) + : : : : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (19) + : : : : :- ^ InputIteratorTransformer (9) + : : : : : +- ShuffleQueryStage (7), Statistics(X) + : : : : : +- ColumnarExchange (6) + : : : : : +- VeloxAppendBatches (5) : : : : : +- ^ ProjectExecTransformer (3) : : : : : +- ^ NoopFilter (2) : : : : : +- ^ Scan parquet (1) - : : : : +- ^ InputIteratorTransformer (16) - : : : : +- ^ InputAdapter (15) - : : : : +- ^ ShuffleQueryStage (14), Statistics(X) - : : : : +- ColumnarExchange (13) - : : : : +- ^ ProjectExecTransformer (11) - : : : : +- ^ NoopFilter (10) - : : : : +- ^ Scan parquet (9) - : : : +- ^ InputIteratorTransformer (31) - : : : +- ^ InputAdapter (30) - : : : +- ^ ShuffleQueryStage (29), Statistics(X) - : : : +- ColumnarExchange (28) - : : : +- ^ ProjectExecTransformer (26) - : : : +- ^ NoopFilter (25) - : : : +- ^ Scan parquet (24) - : : +- ^ InputIteratorTransformer (46) - : : +- ^ InputAdapter (45) - : : +- ^ ShuffleQueryStage (44), Statistics(X) - : : +- ColumnarExchange (43) - : : +- ^ ProjectExecTransformer (41) - : : +- ^ NoopFilter (40) - : : +- ^ Scan parquet (39) - : +- ^ InputIteratorTransformer (61) - : +- ^ InputAdapter (60) - : +- ^ ShuffleQueryStage (59), Statistics(X) - : +- ColumnarExchange (58) - : +- ^ ProjectExecTransformer (56) - : +- ^ NoopFilter (55) - : +- ^ Scan parquet (54) - +- ^ InputIteratorTransformer (72) - +- ^ InputAdapter (71) - +- ^ ShuffleQueryStage (70), Statistics(X) - +- ReusedExchange (69) + : : : : +- ^ InputIteratorTransformer (18) + : : : : +- ShuffleQueryStage (16), Statistics(X) + : : : : +- ColumnarExchange (15) + : : : : +- VeloxAppendBatches (14) + : : : : +- ^ ProjectExecTransformer (12) + : : : : +- ^ NoopFilter (11) + : : : : +- ^ Scan parquet (10) + : : : +- ^ InputIteratorTransformer (35) + : : : +- ShuffleQueryStage (33), Statistics(X) + : : : +- ColumnarExchange (32) + : : : +- VeloxAppendBatches (31) + : : : +- ^ ProjectExecTransformer (29) + : : : +- ^ NoopFilter (28) + : : : +- ^ Scan parquet (27) + : : +- ^ InputIteratorTransformer (52) + : : +- ShuffleQueryStage (50), Statistics(X) + : : +- ColumnarExchange (49) + : : +- VeloxAppendBatches (48) + : : +- ^ ProjectExecTransformer (46) + : : +- ^ NoopFilter (45) + : : +- ^ Scan parquet (44) + : +- ^ InputIteratorTransformer (69) + : +- ShuffleQueryStage (67), Statistics(X) + : +- ColumnarExchange (66) + : +- VeloxAppendBatches (65) + : +- ^ ProjectExecTransformer (63) + : +- ^ NoopFilter (62) + : +- ^ Scan parquet (61) + +- ^ InputIteratorTransformer (81) + +- ShuffleQueryStage (79), Statistics(X) + +- ReusedExchange (78) +- == Initial Plan == - Sort (127) - +- Exchange (126) - +- HashAggregate (125) - +- Exchange (124) - +- HashAggregate (123) - +- Project (122) - +- ShuffledHashJoin Inner BuildRight (121) - :- Exchange (117) - : +- Project (116) - : +- ShuffledHashJoin Inner BuildRight (115) - : :- Exchange (111) - : : +- Project (110) - : : +- ShuffledHashJoin Inner BuildRight (109) - : : :- Exchange (105) - : : : +- Project (104) - : : : +- ShuffledHashJoin Inner BuildRight (103) - : : : :- Exchange (99) - : : : : +- Project (98) - : : : : +- ShuffledHashJoin Inner BuildLeft (97) - : : : : :- Exchange (93) - : : : : : +- Filter (92) - : : : : : +- Scan parquet (91) - : : : : +- Exchange (96) - : : : : +- Filter (95) - : : : : +- Scan parquet (94) - : : : +- Exchange (102) - : : : +- Filter (101) - : : : +- Scan parquet (100) - : : +- Exchange (108) - : : +- Filter (107) - : : +- Scan parquet (106) - : +- Exchange (114) - : +- Filter (113) - : +- Scan parquet (112) - +- Exchange (120) - +- Filter (119) - +- Scan parquet (118) + Sort (148) + +- Exchange (147) + +- HashAggregate (146) + +- Exchange (145) + +- HashAggregate (144) + +- Project (143) + +- SortMergeJoin Inner (142) + :- Sort (137) + : +- Exchange (136) + : +- Project (135) + : +- SortMergeJoin Inner (134) + : :- Sort (129) + : : +- Exchange (128) + : : +- Project (127) + : : +- SortMergeJoin Inner (126) + : : :- Sort (121) + : : : +- Exchange (120) + : : : +- Project (119) + : : : +- SortMergeJoin Inner (118) + : : : :- Sort (113) + : : : : +- Exchange (112) + : : : : +- Project (111) + : : : : +- SortMergeJoin Inner (110) + : : : : :- Sort (105) + : : : : : +- Exchange (104) + : : : : : +- Filter (103) + : : : : : +- Scan parquet (102) + : : : : +- Sort (109) + : : : : +- Exchange (108) + : : : : +- Filter (107) + : : : : +- Scan parquet (106) + : : : +- Sort (117) + : : : +- Exchange (116) + : : : +- Filter (115) + : : : +- Scan parquet (114) + : : +- Sort (125) + : : +- Exchange (124) + : : +- Filter (123) + : : +- Scan parquet (122) + : +- Sort (133) + : +- Exchange (132) + : +- Filter (131) + : +- Scan parquet (130) + +- Sort (141) + +- Exchange (140) + +- Filter (139) + +- Scan parquet (138) (1) Scan parquet @@ -138,534 +147,618 @@ Input [2]: [s_suppkey#X, s_nationkey#X] Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] +Arguments: X + +(6) ColumnarExchange Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [2]: [s_suppkey#X, s_nationkey#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [2]: [s_suppkey#X, s_nationkey#X] -(9) Scan parquet +(10) Scan parquet Output [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1995-01-01), LessThanOrEqual(l_shipdate,1996-12-31), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(10) NoopFilter +(11) NoopFilter Input [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [6]: [hash(l_suppkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Input [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [6]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] +Arguments: X + +(15) ColumnarExchange Input [6]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [s_suppkey#X] Right keys [1]: [l_suppkey#X] Join type: Inner Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [6]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Input [7]: [s_suppkey#X, s_nationkey#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(19) WholeStageCodegenTransformer (X) +(21) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: false -(20) ColumnarExchange +(22) VeloxAppendBatches +Input [6]: [hash_partition_key#X, s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] +Arguments: X + +(23) ColumnarExchange Input [6]: [hash_partition_key#X, s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X], [plan_id=X], [id=#X] -(21) ShuffleQueryStage +(24) ShuffleQueryStage Output [5]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: X -(22) InputAdapter +(25) InputAdapter Input [5]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(23) InputIteratorTransformer +(26) InputIteratorTransformer Input [5]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(24) Scan parquet +(27) Scan parquet Output [2]: [o_orderkey#X, o_custkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderkey), IsNotNull(o_custkey)] ReadSchema: struct -(25) NoopFilter +(28) NoopFilter Input [2]: [o_orderkey#X, o_custkey#X] Arguments: [o_orderkey#X, o_custkey#X] -(26) ProjectExecTransformer +(29) ProjectExecTransformer Output [3]: [hash(o_orderkey#X, 42) AS hash_partition_key#X, o_orderkey#X, o_custkey#X] Input [2]: [o_orderkey#X, o_custkey#X] -(27) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] Arguments: false -(28) ColumnarExchange +(31) VeloxAppendBatches +Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] +Arguments: X + +(32) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [id=#X] -(29) ShuffleQueryStage +(33) ShuffleQueryStage Output [2]: [o_orderkey#X, o_custkey#X] Arguments: X -(30) InputAdapter +(34) InputAdapter Input [2]: [o_orderkey#X, o_custkey#X] -(31) InputIteratorTransformer +(35) InputIteratorTransformer Input [2]: [o_orderkey#X, o_custkey#X] -(32) ShuffledHashJoinExecTransformer +(36) ShuffledHashJoinExecTransformer Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join type: Inner Join condition: None -(33) ProjectExecTransformer +(37) ProjectExecTransformer Output [6]: [hash(o_custkey#X, 42) AS hash_partition_key#X, s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] Input [7]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_orderkey#X, o_custkey#X] -(34) WholeStageCodegenTransformer (X) +(38) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] Arguments: false -(35) ColumnarExchange +(39) VeloxAppendBatches +Input [6]: [hash_partition_key#X, s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] +Arguments: X + +(40) ColumnarExchange Input [6]: [hash_partition_key#X, s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X], [plan_id=X], [id=#X] -(36) ShuffleQueryStage +(41) ShuffleQueryStage Output [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] Arguments: X -(37) InputAdapter +(42) InputAdapter Input [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] -(38) InputIteratorTransformer +(43) InputIteratorTransformer Input [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] -(39) Scan parquet +(44) Scan parquet Output [2]: [c_custkey#X, c_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey), IsNotNull(c_nationkey)] ReadSchema: struct -(40) NoopFilter +(45) NoopFilter Input [2]: [c_custkey#X, c_nationkey#X] Arguments: [c_custkey#X, c_nationkey#X] -(41) ProjectExecTransformer +(46) ProjectExecTransformer Output [3]: [hash(c_custkey#X, 42) AS hash_partition_key#X, c_custkey#X, c_nationkey#X] Input [2]: [c_custkey#X, c_nationkey#X] -(42) WholeStageCodegenTransformer (X) +(47) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] Arguments: false -(43) ColumnarExchange +(48) VeloxAppendBatches +Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] +Arguments: X + +(49) ColumnarExchange Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_nationkey#X], [plan_id=X], [id=#X] -(44) ShuffleQueryStage +(50) ShuffleQueryStage Output [2]: [c_custkey#X, c_nationkey#X] Arguments: X -(45) InputAdapter +(51) InputAdapter Input [2]: [c_custkey#X, c_nationkey#X] -(46) InputIteratorTransformer +(52) InputIteratorTransformer Input [2]: [c_custkey#X, c_nationkey#X] -(47) ShuffledHashJoinExecTransformer +(53) ShuffledHashJoinExecTransformer Left keys [1]: [o_custkey#X] Right keys [1]: [c_custkey#X] Join type: Inner Join condition: None -(48) ProjectExecTransformer +(54) ProjectExecTransformer Output [6]: [hash(s_nationkey#X, 42) AS hash_partition_key#X, s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] Input [7]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X, c_custkey#X, c_nationkey#X] -(49) WholeStageCodegenTransformer (X) +(55) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] Arguments: false -(50) ColumnarExchange +(56) VeloxAppendBatches +Input [6]: [hash_partition_key#X, s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] +Arguments: X + +(57) ColumnarExchange Input [6]: [hash_partition_key#X, s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X], [plan_id=X], [id=#X] -(51) ShuffleQueryStage +(58) ShuffleQueryStage Output [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] Arguments: X -(52) InputAdapter +(59) InputAdapter Input [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] -(53) InputIteratorTransformer +(60) InputIteratorTransformer Input [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] -(54) Scan parquet +(61) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), Or(EqualTo(n_name,FRANCE),EqualTo(n_name,GERMANY))] ReadSchema: struct -(55) NoopFilter +(62) NoopFilter Input [2]: [n_nationkey#X, n_name#X] Arguments: [n_nationkey#X, n_name#X] -(56) ProjectExecTransformer +(63) ProjectExecTransformer Output [3]: [hash(n_nationkey#X, 42) AS hash_partition_key#X, n_nationkey#X, n_name#X] Input [2]: [n_nationkey#X, n_name#X] -(57) WholeStageCodegenTransformer (X) +(64) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] Arguments: false -(58) ColumnarExchange +(65) VeloxAppendBatches +Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] +Arguments: X + +(66) ColumnarExchange Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [id=#X] -(59) ShuffleQueryStage +(67) ShuffleQueryStage Output [2]: [n_nationkey#X, n_name#X] Arguments: X -(60) InputAdapter +(68) InputAdapter Input [2]: [n_nationkey#X, n_name#X] -(61) InputIteratorTransformer +(69) InputIteratorTransformer Input [2]: [n_nationkey#X, n_name#X] -(62) ShuffledHashJoinExecTransformer +(70) ShuffledHashJoinExecTransformer Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(63) ProjectExecTransformer +(71) ProjectExecTransformer Output [6]: [hash(c_nationkey#X, 42) AS hash_partition_key#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] Input [7]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_nationkey#X, n_name#X] -(64) WholeStageCodegenTransformer (X) +(72) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] Arguments: false -(65) ColumnarExchange +(73) VeloxAppendBatches +Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] +Arguments: X + +(74) ColumnarExchange Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X], [plan_id=X], [id=#X] -(66) ShuffleQueryStage +(75) ShuffleQueryStage Output [5]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] Arguments: X -(67) InputAdapter +(76) InputAdapter Input [5]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] -(68) InputIteratorTransformer +(77) InputIteratorTransformer Input [5]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] -(69) ReusedExchange [Reuses operator id: 58] +(78) ReusedExchange [Reuses operator id: 66] Output [2]: [n_nationkey#X, n_name#X] -(70) ShuffleQueryStage +(79) ShuffleQueryStage Output [2]: [n_nationkey#X, n_name#X] Arguments: X -(71) InputAdapter +(80) InputAdapter Input [2]: [n_nationkey#X, n_name#X] -(72) InputIteratorTransformer +(81) InputIteratorTransformer Input [2]: [n_nationkey#X, n_name#X] -(73) ShuffledHashJoinExecTransformer +(82) ShuffledHashJoinExecTransformer Left keys [1]: [c_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: (((n_name#X = FRANCE) AND (n_name#X = GERMANY)) OR ((n_name#X = GERMANY) AND (n_name#X = FRANCE))) -(74) ProjectExecTransformer +(83) ProjectExecTransformer Output [4]: [n_name#X AS supp_nation#X, n_name#X AS cust_nation#X, year(l_shipdate#X) AS l_year#X, (l_extendedprice#X * (1 - l_discount#X)) AS volume#X] Input [7]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X, n_nationkey#X, n_name#X] -(75) FlushableHashAggregateExecTransformer +(84) FlushableHashAggregateExecTransformer Input [4]: [supp_nation#X, cust_nation#X, l_year#X, volume#X] Keys [3]: [supp_nation#X, cust_nation#X, l_year#X] Functions [1]: [partial_sum(volume#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] -(76) ProjectExecTransformer +(85) ProjectExecTransformer Output [6]: [hash(supp_nation#X, cust_nation#X, l_year#X, 42) AS hash_partition_key#X, supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] -(77) WholeStageCodegenTransformer (X) +(86) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Arguments: false -(78) ColumnarExchange +(87) VeloxAppendBatches +Input [6]: [hash_partition_key#X, supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] +Arguments: X + +(88) ColumnarExchange Input [6]: [hash_partition_key#X, supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Arguments: hashpartitioning(supp_nation#X, cust_nation#X, l_year#X, 1), ENSURE_REQUIREMENTS, [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(79) ShuffleQueryStage +(89) ShuffleQueryStage Output [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Arguments: X -(80) InputAdapter +(90) InputAdapter Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] -(81) InputIteratorTransformer +(91) InputIteratorTransformer Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] -(82) RegularHashAggregateExecTransformer +(92) RegularHashAggregateExecTransformer Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Keys [3]: [supp_nation#X, cust_nation#X, l_year#X] Functions [1]: [sum(volume#X)] Aggregate Attributes [1]: [sum(volume#X)#X] Results [4]: [supp_nation#X, cust_nation#X, l_year#X, sum(volume#X)#X AS revenue#X] -(83) WholeStageCodegenTransformer (X) +(93) WholeStageCodegenTransformer (X) Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: false -(84) ColumnarExchange +(94) VeloxAppendBatches +Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] +Arguments: X + +(95) ColumnarExchange Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: rangepartitioning(supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(85) ShuffleQueryStage +(96) ShuffleQueryStage Output [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: X -(86) InputAdapter +(97) InputAdapter Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] -(87) InputIteratorTransformer +(98) InputIteratorTransformer Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] -(88) SortExecTransformer +(99) SortExecTransformer Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: [supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST], true, 0 -(89) WholeStageCodegenTransformer (X) +(100) WholeStageCodegenTransformer (X) Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: false -(90) VeloxColumnarToRowExec +(101) VeloxColumnarToRowExec Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] -(91) Scan parquet +(102) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(92) Filter +(103) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(93) Exchange +(104) Exchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(94) Scan parquet +(105) Sort +Input [2]: [s_suppkey#X, s_nationkey#X] +Arguments: [s_suppkey#X ASC NULLS FIRST], false, 0 + +(106) Scan parquet Output [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1995-01-01), LessThanOrEqual(l_shipdate,1996-12-31), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(95) Filter +(107) Filter Input [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : ((((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1995-01-01)) AND (l_shipdate#X <= 1996-12-31)) AND isnotnull(l_suppkey#X)) AND isnotnull(l_orderkey#X)) -(96) Exchange +(108) Exchange Input [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(97) ShuffledHashJoin +(109) Sort +Input [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] +Arguments: [l_suppkey#X ASC NULLS FIRST], false, 0 + +(110) SortMergeJoin Left keys [1]: [s_suppkey#X] Right keys [1]: [l_suppkey#X] Join type: Inner Join condition: None -(98) Project +(111) Project Output [5]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Input [7]: [s_suppkey#X, s_nationkey#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(99) Exchange +(112) Exchange Input [5]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(100) Scan parquet +(113) Sort +Input [5]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(114) Scan parquet Output [2]: [o_orderkey#X, o_custkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderkey), IsNotNull(o_custkey)] ReadSchema: struct -(101) Filter +(115) Filter Input [2]: [o_orderkey#X, o_custkey#X] Condition : (isnotnull(o_orderkey#X) AND isnotnull(o_custkey#X)) -(102) Exchange +(116) Exchange Input [2]: [o_orderkey#X, o_custkey#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(103) ShuffledHashJoin +(117) Sort +Input [2]: [o_orderkey#X, o_custkey#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(118) SortMergeJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join type: Inner Join condition: None -(104) Project +(119) Project Output [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] Input [7]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_orderkey#X, o_custkey#X] -(105) Exchange +(120) Exchange Input [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(106) Scan parquet +(121) Sort +Input [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] +Arguments: [o_custkey#X ASC NULLS FIRST], false, 0 + +(122) Scan parquet Output [2]: [c_custkey#X, c_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey), IsNotNull(c_nationkey)] ReadSchema: struct -(107) Filter +(123) Filter Input [2]: [c_custkey#X, c_nationkey#X] Condition : (isnotnull(c_custkey#X) AND isnotnull(c_nationkey#X)) -(108) Exchange +(124) Exchange Input [2]: [c_custkey#X, c_nationkey#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(109) ShuffledHashJoin +(125) Sort +Input [2]: [c_custkey#X, c_nationkey#X] +Arguments: [c_custkey#X ASC NULLS FIRST], false, 0 + +(126) SortMergeJoin Left keys [1]: [o_custkey#X] Right keys [1]: [c_custkey#X] Join type: Inner Join condition: None -(110) Project +(127) Project Output [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] Input [7]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X, c_custkey#X, c_nationkey#X] -(111) Exchange +(128) Exchange Input [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(112) Scan parquet +(129) Sort +Input [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] +Arguments: [s_nationkey#X ASC NULLS FIRST], false, 0 + +(130) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), Or(EqualTo(n_name,FRANCE),EqualTo(n_name,GERMANY))] ReadSchema: struct -(113) Filter +(131) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : (isnotnull(n_nationkey#X) AND ((n_name#X = FRANCE) OR (n_name#X = GERMANY))) -(114) Exchange +(132) Exchange Input [2]: [n_nationkey#X, n_name#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(115) ShuffledHashJoin +(133) Sort +Input [2]: [n_nationkey#X, n_name#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(134) SortMergeJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(116) Project +(135) Project Output [5]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] Input [7]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_nationkey#X, n_name#X] -(117) Exchange +(136) Exchange Input [5]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(118) Scan parquet +(137) Sort +Input [5]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] +Arguments: [c_nationkey#X ASC NULLS FIRST], false, 0 + +(138) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), Or(EqualTo(n_name,GERMANY),EqualTo(n_name,FRANCE))] ReadSchema: struct -(119) Filter +(139) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : (isnotnull(n_nationkey#X) AND ((n_name#X = GERMANY) OR (n_name#X = FRANCE))) -(120) Exchange +(140) Exchange Input [2]: [n_nationkey#X, n_name#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(121) ShuffledHashJoin +(141) Sort +Input [2]: [n_nationkey#X, n_name#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(142) SortMergeJoin Left keys [1]: [c_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: (((n_name#X = FRANCE) AND (n_name#X = GERMANY)) OR ((n_name#X = GERMANY) AND (n_name#X = FRANCE))) -(122) Project +(143) Project Output [4]: [n_name#X AS supp_nation#X, n_name#X AS cust_nation#X, year(l_shipdate#X) AS l_year#X, (l_extendedprice#X * (1 - l_discount#X)) AS volume#X] Input [7]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X, n_nationkey#X, n_name#X] -(123) HashAggregate +(144) HashAggregate Input [4]: [supp_nation#X, cust_nation#X, l_year#X, volume#X] Keys [3]: [supp_nation#X, cust_nation#X, l_year#X] Functions [1]: [partial_sum(volume#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] -(124) Exchange +(145) Exchange Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Arguments: hashpartitioning(supp_nation#X, cust_nation#X, l_year#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(125) HashAggregate +(146) HashAggregate Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Keys [3]: [supp_nation#X, cust_nation#X, l_year#X] Functions [1]: [sum(volume#X)] Aggregate Attributes [1]: [sum(volume#X)#X] Results [4]: [supp_nation#X, cust_nation#X, l_year#X, sum(volume#X)#X AS revenue#X] -(126) Exchange +(147) Exchange Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: rangepartitioning(supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(127) Sort +(148) Sort Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: [supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST], true, 0 -(128) AdaptiveSparkPlan +(149) AdaptiveSparkPlan Output [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/8.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/8.txt index f0176bc5e011c..61f6287c2429d 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/8.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/8.txt @@ -1,166 +1,180 @@ == Physical Plan == -AdaptiveSparkPlan (177) +AdaptiveSparkPlan (207) +- == Final Plan == - VeloxColumnarToRowExec (125) - +- ^ SortExecTransformer (123) - +- ^ InputIteratorTransformer (122) - +- ^ InputAdapter (121) - +- ^ ShuffleQueryStage (120), Statistics(X) - +- ColumnarExchange (119) - +- ^ ProjectExecTransformer (117) - +- ^ RegularHashAggregateExecTransformer (116) - +- ^ InputIteratorTransformer (115) - +- ^ InputAdapter (114) - +- ^ ShuffleQueryStage (113), Statistics(X) - +- ColumnarExchange (112) - +- ^ ProjectExecTransformer (110) - +- ^ FlushableHashAggregateExecTransformer (109) - +- ^ ProjectExecTransformer (108) - +- ^ ShuffledHashJoinExecTransformer Inner (107) - :- ^ InputIteratorTransformer (98) - : +- ^ InputAdapter (97) - : +- ^ ShuffleQueryStage (96), Statistics(X) - : +- ColumnarExchange (95) - : +- ^ ProjectExecTransformer (93) - : +- ^ ShuffledHashJoinExecTransformer Inner (92) - : :- ^ InputIteratorTransformer (83) - : : +- ^ InputAdapter (82) - : : +- ^ ShuffleQueryStage (81), Statistics(X) - : : +- ColumnarExchange (80) - : : +- ^ ProjectExecTransformer (78) - : : +- ^ ShuffledHashJoinExecTransformer Inner (77) - : : :- ^ InputIteratorTransformer (68) - : : : +- ^ InputAdapter (67) - : : : +- ^ ShuffleQueryStage (66), Statistics(X) - : : : +- ColumnarExchange (65) - : : : +- ^ ProjectExecTransformer (63) - : : : +- ^ ShuffledHashJoinExecTransformer Inner (62) - : : : :- ^ InputIteratorTransformer (53) - : : : : +- ^ InputAdapter (52) - : : : : +- ^ ShuffleQueryStage (51), Statistics(X) - : : : : +- ColumnarExchange (50) - : : : : +- ^ ProjectExecTransformer (48) - : : : : +- ^ ShuffledHashJoinExecTransformer Inner (47) - : : : : :- ^ InputIteratorTransformer (38) - : : : : : +- ^ InputAdapter (37) - : : : : : +- ^ ShuffleQueryStage (36), Statistics(X) - : : : : : +- ColumnarExchange (35) - : : : : : +- ^ ProjectExecTransformer (33) - : : : : : +- ^ ShuffledHashJoinExecTransformer Inner (32) - : : : : : :- ^ InputIteratorTransformer (23) - : : : : : : +- ^ InputAdapter (22) - : : : : : : +- ^ ShuffleQueryStage (21), Statistics(X) - : : : : : : +- ColumnarExchange (20) - : : : : : : +- ^ ProjectExecTransformer (18) - : : : : : : +- ^ ShuffledHashJoinExecTransformer Inner (17) - : : : : : : :- ^ InputIteratorTransformer (8) - : : : : : : : +- ^ InputAdapter (7) - : : : : : : : +- ^ ShuffleQueryStage (6), Statistics(X) - : : : : : : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (141) + +- ^ SortExecTransformer (139) + +- ^ InputIteratorTransformer (138) + +- ShuffleQueryStage (136), Statistics(X) + +- ColumnarExchange (135) + +- VeloxAppendBatches (134) + +- ^ ProjectExecTransformer (132) + +- ^ RegularHashAggregateExecTransformer (131) + +- ^ InputIteratorTransformer (130) + +- ShuffleQueryStage (128), Statistics(X) + +- ColumnarExchange (127) + +- VeloxAppendBatches (126) + +- ^ ProjectExecTransformer (124) + +- ^ FlushableHashAggregateExecTransformer (123) + +- ^ ProjectExecTransformer (122) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (121) + :- ^ InputIteratorTransformer (111) + : +- ShuffleQueryStage (109), Statistics(X) + : +- ColumnarExchange (108) + : +- VeloxAppendBatches (107) + : +- ^ ProjectExecTransformer (105) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (104) + : :- ^ InputIteratorTransformer (94) + : : +- ShuffleQueryStage (92), Statistics(X) + : : +- ColumnarExchange (91) + : : +- VeloxAppendBatches (90) + : : +- ^ ProjectExecTransformer (88) + : : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (87) + : : :- ^ InputIteratorTransformer (77) + : : : +- ShuffleQueryStage (75), Statistics(X) + : : : +- ColumnarExchange (74) + : : : +- VeloxAppendBatches (73) + : : : +- ^ ProjectExecTransformer (71) + : : : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (70) + : : : :- ^ InputIteratorTransformer (60) + : : : : +- ShuffleQueryStage (58), Statistics(X) + : : : : +- ColumnarExchange (57) + : : : : +- VeloxAppendBatches (56) + : : : : +- ^ ProjectExecTransformer (54) + : : : : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (53) + : : : : :- ^ InputIteratorTransformer (43) + : : : : : +- ShuffleQueryStage (41), Statistics(X) + : : : : : +- ColumnarExchange (40) + : : : : : +- VeloxAppendBatches (39) + : : : : : +- ^ ProjectExecTransformer (37) + : : : : : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (36) + : : : : : :- ^ InputIteratorTransformer (26) + : : : : : : +- ShuffleQueryStage (24), Statistics(X) + : : : : : : +- ColumnarExchange (23) + : : : : : : +- VeloxAppendBatches (22) + : : : : : : +- ^ ProjectExecTransformer (20) + : : : : : : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (19) + : : : : : : :- ^ InputIteratorTransformer (9) + : : : : : : : +- ShuffleQueryStage (7), Statistics(X) + : : : : : : : +- ColumnarExchange (6) + : : : : : : : +- VeloxAppendBatches (5) : : : : : : : +- ^ ProjectExecTransformer (3) : : : : : : : +- ^ NoopFilter (2) : : : : : : : +- ^ Scan parquet (1) - : : : : : : +- ^ InputIteratorTransformer (16) - : : : : : : +- ^ InputAdapter (15) - : : : : : : +- ^ ShuffleQueryStage (14), Statistics(X) - : : : : : : +- ColumnarExchange (13) - : : : : : : +- ^ ProjectExecTransformer (11) - : : : : : : +- ^ NoopFilter (10) - : : : : : : +- ^ Scan parquet (9) - : : : : : +- ^ InputIteratorTransformer (31) - : : : : : +- ^ InputAdapter (30) - : : : : : +- ^ ShuffleQueryStage (29), Statistics(X) - : : : : : +- ColumnarExchange (28) - : : : : : +- ^ ProjectExecTransformer (26) - : : : : : +- ^ NoopFilter (25) - : : : : : +- ^ Scan parquet (24) - : : : : +- ^ InputIteratorTransformer (46) - : : : : +- ^ InputAdapter (45) - : : : : +- ^ ShuffleQueryStage (44), Statistics(X) - : : : : +- ColumnarExchange (43) - : : : : +- ^ ProjectExecTransformer (41) - : : : : +- ^ NoopFilter (40) - : : : : +- ^ Scan parquet (39) - : : : +- ^ InputIteratorTransformer (61) - : : : +- ^ InputAdapter (60) - : : : +- ^ ShuffleQueryStage (59), Statistics(X) - : : : +- ColumnarExchange (58) - : : : +- ^ ProjectExecTransformer (56) - : : : +- ^ NoopFilter (55) - : : : +- ^ Scan parquet (54) - : : +- ^ InputIteratorTransformer (76) - : : +- ^ InputAdapter (75) - : : +- ^ ShuffleQueryStage (74), Statistics(X) - : : +- ColumnarExchange (73) - : : +- ^ ProjectExecTransformer (71) - : : +- ^ NoopFilter (70) - : : +- ^ Scan parquet (69) - : +- ^ InputIteratorTransformer (91) - : +- ^ InputAdapter (90) - : +- ^ ShuffleQueryStage (89), Statistics(X) - : +- ColumnarExchange (88) - : +- ^ ProjectExecTransformer (86) - : +- ^ NoopFilter (85) - : +- ^ Scan parquet (84) - +- ^ InputIteratorTransformer (106) - +- ^ InputAdapter (105) - +- ^ ShuffleQueryStage (104), Statistics(X) - +- ColumnarExchange (103) - +- ^ ProjectExecTransformer (101) - +- ^ NoopFilter (100) - +- ^ Scan parquet (99) + : : : : : : +- ^ InputIteratorTransformer (18) + : : : : : : +- ShuffleQueryStage (16), Statistics(X) + : : : : : : +- ColumnarExchange (15) + : : : : : : +- VeloxAppendBatches (14) + : : : : : : +- ^ ProjectExecTransformer (12) + : : : : : : +- ^ NoopFilter (11) + : : : : : : +- ^ Scan parquet (10) + : : : : : +- ^ InputIteratorTransformer (35) + : : : : : +- ShuffleQueryStage (33), Statistics(X) + : : : : : +- ColumnarExchange (32) + : : : : : +- VeloxAppendBatches (31) + : : : : : +- ^ ProjectExecTransformer (29) + : : : : : +- ^ NoopFilter (28) + : : : : : +- ^ Scan parquet (27) + : : : : +- ^ InputIteratorTransformer (52) + : : : : +- ShuffleQueryStage (50), Statistics(X) + : : : : +- ColumnarExchange (49) + : : : : +- VeloxAppendBatches (48) + : : : : +- ^ ProjectExecTransformer (46) + : : : : +- ^ NoopFilter (45) + : : : : +- ^ Scan parquet (44) + : : : +- ^ InputIteratorTransformer (69) + : : : +- ShuffleQueryStage (67), Statistics(X) + : : : +- ColumnarExchange (66) + : : : +- VeloxAppendBatches (65) + : : : +- ^ ProjectExecTransformer (63) + : : : +- ^ NoopFilter (62) + : : : +- ^ Scan parquet (61) + : : +- ^ InputIteratorTransformer (86) + : : +- ShuffleQueryStage (84), Statistics(X) + : : +- ColumnarExchange (83) + : : +- VeloxAppendBatches (82) + : : +- ^ ProjectExecTransformer (80) + : : +- ^ NoopFilter (79) + : : +- ^ Scan parquet (78) + : +- ^ InputIteratorTransformer (103) + : +- ShuffleQueryStage (101), Statistics(X) + : +- ColumnarExchange (100) + : +- VeloxAppendBatches (99) + : +- ^ ProjectExecTransformer (97) + : +- ^ NoopFilter (96) + : +- ^ Scan parquet (95) + +- ^ InputIteratorTransformer (120) + +- ShuffleQueryStage (118), Statistics(X) + +- ColumnarExchange (117) + +- VeloxAppendBatches (116) + +- ^ ProjectExecTransformer (114) + +- ^ NoopFilter (113) + +- ^ Scan parquet (112) +- == Initial Plan == - Sort (176) - +- Exchange (175) - +- HashAggregate (174) - +- Exchange (173) - +- HashAggregate (172) - +- Project (171) - +- ShuffledHashJoin Inner BuildRight (170) - :- Exchange (165) - : +- Project (164) - : +- ShuffledHashJoin Inner BuildRight (163) - : :- Exchange (159) - : : +- Project (158) - : : +- ShuffledHashJoin Inner BuildRight (157) - : : :- Exchange (153) - : : : +- Project (152) - : : : +- ShuffledHashJoin Inner BuildRight (151) - : : : :- Exchange (147) - : : : : +- Project (146) - : : : : +- ShuffledHashJoin Inner BuildRight (145) - : : : : :- Exchange (141) - : : : : : +- Project (140) - : : : : : +- ShuffledHashJoin Inner BuildRight (139) - : : : : : :- Exchange (135) - : : : : : : +- Project (134) - : : : : : : +- ShuffledHashJoin Inner BuildLeft (133) - : : : : : : :- Exchange (129) - : : : : : : : +- Project (128) - : : : : : : : +- Filter (127) - : : : : : : : +- Scan parquet (126) - : : : : : : +- Exchange (132) - : : : : : : +- Filter (131) - : : : : : : +- Scan parquet (130) - : : : : : +- Exchange (138) - : : : : : +- Filter (137) - : : : : : +- Scan parquet (136) - : : : : +- Exchange (144) - : : : : +- Filter (143) - : : : : +- Scan parquet (142) - : : : +- Exchange (150) - : : : +- Filter (149) - : : : +- Scan parquet (148) - : : +- Exchange (156) - : : +- Filter (155) - : : +- Scan parquet (154) - : +- Exchange (162) - : +- Filter (161) - : +- Scan parquet (160) - +- Exchange (169) - +- Project (168) - +- Filter (167) - +- Scan parquet (166) + Sort (206) + +- Exchange (205) + +- HashAggregate (204) + +- Exchange (203) + +- HashAggregate (202) + +- Project (201) + +- SortMergeJoin Inner (200) + :- Sort (194) + : +- Exchange (193) + : +- Project (192) + : +- SortMergeJoin Inner (191) + : :- Sort (186) + : : +- Exchange (185) + : : +- Project (184) + : : +- SortMergeJoin Inner (183) + : : :- Sort (178) + : : : +- Exchange (177) + : : : +- Project (176) + : : : +- SortMergeJoin Inner (175) + : : : :- Sort (170) + : : : : +- Exchange (169) + : : : : +- Project (168) + : : : : +- SortMergeJoin Inner (167) + : : : : :- Sort (162) + : : : : : +- Exchange (161) + : : : : : +- Project (160) + : : : : : +- SortMergeJoin Inner (159) + : : : : : :- Sort (154) + : : : : : : +- Exchange (153) + : : : : : : +- Project (152) + : : : : : : +- SortMergeJoin Inner (151) + : : : : : : :- Sort (146) + : : : : : : : +- Exchange (145) + : : : : : : : +- Project (144) + : : : : : : : +- Filter (143) + : : : : : : : +- Scan parquet (142) + : : : : : : +- Sort (150) + : : : : : : +- Exchange (149) + : : : : : : +- Filter (148) + : : : : : : +- Scan parquet (147) + : : : : : +- Sort (158) + : : : : : +- Exchange (157) + : : : : : +- Filter (156) + : : : : : +- Scan parquet (155) + : : : : +- Sort (166) + : : : : +- Exchange (165) + : : : : +- Filter (164) + : : : : +- Scan parquet (163) + : : : +- Sort (174) + : : : +- Exchange (173) + : : : +- Filter (172) + : : : +- Scan parquet (171) + : : +- Sort (182) + : : +- Exchange (181) + : : +- Filter (180) + : : +- Scan parquet (179) + : +- Sort (190) + : +- Exchange (189) + : +- Filter (188) + : +- Scan parquet (187) + +- Sort (199) + +- Exchange (198) + +- Project (197) + +- Filter (196) + +- Scan parquet (195) (1) Scan parquet @@ -182,746 +196,866 @@ Input [2]: [p_partkey#X, p_type#X] Input [2]: [hash_partition_key#X, p_partkey#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [2]: [hash_partition_key#X, p_partkey#X] +Arguments: X + +(6) ColumnarExchange Input [2]: [hash_partition_key#X, p_partkey#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [1]: [p_partkey#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [1]: [p_partkey#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [1]: [p_partkey#X] -(9) Scan parquet +(10) Scan parquet Output [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(10) NoopFilter +(11) NoopFilter Input [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [6]: [hash(l_partkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Input [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [6]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(15) ColumnarExchange Input [6]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [p_partkey#X] Right keys [1]: [l_partkey#X] Join type: Inner Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [5]: [hash(l_suppkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Input [6]: [p_partkey#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(19) WholeStageCodegenTransformer (X) +(21) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: false -(20) ColumnarExchange +(22) VeloxAppendBatches +Input [5]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(23) ColumnarExchange Input [5]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(21) ShuffleQueryStage +(24) ShuffleQueryStage Output [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: X -(22) InputAdapter +(25) InputAdapter Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(23) InputIteratorTransformer +(26) InputIteratorTransformer Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(24) Scan parquet +(27) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(25) NoopFilter +(28) NoopFilter Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: [s_suppkey#X, s_nationkey#X] -(26) ProjectExecTransformer +(29) ProjectExecTransformer Output [3]: [hash(s_suppkey#X, 42) AS hash_partition_key#X, s_suppkey#X, s_nationkey#X] Input [2]: [s_suppkey#X, s_nationkey#X] -(27) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] Arguments: false -(28) ColumnarExchange +(31) VeloxAppendBatches +Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] +Arguments: X + +(32) ColumnarExchange Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [id=#X] -(29) ShuffleQueryStage +(33) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] Arguments: X -(30) InputAdapter +(34) InputAdapter Input [2]: [s_suppkey#X, s_nationkey#X] -(31) InputIteratorTransformer +(35) InputIteratorTransformer Input [2]: [s_suppkey#X, s_nationkey#X] -(32) ShuffledHashJoinExecTransformer +(36) ShuffledHashJoinExecTransformer Left keys [1]: [l_suppkey#X] Right keys [1]: [s_suppkey#X] Join type: Inner Join condition: None -(33) ProjectExecTransformer +(37) ProjectExecTransformer Output [5]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Input [6]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, s_suppkey#X, s_nationkey#X] -(34) WholeStageCodegenTransformer (X) +(38) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: false -(35) ColumnarExchange +(39) VeloxAppendBatches +Input [5]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] +Arguments: X + +(40) ColumnarExchange Input [5]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X], [plan_id=X], [id=#X] -(36) ShuffleQueryStage +(41) ShuffleQueryStage Output [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: X -(37) InputAdapter +(42) InputAdapter Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] -(38) InputIteratorTransformer +(43) InputIteratorTransformer Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] -(39) Scan parquet +(44) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1995-01-01), LessThanOrEqual(o_orderdate,1996-12-31), IsNotNull(o_orderkey), IsNotNull(o_custkey)] ReadSchema: struct -(40) NoopFilter +(45) NoopFilter Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Arguments: [o_orderkey#X, o_custkey#X, o_orderdate#X] -(41) ProjectExecTransformer +(46) ProjectExecTransformer Output [4]: [hash(o_orderkey#X, 42) AS hash_partition_key#X, o_orderkey#X, o_custkey#X, o_orderdate#X] Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] -(42) WholeStageCodegenTransformer (X) +(47) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_orderdate#X] Arguments: false -(43) ColumnarExchange +(48) VeloxAppendBatches +Input [4]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_orderdate#X] +Arguments: X + +(49) ColumnarExchange Input [4]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_orderdate#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_orderdate#X], [plan_id=X], [id=#X] -(44) ShuffleQueryStage +(50) ShuffleQueryStage Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Arguments: X -(45) InputAdapter +(51) InputAdapter Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] -(46) InputIteratorTransformer +(52) InputIteratorTransformer Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] -(47) ShuffledHashJoinExecTransformer +(53) ShuffledHashJoinExecTransformer Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join type: Inner Join condition: None -(48) ProjectExecTransformer +(54) ProjectExecTransformer Output [6]: [hash(o_custkey#X, 42) AS hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] Input [7]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderkey#X, o_custkey#X, o_orderdate#X] -(49) WholeStageCodegenTransformer (X) +(55) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] Arguments: false -(50) ColumnarExchange +(56) VeloxAppendBatches +Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] +Arguments: X + +(57) ColumnarExchange Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X], [plan_id=X], [id=#X] -(51) ShuffleQueryStage +(58) ShuffleQueryStage Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] Arguments: X -(52) InputAdapter +(59) InputAdapter Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] -(53) InputIteratorTransformer +(60) InputIteratorTransformer Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] -(54) Scan parquet +(61) Scan parquet Output [2]: [c_custkey#X, c_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey), IsNotNull(c_nationkey)] ReadSchema: struct -(55) NoopFilter +(62) NoopFilter Input [2]: [c_custkey#X, c_nationkey#X] Arguments: [c_custkey#X, c_nationkey#X] -(56) ProjectExecTransformer +(63) ProjectExecTransformer Output [3]: [hash(c_custkey#X, 42) AS hash_partition_key#X, c_custkey#X, c_nationkey#X] Input [2]: [c_custkey#X, c_nationkey#X] -(57) WholeStageCodegenTransformer (X) +(64) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] Arguments: false -(58) ColumnarExchange +(65) VeloxAppendBatches +Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] +Arguments: X + +(66) ColumnarExchange Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_nationkey#X], [plan_id=X], [id=#X] -(59) ShuffleQueryStage +(67) ShuffleQueryStage Output [2]: [c_custkey#X, c_nationkey#X] Arguments: X -(60) InputAdapter +(68) InputAdapter Input [2]: [c_custkey#X, c_nationkey#X] -(61) InputIteratorTransformer +(69) InputIteratorTransformer Input [2]: [c_custkey#X, c_nationkey#X] -(62) ShuffledHashJoinExecTransformer +(70) ShuffledHashJoinExecTransformer Left keys [1]: [o_custkey#X] Right keys [1]: [c_custkey#X] Join type: Inner Join condition: None -(63) ProjectExecTransformer +(71) ProjectExecTransformer Output [6]: [hash(c_nationkey#X, 42) AS hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] Input [7]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X, c_custkey#X, c_nationkey#X] -(64) WholeStageCodegenTransformer (X) +(72) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] Arguments: false -(65) ColumnarExchange +(73) VeloxAppendBatches +Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] +Arguments: X + +(74) ColumnarExchange Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X], [plan_id=X], [id=#X] -(66) ShuffleQueryStage +(75) ShuffleQueryStage Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] Arguments: X -(67) InputAdapter +(76) InputAdapter Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] -(68) InputIteratorTransformer +(77) InputIteratorTransformer Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] -(69) Scan parquet +(78) Scan parquet Output [2]: [n_nationkey#X, n_regionkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), IsNotNull(n_regionkey)] ReadSchema: struct -(70) NoopFilter +(79) NoopFilter Input [2]: [n_nationkey#X, n_regionkey#X] Arguments: [n_nationkey#X, n_regionkey#X] -(71) ProjectExecTransformer +(80) ProjectExecTransformer Output [3]: [hash(n_nationkey#X, 42) AS hash_partition_key#X, n_nationkey#X, n_regionkey#X] Input [2]: [n_nationkey#X, n_regionkey#X] -(72) WholeStageCodegenTransformer (X) +(81) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, n_nationkey#X, n_regionkey#X] Arguments: false -(73) ColumnarExchange +(82) VeloxAppendBatches +Input [3]: [hash_partition_key#X, n_nationkey#X, n_regionkey#X] +Arguments: X + +(83) ColumnarExchange Input [3]: [hash_partition_key#X, n_nationkey#X, n_regionkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_regionkey#X], [plan_id=X], [id=#X] -(74) ShuffleQueryStage +(84) ShuffleQueryStage Output [2]: [n_nationkey#X, n_regionkey#X] Arguments: X -(75) InputAdapter +(85) InputAdapter Input [2]: [n_nationkey#X, n_regionkey#X] -(76) InputIteratorTransformer +(86) InputIteratorTransformer Input [2]: [n_nationkey#X, n_regionkey#X] -(77) ShuffledHashJoinExecTransformer +(87) ShuffledHashJoinExecTransformer Left keys [1]: [c_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(78) ProjectExecTransformer +(88) ProjectExecTransformer Output [6]: [hash(s_nationkey#X, 42) AS hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] Input [7]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X, n_nationkey#X, n_regionkey#X] -(79) WholeStageCodegenTransformer (X) +(89) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] Arguments: false -(80) ColumnarExchange +(90) VeloxAppendBatches +Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] +Arguments: X + +(91) ColumnarExchange Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X], [plan_id=X], [id=#X] -(81) ShuffleQueryStage +(92) ShuffleQueryStage Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] Arguments: X -(82) InputAdapter +(93) InputAdapter Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] -(83) InputIteratorTransformer +(94) InputIteratorTransformer Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] -(84) Scan parquet +(95) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey)] ReadSchema: struct -(85) NoopFilter +(96) NoopFilter Input [2]: [n_nationkey#X, n_name#X] Arguments: [n_nationkey#X, n_name#X] -(86) ProjectExecTransformer +(97) ProjectExecTransformer Output [3]: [hash(n_nationkey#X, 42) AS hash_partition_key#X, n_nationkey#X, n_name#X] Input [2]: [n_nationkey#X, n_name#X] -(87) WholeStageCodegenTransformer (X) +(98) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] Arguments: false -(88) ColumnarExchange +(99) VeloxAppendBatches +Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] +Arguments: X + +(100) ColumnarExchange Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [id=#X] -(89) ShuffleQueryStage +(101) ShuffleQueryStage Output [2]: [n_nationkey#X, n_name#X] Arguments: X -(90) InputAdapter +(102) InputAdapter Input [2]: [n_nationkey#X, n_name#X] -(91) InputIteratorTransformer +(103) InputIteratorTransformer Input [2]: [n_nationkey#X, n_name#X] -(92) ShuffledHashJoinExecTransformer +(104) ShuffledHashJoinExecTransformer Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(93) ProjectExecTransformer +(105) ProjectExecTransformer Output [6]: [hash(n_regionkey#X, 42) AS hash_partition_key#X, l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] Input [7]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X, n_nationkey#X, n_name#X] -(94) WholeStageCodegenTransformer (X) +(106) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] Arguments: false -(95) ColumnarExchange +(107) VeloxAppendBatches +Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] +Arguments: X + +(108) ColumnarExchange Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] Arguments: hashpartitioning(n_regionkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X], [plan_id=X], [id=#X] -(96) ShuffleQueryStage +(109) ShuffleQueryStage Output [5]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] Arguments: X -(97) InputAdapter +(110) InputAdapter Input [5]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] -(98) InputIteratorTransformer +(111) InputIteratorTransformer Input [5]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] -(99) Scan parquet +(112) Scan parquet Output [2]: [r_regionkey#X, r_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(r_name), EqualTo(r_name,AMERICA), IsNotNull(r_regionkey)] ReadSchema: struct -(100) NoopFilter +(113) NoopFilter Input [2]: [r_regionkey#X, r_name#X] Arguments: [r_regionkey#X, r_name#X] -(101) ProjectExecTransformer +(114) ProjectExecTransformer Output [2]: [hash(r_regionkey#X, 42) AS hash_partition_key#X, r_regionkey#X] Input [2]: [r_regionkey#X, r_name#X] -(102) WholeStageCodegenTransformer (X) +(115) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, r_regionkey#X] Arguments: false -(103) ColumnarExchange +(116) VeloxAppendBatches +Input [2]: [hash_partition_key#X, r_regionkey#X] +Arguments: X + +(117) ColumnarExchange Input [2]: [hash_partition_key#X, r_regionkey#X] Arguments: hashpartitioning(r_regionkey#X, 1), ENSURE_REQUIREMENTS, [r_regionkey#X], [plan_id=X], [id=#X] -(104) ShuffleQueryStage +(118) ShuffleQueryStage Output [1]: [r_regionkey#X] Arguments: X -(105) InputAdapter +(119) InputAdapter Input [1]: [r_regionkey#X] -(106) InputIteratorTransformer +(120) InputIteratorTransformer Input [1]: [r_regionkey#X] -(107) ShuffledHashJoinExecTransformer +(121) ShuffledHashJoinExecTransformer Left keys [1]: [n_regionkey#X] Right keys [1]: [r_regionkey#X] Join type: Inner Join condition: None -(108) ProjectExecTransformer +(122) ProjectExecTransformer Output [4]: [year(o_orderdate#X) AS o_year#X, (l_extendedprice#X * (1 - l_discount#X)) AS volume#X, n_name#X AS nation#X, CASE WHEN (n_name#X = BRAZIL) THEN (l_extendedprice#X * (1 - l_discount#X)) ELSE 0.0000 END AS _pre_X#X] Input [6]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X, r_regionkey#X] -(109) FlushableHashAggregateExecTransformer +(123) FlushableHashAggregateExecTransformer Input [4]: [o_year#X, volume#X, nation#X, _pre_X#X] Keys [1]: [o_year#X] Functions [2]: [partial_sum(_pre_X#X), partial_sum(volume#X)] Aggregate Attributes [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Results [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] -(110) ProjectExecTransformer +(124) ProjectExecTransformer Output [6]: [hash(o_year#X, 42) AS hash_partition_key#X, o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] -(111) WholeStageCodegenTransformer (X) +(125) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: false -(112) ColumnarExchange +(126) VeloxAppendBatches +Input [6]: [hash_partition_key#X, o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] +Arguments: X + +(127) ColumnarExchange Input [6]: [hash_partition_key#X, o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: hashpartitioning(o_year#X, 1), ENSURE_REQUIREMENTS, [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(113) ShuffleQueryStage +(128) ShuffleQueryStage Output [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: X -(114) InputAdapter +(129) InputAdapter Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] -(115) InputIteratorTransformer +(130) InputIteratorTransformer Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] -(116) RegularHashAggregateExecTransformer +(131) RegularHashAggregateExecTransformer Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Keys [1]: [o_year#X] Functions [2]: [sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END), sum(volume#X)] Aggregate Attributes [2]: [sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X, sum(volume#X)#X] Results [3]: [o_year#X, sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X, sum(volume#X)#X] -(117) ProjectExecTransformer +(132) ProjectExecTransformer Output [2]: [o_year#X, (sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X / sum(volume#X)#X) AS mkt_share#X] Input [3]: [o_year#X, sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X, sum(volume#X)#X] -(118) WholeStageCodegenTransformer (X) +(133) WholeStageCodegenTransformer (X) Input [2]: [o_year#X, mkt_share#X] Arguments: false -(119) ColumnarExchange +(134) VeloxAppendBatches +Input [2]: [o_year#X, mkt_share#X] +Arguments: X + +(135) ColumnarExchange Input [2]: [o_year#X, mkt_share#X] Arguments: rangepartitioning(o_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(120) ShuffleQueryStage +(136) ShuffleQueryStage Output [2]: [o_year#X, mkt_share#X] Arguments: X -(121) InputAdapter +(137) InputAdapter Input [2]: [o_year#X, mkt_share#X] -(122) InputIteratorTransformer +(138) InputIteratorTransformer Input [2]: [o_year#X, mkt_share#X] -(123) SortExecTransformer +(139) SortExecTransformer Input [2]: [o_year#X, mkt_share#X] Arguments: [o_year#X ASC NULLS FIRST], true, 0 -(124) WholeStageCodegenTransformer (X) +(140) WholeStageCodegenTransformer (X) Input [2]: [o_year#X, mkt_share#X] Arguments: false -(125) VeloxColumnarToRowExec +(141) VeloxColumnarToRowExec Input [2]: [o_year#X, mkt_share#X] -(126) Scan parquet +(142) Scan parquet Output [2]: [p_partkey#X, p_type#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_type), EqualTo(p_type,ECONOMY ANODIZED STEEL), IsNotNull(p_partkey)] ReadSchema: struct -(127) Filter +(143) Filter Input [2]: [p_partkey#X, p_type#X] Condition : ((isnotnull(p_type#X) AND (p_type#X = ECONOMY ANODIZED STEEL)) AND isnotnull(p_partkey#X)) -(128) Project +(144) Project Output [1]: [p_partkey#X] Input [2]: [p_partkey#X, p_type#X] -(129) Exchange +(145) Exchange Input [1]: [p_partkey#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(130) Scan parquet +(146) Sort +Input [1]: [p_partkey#X] +Arguments: [p_partkey#X ASC NULLS FIRST], false, 0 + +(147) Scan parquet Output [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(131) Filter +(148) Filter Input [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Condition : ((isnotnull(l_partkey#X) AND isnotnull(l_suppkey#X)) AND isnotnull(l_orderkey#X)) -(132) Exchange +(149) Exchange Input [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(133) ShuffledHashJoin +(150) Sort +Input [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] +Arguments: [l_partkey#X ASC NULLS FIRST], false, 0 + +(151) SortMergeJoin Left keys [1]: [p_partkey#X] Right keys [1]: [l_partkey#X] Join type: Inner Join condition: None -(134) Project +(152) Project Output [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Input [6]: [p_partkey#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(135) Exchange +(153) Exchange Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(136) Scan parquet +(154) Sort +Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] +Arguments: [l_suppkey#X ASC NULLS FIRST], false, 0 + +(155) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(137) Filter +(156) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(138) Exchange +(157) Exchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(139) ShuffledHashJoin +(158) Sort +Input [2]: [s_suppkey#X, s_nationkey#X] +Arguments: [s_suppkey#X ASC NULLS FIRST], false, 0 + +(159) SortMergeJoin Left keys [1]: [l_suppkey#X] Right keys [1]: [s_suppkey#X] Join type: Inner Join condition: None -(140) Project +(160) Project Output [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Input [6]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, s_suppkey#X, s_nationkey#X] -(141) Exchange +(161) Exchange Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(142) Scan parquet +(162) Sort +Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(163) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1995-01-01), LessThanOrEqual(o_orderdate,1996-12-31), IsNotNull(o_orderkey), IsNotNull(o_custkey)] ReadSchema: struct -(143) Filter +(164) Filter Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Condition : ((((isnotnull(o_orderdate#X) AND (o_orderdate#X >= 1995-01-01)) AND (o_orderdate#X <= 1996-12-31)) AND isnotnull(o_orderkey#X)) AND isnotnull(o_custkey#X)) -(144) Exchange +(165) Exchange Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(145) ShuffledHashJoin +(166) Sort +Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(167) SortMergeJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join type: Inner Join condition: None -(146) Project +(168) Project Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] Input [7]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderkey#X, o_custkey#X, o_orderdate#X] -(147) Exchange +(169) Exchange Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(148) Scan parquet +(170) Sort +Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] +Arguments: [o_custkey#X ASC NULLS FIRST], false, 0 + +(171) Scan parquet Output [2]: [c_custkey#X, c_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey), IsNotNull(c_nationkey)] ReadSchema: struct -(149) Filter +(172) Filter Input [2]: [c_custkey#X, c_nationkey#X] Condition : (isnotnull(c_custkey#X) AND isnotnull(c_nationkey#X)) -(150) Exchange +(173) Exchange Input [2]: [c_custkey#X, c_nationkey#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(151) ShuffledHashJoin +(174) Sort +Input [2]: [c_custkey#X, c_nationkey#X] +Arguments: [c_custkey#X ASC NULLS FIRST], false, 0 + +(175) SortMergeJoin Left keys [1]: [o_custkey#X] Right keys [1]: [c_custkey#X] Join type: Inner Join condition: None -(152) Project +(176) Project Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] Input [7]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X, c_custkey#X, c_nationkey#X] -(153) Exchange +(177) Exchange Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(154) Scan parquet +(178) Sort +Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] +Arguments: [c_nationkey#X ASC NULLS FIRST], false, 0 + +(179) Scan parquet Output [2]: [n_nationkey#X, n_regionkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), IsNotNull(n_regionkey)] ReadSchema: struct -(155) Filter +(180) Filter Input [2]: [n_nationkey#X, n_regionkey#X] Condition : (isnotnull(n_nationkey#X) AND isnotnull(n_regionkey#X)) -(156) Exchange +(181) Exchange Input [2]: [n_nationkey#X, n_regionkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(157) ShuffledHashJoin +(182) Sort +Input [2]: [n_nationkey#X, n_regionkey#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(183) SortMergeJoin Left keys [1]: [c_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(158) Project +(184) Project Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] Input [7]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X, n_nationkey#X, n_regionkey#X] -(159) Exchange +(185) Exchange Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(160) Scan parquet +(186) Sort +Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] +Arguments: [s_nationkey#X ASC NULLS FIRST], false, 0 + +(187) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey)] ReadSchema: struct -(161) Filter +(188) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : isnotnull(n_nationkey#X) -(162) Exchange +(189) Exchange Input [2]: [n_nationkey#X, n_name#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(163) ShuffledHashJoin +(190) Sort +Input [2]: [n_nationkey#X, n_name#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(191) SortMergeJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(164) Project +(192) Project Output [5]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] Input [7]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X, n_nationkey#X, n_name#X] -(165) Exchange +(193) Exchange Input [5]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] Arguments: hashpartitioning(n_regionkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(166) Scan parquet +(194) Sort +Input [5]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] +Arguments: [n_regionkey#X ASC NULLS FIRST], false, 0 + +(195) Scan parquet Output [2]: [r_regionkey#X, r_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(r_name), EqualTo(r_name,AMERICA), IsNotNull(r_regionkey)] ReadSchema: struct -(167) Filter +(196) Filter Input [2]: [r_regionkey#X, r_name#X] Condition : ((isnotnull(r_name#X) AND (r_name#X = AMERICA)) AND isnotnull(r_regionkey#X)) -(168) Project +(197) Project Output [1]: [r_regionkey#X] Input [2]: [r_regionkey#X, r_name#X] -(169) Exchange +(198) Exchange Input [1]: [r_regionkey#X] Arguments: hashpartitioning(r_regionkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(170) ShuffledHashJoin +(199) Sort +Input [1]: [r_regionkey#X] +Arguments: [r_regionkey#X ASC NULLS FIRST], false, 0 + +(200) SortMergeJoin Left keys [1]: [n_regionkey#X] Right keys [1]: [r_regionkey#X] Join type: Inner Join condition: None -(171) Project +(201) Project Output [3]: [year(o_orderdate#X) AS o_year#X, (l_extendedprice#X * (1 - l_discount#X)) AS volume#X, n_name#X AS nation#X] Input [6]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X, r_regionkey#X] -(172) HashAggregate +(202) HashAggregate Input [3]: [o_year#X, volume#X, nation#X] Keys [1]: [o_year#X] Functions [2]: [partial_sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END), partial_sum(volume#X)] Aggregate Attributes [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Results [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] -(173) Exchange +(203) Exchange Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: hashpartitioning(o_year#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(174) HashAggregate +(204) HashAggregate Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Keys [1]: [o_year#X] Functions [2]: [sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END), sum(volume#X)] Aggregate Attributes [2]: [sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X, sum(volume#X)#X] Results [2]: [o_year#X, (sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X / sum(volume#X)#X) AS mkt_share#X] -(175) Exchange +(205) Exchange Input [2]: [o_year#X, mkt_share#X] Arguments: rangepartitioning(o_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(176) Sort +(206) Sort Input [2]: [o_year#X, mkt_share#X] Arguments: [o_year#X ASC NULLS FIRST], true, 0 -(177) AdaptiveSparkPlan +(207) AdaptiveSparkPlan Output [2]: [o_year#X, mkt_share#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/9.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/9.txt index 3f5479af2b147..4b983de23fdec 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/9.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/9.txt @@ -1,126 +1,136 @@ == Physical Plan == -AdaptiveSparkPlan (133) +AdaptiveSparkPlan (155) +- == Final Plan == - VeloxColumnarToRowExec (94) - +- ^ SortExecTransformer (92) - +- ^ InputIteratorTransformer (91) - +- ^ InputAdapter (90) - +- ^ ShuffleQueryStage (89), Statistics(X) - +- ColumnarExchange (88) - +- ^ RegularHashAggregateExecTransformer (86) - +- ^ InputIteratorTransformer (85) - +- ^ InputAdapter (84) - +- ^ ShuffleQueryStage (83), Statistics(X) - +- ColumnarExchange (82) - +- ^ ProjectExecTransformer (80) - +- ^ FlushableHashAggregateExecTransformer (79) - +- ^ ProjectExecTransformer (78) - +- ^ ShuffledHashJoinExecTransformer Inner (77) - :- ^ InputIteratorTransformer (68) - : +- ^ InputAdapter (67) - : +- ^ ShuffleQueryStage (66), Statistics(X) - : +- ColumnarExchange (65) - : +- ^ ProjectExecTransformer (63) - : +- ^ ShuffledHashJoinExecTransformer Inner (62) - : :- ^ InputIteratorTransformer (53) - : : +- ^ InputAdapter (52) - : : +- ^ ShuffleQueryStage (51), Statistics(X) - : : +- ColumnarExchange (50) - : : +- ^ ProjectExecTransformer (48) - : : +- ^ ShuffledHashJoinExecTransformer Inner (47) - : : :- ^ InputIteratorTransformer (38) - : : : +- ^ InputAdapter (37) - : : : +- ^ ShuffleQueryStage (36), Statistics(X) - : : : +- ColumnarExchange (35) - : : : +- ^ ProjectExecTransformer (33) - : : : +- ^ ShuffledHashJoinExecTransformer Inner (32) - : : : :- ^ InputIteratorTransformer (23) - : : : : +- ^ InputAdapter (22) - : : : : +- ^ ShuffleQueryStage (21), Statistics(X) - : : : : +- ColumnarExchange (20) - : : : : +- ^ ProjectExecTransformer (18) - : : : : +- ^ ShuffledHashJoinExecTransformer Inner (17) - : : : : :- ^ InputIteratorTransformer (8) - : : : : : +- ^ InputAdapter (7) - : : : : : +- ^ ShuffleQueryStage (6), Statistics(X) - : : : : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (106) + +- ^ SortExecTransformer (104) + +- ^ InputIteratorTransformer (103) + +- ShuffleQueryStage (101), Statistics(X) + +- ColumnarExchange (100) + +- VeloxAppendBatches (99) + +- ^ RegularHashAggregateExecTransformer (97) + +- ^ InputIteratorTransformer (96) + +- ShuffleQueryStage (94), Statistics(X) + +- ColumnarExchange (93) + +- VeloxAppendBatches (92) + +- ^ ProjectExecTransformer (90) + +- ^ FlushableHashAggregateExecTransformer (89) + +- ^ ProjectExecTransformer (88) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (87) + :- ^ InputIteratorTransformer (77) + : +- ShuffleQueryStage (75), Statistics(X) + : +- ColumnarExchange (74) + : +- VeloxAppendBatches (73) + : +- ^ ProjectExecTransformer (71) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (70) + : :- ^ InputIteratorTransformer (60) + : : +- ShuffleQueryStage (58), Statistics(X) + : : +- ColumnarExchange (57) + : : +- VeloxAppendBatches (56) + : : +- ^ ProjectExecTransformer (54) + : : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (53) + : : :- ^ InputIteratorTransformer (43) + : : : +- ShuffleQueryStage (41), Statistics(X) + : : : +- ColumnarExchange (40) + : : : +- VeloxAppendBatches (39) + : : : +- ^ ProjectExecTransformer (37) + : : : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (36) + : : : :- ^ InputIteratorTransformer (26) + : : : : +- ShuffleQueryStage (24), Statistics(X) + : : : : +- ColumnarExchange (23) + : : : : +- VeloxAppendBatches (22) + : : : : +- ^ ProjectExecTransformer (20) + : : : : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (19) + : : : : :- ^ InputIteratorTransformer (9) + : : : : : +- ShuffleQueryStage (7), Statistics(X) + : : : : : +- ColumnarExchange (6) + : : : : : +- VeloxAppendBatches (5) : : : : : +- ^ ProjectExecTransformer (3) : : : : : +- ^ NoopFilter (2) : : : : : +- ^ Scan parquet (1) - : : : : +- ^ InputIteratorTransformer (16) - : : : : +- ^ InputAdapter (15) - : : : : +- ^ ShuffleQueryStage (14), Statistics(X) - : : : : +- ColumnarExchange (13) - : : : : +- ^ ProjectExecTransformer (11) - : : : : +- ^ NoopFilter (10) - : : : : +- ^ Scan parquet (9) - : : : +- ^ InputIteratorTransformer (31) - : : : +- ^ InputAdapter (30) - : : : +- ^ ShuffleQueryStage (29), Statistics(X) - : : : +- ColumnarExchange (28) - : : : +- ^ ProjectExecTransformer (26) - : : : +- ^ NoopFilter (25) - : : : +- ^ Scan parquet (24) - : : +- ^ InputIteratorTransformer (46) - : : +- ^ InputAdapter (45) - : : +- ^ ShuffleQueryStage (44), Statistics(X) - : : +- ColumnarExchange (43) - : : +- ^ ProjectExecTransformer (41) - : : +- ^ NoopFilter (40) - : : +- ^ Scan parquet (39) - : +- ^ InputIteratorTransformer (61) - : +- ^ InputAdapter (60) - : +- ^ ShuffleQueryStage (59), Statistics(X) - : +- ColumnarExchange (58) - : +- ^ ProjectExecTransformer (56) - : +- ^ NoopFilter (55) - : +- ^ Scan parquet (54) - +- ^ InputIteratorTransformer (76) - +- ^ InputAdapter (75) - +- ^ ShuffleQueryStage (74), Statistics(X) - +- ColumnarExchange (73) - +- ^ ProjectExecTransformer (71) - +- ^ NoopFilter (70) - +- ^ Scan parquet (69) + : : : : +- ^ InputIteratorTransformer (18) + : : : : +- ShuffleQueryStage (16), Statistics(X) + : : : : +- ColumnarExchange (15) + : : : : +- VeloxAppendBatches (14) + : : : : +- ^ ProjectExecTransformer (12) + : : : : +- ^ NoopFilter (11) + : : : : +- ^ Scan parquet (10) + : : : +- ^ InputIteratorTransformer (35) + : : : +- ShuffleQueryStage (33), Statistics(X) + : : : +- ColumnarExchange (32) + : : : +- VeloxAppendBatches (31) + : : : +- ^ ProjectExecTransformer (29) + : : : +- ^ NoopFilter (28) + : : : +- ^ Scan parquet (27) + : : +- ^ InputIteratorTransformer (52) + : : +- ShuffleQueryStage (50), Statistics(X) + : : +- ColumnarExchange (49) + : : +- VeloxAppendBatches (48) + : : +- ^ ProjectExecTransformer (46) + : : +- ^ NoopFilter (45) + : : +- ^ Scan parquet (44) + : +- ^ InputIteratorTransformer (69) + : +- ShuffleQueryStage (67), Statistics(X) + : +- ColumnarExchange (66) + : +- VeloxAppendBatches (65) + : +- ^ ProjectExecTransformer (63) + : +- ^ NoopFilter (62) + : +- ^ Scan parquet (61) + +- ^ InputIteratorTransformer (86) + +- ShuffleQueryStage (84), Statistics(X) + +- ColumnarExchange (83) + +- VeloxAppendBatches (82) + +- ^ ProjectExecTransformer (80) + +- ^ NoopFilter (79) + +- ^ Scan parquet (78) +- == Initial Plan == - Sort (132) - +- Exchange (131) - +- HashAggregate (130) - +- Exchange (129) - +- HashAggregate (128) - +- Project (127) - +- ShuffledHashJoin Inner BuildRight (126) - :- Exchange (122) - : +- Project (121) - : +- ShuffledHashJoin Inner BuildRight (120) - : :- Exchange (116) - : : +- Project (115) - : : +- ShuffledHashJoin Inner BuildRight (114) - : : :- Exchange (110) - : : : +- Project (109) - : : : +- ShuffledHashJoin Inner BuildRight (108) - : : : :- Exchange (104) - : : : : +- Project (103) - : : : : +- ShuffledHashJoin Inner BuildLeft (102) - : : : : :- Exchange (98) - : : : : : +- Project (97) - : : : : : +- Filter (96) - : : : : : +- Scan parquet (95) - : : : : +- Exchange (101) - : : : : +- Filter (100) - : : : : +- Scan parquet (99) - : : : +- Exchange (107) - : : : +- Filter (106) - : : : +- Scan parquet (105) - : : +- Exchange (113) - : : +- Filter (112) - : : +- Scan parquet (111) - : +- Exchange (119) - : +- Filter (118) - : +- Scan parquet (117) - +- Exchange (125) - +- Filter (124) - +- Scan parquet (123) + Sort (154) + +- Exchange (153) + +- HashAggregate (152) + +- Exchange (151) + +- HashAggregate (150) + +- Project (149) + +- SortMergeJoin Inner (148) + :- Sort (143) + : +- Exchange (142) + : +- Project (141) + : +- SortMergeJoin Inner (140) + : :- Sort (135) + : : +- Exchange (134) + : : +- Project (133) + : : +- SortMergeJoin Inner (132) + : : :- Sort (127) + : : : +- Exchange (126) + : : : +- Project (125) + : : : +- SortMergeJoin Inner (124) + : : : :- Sort (119) + : : : : +- Exchange (118) + : : : : +- Project (117) + : : : : +- SortMergeJoin Inner (116) + : : : : :- Sort (111) + : : : : : +- Exchange (110) + : : : : : +- Project (109) + : : : : : +- Filter (108) + : : : : : +- Scan parquet (107) + : : : : +- Sort (115) + : : : : +- Exchange (114) + : : : : +- Filter (113) + : : : : +- Scan parquet (112) + : : : +- Sort (123) + : : : +- Exchange (122) + : : : +- Filter (121) + : : : +- Scan parquet (120) + : : +- Sort (131) + : : +- Exchange (130) + : : +- Filter (129) + : : +- Scan parquet (128) + : +- Sort (139) + : +- Exchange (138) + : +- Filter (137) + : +- Scan parquet (136) + +- Sort (147) + +- Exchange (146) + +- Filter (145) + +- Scan parquet (144) (1) Scan parquet @@ -142,558 +152,646 @@ Input [2]: [p_partkey#X, p_name#X] Input [2]: [hash_partition_key#X, p_partkey#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [2]: [hash_partition_key#X, p_partkey#X] +Arguments: X + +(6) ColumnarExchange Input [2]: [hash_partition_key#X, p_partkey#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [1]: [p_partkey#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [1]: [p_partkey#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [1]: [p_partkey#X] -(9) Scan parquet +(10) Scan parquet Output [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(10) NoopFilter +(11) NoopFilter Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [7]: [hash(l_partkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [7]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [7]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(15) ColumnarExchange Input [7]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [p_partkey#X] Right keys [1]: [l_partkey#X] Join type: Inner Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [7]: [hash(l_suppkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Input [7]: [p_partkey#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -(19) WholeStageCodegenTransformer (X) +(21) WholeStageCodegenTransformer (X) Input [7]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: false -(20) ColumnarExchange +(22) VeloxAppendBatches +Input [7]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(23) ColumnarExchange Input [7]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(21) ShuffleQueryStage +(24) ShuffleQueryStage Output [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: X -(22) InputAdapter +(25) InputAdapter Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -(23) InputIteratorTransformer +(26) InputIteratorTransformer Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -(24) Scan parquet +(27) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(25) NoopFilter +(28) NoopFilter Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: [s_suppkey#X, s_nationkey#X] -(26) ProjectExecTransformer +(29) ProjectExecTransformer Output [3]: [hash(s_suppkey#X, 42) AS hash_partition_key#X, s_suppkey#X, s_nationkey#X] Input [2]: [s_suppkey#X, s_nationkey#X] -(27) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] Arguments: false -(28) ColumnarExchange +(31) VeloxAppendBatches +Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] +Arguments: X + +(32) ColumnarExchange Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [id=#X] -(29) ShuffleQueryStage +(33) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] Arguments: X -(30) InputAdapter +(34) InputAdapter Input [2]: [s_suppkey#X, s_nationkey#X] -(31) InputIteratorTransformer +(35) InputIteratorTransformer Input [2]: [s_suppkey#X, s_nationkey#X] -(32) ShuffledHashJoinExecTransformer +(36) ShuffledHashJoinExecTransformer Left keys [1]: [l_suppkey#X] Right keys [1]: [s_suppkey#X] Join type: Inner Join condition: None -(33) ProjectExecTransformer +(37) ProjectExecTransformer Output [8]: [hash(l_suppkey#X, l_partkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Input [8]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_suppkey#X, s_nationkey#X] -(34) WholeStageCodegenTransformer (X) +(38) WholeStageCodegenTransformer (X) Input [8]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: false -(35) ColumnarExchange +(39) VeloxAppendBatches +Input [8]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] +Arguments: X + +(40) ColumnarExchange Input [8]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: hashpartitioning(l_suppkey#X, l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X], [plan_id=X], [id=#X] -(36) ShuffleQueryStage +(41) ShuffleQueryStage Output [7]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: X -(37) InputAdapter +(42) InputAdapter Input [7]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] -(38) InputIteratorTransformer +(43) InputIteratorTransformer Input [7]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] -(39) Scan parquet +(44) Scan parquet Output [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_suppkey), IsNotNull(ps_partkey)] ReadSchema: struct -(40) NoopFilter +(45) NoopFilter Input [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Arguments: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] -(41) ProjectExecTransformer +(46) ProjectExecTransformer Output [4]: [hash(ps_suppkey#X, ps_partkey#X, 42) AS hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Input [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] -(42) WholeStageCodegenTransformer (X) +(47) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Arguments: false -(43) ColumnarExchange +(48) VeloxAppendBatches +Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_supplycost#X] +Arguments: X + +(49) ColumnarExchange Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Arguments: hashpartitioning(ps_suppkey#X, ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_supplycost#X], [plan_id=X], [id=#X] -(44) ShuffleQueryStage +(50) ShuffleQueryStage Output [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Arguments: X -(45) InputAdapter +(51) InputAdapter Input [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] -(46) InputIteratorTransformer +(52) InputIteratorTransformer Input [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] -(47) ShuffledHashJoinExecTransformer +(53) ShuffledHashJoinExecTransformer Left keys [2]: [l_suppkey#X, l_partkey#X] Right keys [2]: [ps_suppkey#X, ps_partkey#X] Join type: Inner Join condition: None -(48) ProjectExecTransformer +(54) ProjectExecTransformer Output [7]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] Input [10]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_partkey#X, ps_suppkey#X, ps_supplycost#X] -(49) WholeStageCodegenTransformer (X) +(55) WholeStageCodegenTransformer (X) Input [7]: [hash_partition_key#X, l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] Arguments: false -(50) ColumnarExchange +(56) VeloxAppendBatches +Input [7]: [hash_partition_key#X, l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] +Arguments: X + +(57) ColumnarExchange Input [7]: [hash_partition_key#X, l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X], [plan_id=X], [id=#X] -(51) ShuffleQueryStage +(58) ShuffleQueryStage Output [6]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] Arguments: X -(52) InputAdapter +(59) InputAdapter Input [6]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] -(53) InputIteratorTransformer +(60) InputIteratorTransformer Input [6]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] -(54) Scan parquet +(61) Scan parquet Output [2]: [o_orderkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderkey)] ReadSchema: struct -(55) NoopFilter +(62) NoopFilter Input [2]: [o_orderkey#X, o_orderdate#X] Arguments: [o_orderkey#X, o_orderdate#X] -(56) ProjectExecTransformer +(63) ProjectExecTransformer Output [3]: [hash(o_orderkey#X, 42) AS hash_partition_key#X, o_orderkey#X, o_orderdate#X] Input [2]: [o_orderkey#X, o_orderdate#X] -(57) WholeStageCodegenTransformer (X) +(64) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderdate#X] Arguments: false -(58) ColumnarExchange +(65) VeloxAppendBatches +Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderdate#X] +Arguments: X + +(66) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderdate#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderdate#X], [plan_id=X], [id=#X] -(59) ShuffleQueryStage +(67) ShuffleQueryStage Output [2]: [o_orderkey#X, o_orderdate#X] Arguments: X -(60) InputAdapter +(68) InputAdapter Input [2]: [o_orderkey#X, o_orderdate#X] -(61) InputIteratorTransformer +(69) InputIteratorTransformer Input [2]: [o_orderkey#X, o_orderdate#X] -(62) ShuffledHashJoinExecTransformer +(70) ShuffledHashJoinExecTransformer Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join type: Inner Join condition: None -(63) ProjectExecTransformer +(71) ProjectExecTransformer Output [7]: [hash(s_nationkey#X, 42) AS hash_partition_key#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] Input [8]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderkey#X, o_orderdate#X] -(64) WholeStageCodegenTransformer (X) +(72) WholeStageCodegenTransformer (X) Input [7]: [hash_partition_key#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] Arguments: false -(65) ColumnarExchange +(73) VeloxAppendBatches +Input [7]: [hash_partition_key#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] +Arguments: X + +(74) ColumnarExchange Input [7]: [hash_partition_key#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X], [plan_id=X], [id=#X] -(66) ShuffleQueryStage +(75) ShuffleQueryStage Output [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] Arguments: X -(67) InputAdapter +(76) InputAdapter Input [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] -(68) InputIteratorTransformer +(77) InputIteratorTransformer Input [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] -(69) Scan parquet +(78) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey)] ReadSchema: struct -(70) NoopFilter +(79) NoopFilter Input [2]: [n_nationkey#X, n_name#X] Arguments: [n_nationkey#X, n_name#X] -(71) ProjectExecTransformer +(80) ProjectExecTransformer Output [3]: [hash(n_nationkey#X, 42) AS hash_partition_key#X, n_nationkey#X, n_name#X] Input [2]: [n_nationkey#X, n_name#X] -(72) WholeStageCodegenTransformer (X) +(81) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] Arguments: false -(73) ColumnarExchange +(82) VeloxAppendBatches +Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] +Arguments: X + +(83) ColumnarExchange Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [id=#X] -(74) ShuffleQueryStage +(84) ShuffleQueryStage Output [2]: [n_nationkey#X, n_name#X] Arguments: X -(75) InputAdapter +(85) InputAdapter Input [2]: [n_nationkey#X, n_name#X] -(76) InputIteratorTransformer +(86) InputIteratorTransformer Input [2]: [n_nationkey#X, n_name#X] -(77) ShuffledHashJoinExecTransformer +(87) ShuffledHashJoinExecTransformer Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(78) ProjectExecTransformer +(88) ProjectExecTransformer Output [3]: [n_name#X AS nation#X, year(o_orderdate#X) AS o_year#X, ((l_extendedprice#X * (1 - l_discount#X)) - (ps_supplycost#X * l_quantity#X)) AS amount#X] Input [8]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X, n_nationkey#X, n_name#X] -(79) FlushableHashAggregateExecTransformer +(89) FlushableHashAggregateExecTransformer Input [3]: [nation#X, o_year#X, amount#X] Keys [2]: [nation#X, o_year#X] Functions [1]: [partial_sum(amount#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [4]: [nation#X, o_year#X, sum#X, isEmpty#X] -(80) ProjectExecTransformer +(90) ProjectExecTransformer Output [5]: [hash(nation#X, o_year#X, 42) AS hash_partition_key#X, nation#X, o_year#X, sum#X, isEmpty#X] Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] -(81) WholeStageCodegenTransformer (X) +(91) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, nation#X, o_year#X, sum#X, isEmpty#X] Arguments: false -(82) ColumnarExchange +(92) VeloxAppendBatches +Input [5]: [hash_partition_key#X, nation#X, o_year#X, sum#X, isEmpty#X] +Arguments: X + +(93) ColumnarExchange Input [5]: [hash_partition_key#X, nation#X, o_year#X, sum#X, isEmpty#X] Arguments: hashpartitioning(nation#X, o_year#X, 1), ENSURE_REQUIREMENTS, [nation#X, o_year#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(83) ShuffleQueryStage +(94) ShuffleQueryStage Output [4]: [nation#X, o_year#X, sum#X, isEmpty#X] Arguments: X -(84) InputAdapter +(95) InputAdapter Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] -(85) InputIteratorTransformer +(96) InputIteratorTransformer Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] -(86) RegularHashAggregateExecTransformer +(97) RegularHashAggregateExecTransformer Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] Keys [2]: [nation#X, o_year#X] Functions [1]: [sum(amount#X)] Aggregate Attributes [1]: [sum(amount#X)#X] Results [3]: [nation#X, o_year#X, sum(amount#X)#X AS sum_profit#X] -(87) WholeStageCodegenTransformer (X) +(98) WholeStageCodegenTransformer (X) Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: false -(88) ColumnarExchange +(99) VeloxAppendBatches +Input [3]: [nation#X, o_year#X, sum_profit#X] +Arguments: X + +(100) ColumnarExchange Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: rangepartitioning(nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(89) ShuffleQueryStage +(101) ShuffleQueryStage Output [3]: [nation#X, o_year#X, sum_profit#X] Arguments: X -(90) InputAdapter +(102) InputAdapter Input [3]: [nation#X, o_year#X, sum_profit#X] -(91) InputIteratorTransformer +(103) InputIteratorTransformer Input [3]: [nation#X, o_year#X, sum_profit#X] -(92) SortExecTransformer +(104) SortExecTransformer Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: [nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST], true, 0 -(93) WholeStageCodegenTransformer (X) +(105) WholeStageCodegenTransformer (X) Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: false -(94) VeloxColumnarToRowExec +(106) VeloxColumnarToRowExec Input [3]: [nation#X, o_year#X, sum_profit#X] -(95) Scan parquet +(107) Scan parquet Output [2]: [p_partkey#X, p_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_name), StringContains(p_name,green), IsNotNull(p_partkey)] ReadSchema: struct -(96) Filter +(108) Filter Input [2]: [p_partkey#X, p_name#X] Condition : ((isnotnull(p_name#X) AND Contains(p_name#X, green)) AND isnotnull(p_partkey#X)) -(97) Project +(109) Project Output [1]: [p_partkey#X] Input [2]: [p_partkey#X, p_name#X] -(98) Exchange +(110) Exchange Input [1]: [p_partkey#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(99) Scan parquet +(111) Sort +Input [1]: [p_partkey#X] +Arguments: [p_partkey#X ASC NULLS FIRST], false, 0 + +(112) Scan parquet Output [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(100) Filter +(113) Filter Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Condition : ((isnotnull(l_partkey#X) AND isnotnull(l_suppkey#X)) AND isnotnull(l_orderkey#X)) -(101) Exchange +(114) Exchange Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(102) ShuffledHashJoin +(115) Sort +Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] +Arguments: [l_partkey#X ASC NULLS FIRST], false, 0 + +(116) SortMergeJoin Left keys [1]: [p_partkey#X] Right keys [1]: [l_partkey#X] Join type: Inner Join condition: None -(103) Project +(117) Project Output [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Input [7]: [p_partkey#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -(104) Exchange +(118) Exchange Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(105) Scan parquet +(119) Sort +Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] +Arguments: [l_suppkey#X ASC NULLS FIRST], false, 0 + +(120) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(106) Filter +(121) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(107) Exchange +(122) Exchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(108) ShuffledHashJoin +(123) Sort +Input [2]: [s_suppkey#X, s_nationkey#X] +Arguments: [s_suppkey#X ASC NULLS FIRST], false, 0 + +(124) SortMergeJoin Left keys [1]: [l_suppkey#X] Right keys [1]: [s_suppkey#X] Join type: Inner Join condition: None -(109) Project +(125) Project Output [7]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Input [8]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_suppkey#X, s_nationkey#X] -(110) Exchange +(126) Exchange Input [7]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: hashpartitioning(l_suppkey#X, l_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(111) Scan parquet +(127) Sort +Input [7]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] +Arguments: [l_suppkey#X ASC NULLS FIRST, l_partkey#X ASC NULLS FIRST], false, 0 + +(128) Scan parquet Output [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_suppkey), IsNotNull(ps_partkey)] ReadSchema: struct -(112) Filter +(129) Filter Input [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Condition : (isnotnull(ps_suppkey#X) AND isnotnull(ps_partkey#X)) -(113) Exchange +(130) Exchange Input [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Arguments: hashpartitioning(ps_suppkey#X, ps_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(114) ShuffledHashJoin +(131) Sort +Input [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] +Arguments: [ps_suppkey#X ASC NULLS FIRST, ps_partkey#X ASC NULLS FIRST], false, 0 + +(132) SortMergeJoin Left keys [2]: [l_suppkey#X, l_partkey#X] Right keys [2]: [ps_suppkey#X, ps_partkey#X] Join type: Inner Join condition: None -(115) Project +(133) Project Output [6]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] Input [10]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_partkey#X, ps_suppkey#X, ps_supplycost#X] -(116) Exchange +(134) Exchange Input [6]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(117) Scan parquet +(135) Sort +Input [6]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(136) Scan parquet Output [2]: [o_orderkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderkey)] ReadSchema: struct -(118) Filter +(137) Filter Input [2]: [o_orderkey#X, o_orderdate#X] Condition : isnotnull(o_orderkey#X) -(119) Exchange +(138) Exchange Input [2]: [o_orderkey#X, o_orderdate#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(120) ShuffledHashJoin +(139) Sort +Input [2]: [o_orderkey#X, o_orderdate#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(140) SortMergeJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join type: Inner Join condition: None -(121) Project +(141) Project Output [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] Input [8]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderkey#X, o_orderdate#X] -(122) Exchange +(142) Exchange Input [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(123) Scan parquet +(143) Sort +Input [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] +Arguments: [s_nationkey#X ASC NULLS FIRST], false, 0 + +(144) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey)] ReadSchema: struct -(124) Filter +(145) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : isnotnull(n_nationkey#X) -(125) Exchange +(146) Exchange Input [2]: [n_nationkey#X, n_name#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(126) ShuffledHashJoin +(147) Sort +Input [2]: [n_nationkey#X, n_name#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(148) SortMergeJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(127) Project +(149) Project Output [3]: [n_name#X AS nation#X, year(o_orderdate#X) AS o_year#X, ((l_extendedprice#X * (1 - l_discount#X)) - (ps_supplycost#X * l_quantity#X)) AS amount#X] Input [8]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X, n_nationkey#X, n_name#X] -(128) HashAggregate +(150) HashAggregate Input [3]: [nation#X, o_year#X, amount#X] Keys [2]: [nation#X, o_year#X] Functions [1]: [partial_sum(amount#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [4]: [nation#X, o_year#X, sum#X, isEmpty#X] -(129) Exchange +(151) Exchange Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] Arguments: hashpartitioning(nation#X, o_year#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(130) HashAggregate +(152) HashAggregate Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] Keys [2]: [nation#X, o_year#X] Functions [1]: [sum(amount#X)] Aggregate Attributes [1]: [sum(amount#X)#X] Results [3]: [nation#X, o_year#X, sum(amount#X)#X AS sum_profit#X] -(131) Exchange +(153) Exchange Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: rangepartitioning(nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(132) Sort +(154) Sort Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: [nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST], true, 0 -(133) AdaptiveSparkPlan +(155) AdaptiveSparkPlan Output [3]: [nation#X, o_year#X, sum_profit#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/1.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/1.txt index 7a80ed95f3e8d..63b7d317f3cf2 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/1.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/1.txt @@ -1,31 +1,31 @@ == Physical Plan == -AdaptiveSparkPlan (28) +AdaptiveSparkPlan (30) +- == Final Plan == - VeloxColumnarToRowExec (19) - +- ^ SortExecTransformer (17) - +- ^ InputIteratorTransformer (16) - +- ^ InputAdapter (15) - +- ^ ShuffleQueryStage (14) - +- ColumnarExchange (13) - +- ^ RegularHashAggregateExecTransformer (11) - +- ^ InputIteratorTransformer (10) - +- ^ InputAdapter (9) - +- ^ ShuffleQueryStage (8) - +- ColumnarExchange (7) + VeloxColumnarToRowExec (21) + +- ^ SortExecTransformer (19) + +- ^ InputIteratorTransformer (18) + +- ShuffleQueryStage (16) + +- ColumnarExchange (15) + +- VeloxAppendBatches (14) + +- ^ RegularHashAggregateExecTransformer (12) + +- ^ InputIteratorTransformer (11) + +- ShuffleQueryStage (9) + +- ColumnarExchange (8) + +- VeloxAppendBatches (7) +- ^ ProjectExecTransformer (5) +- ^ FlushableHashAggregateExecTransformer (4) +- ^ ProjectExecTransformer (3) +- ^ FilterExecTransformer (2) +- ^ Scan parquet (1) +- == Initial Plan == - Sort (27) - +- Exchange (26) - +- HashAggregate (25) - +- Exchange (24) - +- HashAggregate (23) - +- Project (22) - +- Filter (21) - +- Scan parquet (20) + Sort (29) + +- Exchange (28) + +- HashAggregate (27) + +- Exchange (26) + +- HashAggregate (25) + +- Project (24) + +- Filter (23) + +- Scan parquet (22) (1) Scan parquet @@ -58,97 +58,105 @@ Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, Input [18]: [hash_partition_key#X, l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Arguments: false -(7) ColumnarExchange +(7) VeloxAppendBatches +Input [18]: [hash_partition_key#X, l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] +Arguments: X + +(8) ColumnarExchange Input [18]: [hash_partition_key#X, l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Arguments: hashpartitioning(l_returnflag#X, l_linestatus#X, 1), ENSURE_REQUIREMENTS, [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X], [plan_id=X], [id=#X] -(8) ShuffleQueryStage +(9) ShuffleQueryStage Output [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Arguments: X -(9) InputAdapter +(10) InputAdapter Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] -(10) InputIteratorTransformer +(11) InputIteratorTransformer Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] -(11) RegularHashAggregateExecTransformer +(12) RegularHashAggregateExecTransformer Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Keys [2]: [l_returnflag#X, l_linestatus#X] Functions [8]: [sum(l_quantity#X), sum(l_extendedprice#X), sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true)), sum(CheckOverflow((promote_precision(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true)) * promote_precision(cast(CheckOverflow((1.00 + promote_precision(cast(l_tax#X as decimal(13,2)))), DecimalType(13,2), true) as decimal(26,4)))), DecimalType(38,6), true)), avg(l_quantity#X), avg(l_extendedprice#X), avg(l_discount#X), count(1)] Aggregate Attributes [8]: [sum(l_quantity#X)#X, sum(l_extendedprice#X)#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X, sum(CheckOverflow((promote_precision(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true)) * promote_precision(cast(CheckOverflow((1.00 + promote_precision(cast(l_tax#X as decimal(13,2)))), DecimalType(13,2), true) as decimal(26,4)))), DecimalType(38,6), true))#X, avg(l_quantity#X)#X, avg(l_extendedprice#X)#X, avg(l_discount#X)#X, count(1)#X] Results [10]: [l_returnflag#X, l_linestatus#X, sum(l_quantity#X)#X AS sum_qty#X, sum(l_extendedprice#X)#X AS sum_base_price#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X AS sum_disc_price#X, sum(CheckOverflow((promote_precision(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true)) * promote_precision(cast(CheckOverflow((1.00 + promote_precision(cast(l_tax#X as decimal(13,2)))), DecimalType(13,2), true) as decimal(26,4)))), DecimalType(38,6), true))#X AS sum_charge#X, avg(l_quantity#X)#X AS avg_qty#X, avg(l_extendedprice#X)#X AS avg_price#X, avg(l_discount#X)#X AS avg_disc#X, count(1)#X AS count_order#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] +Arguments: X + +(15) ColumnarExchange Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: rangepartitioning(l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] -(17) SortExecTransformer +(19) SortExecTransformer Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: [l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST], true, 0 -(18) WholeStageCodegenTransformer (X) +(20) WholeStageCodegenTransformer (X) Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: false -(19) VeloxColumnarToRowExec +(21) VeloxColumnarToRowExec Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] -(20) Scan parquet +(22) Scan parquet Output [7]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_tax#X, l_returnflag#X, l_linestatus#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), LessThanOrEqual(l_shipdate,1998-09-02)] ReadSchema: struct -(21) Filter +(23) Filter Input [7]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_tax#X, l_returnflag#X, l_linestatus#X, l_shipdate#X] Condition : (isnotnull(l_shipdate#X) AND (l_shipdate#X <= 1998-09-02)) -(22) Project +(24) Project Output [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_tax#X, l_returnflag#X, l_linestatus#X] Input [7]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_tax#X, l_returnflag#X, l_linestatus#X, l_shipdate#X] -(23) HashAggregate +(25) HashAggregate Input [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_tax#X, l_returnflag#X, l_linestatus#X] Keys [2]: [l_returnflag#X, l_linestatus#X] Functions [8]: [partial_sum(l_quantity#X), partial_sum(l_extendedprice#X), partial_sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true)), partial_sum(CheckOverflow((promote_precision(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true)) * promote_precision(cast(CheckOverflow((1.00 + promote_precision(cast(l_tax#X as decimal(13,2)))), DecimalType(13,2), true) as decimal(26,4)))), DecimalType(38,6), true)), partial_avg(l_quantity#X), partial_avg(l_extendedprice#X), partial_avg(l_discount#X), partial_count(1)] Aggregate Attributes [15]: [sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Results [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] -(24) Exchange +(26) Exchange Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Arguments: hashpartitioning(l_returnflag#X, l_linestatus#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(25) HashAggregate +(27) HashAggregate Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Keys [2]: [l_returnflag#X, l_linestatus#X] Functions [8]: [sum(l_quantity#X), sum(l_extendedprice#X), sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true)), sum(CheckOverflow((promote_precision(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true)) * promote_precision(cast(CheckOverflow((1.00 + promote_precision(cast(l_tax#X as decimal(13,2)))), DecimalType(13,2), true) as decimal(26,4)))), DecimalType(38,6), true)), avg(l_quantity#X), avg(l_extendedprice#X), avg(l_discount#X), count(1)] Aggregate Attributes [8]: [sum(l_quantity#X)#X, sum(l_extendedprice#X)#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X, sum(CheckOverflow((promote_precision(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true)) * promote_precision(cast(CheckOverflow((1.00 + promote_precision(cast(l_tax#X as decimal(13,2)))), DecimalType(13,2), true) as decimal(26,4)))), DecimalType(38,6), true))#X, avg(l_quantity#X)#X, avg(l_extendedprice#X)#X, avg(l_discount#X)#X, count(1)#X] Results [10]: [l_returnflag#X, l_linestatus#X, sum(l_quantity#X)#X AS sum_qty#X, sum(l_extendedprice#X)#X AS sum_base_price#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X AS sum_disc_price#X, sum(CheckOverflow((promote_precision(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true)) * promote_precision(cast(CheckOverflow((1.00 + promote_precision(cast(l_tax#X as decimal(13,2)))), DecimalType(13,2), true) as decimal(26,4)))), DecimalType(38,6), true))#X AS sum_charge#X, avg(l_quantity#X)#X AS avg_qty#X, avg(l_extendedprice#X)#X AS avg_price#X, avg(l_discount#X)#X AS avg_disc#X, count(1)#X AS count_order#X] -(26) Exchange +(28) Exchange Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: rangepartitioning(l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(27) Sort +(29) Sort Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: [l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST], true, 0 -(28) AdaptiveSparkPlan +(30) AdaptiveSparkPlan Output [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/10.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/10.txt index 6b7f5bab6b1c2..993884df3f3ac 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/10.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/10.txt @@ -1,85 +1,91 @@ == Physical Plan == -AdaptiveSparkPlan (87) +AdaptiveSparkPlan (100) +- == Final Plan == - VeloxColumnarToRowExec (60) - +- TakeOrderedAndProjectExecTransformer (59) - +- ^ ProjectExecTransformer (57) - +- ^ RegularHashAggregateExecTransformer (56) - +- ^ InputIteratorTransformer (55) - +- ^ InputAdapter (54) - +- ^ ShuffleQueryStage (53) - +- ColumnarExchange (52) - +- ^ ProjectExecTransformer (50) - +- ^ FlushableHashAggregateExecTransformer (49) - +- ^ ProjectExecTransformer (48) - +- ^ ShuffledHashJoinExecTransformer Inner (47) - :- ^ InputIteratorTransformer (38) - : +- ^ InputAdapter (37) - : +- ^ ShuffleQueryStage (36) - : +- ColumnarExchange (35) - : +- ^ ProjectExecTransformer (33) - : +- ^ ShuffledHashJoinExecTransformer Inner (32) - : :- ^ InputIteratorTransformer (23) - : : +- ^ InputAdapter (22) - : : +- ^ ShuffleQueryStage (21) - : : +- ColumnarExchange (20) - : : +- ^ ProjectExecTransformer (18) - : : +- ^ ShuffledHashJoinExecTransformer Inner (17) - : : :- ^ InputIteratorTransformer (8) - : : : +- ^ InputAdapter (7) - : : : +- ^ ShuffleQueryStage (6) - : : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (67) + +- TakeOrderedAndProjectExecTransformer (66) + +- ^ ProjectExecTransformer (64) + +- ^ RegularHashAggregateExecTransformer (63) + +- ^ InputIteratorTransformer (62) + +- ShuffleQueryStage (60) + +- ColumnarExchange (59) + +- VeloxAppendBatches (58) + +- ^ ProjectExecTransformer (56) + +- ^ FlushableHashAggregateExecTransformer (55) + +- ^ ProjectExecTransformer (54) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (53) + :- ^ InputIteratorTransformer (43) + : +- ShuffleQueryStage (41) + : +- ColumnarExchange (40) + : +- VeloxAppendBatches (39) + : +- ^ ProjectExecTransformer (37) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (36) + : :- ^ InputIteratorTransformer (26) + : : +- ShuffleQueryStage (24) + : : +- ColumnarExchange (23) + : : +- VeloxAppendBatches (22) + : : +- ^ ProjectExecTransformer (20) + : : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (19) + : : :- ^ InputIteratorTransformer (9) + : : : +- ShuffleQueryStage (7) + : : : +- ColumnarExchange (6) + : : : +- VeloxAppendBatches (5) : : : +- ^ ProjectExecTransformer (3) : : : +- ^ FilterExecTransformer (2) : : : +- ^ Scan parquet (1) - : : +- ^ InputIteratorTransformer (16) - : : +- ^ InputAdapter (15) - : : +- ^ ShuffleQueryStage (14) - : : +- ColumnarExchange (13) - : : +- ^ ProjectExecTransformer (11) - : : +- ^ FilterExecTransformer (10) - : : +- ^ Scan parquet (9) - : +- ^ InputIteratorTransformer (31) - : +- ^ InputAdapter (30) - : +- ^ ShuffleQueryStage (29) - : +- ColumnarExchange (28) - : +- ^ ProjectExecTransformer (26) - : +- ^ FilterExecTransformer (25) - : +- ^ Scan parquet (24) - +- ^ InputIteratorTransformer (46) - +- ^ InputAdapter (45) - +- ^ ShuffleQueryStage (44) - +- ColumnarExchange (43) - +- ^ ProjectExecTransformer (41) - +- ^ FilterExecTransformer (40) - +- ^ Scan parquet (39) + : : +- ^ InputIteratorTransformer (18) + : : +- ShuffleQueryStage (16) + : : +- ColumnarExchange (15) + : : +- VeloxAppendBatches (14) + : : +- ^ ProjectExecTransformer (12) + : : +- ^ FilterExecTransformer (11) + : : +- ^ Scan parquet (10) + : +- ^ InputIteratorTransformer (35) + : +- ShuffleQueryStage (33) + : +- ColumnarExchange (32) + : +- VeloxAppendBatches (31) + : +- ^ ProjectExecTransformer (29) + : +- ^ FilterExecTransformer (28) + : +- ^ Scan parquet (27) + +- ^ InputIteratorTransformer (52) + +- ShuffleQueryStage (50) + +- ColumnarExchange (49) + +- VeloxAppendBatches (48) + +- ^ ProjectExecTransformer (46) + +- ^ FilterExecTransformer (45) + +- ^ Scan parquet (44) +- == Initial Plan == - TakeOrderedAndProject (86) - +- HashAggregate (85) - +- Exchange (84) - +- HashAggregate (83) - +- Project (82) - +- ShuffledHashJoin Inner BuildRight (81) - :- Exchange (77) - : +- Project (76) - : +- ShuffledHashJoin Inner BuildRight (75) - : :- Exchange (70) - : : +- Project (69) - : : +- ShuffledHashJoin Inner BuildRight (68) - : : :- Exchange (63) - : : : +- Filter (62) - : : : +- Scan parquet (61) - : : +- Exchange (67) - : : +- Project (66) - : : +- Filter (65) - : : +- Scan parquet (64) - : +- Exchange (74) - : +- Project (73) - : +- Filter (72) - : +- Scan parquet (71) - +- Exchange (80) - +- Filter (79) - +- Scan parquet (78) + TakeOrderedAndProject (99) + +- HashAggregate (98) + +- Exchange (97) + +- HashAggregate (96) + +- Project (95) + +- SortMergeJoin Inner (94) + :- Sort (89) + : +- Exchange (88) + : +- Project (87) + : +- SortMergeJoin Inner (86) + : :- Sort (80) + : : +- Exchange (79) + : : +- Project (78) + : : +- SortMergeJoin Inner (77) + : : :- Sort (71) + : : : +- Exchange (70) + : : : +- Filter (69) + : : : +- Scan parquet (68) + : : +- Sort (76) + : : +- Exchange (75) + : : +- Project (74) + : : +- Filter (73) + : : +- Scan parquet (72) + : +- Sort (85) + : +- Exchange (84) + : +- Project (83) + : +- Filter (82) + : +- Scan parquet (81) + +- Sort (93) + +- Exchange (92) + +- Filter (91) + +- Scan parquet (90) (1) Scan parquet @@ -101,358 +107,410 @@ Input [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acct Input [8]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [8]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] +Arguments: X + +(6) ColumnarExchange Input [8]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] -(9) Scan parquet +(10) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1993-10-01), LessThan(o_orderdate,1994-01-01), IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(10) FilterExecTransformer +(11) FilterExecTransformer Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Arguments: ((((isnotnull(o_orderdate#X) AND (o_orderdate#X >= 1993-10-01)) AND (o_orderdate#X < 1994-01-01)) AND isnotnull(o_custkey#X)) AND isnotnull(o_orderkey#X)) -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [3]: [hash(o_custkey#X, 42) AS hash_partition_key#X, o_orderkey#X, o_custkey#X] Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] +Arguments: X + +(15) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [2]: [o_orderkey#X, o_custkey#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [2]: [o_orderkey#X, o_custkey#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [2]: [o_orderkey#X, o_custkey#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [9]: [hash(o_orderkey#X, 42) AS hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] Input [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X, o_custkey#X] -(19) WholeStageCodegenTransformer (X) +(21) WholeStageCodegenTransformer (X) Input [9]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] Arguments: false -(20) ColumnarExchange +(22) VeloxAppendBatches +Input [9]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] +Arguments: X + +(23) ColumnarExchange Input [9]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X], [plan_id=X], [id=#X] -(21) ShuffleQueryStage +(24) ShuffleQueryStage Output [8]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] Arguments: X -(22) InputAdapter +(25) InputAdapter Input [8]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] -(23) InputIteratorTransformer +(26) InputIteratorTransformer Input [8]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] -(24) Scan parquet +(27) Scan parquet Output [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_returnflag#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_returnflag), EqualTo(l_returnflag,R), IsNotNull(l_orderkey)] ReadSchema: struct -(25) FilterExecTransformer +(28) FilterExecTransformer Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_returnflag#X] Arguments: ((isnotnull(l_returnflag#X) AND (l_returnflag#X = R)) AND isnotnull(l_orderkey#X)) -(26) ProjectExecTransformer +(29) ProjectExecTransformer Output [4]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_returnflag#X] -(27) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X] Arguments: false -(28) ColumnarExchange +(31) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(32) ColumnarExchange Input [4]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(29) ShuffleQueryStage +(33) ShuffleQueryStage Output [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] Arguments: X -(30) InputAdapter +(34) InputAdapter Input [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] -(31) InputIteratorTransformer +(35) InputIteratorTransformer Input [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] -(32) ShuffledHashJoinExecTransformer +(36) ShuffledHashJoinExecTransformer Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(33) ProjectExecTransformer +(37) ProjectExecTransformer Output [10]: [hash(c_nationkey#X, 42) AS hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] Input [11]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X] -(34) WholeStageCodegenTransformer (X) +(38) WholeStageCodegenTransformer (X) Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] Arguments: false -(35) ColumnarExchange +(39) VeloxAppendBatches +Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(40) ColumnarExchange Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(36) ShuffleQueryStage +(41) ShuffleQueryStage Output [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] Arguments: X -(37) InputAdapter +(42) InputAdapter Input [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] -(38) InputIteratorTransformer +(43) InputIteratorTransformer Input [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] -(39) Scan parquet +(44) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey)] ReadSchema: struct -(40) FilterExecTransformer +(45) FilterExecTransformer Input [2]: [n_nationkey#X, n_name#X] Arguments: isnotnull(n_nationkey#X) -(41) ProjectExecTransformer +(46) ProjectExecTransformer Output [3]: [hash(n_nationkey#X, 42) AS hash_partition_key#X, n_nationkey#X, n_name#X] Input [2]: [n_nationkey#X, n_name#X] -(42) WholeStageCodegenTransformer (X) +(47) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] Arguments: false -(43) ColumnarExchange +(48) VeloxAppendBatches +Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] +Arguments: X + +(49) ColumnarExchange Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [id=#X] -(44) ShuffleQueryStage +(50) ShuffleQueryStage Output [2]: [n_nationkey#X, n_name#X] Arguments: X -(45) InputAdapter +(51) InputAdapter Input [2]: [n_nationkey#X, n_name#X] -(46) InputIteratorTransformer +(52) InputIteratorTransformer Input [2]: [n_nationkey#X, n_name#X] -(47) ShuffledHashJoinExecTransformer +(53) ShuffledHashJoinExecTransformer Left keys [1]: [c_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(48) ProjectExecTransformer +(54) ProjectExecTransformer Output [10]: [c_custkey#X, c_name#X, c_address#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X, n_name#X, CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) AS _pre_X#X] Input [11]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X, n_nationkey#X, n_name#X] -(49) FlushableHashAggregateExecTransformer +(55) FlushableHashAggregateExecTransformer Input [10]: [c_custkey#X, c_name#X, c_address#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X, n_name#X, _pre_X#X] Keys [7]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X] Functions [1]: [partial_sum(_pre_X#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] -(50) ProjectExecTransformer +(56) ProjectExecTransformer Output [10]: [hash(c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, 42) AS hash_partition_key#X, c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] -(51) WholeStageCodegenTransformer (X) +(57) WholeStageCodegenTransformer (X) Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Arguments: false -(52) ColumnarExchange +(58) VeloxAppendBatches +Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] +Arguments: X + +(59) ColumnarExchange Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Arguments: hashpartitioning(c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(53) ShuffleQueryStage +(60) ShuffleQueryStage Output [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Arguments: X -(54) InputAdapter +(61) InputAdapter Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] -(55) InputIteratorTransformer +(62) InputIteratorTransformer Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] -(56) RegularHashAggregateExecTransformer +(63) RegularHashAggregateExecTransformer Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Keys [7]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] Results [8]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] -(57) ProjectExecTransformer +(64) ProjectExecTransformer Output [8]: [c_custkey#X, c_name#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X AS revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] Input [8]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] -(58) WholeStageCodegenTransformer (X) +(65) WholeStageCodegenTransformer (X) Input [8]: [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] Arguments: false -(59) TakeOrderedAndProjectExecTransformer +(66) TakeOrderedAndProjectExecTransformer Input [8]: [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] Arguments: X, [revenue#X DESC NULLS LAST], [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X], 0 -(60) VeloxColumnarToRowExec +(67) VeloxColumnarToRowExec Input [8]: [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] -(61) Scan parquet +(68) Scan parquet Output [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey), IsNotNull(c_nationkey)] ReadSchema: struct -(62) Filter +(69) Filter Input [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] Condition : (isnotnull(c_custkey#X) AND isnotnull(c_nationkey#X)) -(63) Exchange +(70) Exchange Input [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(64) Scan parquet +(71) Sort +Input [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] +Arguments: [c_custkey#X ASC NULLS FIRST], false, 0 + +(72) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1993-10-01), LessThan(o_orderdate,1994-01-01), IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(65) Filter +(73) Filter Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Condition : ((((isnotnull(o_orderdate#X) AND (o_orderdate#X >= 1993-10-01)) AND (o_orderdate#X < 1994-01-01)) AND isnotnull(o_custkey#X)) AND isnotnull(o_orderkey#X)) -(66) Project +(74) Project Output [2]: [o_orderkey#X, o_custkey#X] Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] -(67) Exchange +(75) Exchange Input [2]: [o_orderkey#X, o_custkey#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(68) ShuffledHashJoin +(76) Sort +Input [2]: [o_orderkey#X, o_custkey#X] +Arguments: [o_custkey#X ASC NULLS FIRST], false, 0 + +(77) SortMergeJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(69) Project +(78) Project Output [8]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] Input [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X, o_custkey#X] -(70) Exchange +(79) Exchange Input [8]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(71) Scan parquet +(80) Sort +Input [8]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(81) Scan parquet Output [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_returnflag#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_returnflag), EqualTo(l_returnflag,R), IsNotNull(l_orderkey)] ReadSchema: struct -(72) Filter +(82) Filter Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_returnflag#X] Condition : ((isnotnull(l_returnflag#X) AND (l_returnflag#X = R)) AND isnotnull(l_orderkey#X)) -(73) Project +(83) Project Output [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_returnflag#X] -(74) Exchange +(84) Exchange Input [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(75) ShuffledHashJoin +(85) Sort +Input [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(86) SortMergeJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(76) Project +(87) Project Output [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] Input [11]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X] -(77) Exchange +(88) Exchange Input [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(78) Scan parquet +(89) Sort +Input [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] +Arguments: [c_nationkey#X ASC NULLS FIRST], false, 0 + +(90) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey)] ReadSchema: struct -(79) Filter +(91) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : isnotnull(n_nationkey#X) -(80) Exchange +(92) Exchange Input [2]: [n_nationkey#X, n_name#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(81) ShuffledHashJoin +(93) Sort +Input [2]: [n_nationkey#X, n_name#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(94) SortMergeJoin Left keys [1]: [c_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(82) Project +(95) Project Output [9]: [c_custkey#X, c_name#X, c_address#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X, n_name#X] Input [11]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X, n_nationkey#X, n_name#X] -(83) HashAggregate +(96) HashAggregate Input [9]: [c_custkey#X, c_name#X, c_address#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X, n_name#X] Keys [7]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X] Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] -(84) Exchange +(97) Exchange Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Arguments: hashpartitioning(c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(85) HashAggregate +(98) HashAggregate Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Keys [7]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] Results [8]: [c_custkey#X, c_name#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X AS revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] -(86) TakeOrderedAndProject +(99) TakeOrderedAndProject Input [8]: [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] Arguments: X, [revenue#X DESC NULLS LAST], [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] -(87) AdaptiveSparkPlan +(100) AdaptiveSparkPlan Output [8]: [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/11.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/11.txt index 4fe3c8b62ca93..8142375d9ead2 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/11.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/11.txt @@ -1,71 +1,75 @@ == Physical Plan == -AdaptiveSparkPlan (72) +AdaptiveSparkPlan (82) +- == Final Plan == - VeloxColumnarToRowExec (50) - +- ^ SortExecTransformer (48) - +- ^ InputIteratorTransformer (47) - +- ^ InputAdapter (46) - +- ^ ShuffleQueryStage (45) - +- ColumnarExchange (44) - +- ^ FilterExecTransformer (42) - +- ^ RegularHashAggregateExecTransformer (41) - +- ^ InputIteratorTransformer (40) - +- ^ InputAdapter (39) - +- ^ ShuffleQueryStage (38) - +- ColumnarExchange (37) - +- ^ ProjectExecTransformer (35) - +- ^ FlushableHashAggregateExecTransformer (34) - +- ^ ProjectExecTransformer (33) - +- ^ ShuffledHashJoinExecTransformer Inner (32) - :- ^ InputIteratorTransformer (23) - : +- ^ InputAdapter (22) - : +- ^ ShuffleQueryStage (21) - : +- ColumnarExchange (20) - : +- ^ ProjectExecTransformer (18) - : +- ^ ShuffledHashJoinExecTransformer Inner (17) - : :- ^ InputIteratorTransformer (8) - : : +- ^ InputAdapter (7) - : : +- ^ ShuffleQueryStage (6) - : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (56) + +- ^ SortExecTransformer (54) + +- ^ InputIteratorTransformer (53) + +- ShuffleQueryStage (51) + +- ColumnarExchange (50) + +- VeloxAppendBatches (49) + +- ^ FilterExecTransformer (47) + +- ^ RegularHashAggregateExecTransformer (46) + +- ^ InputIteratorTransformer (45) + +- ShuffleQueryStage (43) + +- ColumnarExchange (42) + +- VeloxAppendBatches (41) + +- ^ ProjectExecTransformer (39) + +- ^ FlushableHashAggregateExecTransformer (38) + +- ^ ProjectExecTransformer (37) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (36) + :- ^ InputIteratorTransformer (26) + : +- ShuffleQueryStage (24) + : +- ColumnarExchange (23) + : +- VeloxAppendBatches (22) + : +- ^ ProjectExecTransformer (20) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (19) + : :- ^ InputIteratorTransformer (9) + : : +- ShuffleQueryStage (7) + : : +- ColumnarExchange (6) + : : +- VeloxAppendBatches (5) : : +- ^ ProjectExecTransformer (3) : : +- ^ FilterExecTransformer (2) : : +- ^ Scan parquet (1) - : +- ^ InputIteratorTransformer (16) - : +- ^ InputAdapter (15) - : +- ^ ShuffleQueryStage (14) - : +- ColumnarExchange (13) - : +- ^ ProjectExecTransformer (11) - : +- ^ FilterExecTransformer (10) - : +- ^ Scan parquet (9) - +- ^ InputIteratorTransformer (31) - +- ^ InputAdapter (30) - +- ^ ShuffleQueryStage (29) - +- ColumnarExchange (28) - +- ^ ProjectExecTransformer (26) - +- ^ FilterExecTransformer (25) - +- ^ Scan parquet (24) + : +- ^ InputIteratorTransformer (18) + : +- ShuffleQueryStage (16) + : +- ColumnarExchange (15) + : +- VeloxAppendBatches (14) + : +- ^ ProjectExecTransformer (12) + : +- ^ FilterExecTransformer (11) + : +- ^ Scan parquet (10) + +- ^ InputIteratorTransformer (35) + +- ShuffleQueryStage (33) + +- ColumnarExchange (32) + +- VeloxAppendBatches (31) + +- ^ ProjectExecTransformer (29) + +- ^ FilterExecTransformer (28) + +- ^ Scan parquet (27) +- == Initial Plan == - Sort (71) - +- Exchange (70) - +- Filter (69) - +- HashAggregate (68) - +- Exchange (67) - +- HashAggregate (66) - +- Project (65) - +- ShuffledHashJoin Inner BuildRight (64) - :- Exchange (59) - : +- Project (58) - : +- ShuffledHashJoin Inner BuildRight (57) - : :- Exchange (53) - : : +- Filter (52) - : : +- Scan parquet (51) - : +- Exchange (56) - : +- Filter (55) - : +- Scan parquet (54) - +- Exchange (63) - +- Project (62) - +- Filter (61) - +- Scan parquet (60) + Sort (81) + +- Exchange (80) + +- Filter (79) + +- HashAggregate (78) + +- Exchange (77) + +- HashAggregate (76) + +- Project (75) + +- SortMergeJoin Inner (74) + :- Sort (68) + : +- Exchange (67) + : +- Project (66) + : +- SortMergeJoin Inner (65) + : :- Sort (60) + : : +- Exchange (59) + : : +- Filter (58) + : : +- Scan parquet (57) + : +- Sort (64) + : +- Exchange (63) + : +- Filter (62) + : +- Scan parquet (61) + +- Sort (73) + +- Exchange (72) + +- Project (71) + +- Filter (70) + +- Scan parquet (69) (1) Scan parquet @@ -87,292 +91,332 @@ Input [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Input [5]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [5]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] +Arguments: X + +(6) ColumnarExchange Input [5]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] -(9) Scan parquet +(10) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(10) FilterExecTransformer +(11) FilterExecTransformer Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [3]: [hash(s_suppkey#X, 42) AS hash_partition_key#X, s_suppkey#X, s_nationkey#X] Input [2]: [s_suppkey#X, s_nationkey#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] +Arguments: X + +(15) ColumnarExchange Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [2]: [s_suppkey#X, s_nationkey#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [2]: [s_suppkey#X, s_nationkey#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [ps_suppkey#X] Right keys [1]: [s_suppkey#X] Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [5]: [hash(s_nationkey#X, 42) AS hash_partition_key#X, ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] Input [6]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X, s_suppkey#X, s_nationkey#X] -(19) WholeStageCodegenTransformer (X) +(21) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] Arguments: false -(20) ColumnarExchange +(22) VeloxAppendBatches +Input [5]: [hash_partition_key#X, ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] +Arguments: X + +(23) ColumnarExchange Input [5]: [hash_partition_key#X, ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X], [plan_id=X], [id=#X] -(21) ShuffleQueryStage +(24) ShuffleQueryStage Output [4]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] Arguments: X -(22) InputAdapter +(25) InputAdapter Input [4]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] -(23) InputIteratorTransformer +(26) InputIteratorTransformer Input [4]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] -(24) Scan parquet +(27) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,GERMANY), IsNotNull(n_nationkey)] ReadSchema: struct -(25) FilterExecTransformer +(28) FilterExecTransformer Input [2]: [n_nationkey#X, n_name#X] Arguments: ((isnotnull(n_name#X) AND (n_name#X = GERMANY)) AND isnotnull(n_nationkey#X)) -(26) ProjectExecTransformer +(29) ProjectExecTransformer Output [2]: [hash(n_nationkey#X, 42) AS hash_partition_key#X, n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(27) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, n_nationkey#X] Arguments: false -(28) ColumnarExchange +(31) VeloxAppendBatches +Input [2]: [hash_partition_key#X, n_nationkey#X] +Arguments: X + +(32) ColumnarExchange Input [2]: [hash_partition_key#X, n_nationkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X], [plan_id=X], [id=#X] -(29) ShuffleQueryStage +(33) ShuffleQueryStage Output [1]: [n_nationkey#X] Arguments: X -(30) InputAdapter +(34) InputAdapter Input [1]: [n_nationkey#X] -(31) InputIteratorTransformer +(35) InputIteratorTransformer Input [1]: [n_nationkey#X] -(32) ShuffledHashJoinExecTransformer +(36) ShuffledHashJoinExecTransformer Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(33) ProjectExecTransformer +(37) ProjectExecTransformer Output [4]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(cast(ps_availqty#X as decimal(10,0)) as decimal(12,2)))), DecimalType(23,2), true) AS _pre_X#X] Input [5]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X, n_nationkey#X] -(34) FlushableHashAggregateExecTransformer +(38) FlushableHashAggregateExecTransformer Input [4]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, _pre_X#X] Keys [1]: [ps_partkey#X] Functions [1]: [partial_sum(_pre_X#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [ps_partkey#X, sum#X, isEmpty#X] -(35) ProjectExecTransformer +(39) ProjectExecTransformer Output [4]: [hash(ps_partkey#X, 42) AS hash_partition_key#X, ps_partkey#X, sum#X, isEmpty#X] Input [3]: [ps_partkey#X, sum#X, isEmpty#X] -(36) WholeStageCodegenTransformer (X) +(40) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, ps_partkey#X, sum#X, isEmpty#X] Arguments: false -(37) ColumnarExchange +(41) VeloxAppendBatches +Input [4]: [hash_partition_key#X, ps_partkey#X, sum#X, isEmpty#X] +Arguments: X + +(42) ColumnarExchange Input [4]: [hash_partition_key#X, ps_partkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(38) ShuffleQueryStage +(43) ShuffleQueryStage Output [3]: [ps_partkey#X, sum#X, isEmpty#X] Arguments: X -(39) InputAdapter +(44) InputAdapter Input [3]: [ps_partkey#X, sum#X, isEmpty#X] -(40) InputIteratorTransformer +(45) InputIteratorTransformer Input [3]: [ps_partkey#X, sum#X, isEmpty#X] -(41) RegularHashAggregateExecTransformer +(46) RegularHashAggregateExecTransformer Input [3]: [ps_partkey#X, sum#X, isEmpty#X] Keys [1]: [ps_partkey#X] Functions [1]: [sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(cast(ps_availqty#X as decimal(10,0)) as decimal(12,2)))), DecimalType(23,2), true))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(cast(ps_availqty#X as decimal(10,0)) as decimal(12,2)))), DecimalType(23,2), true))#X] Results [2]: [ps_partkey#X, sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(cast(ps_availqty#X as decimal(10,0)) as decimal(12,2)))), DecimalType(23,2), true))#X AS value#X] -(42) FilterExecTransformer +(47) FilterExecTransformer Input [2]: [ps_partkey#X, value#X] Arguments: (isnotnull(value#X) AND (cast(value#X as decimal(38,6)) > Subquery subquery#X, [id=#X])) -(43) WholeStageCodegenTransformer (X) +(48) WholeStageCodegenTransformer (X) Input [2]: [ps_partkey#X, value#X] Arguments: false -(44) ColumnarExchange +(49) VeloxAppendBatches +Input [2]: [ps_partkey#X, value#X] +Arguments: X + +(50) ColumnarExchange Input [2]: [ps_partkey#X, value#X] Arguments: rangepartitioning(value#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(45) ShuffleQueryStage +(51) ShuffleQueryStage Output [2]: [ps_partkey#X, value#X] Arguments: X -(46) InputAdapter +(52) InputAdapter Input [2]: [ps_partkey#X, value#X] -(47) InputIteratorTransformer +(53) InputIteratorTransformer Input [2]: [ps_partkey#X, value#X] -(48) SortExecTransformer +(54) SortExecTransformer Input [2]: [ps_partkey#X, value#X] Arguments: [value#X DESC NULLS LAST], true, 0 -(49) WholeStageCodegenTransformer (X) +(55) WholeStageCodegenTransformer (X) Input [2]: [ps_partkey#X, value#X] Arguments: false -(50) VeloxColumnarToRowExec +(56) VeloxColumnarToRowExec Input [2]: [ps_partkey#X, value#X] -(51) Scan parquet +(57) Scan parquet Output [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_suppkey)] ReadSchema: struct -(52) Filter +(58) Filter Input [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Condition : isnotnull(ps_suppkey#X) -(53) Exchange +(59) Exchange Input [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(54) Scan parquet +(60) Sort +Input [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] +Arguments: [ps_suppkey#X ASC NULLS FIRST], false, 0 + +(61) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(55) Filter +(62) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(56) Exchange +(63) Exchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(57) ShuffledHashJoin +(64) Sort +Input [2]: [s_suppkey#X, s_nationkey#X] +Arguments: [s_suppkey#X ASC NULLS FIRST], false, 0 + +(65) SortMergeJoin Left keys [1]: [ps_suppkey#X] Right keys [1]: [s_suppkey#X] Join condition: None -(58) Project +(66) Project Output [4]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] Input [6]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X, s_suppkey#X, s_nationkey#X] -(59) Exchange +(67) Exchange Input [4]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(60) Scan parquet +(68) Sort +Input [4]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] +Arguments: [s_nationkey#X ASC NULLS FIRST], false, 0 + +(69) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,GERMANY), IsNotNull(n_nationkey)] ReadSchema: struct -(61) Filter +(70) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : ((isnotnull(n_name#X) AND (n_name#X = GERMANY)) AND isnotnull(n_nationkey#X)) -(62) Project +(71) Project Output [1]: [n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(63) Exchange +(72) Exchange Input [1]: [n_nationkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(64) ShuffledHashJoin +(73) Sort +Input [1]: [n_nationkey#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(74) SortMergeJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(65) Project +(75) Project Output [3]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X] Input [5]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X, n_nationkey#X] -(66) HashAggregate +(76) HashAggregate Input [3]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X] Keys [1]: [ps_partkey#X] Functions [1]: [partial_sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(cast(ps_availqty#X as decimal(10,0)) as decimal(12,2)))), DecimalType(23,2), true))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [ps_partkey#X, sum#X, isEmpty#X] -(67) Exchange +(77) Exchange Input [3]: [ps_partkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(68) HashAggregate +(78) HashAggregate Input [3]: [ps_partkey#X, sum#X, isEmpty#X] Keys [1]: [ps_partkey#X] Functions [1]: [sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(cast(ps_availqty#X as decimal(10,0)) as decimal(12,2)))), DecimalType(23,2), true))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(cast(ps_availqty#X as decimal(10,0)) as decimal(12,2)))), DecimalType(23,2), true))#X] Results [2]: [ps_partkey#X, sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(cast(ps_availqty#X as decimal(10,0)) as decimal(12,2)))), DecimalType(23,2), true))#X AS value#X] -(69) Filter +(79) Filter Input [2]: [ps_partkey#X, value#X] Condition : (isnotnull(value#X) AND (cast(value#X as decimal(38,6)) > Subquery subquery#X, [id=#X])) -(70) Exchange +(80) Exchange Input [2]: [ps_partkey#X, value#X] Arguments: rangepartitioning(value#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(71) Sort +(81) Sort Input [2]: [ps_partkey#X, value#X] Arguments: [value#X DESC NULLS LAST], true, 0 -(72) AdaptiveSparkPlan +(82) AdaptiveSparkPlan Output [2]: [ps_partkey#X, value#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/12.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/12.txt index f8bca5a8c3899..802a79759235a 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/12.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/12.txt @@ -1,50 +1,52 @@ == Physical Plan == -AdaptiveSparkPlan (49) +AdaptiveSparkPlan (55) +- == Final Plan == - VeloxColumnarToRowExec (34) - +- ^ SortExecTransformer (32) - +- ^ InputIteratorTransformer (31) - +- ^ InputAdapter (30) - +- ^ ShuffleQueryStage (29) - +- ColumnarExchange (28) - +- ^ RegularHashAggregateExecTransformer (26) - +- ^ InputIteratorTransformer (25) - +- ^ InputAdapter (24) - +- ^ ShuffleQueryStage (23) - +- ColumnarExchange (22) - +- ^ ProjectExecTransformer (20) - +- ^ FlushableHashAggregateExecTransformer (19) - +- ^ ProjectExecTransformer (18) - +- ^ ShuffledHashJoinExecTransformer Inner (17) - :- ^ InputIteratorTransformer (8) - : +- ^ InputAdapter (7) - : +- ^ ShuffleQueryStage (6) - : +- ColumnarExchange (5) + VeloxColumnarToRowExec (38) + +- ^ SortExecTransformer (36) + +- ^ InputIteratorTransformer (35) + +- ShuffleQueryStage (33) + +- ColumnarExchange (32) + +- VeloxAppendBatches (31) + +- ^ RegularHashAggregateExecTransformer (29) + +- ^ InputIteratorTransformer (28) + +- ShuffleQueryStage (26) + +- ColumnarExchange (25) + +- VeloxAppendBatches (24) + +- ^ ProjectExecTransformer (22) + +- ^ FlushableHashAggregateExecTransformer (21) + +- ^ ProjectExecTransformer (20) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (19) + :- ^ InputIteratorTransformer (9) + : +- ShuffleQueryStage (7) + : +- ColumnarExchange (6) + : +- VeloxAppendBatches (5) : +- ^ ProjectExecTransformer (3) : +- ^ FilterExecTransformer (2) : +- ^ Scan parquet (1) - +- ^ InputIteratorTransformer (16) - +- ^ InputAdapter (15) - +- ^ ShuffleQueryStage (14) - +- ColumnarExchange (13) - +- ^ ProjectExecTransformer (11) - +- ^ FilterExecTransformer (10) - +- ^ Scan parquet (9) + +- ^ InputIteratorTransformer (18) + +- ShuffleQueryStage (16) + +- ColumnarExchange (15) + +- VeloxAppendBatches (14) + +- ^ ProjectExecTransformer (12) + +- ^ FilterExecTransformer (11) + +- ^ Scan parquet (10) +- == Initial Plan == - Sort (48) - +- Exchange (47) - +- HashAggregate (46) - +- Exchange (45) - +- HashAggregate (44) - +- Project (43) - +- ShuffledHashJoin Inner BuildLeft (42) - :- Exchange (37) - : +- Filter (36) - : +- Scan parquet (35) - +- Exchange (41) - +- Project (40) - +- Filter (39) - +- Scan parquet (38) + Sort (54) + +- Exchange (53) + +- HashAggregate (52) + +- Exchange (51) + +- HashAggregate (50) + +- Project (49) + +- SortMergeJoin Inner (48) + :- Sort (42) + : +- Exchange (41) + : +- Filter (40) + : +- Scan parquet (39) + +- Sort (47) + +- Exchange (46) + +- Project (45) + +- Filter (44) + +- Scan parquet (43) (1) Scan parquet @@ -66,196 +68,220 @@ Input [2]: [o_orderkey#X, o_orderpriority#X] Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderpriority#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderpriority#X] +Arguments: X + +(6) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderpriority#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderpriority#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [2]: [o_orderkey#X, o_orderpriority#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [2]: [o_orderkey#X, o_orderpriority#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [2]: [o_orderkey#X, o_orderpriority#X] -(9) Scan parquet +(10) Scan parquet Output [5]: [l_orderkey#X, l_shipdate#X, l_commitdate#X, l_receiptdate#X, l_shipmode#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_commitdate), IsNotNull(l_receiptdate), IsNotNull(l_shipdate), In(l_shipmode, [MAIL,SHIP]), GreaterThanOrEqual(l_receiptdate,1994-01-01), LessThan(l_receiptdate,1995-01-01), IsNotNull(l_orderkey)] ReadSchema: struct -(10) FilterExecTransformer +(11) FilterExecTransformer Input [5]: [l_orderkey#X, l_shipdate#X, l_commitdate#X, l_receiptdate#X, l_shipmode#X] Arguments: ((((((((isnotnull(l_commitdate#X) AND isnotnull(l_receiptdate#X)) AND isnotnull(l_shipdate#X)) AND l_shipmode#X IN (MAIL,SHIP)) AND (l_commitdate#X < l_receiptdate#X)) AND (l_shipdate#X < l_commitdate#X)) AND (l_receiptdate#X >= 1994-01-01)) AND (l_receiptdate#X < 1995-01-01)) AND isnotnull(l_orderkey#X)) -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [3]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_shipmode#X] Input [5]: [l_orderkey#X, l_shipdate#X, l_commitdate#X, l_receiptdate#X, l_shipmode#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, l_orderkey#X, l_shipmode#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [3]: [hash_partition_key#X, l_orderkey#X, l_shipmode#X] +Arguments: X + +(15) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_shipmode#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_shipmode#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [2]: [l_orderkey#X, l_shipmode#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [2]: [l_orderkey#X, l_shipmode#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [2]: [l_orderkey#X, l_shipmode#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [4]: [o_orderpriority#X, l_shipmode#X, CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END AS _pre_X#X, CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END AS _pre_X#X] Input [4]: [o_orderkey#X, o_orderpriority#X, l_orderkey#X, l_shipmode#X] -(19) FlushableHashAggregateExecTransformer +(21) FlushableHashAggregateExecTransformer Input [4]: [o_orderpriority#X, l_shipmode#X, _pre_X#X, _pre_X#X] Keys [1]: [l_shipmode#X] Functions [2]: [partial_sum(_pre_X#X), partial_sum(_pre_X#X)] Aggregate Attributes [2]: [sum#X, sum#X] Results [3]: [l_shipmode#X, sum#X, sum#X] -(20) ProjectExecTransformer +(22) ProjectExecTransformer Output [4]: [hash(l_shipmode#X, 42) AS hash_partition_key#X, l_shipmode#X, sum#X, sum#X] Input [3]: [l_shipmode#X, sum#X, sum#X] -(21) WholeStageCodegenTransformer (X) +(23) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, l_shipmode#X, sum#X, sum#X] Arguments: false -(22) ColumnarExchange +(24) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_shipmode#X, sum#X, sum#X] +Arguments: X + +(25) ColumnarExchange Input [4]: [hash_partition_key#X, l_shipmode#X, sum#X, sum#X] Arguments: hashpartitioning(l_shipmode#X, 1), ENSURE_REQUIREMENTS, [l_shipmode#X, sum#X, sum#X], [plan_id=X], [id=#X] -(23) ShuffleQueryStage +(26) ShuffleQueryStage Output [3]: [l_shipmode#X, sum#X, sum#X] Arguments: X -(24) InputAdapter +(27) InputAdapter Input [3]: [l_shipmode#X, sum#X, sum#X] -(25) InputIteratorTransformer +(28) InputIteratorTransformer Input [3]: [l_shipmode#X, sum#X, sum#X] -(26) RegularHashAggregateExecTransformer +(29) RegularHashAggregateExecTransformer Input [3]: [l_shipmode#X, sum#X, sum#X] Keys [1]: [l_shipmode#X] Functions [2]: [sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END), sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)] Aggregate Attributes [2]: [sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X, sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X] Results [3]: [l_shipmode#X, sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X AS high_line_count#X, sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X AS low_line_count#X] -(27) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: false -(28) ColumnarExchange +(31) VeloxAppendBatches +Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] +Arguments: X + +(32) ColumnarExchange Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: rangepartitioning(l_shipmode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(29) ShuffleQueryStage +(33) ShuffleQueryStage Output [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: X -(30) InputAdapter +(34) InputAdapter Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] -(31) InputIteratorTransformer +(35) InputIteratorTransformer Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] -(32) SortExecTransformer +(36) SortExecTransformer Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: [l_shipmode#X ASC NULLS FIRST], true, 0 -(33) WholeStageCodegenTransformer (X) +(37) WholeStageCodegenTransformer (X) Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: false -(34) VeloxColumnarToRowExec +(38) VeloxColumnarToRowExec Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] -(35) Scan parquet +(39) Scan parquet Output [2]: [o_orderkey#X, o_orderpriority#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderkey)] ReadSchema: struct -(36) Filter +(40) Filter Input [2]: [o_orderkey#X, o_orderpriority#X] Condition : isnotnull(o_orderkey#X) -(37) Exchange +(41) Exchange Input [2]: [o_orderkey#X, o_orderpriority#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(38) Scan parquet +(42) Sort +Input [2]: [o_orderkey#X, o_orderpriority#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(43) Scan parquet Output [5]: [l_orderkey#X, l_shipdate#X, l_commitdate#X, l_receiptdate#X, l_shipmode#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_commitdate), IsNotNull(l_receiptdate), IsNotNull(l_shipdate), In(l_shipmode, [MAIL,SHIP]), GreaterThanOrEqual(l_receiptdate,1994-01-01), LessThan(l_receiptdate,1995-01-01), IsNotNull(l_orderkey)] ReadSchema: struct -(39) Filter +(44) Filter Input [5]: [l_orderkey#X, l_shipdate#X, l_commitdate#X, l_receiptdate#X, l_shipmode#X] Condition : ((((((((isnotnull(l_commitdate#X) AND isnotnull(l_receiptdate#X)) AND isnotnull(l_shipdate#X)) AND l_shipmode#X IN (MAIL,SHIP)) AND (l_commitdate#X < l_receiptdate#X)) AND (l_shipdate#X < l_commitdate#X)) AND (l_receiptdate#X >= 1994-01-01)) AND (l_receiptdate#X < 1995-01-01)) AND isnotnull(l_orderkey#X)) -(40) Project +(45) Project Output [2]: [l_orderkey#X, l_shipmode#X] Input [5]: [l_orderkey#X, l_shipdate#X, l_commitdate#X, l_receiptdate#X, l_shipmode#X] -(41) Exchange +(46) Exchange Input [2]: [l_orderkey#X, l_shipmode#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(42) ShuffledHashJoin +(47) Sort +Input [2]: [l_orderkey#X, l_shipmode#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(48) SortMergeJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(43) Project +(49) Project Output [2]: [o_orderpriority#X, l_shipmode#X] Input [4]: [o_orderkey#X, o_orderpriority#X, l_orderkey#X, l_shipmode#X] -(44) HashAggregate +(50) HashAggregate Input [2]: [o_orderpriority#X, l_shipmode#X] Keys [1]: [l_shipmode#X] Functions [2]: [partial_sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END), partial_sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)] Aggregate Attributes [2]: [sum#X, sum#X] Results [3]: [l_shipmode#X, sum#X, sum#X] -(45) Exchange +(51) Exchange Input [3]: [l_shipmode#X, sum#X, sum#X] Arguments: hashpartitioning(l_shipmode#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(46) HashAggregate +(52) HashAggregate Input [3]: [l_shipmode#X, sum#X, sum#X] Keys [1]: [l_shipmode#X] Functions [2]: [sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END), sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)] Aggregate Attributes [2]: [sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X, sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X] Results [3]: [l_shipmode#X, sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X AS high_line_count#X, sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X AS low_line_count#X] -(47) Exchange +(53) Exchange Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: rangepartitioning(l_shipmode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(48) Sort +(54) Sort Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: [l_shipmode#X ASC NULLS FIRST], true, 0 -(49) AdaptiveSparkPlan +(55) AdaptiveSparkPlan Output [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/13.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/13.txt index 14f2b770f9960..b9bf0f1fad603 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/13.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/13.txt @@ -1,53 +1,55 @@ == Physical Plan == -AdaptiveSparkPlan (52) +AdaptiveSparkPlan (58) +- == Final Plan == - VeloxColumnarToRowExec (36) - +- ^ SortExecTransformer (34) - +- ^ InputIteratorTransformer (33) - +- ^ InputAdapter (32) - +- ^ ShuffleQueryStage (31) - +- ColumnarExchange (30) - +- ^ RegularHashAggregateExecTransformer (28) - +- ^ InputIteratorTransformer (27) - +- ^ InputAdapter (26) - +- ^ ShuffleQueryStage (25) - +- ColumnarExchange (24) - +- ^ ProjectExecTransformer (22) - +- ^ FlushableHashAggregateExecTransformer (21) - +- ^ ProjectExecTransformer (20) - +- ^ RegularHashAggregateExecTransformer (19) - +- ^ RegularHashAggregateExecTransformer (18) - +- ^ ProjectExecTransformer (17) - +- ^ ShuffledHashJoinExecTransformer LeftOuter (16) - :- ^ InputIteratorTransformer (7) - : +- ^ InputAdapter (6) - : +- ^ ShuffleQueryStage (5) - : +- ColumnarExchange (4) + VeloxColumnarToRowExec (40) + +- ^ SortExecTransformer (38) + +- ^ InputIteratorTransformer (37) + +- ShuffleQueryStage (35) + +- ColumnarExchange (34) + +- VeloxAppendBatches (33) + +- ^ RegularHashAggregateExecTransformer (31) + +- ^ InputIteratorTransformer (30) + +- ShuffleQueryStage (28) + +- ColumnarExchange (27) + +- VeloxAppendBatches (26) + +- ^ ProjectExecTransformer (24) + +- ^ FlushableHashAggregateExecTransformer (23) + +- ^ ProjectExecTransformer (22) + +- ^ RegularHashAggregateExecTransformer (21) + +- ^ RegularHashAggregateExecTransformer (20) + +- ^ ProjectExecTransformer (19) + +- ^ ShuffledHashJoinExecTransformer LeftOuter BuildLeft (18) + :- ^ InputIteratorTransformer (8) + : +- ShuffleQueryStage (6) + : +- ColumnarExchange (5) + : +- VeloxAppendBatches (4) : +- ^ ProjectExecTransformer (2) : +- ^ Scan parquet (1) - +- ^ InputIteratorTransformer (15) - +- ^ InputAdapter (14) - +- ^ ShuffleQueryStage (13) - +- ColumnarExchange (12) - +- ^ ProjectExecTransformer (10) - +- ^ FilterExecTransformer (9) - +- ^ Scan parquet (8) + +- ^ InputIteratorTransformer (17) + +- ShuffleQueryStage (15) + +- ColumnarExchange (14) + +- VeloxAppendBatches (13) + +- ^ ProjectExecTransformer (11) + +- ^ FilterExecTransformer (10) + +- ^ Scan parquet (9) +- == Initial Plan == - Sort (51) - +- Exchange (50) - +- HashAggregate (49) - +- Exchange (48) - +- HashAggregate (47) - +- HashAggregate (46) - +- HashAggregate (45) - +- Project (44) - +- ShuffledHashJoin LeftOuter BuildRight (43) - :- Exchange (38) - : +- Scan parquet (37) - +- Exchange (42) - +- Project (41) - +- Filter (40) - +- Scan parquet (39) + Sort (57) + +- Exchange (56) + +- HashAggregate (55) + +- Exchange (54) + +- HashAggregate (53) + +- HashAggregate (52) + +- HashAggregate (51) + +- Project (50) + +- SortMergeJoin LeftOuter (49) + :- Sort (43) + : +- Exchange (42) + : +- Scan parquet (41) + +- Sort (48) + +- Exchange (47) + +- Project (46) + +- Filter (45) + +- Scan parquet (44) (1) Scan parquet @@ -64,223 +66,247 @@ Input [1]: [c_custkey#X] Input [2]: [hash_partition_key#X, c_custkey#X] Arguments: false -(4) ColumnarExchange +(4) VeloxAppendBatches +Input [2]: [hash_partition_key#X, c_custkey#X] +Arguments: X + +(5) ColumnarExchange Input [2]: [hash_partition_key#X, c_custkey#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X], [plan_id=X], [id=#X] -(5) ShuffleQueryStage +(6) ShuffleQueryStage Output [1]: [c_custkey#X] Arguments: X -(6) InputAdapter +(7) InputAdapter Input [1]: [c_custkey#X] -(7) InputIteratorTransformer +(8) InputIteratorTransformer Input [1]: [c_custkey#X] -(8) Scan parquet +(9) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_comment#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_comment), IsNotNull(o_custkey)] ReadSchema: struct -(9) FilterExecTransformer +(10) FilterExecTransformer Input [3]: [o_orderkey#X, o_custkey#X, o_comment#X] Arguments: ((isnotnull(o_comment#X) AND NOT o_comment#X LIKE %special%requests%) AND isnotnull(o_custkey#X)) -(10) ProjectExecTransformer +(11) ProjectExecTransformer Output [3]: [hash(o_custkey#X, 42) AS hash_partition_key#X, o_orderkey#X, o_custkey#X] Input [3]: [o_orderkey#X, o_custkey#X, o_comment#X] -(11) WholeStageCodegenTransformer (X) +(12) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] Arguments: false -(12) ColumnarExchange +(13) VeloxAppendBatches +Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] +Arguments: X + +(14) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [id=#X] -(13) ShuffleQueryStage +(15) ShuffleQueryStage Output [2]: [o_orderkey#X, o_custkey#X] Arguments: X -(14) InputAdapter +(16) InputAdapter Input [2]: [o_orderkey#X, o_custkey#X] -(15) InputIteratorTransformer +(17) InputIteratorTransformer Input [2]: [o_orderkey#X, o_custkey#X] -(16) ShuffledHashJoinExecTransformer +(18) ShuffledHashJoinExecTransformer Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(17) ProjectExecTransformer +(19) ProjectExecTransformer Output [2]: [c_custkey#X, o_orderkey#X] Input [3]: [c_custkey#X, o_orderkey#X, o_custkey#X] -(18) RegularHashAggregateExecTransformer +(20) RegularHashAggregateExecTransformer Input [2]: [c_custkey#X, o_orderkey#X] Keys [1]: [c_custkey#X] Functions [1]: [partial_count(o_orderkey#X)] Aggregate Attributes [1]: [count#X] Results [2]: [c_custkey#X, count#X] -(19) RegularHashAggregateExecTransformer +(21) RegularHashAggregateExecTransformer Input [2]: [c_custkey#X, count#X] Keys [1]: [c_custkey#X] Functions [1]: [count(o_orderkey#X)] Aggregate Attributes [1]: [count(o_orderkey#X)#X] Results [2]: [c_custkey#X, count(o_orderkey#X)#X] -(20) ProjectExecTransformer +(22) ProjectExecTransformer Output [1]: [count(o_orderkey#X)#X AS c_count#X] Input [2]: [c_custkey#X, count(o_orderkey#X)#X] -(21) FlushableHashAggregateExecTransformer +(23) FlushableHashAggregateExecTransformer Input [1]: [c_count#X] Keys [1]: [c_count#X] Functions [1]: [partial_count(1)] Aggregate Attributes [1]: [count#X] Results [2]: [c_count#X, count#X] -(22) ProjectExecTransformer +(24) ProjectExecTransformer Output [3]: [hash(c_count#X, 42) AS hash_partition_key#X, c_count#X, count#X] Input [2]: [c_count#X, count#X] -(23) WholeStageCodegenTransformer (X) +(25) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, c_count#X, count#X] Arguments: false -(24) ColumnarExchange +(26) VeloxAppendBatches +Input [3]: [hash_partition_key#X, c_count#X, count#X] +Arguments: X + +(27) ColumnarExchange Input [3]: [hash_partition_key#X, c_count#X, count#X] Arguments: hashpartitioning(c_count#X, 1), ENSURE_REQUIREMENTS, [c_count#X, count#X], [plan_id=X], [id=#X] -(25) ShuffleQueryStage +(28) ShuffleQueryStage Output [2]: [c_count#X, count#X] Arguments: X -(26) InputAdapter +(29) InputAdapter Input [2]: [c_count#X, count#X] -(27) InputIteratorTransformer +(30) InputIteratorTransformer Input [2]: [c_count#X, count#X] -(28) RegularHashAggregateExecTransformer +(31) RegularHashAggregateExecTransformer Input [2]: [c_count#X, count#X] Keys [1]: [c_count#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [c_count#X, count(1)#X AS custdist#X] -(29) WholeStageCodegenTransformer (X) +(32) WholeStageCodegenTransformer (X) Input [2]: [c_count#X, custdist#X] Arguments: false -(30) ColumnarExchange +(33) VeloxAppendBatches +Input [2]: [c_count#X, custdist#X] +Arguments: X + +(34) ColumnarExchange Input [2]: [c_count#X, custdist#X] Arguments: rangepartitioning(custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(31) ShuffleQueryStage +(35) ShuffleQueryStage Output [2]: [c_count#X, custdist#X] Arguments: X -(32) InputAdapter +(36) InputAdapter Input [2]: [c_count#X, custdist#X] -(33) InputIteratorTransformer +(37) InputIteratorTransformer Input [2]: [c_count#X, custdist#X] -(34) SortExecTransformer +(38) SortExecTransformer Input [2]: [c_count#X, custdist#X] Arguments: [custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST], true, 0 -(35) WholeStageCodegenTransformer (X) +(39) WholeStageCodegenTransformer (X) Input [2]: [c_count#X, custdist#X] Arguments: false -(36) VeloxColumnarToRowExec +(40) VeloxColumnarToRowExec Input [2]: [c_count#X, custdist#X] -(37) Scan parquet +(41) Scan parquet Output [1]: [c_custkey#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(38) Exchange +(42) Exchange Input [1]: [c_custkey#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(39) Scan parquet +(43) Sort +Input [1]: [c_custkey#X] +Arguments: [c_custkey#X ASC NULLS FIRST], false, 0 + +(44) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_comment#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_comment), IsNotNull(o_custkey)] ReadSchema: struct -(40) Filter +(45) Filter Input [3]: [o_orderkey#X, o_custkey#X, o_comment#X] Condition : ((isnotnull(o_comment#X) AND NOT o_comment#X LIKE %special%requests%) AND isnotnull(o_custkey#X)) -(41) Project +(46) Project Output [2]: [o_orderkey#X, o_custkey#X] Input [3]: [o_orderkey#X, o_custkey#X, o_comment#X] -(42) Exchange +(47) Exchange Input [2]: [o_orderkey#X, o_custkey#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(43) ShuffledHashJoin +(48) Sort +Input [2]: [o_orderkey#X, o_custkey#X] +Arguments: [o_custkey#X ASC NULLS FIRST], false, 0 + +(49) SortMergeJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(44) Project +(50) Project Output [2]: [c_custkey#X, o_orderkey#X] Input [3]: [c_custkey#X, o_orderkey#X, o_custkey#X] -(45) HashAggregate +(51) HashAggregate Input [2]: [c_custkey#X, o_orderkey#X] Keys [1]: [c_custkey#X] Functions [1]: [partial_count(o_orderkey#X)] Aggregate Attributes [1]: [count#X] Results [2]: [c_custkey#X, count#X] -(46) HashAggregate +(52) HashAggregate Input [2]: [c_custkey#X, count#X] Keys [1]: [c_custkey#X] Functions [1]: [count(o_orderkey#X)] Aggregate Attributes [1]: [count(o_orderkey#X)#X] Results [1]: [count(o_orderkey#X)#X AS c_count#X] -(47) HashAggregate +(53) HashAggregate Input [1]: [c_count#X] Keys [1]: [c_count#X] Functions [1]: [partial_count(1)] Aggregate Attributes [1]: [count#X] Results [2]: [c_count#X, count#X] -(48) Exchange +(54) Exchange Input [2]: [c_count#X, count#X] Arguments: hashpartitioning(c_count#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(49) HashAggregate +(55) HashAggregate Input [2]: [c_count#X, count#X] Keys [1]: [c_count#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [c_count#X, count(1)#X AS custdist#X] -(50) Exchange +(56) Exchange Input [2]: [c_count#X, custdist#X] Arguments: rangepartitioning(custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(51) Sort +(57) Sort Input [2]: [c_count#X, custdist#X] Arguments: [custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST], true, 0 -(52) AdaptiveSparkPlan +(58) AdaptiveSparkPlan Output [2]: [c_count#X, custdist#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/14.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/14.txt index 3be22928e4f25..425c55f5a4ce2 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/14.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/14.txt @@ -1,38 +1,40 @@ == Physical Plan == -AdaptiveSparkPlan (35) +AdaptiveSparkPlan (39) +- == Final Plan == - VeloxColumnarToRowExec (23) - +- ^ ProjectExecTransformer (21) - +- ^ RegularHashAggregateExecTransformer (20) - +- ^ RegularHashAggregateExecTransformer (19) - +- ^ ProjectExecTransformer (18) - +- ^ ShuffledHashJoinExecTransformer Inner (17) - :- ^ InputIteratorTransformer (8) - : +- ^ InputAdapter (7) - : +- ^ ShuffleQueryStage (6) - : +- ColumnarExchange (5) + VeloxColumnarToRowExec (25) + +- ^ ProjectExecTransformer (23) + +- ^ RegularHashAggregateExecTransformer (22) + +- ^ RegularHashAggregateExecTransformer (21) + +- ^ ProjectExecTransformer (20) + +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (19) + :- ^ InputIteratorTransformer (9) + : +- ShuffleQueryStage (7) + : +- ColumnarExchange (6) + : +- VeloxAppendBatches (5) : +- ^ ProjectExecTransformer (3) : +- ^ FilterExecTransformer (2) : +- ^ Scan parquet (1) - +- ^ InputIteratorTransformer (16) - +- ^ InputAdapter (15) - +- ^ ShuffleQueryStage (14) - +- ColumnarExchange (13) - +- ^ ProjectExecTransformer (11) - +- ^ FilterExecTransformer (10) - +- ^ Scan parquet (9) + +- ^ InputIteratorTransformer (18) + +- ShuffleQueryStage (16) + +- ColumnarExchange (15) + +- VeloxAppendBatches (14) + +- ^ ProjectExecTransformer (12) + +- ^ FilterExecTransformer (11) + +- ^ Scan parquet (10) +- == Initial Plan == - HashAggregate (34) - +- HashAggregate (33) - +- Project (32) - +- ShuffledHashJoin Inner BuildRight (31) - :- Exchange (27) - : +- Project (26) - : +- Filter (25) - : +- Scan parquet (24) - +- Exchange (30) - +- Filter (29) - +- Scan parquet (28) + HashAggregate (38) + +- HashAggregate (37) + +- Project (36) + +- SortMergeJoin Inner (35) + :- Sort (30) + : +- Exchange (29) + : +- Project (28) + : +- Filter (27) + : +- Scan parquet (26) + +- Sort (34) + +- Exchange (33) + +- Filter (32) + +- Scan parquet (31) (1) Scan parquet @@ -54,144 +56,160 @@ Input [4]: [l_partkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Input [4]: [hash_partition_key#X, l_partkey#X, l_extendedprice#X, l_discount#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_partkey#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(6) ColumnarExchange Input [4]: [hash_partition_key#X, l_partkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [3]: [l_partkey#X, l_extendedprice#X, l_discount#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [3]: [l_partkey#X, l_extendedprice#X, l_discount#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [3]: [l_partkey#X, l_extendedprice#X, l_discount#X] -(9) Scan parquet +(10) Scan parquet Output [2]: [p_partkey#X, p_type#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_partkey)] ReadSchema: struct -(10) FilterExecTransformer +(11) FilterExecTransformer Input [2]: [p_partkey#X, p_type#X] Arguments: isnotnull(p_partkey#X) -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [3]: [hash(p_partkey#X, 42) AS hash_partition_key#X, p_partkey#X, p_type#X] Input [2]: [p_partkey#X, p_type#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, p_partkey#X, p_type#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [3]: [hash_partition_key#X, p_partkey#X, p_type#X] +Arguments: X + +(15) ColumnarExchange Input [3]: [hash_partition_key#X, p_partkey#X, p_type#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X, p_type#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [2]: [p_partkey#X, p_type#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [2]: [p_partkey#X, p_type#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [2]: [p_partkey#X, p_type#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [5]: [l_extendedprice#X, l_discount#X, p_type#X, CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) ELSE 0.0000 END AS _pre_X#X, CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) AS _pre_X#X] Input [5]: [l_partkey#X, l_extendedprice#X, l_discount#X, p_partkey#X, p_type#X] -(19) RegularHashAggregateExecTransformer +(21) RegularHashAggregateExecTransformer Input [5]: [l_extendedprice#X, l_discount#X, p_type#X, _pre_X#X, _pre_X#X] Keys: [] Functions [2]: [partial_sum(_pre_X#X), partial_sum(_pre_X#X)] Aggregate Attributes [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Results [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] -(20) RegularHashAggregateExecTransformer +(22) RegularHashAggregateExecTransformer Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Keys: [] Functions [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) ELSE 0.0000 END), sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) ELSE 0.0000 END)#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] Results [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) ELSE 0.0000 END)#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] -(21) ProjectExecTransformer +(23) ProjectExecTransformer Output [1]: [CheckOverflow((promote_precision(CheckOverflow((100.0000 * promote_precision(sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) ELSE 0.0000 END)#X)), DecimalType(38,6), true)) / promote_precision(cast(sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X as decimal(38,6)))), DecimalType(38,6), true) AS promo_revenue#X] Input [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) ELSE 0.0000 END)#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] -(22) WholeStageCodegenTransformer (X) +(24) WholeStageCodegenTransformer (X) Input [1]: [promo_revenue#X] Arguments: false -(23) VeloxColumnarToRowExec +(25) VeloxColumnarToRowExec Input [1]: [promo_revenue#X] -(24) Scan parquet +(26) Scan parquet Output [4]: [l_partkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1995-09-01), LessThan(l_shipdate,1995-10-01), IsNotNull(l_partkey)] ReadSchema: struct -(25) Filter +(27) Filter Input [4]: [l_partkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : (((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1995-09-01)) AND (l_shipdate#X < 1995-10-01)) AND isnotnull(l_partkey#X)) -(26) Project +(28) Project Output [3]: [l_partkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_partkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(27) Exchange +(29) Exchange Input [3]: [l_partkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(28) Scan parquet +(30) Sort +Input [3]: [l_partkey#X, l_extendedprice#X, l_discount#X] +Arguments: [l_partkey#X ASC NULLS FIRST], false, 0 + +(31) Scan parquet Output [2]: [p_partkey#X, p_type#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_partkey)] ReadSchema: struct -(29) Filter +(32) Filter Input [2]: [p_partkey#X, p_type#X] Condition : isnotnull(p_partkey#X) -(30) Exchange +(33) Exchange Input [2]: [p_partkey#X, p_type#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(31) ShuffledHashJoin +(34) Sort +Input [2]: [p_partkey#X, p_type#X] +Arguments: [p_partkey#X ASC NULLS FIRST], false, 0 + +(35) SortMergeJoin Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(32) Project +(36) Project Output [3]: [l_extendedprice#X, l_discount#X, p_type#X] Input [5]: [l_partkey#X, l_extendedprice#X, l_discount#X, p_partkey#X, p_type#X] -(33) HashAggregate +(37) HashAggregate Input [3]: [l_extendedprice#X, l_discount#X, p_type#X] Keys: [] Functions [2]: [partial_sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) ELSE 0.0000 END), partial_sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Results [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] -(34) HashAggregate +(38) HashAggregate Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Keys: [] Functions [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) ELSE 0.0000 END), sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) ELSE 0.0000 END)#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] Results [1]: [CheckOverflow((promote_precision(CheckOverflow((100.0000 * promote_precision(sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) ELSE 0.0000 END)#X)), DecimalType(38,6), true)) / promote_precision(cast(sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X as decimal(38,6)))), DecimalType(38,6), true) AS promo_revenue#X] -(35) AdaptiveSparkPlan +(39) AdaptiveSparkPlan Output [1]: [promo_revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/15.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/15.txt index 82b075711eccc..f003eed994d08 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/15.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/15.txt @@ -1,47 +1,49 @@ == Physical Plan == -AdaptiveSparkPlan (45) +AdaptiveSparkPlan (50) +- == Final Plan == - VeloxColumnarToRowExec (30) - +- ^ SortExecTransformer (28) - +- ^ InputIteratorTransformer (27) - +- ^ InputAdapter (26) - +- ^ ShuffleQueryStage (25) - +- ColumnarExchange (24) - +- ^ ProjectExecTransformer (22) - +- ^ ShuffledHashJoinExecTransformer Inner (21) - :- ^ InputIteratorTransformer (8) - : +- ^ InputAdapter (7) - : +- ^ ShuffleQueryStage (6) - : +- ColumnarExchange (5) + VeloxColumnarToRowExec (33) + +- ^ SortExecTransformer (31) + +- ^ InputIteratorTransformer (30) + +- ShuffleQueryStage (28) + +- ColumnarExchange (27) + +- VeloxAppendBatches (26) + +- ^ ProjectExecTransformer (24) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (23) + :- ^ InputIteratorTransformer (9) + : +- ShuffleQueryStage (7) + : +- ColumnarExchange (6) + : +- VeloxAppendBatches (5) : +- ^ ProjectExecTransformer (3) : +- ^ FilterExecTransformer (2) : +- ^ Scan parquet (1) - +- ^ FilterExecTransformer (20) - +- ^ RegularHashAggregateExecTransformer (19) - +- ^ InputIteratorTransformer (18) - +- ^ InputAdapter (17) - +- ^ ShuffleQueryStage (16) - +- ColumnarExchange (15) - +- ^ ProjectExecTransformer (13) - +- ^ FlushableHashAggregateExecTransformer (12) - +- ^ ProjectExecTransformer (11) - +- ^ FilterExecTransformer (10) - +- ^ Scan parquet (9) + +- ^ FilterExecTransformer (22) + +- ^ RegularHashAggregateExecTransformer (21) + +- ^ InputIteratorTransformer (20) + +- ShuffleQueryStage (18) + +- ColumnarExchange (17) + +- VeloxAppendBatches (16) + +- ^ ProjectExecTransformer (14) + +- ^ FlushableHashAggregateExecTransformer (13) + +- ^ ProjectExecTransformer (12) + +- ^ FilterExecTransformer (11) + +- ^ Scan parquet (10) +- == Initial Plan == - Sort (44) - +- Exchange (43) - +- Project (42) - +- ShuffledHashJoin Inner BuildLeft (41) - :- Exchange (33) - : +- Filter (32) - : +- Scan parquet (31) - +- Filter (40) - +- HashAggregate (39) - +- Exchange (38) - +- HashAggregate (37) - +- Project (36) - +- Filter (35) - +- Scan parquet (34) + Sort (49) + +- Exchange (48) + +- Project (47) + +- SortMergeJoin Inner (46) + :- Sort (37) + : +- Exchange (36) + : +- Filter (35) + : +- Scan parquet (34) + +- Sort (45) + +- Filter (44) + +- HashAggregate (43) + +- Exchange (42) + +- HashAggregate (41) + +- Project (40) + +- Filter (39) + +- Scan parquet (38) (1) Scan parquet @@ -63,182 +65,202 @@ Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_phone#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_phone#X] +Arguments: X + +(6) ColumnarExchange Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_phone#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_address#X, s_phone#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] -(9) Scan parquet +(10) Scan parquet Output [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1996-01-01), LessThan(l_shipdate,1996-04-01), IsNotNull(l_suppkey)] ReadSchema: struct -(10) FilterExecTransformer +(11) FilterExecTransformer Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: (((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1996-01-01)) AND (l_shipdate#X < 1996-04-01)) AND isnotnull(l_suppkey#X)) -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) AS _pre_X#X] Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(12) FlushableHashAggregateExecTransformer +(13) FlushableHashAggregateExecTransformer Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, _pre_X#X] Keys [1]: [l_suppkey#X] Functions [1]: [partial_sum(_pre_X#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_suppkey#X, sum#X, isEmpty#X] -(13) ProjectExecTransformer +(14) ProjectExecTransformer Output [4]: [hash(l_suppkey#X, 42) AS hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] Input [3]: [l_suppkey#X, sum#X, isEmpty#X] -(14) WholeStageCodegenTransformer (X) +(15) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: false -(15) ColumnarExchange +(16) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] +Arguments: X + +(17) ColumnarExchange Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(16) ShuffleQueryStage +(18) ShuffleQueryStage Output [3]: [l_suppkey#X, sum#X, isEmpty#X] Arguments: X -(17) InputAdapter +(19) InputAdapter Input [3]: [l_suppkey#X, sum#X, isEmpty#X] -(18) InputIteratorTransformer +(20) InputIteratorTransformer Input [3]: [l_suppkey#X, sum#X, isEmpty#X] -(19) RegularHashAggregateExecTransformer +(21) RegularHashAggregateExecTransformer Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Keys [1]: [l_suppkey#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] Results [2]: [l_suppkey#X AS supplier_no#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X AS total_revenue#X] -(20) FilterExecTransformer +(22) FilterExecTransformer Input [2]: [supplier_no#X, total_revenue#X] Arguments: (isnotnull(total_revenue#X) AND (total_revenue#X = Subquery subquery#X, [id=#X])) -(21) ShuffledHashJoinExecTransformer +(23) ShuffledHashJoinExecTransformer Left keys [1]: [s_suppkey#X] Right keys [1]: [supplier_no#X] Join condition: None -(22) ProjectExecTransformer +(24) ProjectExecTransformer Output [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Input [6]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, supplier_no#X, total_revenue#X] -(23) WholeStageCodegenTransformer (X) +(25) WholeStageCodegenTransformer (X) Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: false -(24) ColumnarExchange +(26) VeloxAppendBatches +Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] +Arguments: X + +(27) ColumnarExchange Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: rangepartitioning(s_suppkey#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(25) ShuffleQueryStage +(28) ShuffleQueryStage Output [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: X -(26) InputAdapter +(29) InputAdapter Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] -(27) InputIteratorTransformer +(30) InputIteratorTransformer Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] -(28) SortExecTransformer +(31) SortExecTransformer Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: [s_suppkey#X ASC NULLS FIRST], true, 0 -(29) WholeStageCodegenTransformer (X) +(32) WholeStageCodegenTransformer (X) Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: false -(30) VeloxColumnarToRowExec +(33) VeloxColumnarToRowExec Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] -(31) Scan parquet +(34) Scan parquet Output [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey)] ReadSchema: struct -(32) Filter +(35) Filter Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] Condition : isnotnull(s_suppkey#X) -(33) Exchange +(36) Exchange Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(34) Scan parquet +(37) Sort +Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] +Arguments: [s_suppkey#X ASC NULLS FIRST], false, 0 + +(38) Scan parquet Output [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1996-01-01), LessThan(l_shipdate,1996-04-01), IsNotNull(l_suppkey)] ReadSchema: struct -(35) Filter +(39) Filter Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : (((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1996-01-01)) AND (l_shipdate#X < 1996-04-01)) AND isnotnull(l_suppkey#X)) -(36) Project +(40) Project Output [3]: [l_suppkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(37) HashAggregate +(41) HashAggregate Input [3]: [l_suppkey#X, l_extendedprice#X, l_discount#X] Keys [1]: [l_suppkey#X] Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_suppkey#X, sum#X, isEmpty#X] -(38) Exchange +(42) Exchange Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(39) HashAggregate +(43) HashAggregate Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Keys [1]: [l_suppkey#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] Results [2]: [l_suppkey#X AS supplier_no#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X AS total_revenue#X] -(40) Filter +(44) Filter Input [2]: [supplier_no#X, total_revenue#X] Condition : (isnotnull(total_revenue#X) AND (total_revenue#X = Subquery subquery#X, [id=#X])) -(41) ShuffledHashJoin +(45) Sort +Input [2]: [supplier_no#X, total_revenue#X] +Arguments: [supplier_no#X ASC NULLS FIRST], false, 0 + +(46) SortMergeJoin Left keys [1]: [s_suppkey#X] Right keys [1]: [supplier_no#X] Join condition: None -(42) Project +(47) Project Output [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Input [6]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, supplier_no#X, total_revenue#X] -(43) Exchange +(48) Exchange Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: rangepartitioning(s_suppkey#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(44) Sort +(49) Sort Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: [s_suppkey#X ASC NULLS FIRST], true, 0 -(45) AdaptiveSparkPlan +(50) AdaptiveSparkPlan Output [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/16.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/16.txt index 4eff981d78bde..c9374b01ff021 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/16.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/16.txt @@ -1,64 +1,66 @@ == Physical Plan == -AdaptiveSparkPlan (64) +AdaptiveSparkPlan (71) +- == Final Plan == - VeloxColumnarToRowExec (42) - +- ^ SortExecTransformer (40) - +- ^ InputIteratorTransformer (39) - +- ^ InputAdapter (38) - +- ^ ShuffleQueryStage (37) - +- ColumnarExchange (36) - +- ^ RegularHashAggregateExecTransformer (34) - +- ^ InputIteratorTransformer (33) - +- ^ InputAdapter (32) - +- ^ ShuffleQueryStage (31) - +- ColumnarExchange (30) - +- ^ ProjectExecTransformer (28) - +- ^ FlushableHashAggregateExecTransformer (27) - +- ^ RegularHashAggregateExecTransformer (26) - +- ^ InputIteratorTransformer (25) - +- ^ InputAdapter (24) - +- ^ ShuffleQueryStage (23) - +- ColumnarExchange (22) - +- ^ ProjectExecTransformer (20) - +- ^ FlushableHashAggregateExecTransformer (19) - +- ^ ProjectExecTransformer (18) - +- ^ ShuffledHashJoinExecTransformer Inner (17) - :- ^ InputIteratorTransformer (8) - : +- ^ InputAdapter (7) - : +- ^ ShuffleQueryStage (6) - : +- ColumnarExchange (5) + VeloxColumnarToRowExec (47) + +- ^ SortExecTransformer (45) + +- ^ InputIteratorTransformer (44) + +- ShuffleQueryStage (42) + +- ColumnarExchange (41) + +- VeloxAppendBatches (40) + +- ^ RegularHashAggregateExecTransformer (38) + +- ^ InputIteratorTransformer (37) + +- ShuffleQueryStage (35) + +- ColumnarExchange (34) + +- VeloxAppendBatches (33) + +- ^ ProjectExecTransformer (31) + +- ^ FlushableHashAggregateExecTransformer (30) + +- ^ RegularHashAggregateExecTransformer (29) + +- ^ InputIteratorTransformer (28) + +- ShuffleQueryStage (26) + +- ColumnarExchange (25) + +- VeloxAppendBatches (24) + +- ^ ProjectExecTransformer (22) + +- ^ FlushableHashAggregateExecTransformer (21) + +- ^ ProjectExecTransformer (20) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (19) + :- ^ InputIteratorTransformer (9) + : +- ShuffleQueryStage (7) + : +- ColumnarExchange (6) + : +- VeloxAppendBatches (5) : +- ^ ProjectExecTransformer (3) : +- ^ FilterExecTransformer (2) : +- ^ Scan parquet (1) - +- ^ InputIteratorTransformer (16) - +- ^ InputAdapter (15) - +- ^ ShuffleQueryStage (14) - +- ColumnarExchange (13) - +- ^ ProjectExecTransformer (11) - +- ^ FilterExecTransformer (10) - +- ^ Scan parquet (9) + +- ^ InputIteratorTransformer (18) + +- ShuffleQueryStage (16) + +- ColumnarExchange (15) + +- VeloxAppendBatches (14) + +- ^ ProjectExecTransformer (12) + +- ^ FilterExecTransformer (11) + +- ^ Scan parquet (10) +- == Initial Plan == - Sort (63) - +- Exchange (62) - +- HashAggregate (61) - +- Exchange (60) - +- HashAggregate (59) - +- HashAggregate (58) - +- Exchange (57) - +- HashAggregate (56) - +- Project (55) - +- ShuffledHashJoin Inner BuildRight (54) - :- Exchange (50) - : +- BroadcastHashJoin LeftAnti BuildRight (49) - : :- Filter (44) - : : +- Scan parquet (43) - : +- BroadcastExchange (48) - : +- Project (47) - : +- Filter (46) - : +- Scan parquet (45) - +- Exchange (53) - +- Filter (52) - +- Scan parquet (51) + Sort (70) + +- Exchange (69) + +- HashAggregate (68) + +- Exchange (67) + +- HashAggregate (66) + +- HashAggregate (65) + +- Exchange (64) + +- HashAggregate (63) + +- Project (62) + +- SortMergeJoin Inner (61) + :- Sort (56) + : +- Exchange (55) + : +- BroadcastHashJoin LeftAnti BuildRight (54) + : :- Filter (49) + : : +- Scan parquet (48) + : +- BroadcastExchange (53) + : +- Project (52) + : +- Filter (51) + : +- Scan parquet (50) + +- Sort (60) + +- Exchange (59) + +- Filter (58) + +- Scan parquet (57) (1) Scan parquet @@ -80,270 +82,298 @@ Input [2]: [ps_partkey#X, ps_suppkey#X] Input [3]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [3]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X] +Arguments: X + +(6) ColumnarExchange Input [3]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X] Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [2]: [ps_partkey#X, ps_suppkey#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [2]: [ps_partkey#X, ps_suppkey#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [2]: [ps_partkey#X, ps_suppkey#X] -(9) Scan parquet +(10) Scan parquet Output [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_brand), IsNotNull(p_type), Not(EqualTo(p_brand,Brand#X)), Not(StringStartsWith(p_type,MEDIUM POLISHED)), In(p_size, [14,19,23,3,36,45,49,9]), IsNotNull(p_partkey)] ReadSchema: struct -(10) FilterExecTransformer +(11) FilterExecTransformer Input [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] Arguments: (((((isnotnull(p_brand#X) AND isnotnull(p_type#X)) AND NOT (p_brand#X = Brand#X)) AND NOT StartsWith(p_type#X, MEDIUM POLISHED)) AND p_size#X IN (49,14,23,45,19,3,36,9)) AND isnotnull(p_partkey#X)) -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [5]: [hash(p_partkey#X, 42) AS hash_partition_key#X, p_partkey#X, p_brand#X, p_type#X, p_size#X] Input [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, p_partkey#X, p_brand#X, p_type#X, p_size#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [5]: [hash_partition_key#X, p_partkey#X, p_brand#X, p_type#X, p_size#X] +Arguments: X + +(15) ColumnarExchange Input [5]: [hash_partition_key#X, p_partkey#X, p_brand#X, p_type#X, p_size#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X, p_brand#X, p_type#X, p_size#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [ps_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [4]: [ps_suppkey#X, p_brand#X, p_type#X, p_size#X] Input [6]: [ps_partkey#X, ps_suppkey#X, p_partkey#X, p_brand#X, p_type#X, p_size#X] -(19) FlushableHashAggregateExecTransformer +(21) FlushableHashAggregateExecTransformer Input [4]: [ps_suppkey#X, p_brand#X, p_type#X, p_size#X] Keys [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Functions: [] Aggregate Attributes: [] Results [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(20) ProjectExecTransformer +(22) ProjectExecTransformer Output [5]: [hash(p_brand#X, p_type#X, p_size#X, ps_suppkey#X, 42) AS hash_partition_key#X, p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(21) WholeStageCodegenTransformer (X) +(23) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Arguments: false -(22) ColumnarExchange +(24) VeloxAppendBatches +Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, ps_suppkey#X] +Arguments: X + +(25) ColumnarExchange Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, ps_suppkey#X], [plan_id=X], [id=#X] -(23) ShuffleQueryStage +(26) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Arguments: X -(24) InputAdapter +(27) InputAdapter Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(25) InputIteratorTransformer +(28) InputIteratorTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(26) RegularHashAggregateExecTransformer +(29) RegularHashAggregateExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Keys [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Functions: [] Aggregate Attributes: [] Results [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(27) FlushableHashAggregateExecTransformer +(30) FlushableHashAggregateExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] Functions [1]: [partial_count(distinct ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count#X] -(28) ProjectExecTransformer +(31) ProjectExecTransformer Output [5]: [hash(p_brand#X, p_type#X, p_size#X, 42) AS hash_partition_key#X, p_brand#X, p_type#X, p_size#X, count#X] Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] -(29) WholeStageCodegenTransformer (X) +(32) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, count#X] Arguments: false -(30) ColumnarExchange +(33) VeloxAppendBatches +Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, count#X] +Arguments: X + +(34) ColumnarExchange Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, count#X] Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, count#X], [plan_id=X], [id=#X] -(31) ShuffleQueryStage +(35) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, count#X] Arguments: X -(32) InputAdapter +(36) InputAdapter Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] -(33) InputIteratorTransformer +(37) InputIteratorTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] -(34) RegularHashAggregateExecTransformer +(38) RegularHashAggregateExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] Functions [1]: [count(distinct ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count(ps_suppkey#X)#X AS supplier_cnt#X] -(35) WholeStageCodegenTransformer (X) +(39) WholeStageCodegenTransformer (X) Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: false -(36) ColumnarExchange +(40) VeloxAppendBatches +Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] +Arguments: X + +(41) ColumnarExchange Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: rangepartitioning(supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(37) ShuffleQueryStage +(42) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: X -(38) InputAdapter +(43) InputAdapter Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] -(39) InputIteratorTransformer +(44) InputIteratorTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] -(40) SortExecTransformer +(45) SortExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: [supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST], true, 0 -(41) WholeStageCodegenTransformer (X) +(46) WholeStageCodegenTransformer (X) Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: false -(42) VeloxColumnarToRowExec +(47) VeloxColumnarToRowExec Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] -(43) Scan parquet +(48) Scan parquet Output [2]: [ps_partkey#X, ps_suppkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_partkey)] ReadSchema: struct -(44) Filter +(49) Filter Input [2]: [ps_partkey#X, ps_suppkey#X] Condition : isnotnull(ps_partkey#X) -(45) Scan parquet +(50) Scan parquet Output [2]: [s_suppkey#X, s_comment#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_comment)] ReadSchema: struct -(46) Filter +(51) Filter Input [2]: [s_suppkey#X, s_comment#X] Condition : (isnotnull(s_comment#X) AND s_comment#X LIKE %Customer%Complaints%) -(47) Project +(52) Project Output [1]: [s_suppkey#X] Input [2]: [s_suppkey#X, s_comment#X] -(48) BroadcastExchange +(53) BroadcastExchange Input [1]: [s_suppkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),true), [plan_id=X] -(49) BroadcastHashJoin +(54) BroadcastHashJoin Left keys [1]: [ps_suppkey#X] Right keys [1]: [s_suppkey#X] Join condition: None -(50) Exchange +(55) Exchange Input [2]: [ps_partkey#X, ps_suppkey#X] Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(51) Scan parquet +(56) Sort +Input [2]: [ps_partkey#X, ps_suppkey#X] +Arguments: [ps_partkey#X ASC NULLS FIRST], false, 0 + +(57) Scan parquet Output [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_brand), IsNotNull(p_type), Not(EqualTo(p_brand,Brand#X)), Not(StringStartsWith(p_type,MEDIUM POLISHED)), In(p_size, [14,19,23,3,36,45,49,9]), IsNotNull(p_partkey)] ReadSchema: struct -(52) Filter +(58) Filter Input [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] Condition : (((((isnotnull(p_brand#X) AND isnotnull(p_type#X)) AND NOT (p_brand#X = Brand#X)) AND NOT StartsWith(p_type#X, MEDIUM POLISHED)) AND p_size#X IN (49,14,23,45,19,3,36,9)) AND isnotnull(p_partkey#X)) -(53) Exchange +(59) Exchange Input [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(54) ShuffledHashJoin +(60) Sort +Input [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] +Arguments: [p_partkey#X ASC NULLS FIRST], false, 0 + +(61) SortMergeJoin Left keys [1]: [ps_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(55) Project +(62) Project Output [4]: [ps_suppkey#X, p_brand#X, p_type#X, p_size#X] Input [6]: [ps_partkey#X, ps_suppkey#X, p_partkey#X, p_brand#X, p_type#X, p_size#X] -(56) HashAggregate +(63) HashAggregate Input [4]: [ps_suppkey#X, p_brand#X, p_type#X, p_size#X] Keys [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Functions: [] Aggregate Attributes: [] Results [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(57) Exchange +(64) Exchange Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(58) HashAggregate +(65) HashAggregate Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Keys [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Functions: [] Aggregate Attributes: [] Results [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(59) HashAggregate +(66) HashAggregate Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] Functions [1]: [partial_count(distinct ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count#X] -(60) Exchange +(67) Exchange Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(61) HashAggregate +(68) HashAggregate Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] Functions [1]: [count(distinct ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count(ps_suppkey#X)#X AS supplier_cnt#X] -(62) Exchange +(69) Exchange Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: rangepartitioning(supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(63) Sort +(70) Sort Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: [supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST], true, 0 -(64) AdaptiveSparkPlan +(71) AdaptiveSparkPlan Output [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/17.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/17.txt index 04f31f2d7a90c..69f50fa16ab02 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/17.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/17.txt @@ -1,59 +1,62 @@ == Physical Plan == -AdaptiveSparkPlan (57) +AdaptiveSparkPlan (63) +- == Final Plan == - VeloxColumnarToRowExec (37) - +- ^ ProjectExecTransformer (35) - +- ^ RegularHashAggregateExecTransformer (34) - +- ^ RegularHashAggregateExecTransformer (33) - +- ^ ProjectExecTransformer (32) - +- ^ ShuffledHashJoinExecTransformer Inner (31) - :- ^ ProjectExecTransformer (18) - : +- ^ ShuffledHashJoinExecTransformer Inner (17) - : :- ^ InputIteratorTransformer (8) - : : +- ^ InputAdapter (7) - : : +- ^ ShuffleQueryStage (6) - : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (40) + +- ^ ProjectExecTransformer (38) + +- ^ RegularHashAggregateExecTransformer (37) + +- ^ RegularHashAggregateExecTransformer (36) + +- ^ ProjectExecTransformer (35) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (34) + :- ^ ProjectExecTransformer (20) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (19) + : :- ^ InputIteratorTransformer (9) + : : +- ShuffleQueryStage (7) + : : +- ColumnarExchange (6) + : : +- VeloxAppendBatches (5) : : +- ^ ProjectExecTransformer (3) : : +- ^ FilterExecTransformer (2) : : +- ^ Scan parquet (1) - : +- ^ InputIteratorTransformer (16) - : +- ^ InputAdapter (15) - : +- ^ ShuffleQueryStage (14) - : +- ColumnarExchange (13) - : +- ^ ProjectExecTransformer (11) - : +- ^ FilterExecTransformer (10) - : +- ^ Scan parquet (9) - +- ^ FilterExecTransformer (30) - +- ^ ProjectExecTransformer (29) - +- ^ RegularHashAggregateExecTransformer (28) - +- ^ InputIteratorTransformer (27) - +- ^ InputAdapter (26) - +- ^ ShuffleQueryStage (25) - +- ColumnarExchange (24) - +- ^ ProjectExecTransformer (22) - +- ^ FlushableHashAggregateExecTransformer (21) - +- ^ FilterExecTransformer (20) - +- ^ Scan parquet (19) + : +- ^ InputIteratorTransformer (18) + : +- ShuffleQueryStage (16) + : +- ColumnarExchange (15) + : +- VeloxAppendBatches (14) + : +- ^ ProjectExecTransformer (12) + : +- ^ FilterExecTransformer (11) + : +- ^ Scan parquet (10) + +- ^ FilterExecTransformer (33) + +- ^ ProjectExecTransformer (32) + +- ^ RegularHashAggregateExecTransformer (31) + +- ^ InputIteratorTransformer (30) + +- ShuffleQueryStage (28) + +- ColumnarExchange (27) + +- VeloxAppendBatches (26) + +- ^ ProjectExecTransformer (24) + +- ^ FlushableHashAggregateExecTransformer (23) + +- ^ FilterExecTransformer (22) + +- ^ Scan parquet (21) +- == Initial Plan == - HashAggregate (56) - +- HashAggregate (55) - +- Project (54) - +- ShuffledHashJoin Inner BuildRight (53) - :- Project (46) - : +- ShuffledHashJoin Inner BuildRight (45) - : :- Exchange (40) - : : +- Filter (39) - : : +- Scan parquet (38) - : +- Exchange (44) - : +- Project (43) - : +- Filter (42) - : +- Scan parquet (41) - +- Filter (52) - +- HashAggregate (51) - +- Exchange (50) - +- HashAggregate (49) - +- Filter (48) - +- Scan parquet (47) + HashAggregate (62) + +- HashAggregate (61) + +- Project (60) + +- SortMergeJoin Inner (59) + :- Project (51) + : +- SortMergeJoin Inner (50) + : :- Sort (44) + : : +- Exchange (43) + : : +- Filter (42) + : : +- Scan parquet (41) + : +- Sort (49) + : +- Exchange (48) + : +- Project (47) + : +- Filter (46) + : +- Scan parquet (45) + +- Sort (58) + +- Filter (57) + +- HashAggregate (56) + +- Exchange (55) + +- HashAggregate (54) + +- Filter (53) + +- Scan parquet (52) (1) Scan parquet @@ -75,250 +78,274 @@ Input [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] Input [4]: [hash_partition_key#X, l_partkey#X, l_quantity#X, l_extendedprice#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_partkey#X, l_quantity#X, l_extendedprice#X] +Arguments: X + +(6) ColumnarExchange Input [4]: [hash_partition_key#X, l_partkey#X, l_quantity#X, l_extendedprice#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_quantity#X, l_extendedprice#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] -(9) Scan parquet +(10) Scan parquet Output [3]: [p_partkey#X, p_brand#X, p_container#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_brand), IsNotNull(p_container), EqualTo(p_brand,Brand#X), EqualTo(p_container,MED BOX), IsNotNull(p_partkey)] ReadSchema: struct -(10) FilterExecTransformer +(11) FilterExecTransformer Input [3]: [p_partkey#X, p_brand#X, p_container#X] Arguments: ((((isnotnull(p_brand#X) AND isnotnull(p_container#X)) AND (p_brand#X = Brand#X)) AND (p_container#X = MED BOX)) AND isnotnull(p_partkey#X)) -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [2]: [hash(p_partkey#X, 42) AS hash_partition_key#X, p_partkey#X] Input [3]: [p_partkey#X, p_brand#X, p_container#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, p_partkey#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [2]: [hash_partition_key#X, p_partkey#X] +Arguments: X + +(15) ColumnarExchange Input [2]: [hash_partition_key#X, p_partkey#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [1]: [p_partkey#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [1]: [p_partkey#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [1]: [p_partkey#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [3]: [l_quantity#X, l_extendedprice#X, p_partkey#X] Input [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, p_partkey#X] -(19) Scan parquet +(21) Scan parquet Output [2]: [l_partkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey)] ReadSchema: struct -(20) FilterExecTransformer +(22) FilterExecTransformer Input [2]: [l_partkey#X, l_quantity#X] Arguments: isnotnull(l_partkey#X) -(21) FlushableHashAggregateExecTransformer +(23) FlushableHashAggregateExecTransformer Input [2]: [l_partkey#X, l_quantity#X] Keys [1]: [l_partkey#X] Functions [1]: [partial_avg(l_quantity#X)] Aggregate Attributes [2]: [sum#X, count#X] Results [3]: [l_partkey#X, sum#X, count#X] -(22) ProjectExecTransformer +(24) ProjectExecTransformer Output [4]: [hash(l_partkey#X, 42) AS hash_partition_key#X, l_partkey#X, sum#X, count#X] Input [3]: [l_partkey#X, sum#X, count#X] -(23) WholeStageCodegenTransformer (X) +(25) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, l_partkey#X, sum#X, count#X] Arguments: false -(24) ColumnarExchange +(26) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_partkey#X, sum#X, count#X] +Arguments: X + +(27) ColumnarExchange Input [4]: [hash_partition_key#X, l_partkey#X, sum#X, count#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, sum#X, count#X], [plan_id=X], [id=#X] -(25) ShuffleQueryStage +(28) ShuffleQueryStage Output [3]: [l_partkey#X, sum#X, count#X] Arguments: X -(26) InputAdapter +(29) InputAdapter Input [3]: [l_partkey#X, sum#X, count#X] -(27) InputIteratorTransformer +(30) InputIteratorTransformer Input [3]: [l_partkey#X, sum#X, count#X] -(28) RegularHashAggregateExecTransformer +(31) RegularHashAggregateExecTransformer Input [3]: [l_partkey#X, sum#X, count#X] Keys [1]: [l_partkey#X] Functions [1]: [avg(l_quantity#X)] Aggregate Attributes [1]: [avg(l_quantity#X)#X] Results [2]: [l_partkey#X, avg(l_quantity#X)#X] -(29) ProjectExecTransformer +(32) ProjectExecTransformer Output [2]: [CheckOverflow((0.200000 * promote_precision(avg(l_quantity#X)#X)), DecimalType(18,7), true) AS (0.2 * avg(l_quantity))#X, l_partkey#X] Input [2]: [l_partkey#X, avg(l_quantity#X)#X] -(30) FilterExecTransformer +(33) FilterExecTransformer Input [2]: [(0.2 * avg(l_quantity))#X, l_partkey#X] Arguments: isnotnull((0.2 * avg(l_quantity))#X) -(31) ShuffledHashJoinExecTransformer +(34) ShuffledHashJoinExecTransformer Left keys [1]: [p_partkey#X] Right keys [1]: [l_partkey#X] Join condition: (cast(l_quantity#X as decimal(18,7)) < (0.2 * avg(l_quantity))#X) -(32) ProjectExecTransformer +(35) ProjectExecTransformer Output [1]: [l_extendedprice#X] Input [5]: [l_quantity#X, l_extendedprice#X, p_partkey#X, (0.2 * avg(l_quantity))#X, l_partkey#X] -(33) RegularHashAggregateExecTransformer +(36) RegularHashAggregateExecTransformer Input [1]: [l_extendedprice#X] Keys: [] Functions [1]: [partial_sum(l_extendedprice#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(34) RegularHashAggregateExecTransformer +(37) RegularHashAggregateExecTransformer Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(l_extendedprice#X)] Aggregate Attributes [1]: [sum(l_extendedprice#X)#X] Results [1]: [sum(l_extendedprice#X)#X] -(35) ProjectExecTransformer +(38) ProjectExecTransformer Output [1]: [CheckOverflow((promote_precision(sum(l_extendedprice#X)#X) / 7.00), DecimalType(27,6), true) AS avg_yearly#X] Input [1]: [sum(l_extendedprice#X)#X] -(36) WholeStageCodegenTransformer (X) +(39) WholeStageCodegenTransformer (X) Input [1]: [avg_yearly#X] Arguments: false -(37) VeloxColumnarToRowExec +(40) VeloxColumnarToRowExec Input [1]: [avg_yearly#X] -(38) Scan parquet +(41) Scan parquet Output [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey), IsNotNull(l_quantity)] ReadSchema: struct -(39) Filter +(42) Filter Input [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] Condition : (isnotnull(l_partkey#X) AND isnotnull(l_quantity#X)) -(40) Exchange +(43) Exchange Input [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(41) Scan parquet +(44) Sort +Input [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] +Arguments: [l_partkey#X ASC NULLS FIRST], false, 0 + +(45) Scan parquet Output [3]: [p_partkey#X, p_brand#X, p_container#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_brand), IsNotNull(p_container), EqualTo(p_brand,Brand#X), EqualTo(p_container,MED BOX), IsNotNull(p_partkey)] ReadSchema: struct -(42) Filter +(46) Filter Input [3]: [p_partkey#X, p_brand#X, p_container#X] Condition : ((((isnotnull(p_brand#X) AND isnotnull(p_container#X)) AND (p_brand#X = Brand#X)) AND (p_container#X = MED BOX)) AND isnotnull(p_partkey#X)) -(43) Project +(47) Project Output [1]: [p_partkey#X] Input [3]: [p_partkey#X, p_brand#X, p_container#X] -(44) Exchange +(48) Exchange Input [1]: [p_partkey#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(45) ShuffledHashJoin +(49) Sort +Input [1]: [p_partkey#X] +Arguments: [p_partkey#X ASC NULLS FIRST], false, 0 + +(50) SortMergeJoin Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(46) Project +(51) Project Output [3]: [l_quantity#X, l_extendedprice#X, p_partkey#X] Input [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, p_partkey#X] -(47) Scan parquet +(52) Scan parquet Output [2]: [l_partkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey)] ReadSchema: struct -(48) Filter +(53) Filter Input [2]: [l_partkey#X, l_quantity#X] Condition : isnotnull(l_partkey#X) -(49) HashAggregate +(54) HashAggregate Input [2]: [l_partkey#X, l_quantity#X] Keys [1]: [l_partkey#X] Functions [1]: [partial_avg(l_quantity#X)] Aggregate Attributes [2]: [sum#X, count#X] Results [3]: [l_partkey#X, sum#X, count#X] -(50) Exchange +(55) Exchange Input [3]: [l_partkey#X, sum#X, count#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(51) HashAggregate +(56) HashAggregate Input [3]: [l_partkey#X, sum#X, count#X] Keys [1]: [l_partkey#X] Functions [1]: [avg(l_quantity#X)] Aggregate Attributes [1]: [avg(l_quantity#X)#X] Results [2]: [CheckOverflow((0.200000 * promote_precision(avg(l_quantity#X)#X)), DecimalType(18,7), true) AS (0.2 * avg(l_quantity))#X, l_partkey#X] -(52) Filter +(57) Filter Input [2]: [(0.2 * avg(l_quantity))#X, l_partkey#X] Condition : isnotnull((0.2 * avg(l_quantity))#X) -(53) ShuffledHashJoin +(58) Sort +Input [2]: [(0.2 * avg(l_quantity))#X, l_partkey#X] +Arguments: [l_partkey#X ASC NULLS FIRST], false, 0 + +(59) SortMergeJoin Left keys [1]: [p_partkey#X] Right keys [1]: [l_partkey#X] Join condition: (cast(l_quantity#X as decimal(18,7)) < (0.2 * avg(l_quantity))#X) -(54) Project +(60) Project Output [1]: [l_extendedprice#X] Input [5]: [l_quantity#X, l_extendedprice#X, p_partkey#X, (0.2 * avg(l_quantity))#X, l_partkey#X] -(55) HashAggregate +(61) HashAggregate Input [1]: [l_extendedprice#X] Keys: [] Functions [1]: [partial_sum(l_extendedprice#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(56) HashAggregate +(62) HashAggregate Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(l_extendedprice#X)] Aggregate Attributes [1]: [sum(l_extendedprice#X)#X] Results [1]: [CheckOverflow((promote_precision(sum(l_extendedprice#X)#X) / 7.00), DecimalType(27,6), true) AS avg_yearly#X] -(57) AdaptiveSparkPlan +(63) AdaptiveSparkPlan Output [1]: [avg_yearly#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/18.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/18.txt index ac59fbf6e10f6..96fa1cd826060 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/18.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/18.txt @@ -1,96 +1,102 @@ == Physical Plan == -AdaptiveSparkPlan (97) +AdaptiveSparkPlan (110) +- == Final Plan == - VeloxColumnarToRowExec (64) - +- TakeOrderedAndProjectExecTransformer (63) - +- ^ RegularHashAggregateExecTransformer (61) - +- ^ RegularHashAggregateExecTransformer (60) - +- ^ ProjectExecTransformer (59) - +- ^ ShuffledHashJoinExecTransformer Inner (58) - :- ^ InputIteratorTransformer (41) - : +- ^ InputAdapter (40) - : +- ^ ShuffleQueryStage (39) - : +- ColumnarExchange (38) - : +- ^ ProjectExecTransformer (36) - : +- ^ ShuffledHashJoinExecTransformer Inner (35) - : :- ^ InputIteratorTransformer (8) - : : +- ^ InputAdapter (7) - : : +- ^ ShuffleQueryStage (6) - : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (70) + +- TakeOrderedAndProjectExecTransformer (69) + +- ^ RegularHashAggregateExecTransformer (67) + +- ^ RegularHashAggregateExecTransformer (66) + +- ^ ProjectExecTransformer (65) + +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (64) + :- ^ InputIteratorTransformer (46) + : +- ShuffleQueryStage (44) + : +- ColumnarExchange (43) + : +- VeloxAppendBatches (42) + : +- ^ ProjectExecTransformer (40) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (39) + : :- ^ InputIteratorTransformer (9) + : : +- ShuffleQueryStage (7) + : : +- ColumnarExchange (6) + : : +- VeloxAppendBatches (5) : : +- ^ ProjectExecTransformer (3) : : +- ^ FilterExecTransformer (2) : : +- ^ Scan parquet (1) - : +- ^ InputIteratorTransformer (34) - : +- ^ InputAdapter (33) - : +- ^ ShuffleQueryStage (32) - : +- ColumnarExchange (31) - : +- ^ ProjectExecTransformer (29) - : +- ^ ShuffledHashJoinExecTransformer LeftSemi (28) - : :- ^ InputIteratorTransformer (16) - : : +- ^ InputAdapter (15) - : : +- ^ ShuffleQueryStage (14) - : : +- ColumnarExchange (13) - : : +- ^ ProjectExecTransformer (11) - : : +- ^ FilterExecTransformer (10) - : : +- ^ Scan parquet (9) - : +- ^ ProjectExecTransformer (27) - : +- ^ FilterExecTransformer (26) - : +- ^ RegularHashAggregateExecTransformer (25) - : +- ^ InputIteratorTransformer (24) - : +- ^ InputAdapter (23) - : +- ^ ShuffleQueryStage (22) - : +- ColumnarExchange (21) - : +- ^ ProjectExecTransformer (19) - : +- ^ FlushableHashAggregateExecTransformer (18) - : +- ^ Scan parquet (17) - +- ^ ShuffledHashJoinExecTransformer LeftSemi (57) - :- ^ InputIteratorTransformer (49) - : +- ^ InputAdapter (48) - : +- ^ ShuffleQueryStage (47) - : +- ColumnarExchange (46) - : +- ^ ProjectExecTransformer (44) - : +- ^ FilterExecTransformer (43) - : +- ^ Scan parquet (42) - +- ^ ProjectExecTransformer (56) - +- ^ FilterExecTransformer (55) - +- ^ RegularHashAggregateExecTransformer (54) - +- ^ InputIteratorTransformer (53) - +- ^ InputAdapter (52) - +- ^ ShuffleQueryStage (51) - +- ReusedExchange (50) + : +- ^ InputIteratorTransformer (38) + : +- ShuffleQueryStage (36) + : +- ColumnarExchange (35) + : +- VeloxAppendBatches (34) + : +- ^ ProjectExecTransformer (32) + : +- ^ ShuffledHashJoinExecTransformer LeftSemi BuildRight (31) + : :- ^ InputIteratorTransformer (18) + : : +- ShuffleQueryStage (16) + : : +- ColumnarExchange (15) + : : +- VeloxAppendBatches (14) + : : +- ^ ProjectExecTransformer (12) + : : +- ^ FilterExecTransformer (11) + : : +- ^ Scan parquet (10) + : +- ^ ProjectExecTransformer (30) + : +- ^ FilterExecTransformer (29) + : +- ^ RegularHashAggregateExecTransformer (28) + : +- ^ InputIteratorTransformer (27) + : +- ShuffleQueryStage (25) + : +- ColumnarExchange (24) + : +- VeloxAppendBatches (23) + : +- ^ ProjectExecTransformer (21) + : +- ^ FlushableHashAggregateExecTransformer (20) + : +- ^ Scan parquet (19) + +- ^ ShuffledHashJoinExecTransformer LeftSemi BuildRight (63) + :- ^ InputIteratorTransformer (55) + : +- ShuffleQueryStage (53) + : +- ColumnarExchange (52) + : +- VeloxAppendBatches (51) + : +- ^ ProjectExecTransformer (49) + : +- ^ FilterExecTransformer (48) + : +- ^ Scan parquet (47) + +- ^ ProjectExecTransformer (62) + +- ^ FilterExecTransformer (61) + +- ^ RegularHashAggregateExecTransformer (60) + +- ^ InputIteratorTransformer (59) + +- ShuffleQueryStage (57) + +- ReusedExchange (56) +- == Initial Plan == - TakeOrderedAndProject (96) - +- HashAggregate (95) - +- HashAggregate (94) - +- Project (93) - +- ShuffledHashJoin Inner BuildRight (92) - :- Exchange (81) - : +- Project (80) - : +- ShuffledHashJoin Inner BuildLeft (79) - : :- Exchange (67) - : : +- Filter (66) - : : +- Scan parquet (65) - : +- Exchange (78) - : +- ShuffledHashJoin LeftSemi BuildRight (77) - : :- Exchange (70) - : : +- Filter (69) - : : +- Scan parquet (68) - : +- Project (76) - : +- Filter (75) - : +- HashAggregate (74) - : +- Exchange (73) - : +- HashAggregate (72) - : +- Scan parquet (71) - +- ShuffledHashJoin LeftSemi BuildRight (91) - :- Exchange (84) - : +- Filter (83) - : +- Scan parquet (82) - +- Project (90) - +- Filter (89) - +- HashAggregate (88) - +- Exchange (87) - +- HashAggregate (86) - +- Scan parquet (85) + TakeOrderedAndProject (109) + +- HashAggregate (108) + +- HashAggregate (107) + +- Project (106) + +- SortMergeJoin Inner (105) + :- Sort (92) + : +- Exchange (91) + : +- Project (90) + : +- SortMergeJoin Inner (89) + : :- Sort (74) + : : +- Exchange (73) + : : +- Filter (72) + : : +- Scan parquet (71) + : +- Sort (88) + : +- Exchange (87) + : +- SortMergeJoin LeftSemi (86) + : :- Sort (78) + : : +- Exchange (77) + : : +- Filter (76) + : : +- Scan parquet (75) + : +- Sort (85) + : +- Project (84) + : +- Filter (83) + : +- HashAggregate (82) + : +- Exchange (81) + : +- HashAggregate (80) + : +- Scan parquet (79) + +- SortMergeJoin LeftSemi (104) + :- Sort (96) + : +- Exchange (95) + : +- Filter (94) + : +- Scan parquet (93) + +- Sort (103) + +- Project (102) + +- Filter (101) + +- HashAggregate (100) + +- Exchange (99) + +- HashAggregate (98) + +- Scan parquet (97) (1) Scan parquet @@ -112,420 +118,472 @@ Input [2]: [c_custkey#X, c_name#X] Input [3]: [hash_partition_key#X, c_custkey#X, c_name#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [3]: [hash_partition_key#X, c_custkey#X, c_name#X] +Arguments: X + +(6) ColumnarExchange Input [3]: [hash_partition_key#X, c_custkey#X, c_name#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [2]: [c_custkey#X, c_name#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [2]: [c_custkey#X, c_name#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [2]: [c_custkey#X, c_name#X] -(9) Scan parquet +(10) Scan parquet Output [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(10) FilterExecTransformer +(11) FilterExecTransformer Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Arguments: (isnotnull(o_custkey#X) AND isnotnull(o_orderkey#X)) -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [5]: [hash(o_orderkey#X, 42) AS hash_partition_key#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] +Arguments: X + +(15) ColumnarExchange Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -(17) Scan parquet +(19) Scan parquet Output [2]: [l_orderkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(18) FlushableHashAggregateExecTransformer +(20) FlushableHashAggregateExecTransformer Input [2]: [l_orderkey#X, l_quantity#X] Keys [1]: [l_orderkey#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_orderkey#X, sum#X, isEmpty#X] -(19) ProjectExecTransformer +(21) ProjectExecTransformer Output [4]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, sum#X, isEmpty#X] Input [3]: [l_orderkey#X, sum#X, isEmpty#X] -(20) WholeStageCodegenTransformer (X) +(22) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, l_orderkey#X, sum#X, isEmpty#X] Arguments: false -(21) ColumnarExchange +(23) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_orderkey#X, sum#X, isEmpty#X] +Arguments: X + +(24) ColumnarExchange Input [4]: [hash_partition_key#X, l_orderkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(22) ShuffleQueryStage +(25) ShuffleQueryStage Output [3]: [l_orderkey#X, sum#X, isEmpty#X] Arguments: X -(23) InputAdapter +(26) InputAdapter Input [3]: [l_orderkey#X, sum#X, isEmpty#X] -(24) InputIteratorTransformer +(27) InputIteratorTransformer Input [3]: [l_orderkey#X, sum#X, isEmpty#X] -(25) RegularHashAggregateExecTransformer +(28) RegularHashAggregateExecTransformer Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Keys [1]: [l_orderkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [2]: [l_orderkey#X, sum(l_quantity#X)#X AS sum(l_quantity#X)#X] -(26) FilterExecTransformer +(29) FilterExecTransformer Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] Arguments: (isnotnull(sum(l_quantity#X)#X) AND (sum(l_quantity#X)#X > 300.00)) -(27) ProjectExecTransformer +(30) ProjectExecTransformer Output [1]: [l_orderkey#X] Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] -(28) ShuffledHashJoinExecTransformer +(31) ShuffledHashJoinExecTransformer Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(29) ProjectExecTransformer +(32) ProjectExecTransformer Output [5]: [hash(o_custkey#X, 42) AS hash_partition_key#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -(30) WholeStageCodegenTransformer (X) +(33) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Arguments: false -(31) ColumnarExchange +(34) VeloxAppendBatches +Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] +Arguments: X + +(35) ColumnarExchange Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X], [plan_id=X], [id=#X] -(32) ShuffleQueryStage +(36) ShuffleQueryStage Output [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Arguments: X -(33) InputAdapter +(37) InputAdapter Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -(34) InputIteratorTransformer +(38) InputIteratorTransformer Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -(35) ShuffledHashJoinExecTransformer +(39) ShuffledHashJoinExecTransformer Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(36) ProjectExecTransformer +(40) ProjectExecTransformer Output [6]: [hash(o_orderkey#X, 42) AS hash_partition_key#X, c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] Input [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -(37) WholeStageCodegenTransformer (X) +(41) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] Arguments: false -(38) ColumnarExchange +(42) VeloxAppendBatches +Input [6]: [hash_partition_key#X, c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] +Arguments: X + +(43) ColumnarExchange Input [6]: [hash_partition_key#X, c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X], [plan_id=X], [id=#X] -(39) ShuffleQueryStage +(44) ShuffleQueryStage Output [5]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] Arguments: X -(40) InputAdapter +(45) InputAdapter Input [5]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] -(41) InputIteratorTransformer +(46) InputIteratorTransformer Input [5]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] -(42) Scan parquet +(47) Scan parquet Output [2]: [l_orderkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_orderkey)] ReadSchema: struct -(43) FilterExecTransformer +(48) FilterExecTransformer Input [2]: [l_orderkey#X, l_quantity#X] Arguments: isnotnull(l_orderkey#X) -(44) ProjectExecTransformer +(49) ProjectExecTransformer Output [3]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_quantity#X] Input [2]: [l_orderkey#X, l_quantity#X] -(45) WholeStageCodegenTransformer (X) +(50) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, l_orderkey#X, l_quantity#X] Arguments: false -(46) ColumnarExchange +(51) VeloxAppendBatches +Input [3]: [hash_partition_key#X, l_orderkey#X, l_quantity#X] +Arguments: X + +(52) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_quantity#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_quantity#X], [plan_id=X], [id=#X] -(47) ShuffleQueryStage +(53) ShuffleQueryStage Output [2]: [l_orderkey#X, l_quantity#X] Arguments: X -(48) InputAdapter +(54) InputAdapter Input [2]: [l_orderkey#X, l_quantity#X] -(49) InputIteratorTransformer +(55) InputIteratorTransformer Input [2]: [l_orderkey#X, l_quantity#X] -(50) ReusedExchange [Reuses operator id: 21] +(56) ReusedExchange [Reuses operator id: 24] Output [3]: [l_orderkey#X, sum#X, isEmpty#X] -(51) ShuffleQueryStage +(57) ShuffleQueryStage Output [3]: [l_orderkey#X, sum#X, isEmpty#X] Arguments: X -(52) InputAdapter +(58) InputAdapter Input [3]: [l_orderkey#X, sum#X, isEmpty#X] -(53) InputIteratorTransformer +(59) InputIteratorTransformer Input [3]: [l_orderkey#X, sum#X, isEmpty#X] -(54) RegularHashAggregateExecTransformer +(60) RegularHashAggregateExecTransformer Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Keys [1]: [l_orderkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [2]: [l_orderkey#X, sum(l_quantity#X)#X AS sum(l_quantity#X)#X] -(55) FilterExecTransformer +(61) FilterExecTransformer Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] Arguments: (isnotnull(sum(l_quantity#X)#X) AND (sum(l_quantity#X)#X > 300.00)) -(56) ProjectExecTransformer +(62) ProjectExecTransformer Output [1]: [l_orderkey#X] Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] -(57) ShuffledHashJoinExecTransformer +(63) ShuffledHashJoinExecTransformer Left keys [1]: [l_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(58) ShuffledHashJoinExecTransformer +(64) ShuffledHashJoinExecTransformer Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(59) ProjectExecTransformer +(65) ProjectExecTransformer Output [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_quantity#X] Input [7]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_orderkey#X, l_quantity#X] -(60) RegularHashAggregateExecTransformer +(66) RegularHashAggregateExecTransformer Input [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_quantity#X] Keys [5]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] -(61) RegularHashAggregateExecTransformer +(67) RegularHashAggregateExecTransformer Input [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] Keys [5]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity#X)#X AS sum(l_quantity)#X] -(62) WholeStageCodegenTransformer (X) +(68) WholeStageCodegenTransformer (X) Input [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] Arguments: false -(63) TakeOrderedAndProjectExecTransformer +(69) TakeOrderedAndProjectExecTransformer Input [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] Arguments: X, [o_totalprice#X DESC NULLS LAST, o_orderdate#X ASC NULLS FIRST], [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X], 0 -(64) VeloxColumnarToRowExec +(70) VeloxColumnarToRowExec Input [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] -(65) Scan parquet +(71) Scan parquet Output [2]: [c_custkey#X, c_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey)] ReadSchema: struct -(66) Filter +(72) Filter Input [2]: [c_custkey#X, c_name#X] Condition : isnotnull(c_custkey#X) -(67) Exchange +(73) Exchange Input [2]: [c_custkey#X, c_name#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(68) Scan parquet +(74) Sort +Input [2]: [c_custkey#X, c_name#X] +Arguments: [c_custkey#X ASC NULLS FIRST], false, 0 + +(75) Scan parquet Output [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(69) Filter +(76) Filter Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Condition : (isnotnull(o_custkey#X) AND isnotnull(o_orderkey#X)) -(70) Exchange +(77) Exchange Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(71) Scan parquet +(78) Sort +Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(79) Scan parquet Output [2]: [l_orderkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(72) HashAggregate +(80) HashAggregate Input [2]: [l_orderkey#X, l_quantity#X] Keys [1]: [l_orderkey#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_orderkey#X, sum#X, isEmpty#X] -(73) Exchange +(81) Exchange Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(74) HashAggregate +(82) HashAggregate Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Keys [1]: [l_orderkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [2]: [l_orderkey#X, sum(l_quantity#X)#X AS sum(l_quantity#X)#X] -(75) Filter +(83) Filter Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] Condition : (isnotnull(sum(l_quantity#X)#X) AND (sum(l_quantity#X)#X > 300.00)) -(76) Project +(84) Project Output [1]: [l_orderkey#X] Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] -(77) ShuffledHashJoin +(85) Sort +Input [1]: [l_orderkey#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(86) SortMergeJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(78) Exchange +(87) Exchange Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(79) ShuffledHashJoin +(88) Sort +Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] +Arguments: [o_custkey#X ASC NULLS FIRST], false, 0 + +(89) SortMergeJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(80) Project +(90) Project Output [5]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] Input [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -(81) Exchange +(91) Exchange Input [5]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(82) Scan parquet +(92) Sort +Input [5]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(93) Scan parquet Output [2]: [l_orderkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_orderkey)] ReadSchema: struct -(83) Filter +(94) Filter Input [2]: [l_orderkey#X, l_quantity#X] Condition : isnotnull(l_orderkey#X) -(84) Exchange +(95) Exchange Input [2]: [l_orderkey#X, l_quantity#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(85) Scan parquet +(96) Sort +Input [2]: [l_orderkey#X, l_quantity#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(97) Scan parquet Output [2]: [l_orderkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(86) HashAggregate +(98) HashAggregate Input [2]: [l_orderkey#X, l_quantity#X] Keys [1]: [l_orderkey#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_orderkey#X, sum#X, isEmpty#X] -(87) Exchange +(99) Exchange Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(88) HashAggregate +(100) HashAggregate Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Keys [1]: [l_orderkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [2]: [l_orderkey#X, sum(l_quantity#X)#X AS sum(l_quantity#X)#X] -(89) Filter +(101) Filter Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] Condition : (isnotnull(sum(l_quantity#X)#X) AND (sum(l_quantity#X)#X > 300.00)) -(90) Project +(102) Project Output [1]: [l_orderkey#X] Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] -(91) ShuffledHashJoin +(103) Sort +Input [1]: [l_orderkey#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(104) SortMergeJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(92) ShuffledHashJoin +(105) SortMergeJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(93) Project +(106) Project Output [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_quantity#X] Input [7]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_orderkey#X, l_quantity#X] -(94) HashAggregate +(107) HashAggregate Input [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_quantity#X] Keys [5]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] -(95) HashAggregate +(108) HashAggregate Input [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] Keys [5]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity#X)#X AS sum(l_quantity)#X] -(96) TakeOrderedAndProject +(109) TakeOrderedAndProject Input [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] Arguments: X, [o_totalprice#X DESC NULLS LAST, o_orderdate#X ASC NULLS FIRST], [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] -(97) AdaptiveSparkPlan +(110) AdaptiveSparkPlan Output [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/19.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/19.txt index 4bff7da9b4735..ae0feb5dfd565 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/19.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/19.txt @@ -1,37 +1,39 @@ == Physical Plan == -AdaptiveSparkPlan (34) +AdaptiveSparkPlan (38) +- == Final Plan == - VeloxColumnarToRowExec (22) - +- ^ RegularHashAggregateExecTransformer (20) - +- ^ RegularHashAggregateExecTransformer (19) - +- ^ ProjectExecTransformer (18) - +- ^ ShuffledHashJoinExecTransformer Inner (17) - :- ^ InputIteratorTransformer (8) - : +- ^ InputAdapter (7) - : +- ^ ShuffleQueryStage (6) - : +- ColumnarExchange (5) + VeloxColumnarToRowExec (24) + +- ^ RegularHashAggregateExecTransformer (22) + +- ^ RegularHashAggregateExecTransformer (21) + +- ^ ProjectExecTransformer (20) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (19) + :- ^ InputIteratorTransformer (9) + : +- ShuffleQueryStage (7) + : +- ColumnarExchange (6) + : +- VeloxAppendBatches (5) : +- ^ ProjectExecTransformer (3) : +- ^ FilterExecTransformer (2) : +- ^ Scan parquet (1) - +- ^ InputIteratorTransformer (16) - +- ^ InputAdapter (15) - +- ^ ShuffleQueryStage (14) - +- ColumnarExchange (13) - +- ^ ProjectExecTransformer (11) - +- ^ FilterExecTransformer (10) - +- ^ Scan parquet (9) + +- ^ InputIteratorTransformer (18) + +- ShuffleQueryStage (16) + +- ColumnarExchange (15) + +- VeloxAppendBatches (14) + +- ^ ProjectExecTransformer (12) + +- ^ FilterExecTransformer (11) + +- ^ Scan parquet (10) +- == Initial Plan == - HashAggregate (33) - +- HashAggregate (32) - +- Project (31) - +- ShuffledHashJoin Inner BuildRight (30) - :- Exchange (26) - : +- Project (25) - : +- Filter (24) - : +- Scan parquet (23) - +- Exchange (29) - +- Filter (28) - +- Scan parquet (27) + HashAggregate (37) + +- HashAggregate (36) + +- Project (35) + +- SortMergeJoin Inner (34) + :- Sort (29) + : +- Exchange (28) + : +- Project (27) + : +- Filter (26) + : +- Scan parquet (25) + +- Sort (33) + +- Exchange (32) + +- Filter (31) + +- Scan parquet (30) (1) Scan parquet @@ -53,140 +55,156 @@ Input [6]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, l_shipin Input [5]: [hash_partition_key#X, l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [5]: [hash_partition_key#X, l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(6) ColumnarExchange Input [5]: [hash_partition_key#X, l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -(9) Scan parquet +(10) Scan parquet Output [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_size), GreaterThanOrEqual(p_size,1), IsNotNull(p_partkey), Or(Or(And(And(EqualTo(p_brand,Brand#X),In(p_container, [SM BOX,SM CASE,SM PACK,SM PKG])),LessThanOrEqual(p_size,5)),And(And(EqualTo(p_brand,Brand#X),In(p_container, [MED BAG,MED BOX,MED PACK,MED PKG])),LessThanOrEqual(p_size,10))),And(And(EqualTo(p_brand,Brand#X),In(p_container, [LG BOX,LG CASE,LG PACK,LG PKG])),LessThanOrEqual(p_size,15)))] ReadSchema: struct -(10) FilterExecTransformer +(11) FilterExecTransformer Input [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] Arguments: (((isnotnull(p_size#X) AND (p_size#X >= 1)) AND isnotnull(p_partkey#X)) AND (((((p_brand#X = Brand#X) AND p_container#X IN (SM CASE,SM BOX,SM PACK,SM PKG)) AND (p_size#X <= 5)) OR (((p_brand#X = Brand#X) AND p_container#X IN (MED BAG,MED BOX,MED PKG,MED PACK)) AND (p_size#X <= 10))) OR (((p_brand#X = Brand#X) AND p_container#X IN (LG CASE,LG BOX,LG PACK,LG PKG)) AND (p_size#X <= 15)))) -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [5]: [hash(p_partkey#X, 42) AS hash_partition_key#X, p_partkey#X, p_brand#X, p_size#X, p_container#X] Input [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, p_partkey#X, p_brand#X, p_size#X, p_container#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [5]: [hash_partition_key#X, p_partkey#X, p_brand#X, p_size#X, p_container#X] +Arguments: X + +(15) ColumnarExchange Input [5]: [hash_partition_key#X, p_partkey#X, p_brand#X, p_size#X, p_container#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X, p_brand#X, p_size#X, p_container#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join condition: (((((((p_brand#X = Brand#X) AND p_container#X IN (SM CASE,SM BOX,SM PACK,SM PKG)) AND (l_quantity#X >= 1.00)) AND (l_quantity#X <= 11.00)) AND (p_size#X <= 5)) OR (((((p_brand#X = Brand#X) AND p_container#X IN (MED BAG,MED BOX,MED PKG,MED PACK)) AND (l_quantity#X >= 10.00)) AND (l_quantity#X <= 20.00)) AND (p_size#X <= 10))) OR (((((p_brand#X = Brand#X) AND p_container#X IN (LG CASE,LG BOX,LG PACK,LG PKG)) AND (l_quantity#X >= 20.00)) AND (l_quantity#X <= 30.00)) AND (p_size#X <= 15))) -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [3]: [l_extendedprice#X, l_discount#X, CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) AS _pre_X#X] Input [8]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, p_partkey#X, p_brand#X, p_size#X, p_container#X] -(19) RegularHashAggregateExecTransformer +(21) RegularHashAggregateExecTransformer Input [3]: [l_extendedprice#X, l_discount#X, _pre_X#X] Keys: [] Functions [1]: [partial_sum(_pre_X#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(20) RegularHashAggregateExecTransformer +(22) RegularHashAggregateExecTransformer Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] Results [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X AS revenue#X] -(21) WholeStageCodegenTransformer (X) +(23) WholeStageCodegenTransformer (X) Input [1]: [revenue#X] Arguments: false -(22) VeloxColumnarToRowExec +(24) VeloxColumnarToRowExec Input [1]: [revenue#X] -(23) Scan parquet +(25) Scan parquet Output [6]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, l_shipinstruct#X, l_shipmode#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipinstruct), In(l_shipmode, [AIR,AIR REG]), EqualTo(l_shipinstruct,DELIVER IN PERSON), IsNotNull(l_partkey), Or(Or(And(GreaterThanOrEqual(l_quantity,1.00),LessThanOrEqual(l_quantity,11.00)),And(GreaterThanOrEqual(l_quantity,10.00),LessThanOrEqual(l_quantity,20.00))),And(GreaterThanOrEqual(l_quantity,20.00),LessThanOrEqual(l_quantity,30.00)))] ReadSchema: struct -(24) Filter +(26) Filter Input [6]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, l_shipinstruct#X, l_shipmode#X] Condition : ((((isnotnull(l_shipinstruct#X) AND l_shipmode#X IN (AIR,AIR REG)) AND (l_shipinstruct#X = DELIVER IN PERSON)) AND isnotnull(l_partkey#X)) AND ((((l_quantity#X >= 1.00) AND (l_quantity#X <= 11.00)) OR ((l_quantity#X >= 10.00) AND (l_quantity#X <= 20.00))) OR ((l_quantity#X >= 20.00) AND (l_quantity#X <= 30.00)))) -(25) Project +(27) Project Output [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Input [6]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, l_shipinstruct#X, l_shipmode#X] -(26) Exchange +(28) Exchange Input [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(27) Scan parquet +(29) Sort +Input [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] +Arguments: [l_partkey#X ASC NULLS FIRST], false, 0 + +(30) Scan parquet Output [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_size), GreaterThanOrEqual(p_size,1), IsNotNull(p_partkey), Or(Or(And(And(EqualTo(p_brand,Brand#X),In(p_container, [SM BOX,SM CASE,SM PACK,SM PKG])),LessThanOrEqual(p_size,5)),And(And(EqualTo(p_brand,Brand#X),In(p_container, [MED BAG,MED BOX,MED PACK,MED PKG])),LessThanOrEqual(p_size,10))),And(And(EqualTo(p_brand,Brand#X),In(p_container, [LG BOX,LG CASE,LG PACK,LG PKG])),LessThanOrEqual(p_size,15)))] ReadSchema: struct -(28) Filter +(31) Filter Input [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] Condition : (((isnotnull(p_size#X) AND (p_size#X >= 1)) AND isnotnull(p_partkey#X)) AND (((((p_brand#X = Brand#X) AND p_container#X IN (SM CASE,SM BOX,SM PACK,SM PKG)) AND (p_size#X <= 5)) OR (((p_brand#X = Brand#X) AND p_container#X IN (MED BAG,MED BOX,MED PKG,MED PACK)) AND (p_size#X <= 10))) OR (((p_brand#X = Brand#X) AND p_container#X IN (LG CASE,LG BOX,LG PACK,LG PKG)) AND (p_size#X <= 15)))) -(29) Exchange +(32) Exchange Input [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(30) ShuffledHashJoin +(33) Sort +Input [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] +Arguments: [p_partkey#X ASC NULLS FIRST], false, 0 + +(34) SortMergeJoin Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join condition: (((((((p_brand#X = Brand#X) AND p_container#X IN (SM CASE,SM BOX,SM PACK,SM PKG)) AND (l_quantity#X >= 1.00)) AND (l_quantity#X <= 11.00)) AND (p_size#X <= 5)) OR (((((p_brand#X = Brand#X) AND p_container#X IN (MED BAG,MED BOX,MED PKG,MED PACK)) AND (l_quantity#X >= 10.00)) AND (l_quantity#X <= 20.00)) AND (p_size#X <= 10))) OR (((((p_brand#X = Brand#X) AND p_container#X IN (LG CASE,LG BOX,LG PACK,LG PKG)) AND (l_quantity#X >= 20.00)) AND (l_quantity#X <= 30.00)) AND (p_size#X <= 15))) -(31) Project +(35) Project Output [2]: [l_extendedprice#X, l_discount#X] Input [8]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, p_partkey#X, p_brand#X, p_size#X, p_container#X] -(32) HashAggregate +(36) HashAggregate Input [2]: [l_extendedprice#X, l_discount#X] Keys: [] Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(33) HashAggregate +(37) HashAggregate Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] Results [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X AS revenue#X] -(34) AdaptiveSparkPlan +(38) AdaptiveSparkPlan Output [1]: [revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/20.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/20.txt index 15ea12dd248eb..bab7855516360 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/20.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/20.txt @@ -1,121 +1,130 @@ == Physical Plan == -AdaptiveSparkPlan (126) +AdaptiveSparkPlan (146) +- == Final Plan == - VeloxColumnarToRowExec (86) - +- ^ SortExecTransformer (84) - +- ^ InputIteratorTransformer (83) - +- ^ InputAdapter (82) - +- ^ ShuffleQueryStage (81) - +- ColumnarExchange (80) - +- ^ ProjectExecTransformer (78) - +- ^ ShuffledHashJoinExecTransformer Inner (77) - :- ^ InputIteratorTransformer (68) - : +- ^ InputAdapter (67) - : +- ^ ShuffleQueryStage (66) - : +- ColumnarExchange (65) - : +- ^ ProjectExecTransformer (63) - : +- ^ ShuffledHashJoinExecTransformer LeftSemi (62) - : :- ^ InputIteratorTransformer (8) - : : +- ^ InputAdapter (7) - : : +- ^ ShuffleQueryStage (6) - : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (96) + +- ^ SortExecTransformer (94) + +- ^ InputIteratorTransformer (93) + +- ShuffleQueryStage (91) + +- ColumnarExchange (90) + +- VeloxAppendBatches (89) + +- ^ ProjectExecTransformer (87) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (86) + :- ^ InputIteratorTransformer (76) + : +- ShuffleQueryStage (74) + : +- ColumnarExchange (73) + : +- VeloxAppendBatches (72) + : +- ^ ProjectExecTransformer (70) + : +- ^ ShuffledHashJoinExecTransformer LeftSemi BuildRight (69) + : :- ^ InputIteratorTransformer (9) + : : +- ShuffleQueryStage (7) + : : +- ColumnarExchange (6) + : : +- VeloxAppendBatches (5) : : +- ^ ProjectExecTransformer (3) : : +- ^ FilterExecTransformer (2) : : +- ^ Scan parquet (1) - : +- ^ InputIteratorTransformer (61) - : +- ^ InputAdapter (60) - : +- ^ ShuffleQueryStage (59) - : +- ColumnarExchange (58) - : +- ^ ProjectExecTransformer (56) - : +- ^ ShuffledHashJoinExecTransformer Inner (55) - : :- ^ InputIteratorTransformer (31) - : : +- ^ InputAdapter (30) - : : +- ^ ShuffleQueryStage (29) - : : +- ColumnarExchange (28) - : : +- ^ ProjectExecTransformer (26) - : : +- ^ ShuffledHashJoinExecTransformer LeftSemi (25) - : : :- ^ InputIteratorTransformer (16) - : : : +- ^ InputAdapter (15) - : : : +- ^ ShuffleQueryStage (14) - : : : +- ColumnarExchange (13) - : : : +- ^ ProjectExecTransformer (11) - : : : +- ^ FilterExecTransformer (10) - : : : +- ^ Scan parquet (9) - : : +- ^ InputIteratorTransformer (24) - : : +- ^ InputAdapter (23) - : : +- ^ ShuffleQueryStage (22) - : : +- ColumnarExchange (21) - : : +- ^ ProjectExecTransformer (19) - : : +- ^ FilterExecTransformer (18) - : : +- ^ Scan parquet (17) - : +- ^ InputIteratorTransformer (54) - : +- ^ InputAdapter (53) - : +- ^ ShuffleQueryStage (52) - : +- ColumnarExchange (51) - : +- ^ ProjectExecTransformer (49) - : +- ^ FilterExecTransformer (48) - : +- ^ ProjectExecTransformer (47) - : +- ^ RegularHashAggregateExecTransformer (46) - : +- ^ RegularHashAggregateExecTransformer (45) - : +- ^ ShuffledHashJoinExecTransformer LeftSemi (44) - : :- ^ InputIteratorTransformer (39) - : : +- ^ InputAdapter (38) - : : +- ^ ShuffleQueryStage (37) - : : +- ColumnarExchange (36) - : : +- ^ ProjectExecTransformer (34) - : : +- ^ FilterExecTransformer (33) - : : +- ^ Scan parquet (32) - : +- ^ InputIteratorTransformer (43) - : +- ^ InputAdapter (42) - : +- ^ ShuffleQueryStage (41) - : +- ReusedExchange (40) - +- ^ InputIteratorTransformer (76) - +- ^ InputAdapter (75) - +- ^ ShuffleQueryStage (74) - +- ColumnarExchange (73) - +- ^ ProjectExecTransformer (71) - +- ^ FilterExecTransformer (70) - +- ^ Scan parquet (69) + : +- ^ InputIteratorTransformer (68) + : +- ShuffleQueryStage (66) + : +- ColumnarExchange (65) + : +- VeloxAppendBatches (64) + : +- ^ ProjectExecTransformer (62) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (61) + : :- ^ InputIteratorTransformer (35) + : : +- ShuffleQueryStage (33) + : : +- ColumnarExchange (32) + : : +- VeloxAppendBatches (31) + : : +- ^ ProjectExecTransformer (29) + : : +- ^ ShuffledHashJoinExecTransformer LeftSemi BuildRight (28) + : : :- ^ InputIteratorTransformer (18) + : : : +- ShuffleQueryStage (16) + : : : +- ColumnarExchange (15) + : : : +- VeloxAppendBatches (14) + : : : +- ^ ProjectExecTransformer (12) + : : : +- ^ FilterExecTransformer (11) + : : : +- ^ Scan parquet (10) + : : +- ^ InputIteratorTransformer (27) + : : +- ShuffleQueryStage (25) + : : +- ColumnarExchange (24) + : : +- VeloxAppendBatches (23) + : : +- ^ ProjectExecTransformer (21) + : : +- ^ FilterExecTransformer (20) + : : +- ^ Scan parquet (19) + : +- ^ InputIteratorTransformer (60) + : +- ShuffleQueryStage (58) + : +- ColumnarExchange (57) + : +- VeloxAppendBatches (56) + : +- ^ ProjectExecTransformer (54) + : +- ^ FilterExecTransformer (53) + : +- ^ ProjectExecTransformer (52) + : +- ^ RegularHashAggregateExecTransformer (51) + : +- ^ RegularHashAggregateExecTransformer (50) + : +- ^ ShuffledHashJoinExecTransformer LeftSemi BuildRight (49) + : :- ^ InputIteratorTransformer (44) + : : +- ShuffleQueryStage (42) + : : +- ColumnarExchange (41) + : : +- VeloxAppendBatches (40) + : : +- ^ ProjectExecTransformer (38) + : : +- ^ FilterExecTransformer (37) + : : +- ^ Scan parquet (36) + : +- ^ InputIteratorTransformer (48) + : +- ShuffleQueryStage (46) + : +- ReusedExchange (45) + +- ^ InputIteratorTransformer (85) + +- ShuffleQueryStage (83) + +- ColumnarExchange (82) + +- VeloxAppendBatches (81) + +- ^ ProjectExecTransformer (79) + +- ^ FilterExecTransformer (78) + +- ^ Scan parquet (77) +- == Initial Plan == - Sort (125) - +- Exchange (124) - +- Project (123) - +- ShuffledHashJoin Inner BuildRight (122) - :- Exchange (117) - : +- Project (116) - : +- ShuffledHashJoin LeftSemi BuildRight (115) - : :- Exchange (89) - : : +- Filter (88) - : : +- Scan parquet (87) - : +- Exchange (114) - : +- Project (113) - : +- ShuffledHashJoin Inner BuildLeft (112) - : :- Exchange (98) - : : +- ShuffledHashJoin LeftSemi BuildRight (97) - : : :- Exchange (92) - : : : +- Filter (91) - : : : +- Scan parquet (90) - : : +- Exchange (96) - : : +- Project (95) - : : +- Filter (94) - : : +- Scan parquet (93) - : +- Exchange (111) - : +- Filter (110) - : +- HashAggregate (109) - : +- HashAggregate (108) - : +- ShuffledHashJoin LeftSemi BuildRight (107) - : :- Exchange (102) - : : +- Project (101) - : : +- Filter (100) - : : +- Scan parquet (99) - : +- Exchange (106) - : +- Project (105) - : +- Filter (104) - : +- Scan parquet (103) - +- Exchange (121) - +- Project (120) - +- Filter (119) - +- Scan parquet (118) + Sort (145) + +- Exchange (144) + +- Project (143) + +- SortMergeJoin Inner (142) + :- Sort (136) + : +- Exchange (135) + : +- Project (134) + : +- SortMergeJoin LeftSemi (133) + : :- Sort (100) + : : +- Exchange (99) + : : +- Filter (98) + : : +- Scan parquet (97) + : +- Sort (132) + : +- Exchange (131) + : +- Project (130) + : +- SortMergeJoin Inner (129) + : :- Sort (112) + : : +- Exchange (111) + : : +- SortMergeJoin LeftSemi (110) + : : :- Sort (104) + : : : +- Exchange (103) + : : : +- Filter (102) + : : : +- Scan parquet (101) + : : +- Sort (109) + : : +- Exchange (108) + : : +- Project (107) + : : +- Filter (106) + : : +- Scan parquet (105) + : +- Sort (128) + : +- Exchange (127) + : +- Filter (126) + : +- HashAggregate (125) + : +- HashAggregate (124) + : +- SortMergeJoin LeftSemi (123) + : :- Sort (117) + : : +- Exchange (116) + : : +- Project (115) + : : +- Filter (114) + : : +- Scan parquet (113) + : +- Sort (122) + : +- Exchange (121) + : +- Project (120) + : +- Filter (119) + : +- Scan parquet (118) + +- Sort (141) + +- Exchange (140) + +- Project (139) + +- Filter (138) + +- Scan parquet (137) (1) Scan parquet @@ -137,518 +146,598 @@ Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] +Arguments: X + +(6) ColumnarExchange Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] -(9) Scan parquet +(10) Scan parquet Output [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_availqty), IsNotNull(ps_partkey), IsNotNull(ps_suppkey)] ReadSchema: struct -(10) FilterExecTransformer +(11) FilterExecTransformer Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: ((isnotnull(ps_availqty#X) AND isnotnull(ps_partkey#X)) AND isnotnull(ps_suppkey#X)) -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [4]: [hash(ps_partkey#X, 42) AS hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X] Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X] +Arguments: X + +(15) ColumnarExchange Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_availqty#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] -(17) Scan parquet +(19) Scan parquet Output [2]: [p_partkey#X, p_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_name), StringStartsWith(p_name,forest)] ReadSchema: struct -(18) FilterExecTransformer +(20) FilterExecTransformer Input [2]: [p_partkey#X, p_name#X] Arguments: (isnotnull(p_name#X) AND StartsWith(p_name#X, forest)) -(19) ProjectExecTransformer +(21) ProjectExecTransformer Output [2]: [hash(p_partkey#X, 42) AS hash_partition_key#X, p_partkey#X] Input [2]: [p_partkey#X, p_name#X] -(20) WholeStageCodegenTransformer (X) +(22) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, p_partkey#X] Arguments: false -(21) ColumnarExchange +(23) VeloxAppendBatches +Input [2]: [hash_partition_key#X, p_partkey#X] +Arguments: X + +(24) ColumnarExchange Input [2]: [hash_partition_key#X, p_partkey#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [id=#X] -(22) ShuffleQueryStage +(25) ShuffleQueryStage Output [1]: [p_partkey#X] Arguments: X -(23) InputAdapter +(26) InputAdapter Input [1]: [p_partkey#X] -(24) InputIteratorTransformer +(27) InputIteratorTransformer Input [1]: [p_partkey#X] -(25) ShuffledHashJoinExecTransformer +(28) ShuffledHashJoinExecTransformer Left keys [1]: [ps_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(26) ProjectExecTransformer +(29) ProjectExecTransformer Output [4]: [hash(ps_partkey#X, ps_suppkey#X, 42) AS hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X] Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] -(27) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: false -(28) ColumnarExchange +(31) VeloxAppendBatches +Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X] +Arguments: X + +(32) ColumnarExchange Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: hashpartitioning(ps_partkey#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_availqty#X], [plan_id=X], [id=#X] -(29) ShuffleQueryStage +(33) ShuffleQueryStage Output [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: X -(30) InputAdapter +(34) InputAdapter Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] -(31) InputIteratorTransformer +(35) InputIteratorTransformer Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] -(32) Scan parquet +(36) Scan parquet Output [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1994-01-01), LessThan(l_shipdate,1995-01-01), IsNotNull(l_partkey), IsNotNull(l_suppkey)] ReadSchema: struct -(33) FilterExecTransformer +(37) FilterExecTransformer Input [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] Arguments: ((((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1994-01-01)) AND (l_shipdate#X < 1995-01-01)) AND isnotnull(l_partkey#X)) AND isnotnull(l_suppkey#X)) -(34) ProjectExecTransformer +(38) ProjectExecTransformer Output [4]: [hash(l_partkey#X, 42) AS hash_partition_key#X, l_partkey#X, l_suppkey#X, l_quantity#X] Input [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] -(35) WholeStageCodegenTransformer (X) +(39) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, l_partkey#X, l_suppkey#X, l_quantity#X] Arguments: false -(36) ColumnarExchange +(40) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_partkey#X, l_suppkey#X, l_quantity#X] +Arguments: X + +(41) ColumnarExchange Input [4]: [hash_partition_key#X, l_partkey#X, l_suppkey#X, l_quantity#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_suppkey#X, l_quantity#X], [plan_id=X], [id=#X] -(37) ShuffleQueryStage +(42) ShuffleQueryStage Output [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] Arguments: X -(38) InputAdapter +(43) InputAdapter Input [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] -(39) InputIteratorTransformer +(44) InputIteratorTransformer Input [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] -(40) ReusedExchange [Reuses operator id: 21] +(45) ReusedExchange [Reuses operator id: 24] Output [1]: [p_partkey#X] -(41) ShuffleQueryStage +(46) ShuffleQueryStage Output [1]: [p_partkey#X] Arguments: X -(42) InputAdapter +(47) InputAdapter Input [1]: [p_partkey#X] -(43) InputIteratorTransformer +(48) InputIteratorTransformer Input [1]: [p_partkey#X] -(44) ShuffledHashJoinExecTransformer +(49) ShuffledHashJoinExecTransformer Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(45) RegularHashAggregateExecTransformer +(50) RegularHashAggregateExecTransformer Input [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] Keys [2]: [l_partkey#X, l_suppkey#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] -(46) RegularHashAggregateExecTransformer +(51) RegularHashAggregateExecTransformer Input [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] Keys [2]: [l_partkey#X, l_suppkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [3]: [l_partkey#X, l_suppkey#X, sum(l_quantity#X)#X] -(47) ProjectExecTransformer +(52) ProjectExecTransformer Output [3]: [CheckOverflow((0.50 * promote_precision(sum(l_quantity#X)#X)), DecimalType(24,3), true) AS (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Input [3]: [l_partkey#X, l_suppkey#X, sum(l_quantity#X)#X] -(48) FilterExecTransformer +(53) FilterExecTransformer Input [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Arguments: isnotnull((0.5 * sum(l_quantity))#X) -(49) ProjectExecTransformer +(54) ProjectExecTransformer Output [4]: [hash(l_partkey#X, l_suppkey#X, 42) AS hash_partition_key#X, (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Input [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] -(50) WholeStageCodegenTransformer (X) +(55) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Arguments: false -(51) ColumnarExchange +(56) VeloxAppendBatches +Input [4]: [hash_partition_key#X, (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] +Arguments: X + +(57) ColumnarExchange Input [4]: [hash_partition_key#X, (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Arguments: hashpartitioning(l_partkey#X, l_suppkey#X, 1), ENSURE_REQUIREMENTS, [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X], [plan_id=X], [id=#X] -(52) ShuffleQueryStage +(58) ShuffleQueryStage Output [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Arguments: X -(53) InputAdapter +(59) InputAdapter Input [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] -(54) InputIteratorTransformer +(60) InputIteratorTransformer Input [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] -(55) ShuffledHashJoinExecTransformer +(61) ShuffledHashJoinExecTransformer Left keys [2]: [ps_partkey#X, ps_suppkey#X] Right keys [2]: [l_partkey#X, l_suppkey#X] Join condition: (cast(cast(ps_availqty#X as decimal(10,0)) as decimal(24,3)) > (0.5 * sum(l_quantity))#X) -(56) ProjectExecTransformer +(62) ProjectExecTransformer Output [2]: [hash(ps_suppkey#X, 42) AS hash_partition_key#X, ps_suppkey#X] Input [6]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] -(57) WholeStageCodegenTransformer (X) +(63) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, ps_suppkey#X] Arguments: false -(58) ColumnarExchange +(64) VeloxAppendBatches +Input [2]: [hash_partition_key#X, ps_suppkey#X] +Arguments: X + +(65) ColumnarExchange Input [2]: [hash_partition_key#X, ps_suppkey#X] Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_suppkey#X], [plan_id=X], [id=#X] -(59) ShuffleQueryStage +(66) ShuffleQueryStage Output [1]: [ps_suppkey#X] Arguments: X -(60) InputAdapter +(67) InputAdapter Input [1]: [ps_suppkey#X] -(61) InputIteratorTransformer +(68) InputIteratorTransformer Input [1]: [ps_suppkey#X] -(62) ShuffledHashJoinExecTransformer +(69) ShuffledHashJoinExecTransformer Left keys [1]: [s_suppkey#X] Right keys [1]: [ps_suppkey#X] Join condition: None -(63) ProjectExecTransformer +(70) ProjectExecTransformer Output [4]: [hash(s_nationkey#X, 42) AS hash_partition_key#X, s_name#X, s_address#X, s_nationkey#X] Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] -(64) WholeStageCodegenTransformer (X) +(71) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, s_name#X, s_address#X, s_nationkey#X] Arguments: false -(65) ColumnarExchange +(72) VeloxAppendBatches +Input [4]: [hash_partition_key#X, s_name#X, s_address#X, s_nationkey#X] +Arguments: X + +(73) ColumnarExchange Input [4]: [hash_partition_key#X, s_name#X, s_address#X, s_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_name#X, s_address#X, s_nationkey#X], [plan_id=X], [id=#X] -(66) ShuffleQueryStage +(74) ShuffleQueryStage Output [3]: [s_name#X, s_address#X, s_nationkey#X] Arguments: X -(67) InputAdapter +(75) InputAdapter Input [3]: [s_name#X, s_address#X, s_nationkey#X] -(68) InputIteratorTransformer +(76) InputIteratorTransformer Input [3]: [s_name#X, s_address#X, s_nationkey#X] -(69) Scan parquet +(77) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,CANADA), IsNotNull(n_nationkey)] ReadSchema: struct -(70) FilterExecTransformer +(78) FilterExecTransformer Input [2]: [n_nationkey#X, n_name#X] Arguments: ((isnotnull(n_name#X) AND (n_name#X = CANADA)) AND isnotnull(n_nationkey#X)) -(71) ProjectExecTransformer +(79) ProjectExecTransformer Output [2]: [hash(n_nationkey#X, 42) AS hash_partition_key#X, n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(72) WholeStageCodegenTransformer (X) +(80) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, n_nationkey#X] Arguments: false -(73) ColumnarExchange +(81) VeloxAppendBatches +Input [2]: [hash_partition_key#X, n_nationkey#X] +Arguments: X + +(82) ColumnarExchange Input [2]: [hash_partition_key#X, n_nationkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X], [plan_id=X], [id=#X] -(74) ShuffleQueryStage +(83) ShuffleQueryStage Output [1]: [n_nationkey#X] Arguments: X -(75) InputAdapter +(84) InputAdapter Input [1]: [n_nationkey#X] -(76) InputIteratorTransformer +(85) InputIteratorTransformer Input [1]: [n_nationkey#X] -(77) ShuffledHashJoinExecTransformer +(86) ShuffledHashJoinExecTransformer Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(78) ProjectExecTransformer +(87) ProjectExecTransformer Output [2]: [s_name#X, s_address#X] Input [4]: [s_name#X, s_address#X, s_nationkey#X, n_nationkey#X] -(79) WholeStageCodegenTransformer (X) +(88) WholeStageCodegenTransformer (X) Input [2]: [s_name#X, s_address#X] Arguments: false -(80) ColumnarExchange +(89) VeloxAppendBatches +Input [2]: [s_name#X, s_address#X] +Arguments: X + +(90) ColumnarExchange Input [2]: [s_name#X, s_address#X] Arguments: rangepartitioning(s_name#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(81) ShuffleQueryStage +(91) ShuffleQueryStage Output [2]: [s_name#X, s_address#X] Arguments: X -(82) InputAdapter +(92) InputAdapter Input [2]: [s_name#X, s_address#X] -(83) InputIteratorTransformer +(93) InputIteratorTransformer Input [2]: [s_name#X, s_address#X] -(84) SortExecTransformer +(94) SortExecTransformer Input [2]: [s_name#X, s_address#X] Arguments: [s_name#X ASC NULLS FIRST], true, 0 -(85) WholeStageCodegenTransformer (X) +(95) WholeStageCodegenTransformer (X) Input [2]: [s_name#X, s_address#X] Arguments: false -(86) VeloxColumnarToRowExec +(96) VeloxColumnarToRowExec Input [2]: [s_name#X, s_address#X] -(87) Scan parquet +(97) Scan parquet Output [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_nationkey)] ReadSchema: struct -(88) Filter +(98) Filter Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Condition : isnotnull(s_nationkey#X) -(89) Exchange +(99) Exchange Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(90) Scan parquet +(100) Sort +Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] +Arguments: [s_suppkey#X ASC NULLS FIRST], false, 0 + +(101) Scan parquet Output [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_availqty), IsNotNull(ps_partkey), IsNotNull(ps_suppkey)] ReadSchema: struct -(91) Filter +(102) Filter Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Condition : ((isnotnull(ps_availqty#X) AND isnotnull(ps_partkey#X)) AND isnotnull(ps_suppkey#X)) -(92) Exchange +(103) Exchange Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(93) Scan parquet +(104) Sort +Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] +Arguments: [ps_partkey#X ASC NULLS FIRST], false, 0 + +(105) Scan parquet Output [2]: [p_partkey#X, p_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_name), StringStartsWith(p_name,forest)] ReadSchema: struct -(94) Filter +(106) Filter Input [2]: [p_partkey#X, p_name#X] Condition : (isnotnull(p_name#X) AND StartsWith(p_name#X, forest)) -(95) Project +(107) Project Output [1]: [p_partkey#X] Input [2]: [p_partkey#X, p_name#X] -(96) Exchange +(108) Exchange Input [1]: [p_partkey#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(97) ShuffledHashJoin +(109) Sort +Input [1]: [p_partkey#X] +Arguments: [p_partkey#X ASC NULLS FIRST], false, 0 + +(110) SortMergeJoin Left keys [1]: [ps_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(98) Exchange +(111) Exchange Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: hashpartitioning(ps_partkey#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(99) Scan parquet +(112) Sort +Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] +Arguments: [ps_partkey#X ASC NULLS FIRST, ps_suppkey#X ASC NULLS FIRST], false, 0 + +(113) Scan parquet Output [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1994-01-01), LessThan(l_shipdate,1995-01-01), IsNotNull(l_partkey), IsNotNull(l_suppkey)] ReadSchema: struct -(100) Filter +(114) Filter Input [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] Condition : ((((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1994-01-01)) AND (l_shipdate#X < 1995-01-01)) AND isnotnull(l_partkey#X)) AND isnotnull(l_suppkey#X)) -(101) Project +(115) Project Output [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] Input [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] -(102) Exchange +(116) Exchange Input [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(103) Scan parquet +(117) Sort +Input [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] +Arguments: [l_partkey#X ASC NULLS FIRST], false, 0 + +(118) Scan parquet Output [2]: [p_partkey#X, p_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_name), StringStartsWith(p_name,forest)] ReadSchema: struct -(104) Filter +(119) Filter Input [2]: [p_partkey#X, p_name#X] Condition : (isnotnull(p_name#X) AND StartsWith(p_name#X, forest)) -(105) Project +(120) Project Output [1]: [p_partkey#X] Input [2]: [p_partkey#X, p_name#X] -(106) Exchange +(121) Exchange Input [1]: [p_partkey#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(107) ShuffledHashJoin +(122) Sort +Input [1]: [p_partkey#X] +Arguments: [p_partkey#X ASC NULLS FIRST], false, 0 + +(123) SortMergeJoin Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(108) HashAggregate +(124) HashAggregate Input [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] Keys [2]: [l_partkey#X, l_suppkey#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] -(109) HashAggregate +(125) HashAggregate Input [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] Keys [2]: [l_partkey#X, l_suppkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [3]: [CheckOverflow((0.50 * promote_precision(sum(l_quantity#X)#X)), DecimalType(24,3), true) AS (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] -(110) Filter +(126) Filter Input [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Condition : isnotnull((0.5 * sum(l_quantity))#X) -(111) Exchange +(127) Exchange Input [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Arguments: hashpartitioning(l_partkey#X, l_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(112) ShuffledHashJoin +(128) Sort +Input [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] +Arguments: [l_partkey#X ASC NULLS FIRST, l_suppkey#X ASC NULLS FIRST], false, 0 + +(129) SortMergeJoin Left keys [2]: [ps_partkey#X, ps_suppkey#X] Right keys [2]: [l_partkey#X, l_suppkey#X] Join condition: (cast(cast(ps_availqty#X as decimal(10,0)) as decimal(24,3)) > (0.5 * sum(l_quantity))#X) -(113) Project +(130) Project Output [1]: [ps_suppkey#X] Input [6]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] -(114) Exchange +(131) Exchange Input [1]: [ps_suppkey#X] Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(115) ShuffledHashJoin +(132) Sort +Input [1]: [ps_suppkey#X] +Arguments: [ps_suppkey#X ASC NULLS FIRST], false, 0 + +(133) SortMergeJoin Left keys [1]: [s_suppkey#X] Right keys [1]: [ps_suppkey#X] Join condition: None -(116) Project +(134) Project Output [3]: [s_name#X, s_address#X, s_nationkey#X] Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] -(117) Exchange +(135) Exchange Input [3]: [s_name#X, s_address#X, s_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(118) Scan parquet +(136) Sort +Input [3]: [s_name#X, s_address#X, s_nationkey#X] +Arguments: [s_nationkey#X ASC NULLS FIRST], false, 0 + +(137) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,CANADA), IsNotNull(n_nationkey)] ReadSchema: struct -(119) Filter +(138) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : ((isnotnull(n_name#X) AND (n_name#X = CANADA)) AND isnotnull(n_nationkey#X)) -(120) Project +(139) Project Output [1]: [n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(121) Exchange +(140) Exchange Input [1]: [n_nationkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(122) ShuffledHashJoin +(141) Sort +Input [1]: [n_nationkey#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(142) SortMergeJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(123) Project +(143) Project Output [2]: [s_name#X, s_address#X] Input [4]: [s_name#X, s_address#X, s_nationkey#X, n_nationkey#X] -(124) Exchange +(144) Exchange Input [2]: [s_name#X, s_address#X] Arguments: rangepartitioning(s_name#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(125) Sort +(145) Sort Input [2]: [s_name#X, s_address#X] Arguments: [s_name#X ASC NULLS FIRST], true, 0 -(126) AdaptiveSparkPlan +(146) AdaptiveSparkPlan Output [2]: [s_name#X, s_address#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/21.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/21.txt index 17fad29860dfd..ef4e87bb1de4d 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/21.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/21.txt @@ -1,114 +1,123 @@ == Physical Plan == -AdaptiveSparkPlan (119) +AdaptiveSparkPlan (138) +- == Final Plan == - VeloxColumnarToRowExec (82) - +- TakeOrderedAndProjectExecTransformer (81) - +- ^ RegularHashAggregateExecTransformer (79) - +- ^ InputIteratorTransformer (78) - +- ^ InputAdapter (77) - +- ^ ShuffleQueryStage (76) - +- ColumnarExchange (75) - +- ^ ProjectExecTransformer (73) - +- ^ FlushableHashAggregateExecTransformer (72) - +- ^ ProjectExecTransformer (71) - +- ^ ShuffledHashJoinExecTransformer Inner (70) - :- ^ InputIteratorTransformer (61) - : +- ^ InputAdapter (60) - : +- ^ ShuffleQueryStage (59) - : +- ColumnarExchange (58) - : +- ^ ProjectExecTransformer (56) - : +- ^ ShuffledHashJoinExecTransformer Inner (55) - : :- ^ InputIteratorTransformer (46) - : : +- ^ InputAdapter (45) - : : +- ^ ShuffleQueryStage (44) - : : +- ColumnarExchange (43) - : : +- ^ ProjectExecTransformer (41) - : : +- ^ ShuffledHashJoinExecTransformer Inner (40) - : : :- ^ InputIteratorTransformer (8) - : : : +- ^ InputAdapter (7) - : : : +- ^ ShuffleQueryStage (6) - : : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (92) + +- TakeOrderedAndProjectExecTransformer (91) + +- ^ RegularHashAggregateExecTransformer (89) + +- ^ InputIteratorTransformer (88) + +- ShuffleQueryStage (86) + +- ColumnarExchange (85) + +- VeloxAppendBatches (84) + +- ^ ProjectExecTransformer (82) + +- ^ FlushableHashAggregateExecTransformer (81) + +- ^ ProjectExecTransformer (80) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (79) + :- ^ InputIteratorTransformer (69) + : +- ShuffleQueryStage (67) + : +- ColumnarExchange (66) + : +- VeloxAppendBatches (65) + : +- ^ ProjectExecTransformer (63) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (62) + : :- ^ InputIteratorTransformer (52) + : : +- ShuffleQueryStage (50) + : : +- ColumnarExchange (49) + : : +- VeloxAppendBatches (48) + : : +- ^ ProjectExecTransformer (46) + : : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (45) + : : :- ^ InputIteratorTransformer (9) + : : : +- ShuffleQueryStage (7) + : : : +- ColumnarExchange (6) + : : : +- VeloxAppendBatches (5) : : : +- ^ ProjectExecTransformer (3) : : : +- ^ FilterExecTransformer (2) : : : +- ^ Scan parquet (1) - : : +- ^ InputIteratorTransformer (39) - : : +- ^ InputAdapter (38) - : : +- ^ ShuffleQueryStage (37) - : : +- ColumnarExchange (36) - : : +- ^ ProjectExecTransformer (34) - : : +- ^ ShuffledHashJoinExecTransformer LeftAnti (33) - : : :- ^ ShuffledHashJoinExecTransformer LeftSemi (24) - : : : :- ^ InputIteratorTransformer (16) - : : : : +- ^ InputAdapter (15) - : : : : +- ^ ShuffleQueryStage (14) - : : : : +- ColumnarExchange (13) - : : : : +- ^ ProjectExecTransformer (11) - : : : : +- ^ FilterExecTransformer (10) - : : : : +- ^ Scan parquet (9) - : : : +- ^ InputIteratorTransformer (23) - : : : +- ^ InputAdapter (22) - : : : +- ^ ShuffleQueryStage (21) - : : : +- ColumnarExchange (20) - : : : +- ^ ProjectExecTransformer (18) - : : : +- ^ Scan parquet (17) - : : +- ^ InputIteratorTransformer (32) - : : +- ^ InputAdapter (31) - : : +- ^ ShuffleQueryStage (30) - : : +- ColumnarExchange (29) - : : +- ^ ProjectExecTransformer (27) - : : +- ^ FilterExecTransformer (26) - : : +- ^ Scan parquet (25) - : +- ^ InputIteratorTransformer (54) - : +- ^ InputAdapter (53) - : +- ^ ShuffleQueryStage (52) - : +- ColumnarExchange (51) - : +- ^ ProjectExecTransformer (49) - : +- ^ FilterExecTransformer (48) - : +- ^ Scan parquet (47) - +- ^ InputIteratorTransformer (69) - +- ^ InputAdapter (68) - +- ^ ShuffleQueryStage (67) - +- ColumnarExchange (66) - +- ^ ProjectExecTransformer (64) - +- ^ FilterExecTransformer (63) - +- ^ Scan parquet (62) + : : +- ^ InputIteratorTransformer (44) + : : +- ShuffleQueryStage (42) + : : +- ColumnarExchange (41) + : : +- VeloxAppendBatches (40) + : : +- ^ ProjectExecTransformer (38) + : : +- ^ ShuffledHashJoinExecTransformer LeftAnti BuildRight (37) + : : :- ^ ShuffledHashJoinExecTransformer LeftSemi BuildRight (27) + : : : :- ^ InputIteratorTransformer (18) + : : : : +- ShuffleQueryStage (16) + : : : : +- ColumnarExchange (15) + : : : : +- VeloxAppendBatches (14) + : : : : +- ^ ProjectExecTransformer (12) + : : : : +- ^ FilterExecTransformer (11) + : : : : +- ^ Scan parquet (10) + : : : +- ^ InputIteratorTransformer (26) + : : : +- ShuffleQueryStage (24) + : : : +- ColumnarExchange (23) + : : : +- VeloxAppendBatches (22) + : : : +- ^ ProjectExecTransformer (20) + : : : +- ^ Scan parquet (19) + : : +- ^ InputIteratorTransformer (36) + : : +- ShuffleQueryStage (34) + : : +- ColumnarExchange (33) + : : +- VeloxAppendBatches (32) + : : +- ^ ProjectExecTransformer (30) + : : +- ^ FilterExecTransformer (29) + : : +- ^ Scan parquet (28) + : +- ^ InputIteratorTransformer (61) + : +- ShuffleQueryStage (59) + : +- ColumnarExchange (58) + : +- VeloxAppendBatches (57) + : +- ^ ProjectExecTransformer (55) + : +- ^ FilterExecTransformer (54) + : +- ^ Scan parquet (53) + +- ^ InputIteratorTransformer (78) + +- ShuffleQueryStage (76) + +- ColumnarExchange (75) + +- VeloxAppendBatches (74) + +- ^ ProjectExecTransformer (72) + +- ^ FilterExecTransformer (71) + +- ^ Scan parquet (70) +- == Initial Plan == - TakeOrderedAndProject (118) - +- HashAggregate (117) - +- Exchange (116) - +- HashAggregate (115) - +- Project (114) - +- ShuffledHashJoin Inner BuildRight (113) - :- Exchange (108) - : +- Project (107) - : +- ShuffledHashJoin Inner BuildRight (106) - : :- Exchange (101) - : : +- Project (100) - : : +- ShuffledHashJoin Inner BuildLeft (99) - : : :- Exchange (85) - : : : +- Filter (84) - : : : +- Scan parquet (83) - : : +- Exchange (98) - : : +- ShuffledHashJoin LeftAnti BuildRight (97) - : : :- ShuffledHashJoin LeftSemi BuildRight (92) - : : : :- Exchange (89) - : : : : +- Project (88) - : : : : +- Filter (87) - : : : : +- Scan parquet (86) - : : : +- Exchange (91) - : : : +- Scan parquet (90) - : : +- Exchange (96) - : : +- Project (95) - : : +- Filter (94) - : : +- Scan parquet (93) - : +- Exchange (105) - : +- Project (104) - : +- Filter (103) - : +- Scan parquet (102) - +- Exchange (112) - +- Project (111) - +- Filter (110) - +- Scan parquet (109) + TakeOrderedAndProject (137) + +- HashAggregate (136) + +- Exchange (135) + +- HashAggregate (134) + +- Project (133) + +- SortMergeJoin Inner (132) + :- Sort (126) + : +- Exchange (125) + : +- Project (124) + : +- SortMergeJoin Inner (123) + : :- Sort (117) + : : +- Exchange (116) + : : +- Project (115) + : : +- SortMergeJoin Inner (114) + : : :- Sort (96) + : : : +- Exchange (95) + : : : +- Filter (94) + : : : +- Scan parquet (93) + : : +- Sort (113) + : : +- Exchange (112) + : : +- SortMergeJoin LeftAnti (111) + : : :- SortMergeJoin LeftSemi (105) + : : : :- Sort (101) + : : : : +- Exchange (100) + : : : : +- Project (99) + : : : : +- Filter (98) + : : : : +- Scan parquet (97) + : : : +- Sort (104) + : : : +- Exchange (103) + : : : +- Scan parquet (102) + : : +- Sort (110) + : : +- Exchange (109) + : : +- Project (108) + : : +- Filter (107) + : : +- Scan parquet (106) + : +- Sort (122) + : +- Exchange (121) + : +- Project (120) + : +- Filter (119) + : +- Scan parquet (118) + +- Sort (131) + +- Exchange (130) + +- Project (129) + +- Filter (128) + +- Scan parquet (127) (1) Scan parquet @@ -130,494 +139,570 @@ Input [3]: [s_suppkey#X, s_name#X, s_nationkey#X] Input [4]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_nationkey#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [4]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_nationkey#X] +Arguments: X + +(6) ColumnarExchange Input [4]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_nationkey#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [3]: [s_suppkey#X, s_name#X, s_nationkey#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [3]: [s_suppkey#X, s_name#X, s_nationkey#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [3]: [s_suppkey#X, s_name#X, s_nationkey#X] -(9) Scan parquet +(10) Scan parquet Output [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_receiptdate), IsNotNull(l_commitdate), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(10) FilterExecTransformer +(11) FilterExecTransformer Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Arguments: ((((isnotnull(l_receiptdate#X) AND isnotnull(l_commitdate#X)) AND (l_receiptdate#X > l_commitdate#X)) AND isnotnull(l_suppkey#X)) AND isnotnull(l_orderkey#X)) -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [3]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_suppkey#X] Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] +Arguments: X + +(15) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [2]: [l_orderkey#X, l_suppkey#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [2]: [l_orderkey#X, l_suppkey#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [2]: [l_orderkey#X, l_suppkey#X] -(17) Scan parquet +(19) Scan parquet Output [2]: [l_orderkey#X, l_suppkey#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [3]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_suppkey#X] Input [2]: [l_orderkey#X, l_suppkey#X] -(19) WholeStageCodegenTransformer (X) +(21) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] Arguments: false -(20) ColumnarExchange +(22) VeloxAppendBatches +Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] +Arguments: X + +(23) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [id=#X] -(21) ShuffleQueryStage +(24) ShuffleQueryStage Output [2]: [l_orderkey#X, l_suppkey#X] Arguments: X -(22) InputAdapter +(25) InputAdapter Input [2]: [l_orderkey#X, l_suppkey#X] -(23) InputIteratorTransformer +(26) InputIteratorTransformer Input [2]: [l_orderkey#X, l_suppkey#X] -(24) ShuffledHashJoinExecTransformer +(27) ShuffledHashJoinExecTransformer Left keys [1]: [l_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: NOT (l_suppkey#X = l_suppkey#X) -(25) Scan parquet +(28) Scan parquet Output [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_receiptdate), IsNotNull(l_commitdate)] ReadSchema: struct -(26) FilterExecTransformer +(29) FilterExecTransformer Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Arguments: ((isnotnull(l_receiptdate#X) AND isnotnull(l_commitdate#X)) AND (l_receiptdate#X > l_commitdate#X)) -(27) ProjectExecTransformer +(30) ProjectExecTransformer Output [3]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_suppkey#X] Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] -(28) WholeStageCodegenTransformer (X) +(31) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] Arguments: false -(29) ColumnarExchange +(32) VeloxAppendBatches +Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] +Arguments: X + +(33) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [id=#X] -(30) ShuffleQueryStage +(34) ShuffleQueryStage Output [2]: [l_orderkey#X, l_suppkey#X] Arguments: X -(31) InputAdapter +(35) InputAdapter Input [2]: [l_orderkey#X, l_suppkey#X] -(32) InputIteratorTransformer +(36) InputIteratorTransformer Input [2]: [l_orderkey#X, l_suppkey#X] -(33) ShuffledHashJoinExecTransformer +(37) ShuffledHashJoinExecTransformer Left keys [1]: [l_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: NOT (l_suppkey#X = l_suppkey#X) -(34) ProjectExecTransformer +(38) ProjectExecTransformer Output [3]: [hash(l_suppkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_suppkey#X] Input [2]: [l_orderkey#X, l_suppkey#X] -(35) WholeStageCodegenTransformer (X) +(39) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] Arguments: false -(36) ColumnarExchange +(40) VeloxAppendBatches +Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] +Arguments: X + +(41) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [id=#X] -(37) ShuffleQueryStage +(42) ShuffleQueryStage Output [2]: [l_orderkey#X, l_suppkey#X] Arguments: X -(38) InputAdapter +(43) InputAdapter Input [2]: [l_orderkey#X, l_suppkey#X] -(39) InputIteratorTransformer +(44) InputIteratorTransformer Input [2]: [l_orderkey#X, l_suppkey#X] -(40) ShuffledHashJoinExecTransformer +(45) ShuffledHashJoinExecTransformer Left keys [1]: [s_suppkey#X] Right keys [1]: [l_suppkey#X] Join condition: None -(41) ProjectExecTransformer +(46) ProjectExecTransformer Output [4]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, s_name#X, s_nationkey#X, l_orderkey#X] Input [5]: [s_suppkey#X, s_name#X, s_nationkey#X, l_orderkey#X, l_suppkey#X] -(42) WholeStageCodegenTransformer (X) +(47) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, s_name#X, s_nationkey#X, l_orderkey#X] Arguments: false -(43) ColumnarExchange +(48) VeloxAppendBatches +Input [4]: [hash_partition_key#X, s_name#X, s_nationkey#X, l_orderkey#X] +Arguments: X + +(49) ColumnarExchange Input [4]: [hash_partition_key#X, s_name#X, s_nationkey#X, l_orderkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [s_name#X, s_nationkey#X, l_orderkey#X], [plan_id=X], [id=#X] -(44) ShuffleQueryStage +(50) ShuffleQueryStage Output [3]: [s_name#X, s_nationkey#X, l_orderkey#X] Arguments: X -(45) InputAdapter +(51) InputAdapter Input [3]: [s_name#X, s_nationkey#X, l_orderkey#X] -(46) InputIteratorTransformer +(52) InputIteratorTransformer Input [3]: [s_name#X, s_nationkey#X, l_orderkey#X] -(47) Scan parquet +(53) Scan parquet Output [2]: [o_orderkey#X, o_orderstatus#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderstatus), EqualTo(o_orderstatus,F), IsNotNull(o_orderkey)] ReadSchema: struct -(48) FilterExecTransformer +(54) FilterExecTransformer Input [2]: [o_orderkey#X, o_orderstatus#X] Arguments: ((isnotnull(o_orderstatus#X) AND (o_orderstatus#X = F)) AND isnotnull(o_orderkey#X)) -(49) ProjectExecTransformer +(55) ProjectExecTransformer Output [2]: [hash(o_orderkey#X, 42) AS hash_partition_key#X, o_orderkey#X] Input [2]: [o_orderkey#X, o_orderstatus#X] -(50) WholeStageCodegenTransformer (X) +(56) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, o_orderkey#X] Arguments: false -(51) ColumnarExchange +(57) VeloxAppendBatches +Input [2]: [hash_partition_key#X, o_orderkey#X] +Arguments: X + +(58) ColumnarExchange Input [2]: [hash_partition_key#X, o_orderkey#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X], [plan_id=X], [id=#X] -(52) ShuffleQueryStage +(59) ShuffleQueryStage Output [1]: [o_orderkey#X] Arguments: X -(53) InputAdapter +(60) InputAdapter Input [1]: [o_orderkey#X] -(54) InputIteratorTransformer +(61) InputIteratorTransformer Input [1]: [o_orderkey#X] -(55) ShuffledHashJoinExecTransformer +(62) ShuffledHashJoinExecTransformer Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join condition: None -(56) ProjectExecTransformer +(63) ProjectExecTransformer Output [3]: [hash(s_nationkey#X, 42) AS hash_partition_key#X, s_name#X, s_nationkey#X] Input [4]: [s_name#X, s_nationkey#X, l_orderkey#X, o_orderkey#X] -(57) WholeStageCodegenTransformer (X) +(64) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, s_name#X, s_nationkey#X] Arguments: false -(58) ColumnarExchange +(65) VeloxAppendBatches +Input [3]: [hash_partition_key#X, s_name#X, s_nationkey#X] +Arguments: X + +(66) ColumnarExchange Input [3]: [hash_partition_key#X, s_name#X, s_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_name#X, s_nationkey#X], [plan_id=X], [id=#X] -(59) ShuffleQueryStage +(67) ShuffleQueryStage Output [2]: [s_name#X, s_nationkey#X] Arguments: X -(60) InputAdapter +(68) InputAdapter Input [2]: [s_name#X, s_nationkey#X] -(61) InputIteratorTransformer +(69) InputIteratorTransformer Input [2]: [s_name#X, s_nationkey#X] -(62) Scan parquet +(70) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,SAUDI ARABIA), IsNotNull(n_nationkey)] ReadSchema: struct -(63) FilterExecTransformer +(71) FilterExecTransformer Input [2]: [n_nationkey#X, n_name#X] Arguments: ((isnotnull(n_name#X) AND (n_name#X = SAUDI ARABIA)) AND isnotnull(n_nationkey#X)) -(64) ProjectExecTransformer +(72) ProjectExecTransformer Output [2]: [hash(n_nationkey#X, 42) AS hash_partition_key#X, n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(65) WholeStageCodegenTransformer (X) +(73) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, n_nationkey#X] Arguments: false -(66) ColumnarExchange +(74) VeloxAppendBatches +Input [2]: [hash_partition_key#X, n_nationkey#X] +Arguments: X + +(75) ColumnarExchange Input [2]: [hash_partition_key#X, n_nationkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X], [plan_id=X], [id=#X] -(67) ShuffleQueryStage +(76) ShuffleQueryStage Output [1]: [n_nationkey#X] Arguments: X -(68) InputAdapter +(77) InputAdapter Input [1]: [n_nationkey#X] -(69) InputIteratorTransformer +(78) InputIteratorTransformer Input [1]: [n_nationkey#X] -(70) ShuffledHashJoinExecTransformer +(79) ShuffledHashJoinExecTransformer Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(71) ProjectExecTransformer +(80) ProjectExecTransformer Output [1]: [s_name#X] Input [3]: [s_name#X, s_nationkey#X, n_nationkey#X] -(72) FlushableHashAggregateExecTransformer +(81) FlushableHashAggregateExecTransformer Input [1]: [s_name#X] Keys [1]: [s_name#X] Functions [1]: [partial_count(1)] Aggregate Attributes [1]: [count#X] Results [2]: [s_name#X, count#X] -(73) ProjectExecTransformer +(82) ProjectExecTransformer Output [3]: [hash(s_name#X, 42) AS hash_partition_key#X, s_name#X, count#X] Input [2]: [s_name#X, count#X] -(74) WholeStageCodegenTransformer (X) +(83) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, s_name#X, count#X] Arguments: false -(75) ColumnarExchange +(84) VeloxAppendBatches +Input [3]: [hash_partition_key#X, s_name#X, count#X] +Arguments: X + +(85) ColumnarExchange Input [3]: [hash_partition_key#X, s_name#X, count#X] Arguments: hashpartitioning(s_name#X, 1), ENSURE_REQUIREMENTS, [s_name#X, count#X], [plan_id=X], [id=#X] -(76) ShuffleQueryStage +(86) ShuffleQueryStage Output [2]: [s_name#X, count#X] Arguments: X -(77) InputAdapter +(87) InputAdapter Input [2]: [s_name#X, count#X] -(78) InputIteratorTransformer +(88) InputIteratorTransformer Input [2]: [s_name#X, count#X] -(79) RegularHashAggregateExecTransformer +(89) RegularHashAggregateExecTransformer Input [2]: [s_name#X, count#X] Keys [1]: [s_name#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [s_name#X, count(1)#X AS numwait#X] -(80) WholeStageCodegenTransformer (X) +(90) WholeStageCodegenTransformer (X) Input [2]: [s_name#X, numwait#X] Arguments: false -(81) TakeOrderedAndProjectExecTransformer +(91) TakeOrderedAndProjectExecTransformer Input [2]: [s_name#X, numwait#X] Arguments: X, [numwait#X DESC NULLS LAST, s_name#X ASC NULLS FIRST], [s_name#X, numwait#X], 0 -(82) VeloxColumnarToRowExec +(92) VeloxColumnarToRowExec Input [2]: [s_name#X, numwait#X] -(83) Scan parquet +(93) Scan parquet Output [3]: [s_suppkey#X, s_name#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(84) Filter +(94) Filter Input [3]: [s_suppkey#X, s_name#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(85) Exchange +(95) Exchange Input [3]: [s_suppkey#X, s_name#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(86) Scan parquet +(96) Sort +Input [3]: [s_suppkey#X, s_name#X, s_nationkey#X] +Arguments: [s_suppkey#X ASC NULLS FIRST], false, 0 + +(97) Scan parquet Output [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_receiptdate), IsNotNull(l_commitdate), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(87) Filter +(98) Filter Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Condition : ((((isnotnull(l_receiptdate#X) AND isnotnull(l_commitdate#X)) AND (l_receiptdate#X > l_commitdate#X)) AND isnotnull(l_suppkey#X)) AND isnotnull(l_orderkey#X)) -(88) Project +(99) Project Output [2]: [l_orderkey#X, l_suppkey#X] Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] -(89) Exchange +(100) Exchange Input [2]: [l_orderkey#X, l_suppkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(90) Scan parquet +(101) Sort +Input [2]: [l_orderkey#X, l_suppkey#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(102) Scan parquet Output [2]: [l_orderkey#X, l_suppkey#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(91) Exchange +(103) Exchange Input [2]: [l_orderkey#X, l_suppkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(92) ShuffledHashJoin +(104) Sort +Input [2]: [l_orderkey#X, l_suppkey#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(105) SortMergeJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: NOT (l_suppkey#X = l_suppkey#X) -(93) Scan parquet +(106) Scan parquet Output [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_receiptdate), IsNotNull(l_commitdate)] ReadSchema: struct -(94) Filter +(107) Filter Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Condition : ((isnotnull(l_receiptdate#X) AND isnotnull(l_commitdate#X)) AND (l_receiptdate#X > l_commitdate#X)) -(95) Project +(108) Project Output [2]: [l_orderkey#X, l_suppkey#X] Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] -(96) Exchange +(109) Exchange Input [2]: [l_orderkey#X, l_suppkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(97) ShuffledHashJoin +(110) Sort +Input [2]: [l_orderkey#X, l_suppkey#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(111) SortMergeJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: NOT (l_suppkey#X = l_suppkey#X) -(98) Exchange +(112) Exchange Input [2]: [l_orderkey#X, l_suppkey#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(99) ShuffledHashJoin +(113) Sort +Input [2]: [l_orderkey#X, l_suppkey#X] +Arguments: [l_suppkey#X ASC NULLS FIRST], false, 0 + +(114) SortMergeJoin Left keys [1]: [s_suppkey#X] Right keys [1]: [l_suppkey#X] Join condition: None -(100) Project +(115) Project Output [3]: [s_name#X, s_nationkey#X, l_orderkey#X] Input [5]: [s_suppkey#X, s_name#X, s_nationkey#X, l_orderkey#X, l_suppkey#X] -(101) Exchange +(116) Exchange Input [3]: [s_name#X, s_nationkey#X, l_orderkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(102) Scan parquet +(117) Sort +Input [3]: [s_name#X, s_nationkey#X, l_orderkey#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(118) Scan parquet Output [2]: [o_orderkey#X, o_orderstatus#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderstatus), EqualTo(o_orderstatus,F), IsNotNull(o_orderkey)] ReadSchema: struct -(103) Filter +(119) Filter Input [2]: [o_orderkey#X, o_orderstatus#X] Condition : ((isnotnull(o_orderstatus#X) AND (o_orderstatus#X = F)) AND isnotnull(o_orderkey#X)) -(104) Project +(120) Project Output [1]: [o_orderkey#X] Input [2]: [o_orderkey#X, o_orderstatus#X] -(105) Exchange +(121) Exchange Input [1]: [o_orderkey#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(106) ShuffledHashJoin +(122) Sort +Input [1]: [o_orderkey#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(123) SortMergeJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join condition: None -(107) Project +(124) Project Output [2]: [s_name#X, s_nationkey#X] Input [4]: [s_name#X, s_nationkey#X, l_orderkey#X, o_orderkey#X] -(108) Exchange +(125) Exchange Input [2]: [s_name#X, s_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(109) Scan parquet +(126) Sort +Input [2]: [s_name#X, s_nationkey#X] +Arguments: [s_nationkey#X ASC NULLS FIRST], false, 0 + +(127) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,SAUDI ARABIA), IsNotNull(n_nationkey)] ReadSchema: struct -(110) Filter +(128) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : ((isnotnull(n_name#X) AND (n_name#X = SAUDI ARABIA)) AND isnotnull(n_nationkey#X)) -(111) Project +(129) Project Output [1]: [n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(112) Exchange +(130) Exchange Input [1]: [n_nationkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(113) ShuffledHashJoin +(131) Sort +Input [1]: [n_nationkey#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(132) SortMergeJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(114) Project +(133) Project Output [1]: [s_name#X] Input [3]: [s_name#X, s_nationkey#X, n_nationkey#X] -(115) HashAggregate +(134) HashAggregate Input [1]: [s_name#X] Keys [1]: [s_name#X] Functions [1]: [partial_count(1)] Aggregate Attributes [1]: [count#X] Results [2]: [s_name#X, count#X] -(116) Exchange +(135) Exchange Input [2]: [s_name#X, count#X] Arguments: hashpartitioning(s_name#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(117) HashAggregate +(136) HashAggregate Input [2]: [s_name#X, count#X] Keys [1]: [s_name#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [s_name#X, count(1)#X AS numwait#X] -(118) TakeOrderedAndProject +(137) TakeOrderedAndProject Input [2]: [s_name#X, numwait#X] Arguments: X, [numwait#X DESC NULLS LAST, s_name#X ASC NULLS FIRST], [s_name#X, numwait#X] -(119) AdaptiveSparkPlan +(138) AdaptiveSparkPlan Output [2]: [s_name#X, numwait#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/22.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/22.txt index 918335dff38ba..fcf712a9d5fdf 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/22.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/22.txt @@ -1,47 +1,49 @@ == Physical Plan == -AdaptiveSparkPlan (46) +AdaptiveSparkPlan (52) +- == Final Plan == - VeloxColumnarToRowExec (33) - +- ^ SortExecTransformer (31) - +- ^ InputIteratorTransformer (30) - +- ^ InputAdapter (29) - +- ^ ShuffleQueryStage (28) - +- ColumnarExchange (27) - +- ^ RegularHashAggregateExecTransformer (25) - +- ^ InputIteratorTransformer (24) - +- ^ InputAdapter (23) - +- ^ ShuffleQueryStage (22) - +- ColumnarExchange (21) - +- ^ ProjectExecTransformer (19) - +- ^ FlushableHashAggregateExecTransformer (18) - +- ^ ProjectExecTransformer (17) - +- ^ ShuffledHashJoinExecTransformer LeftAnti (16) - :- ^ InputIteratorTransformer (8) - : +- ^ InputAdapter (7) - : +- ^ ShuffleQueryStage (6) - : +- ColumnarExchange (5) + VeloxColumnarToRowExec (37) + +- ^ SortExecTransformer (35) + +- ^ InputIteratorTransformer (34) + +- ShuffleQueryStage (32) + +- ColumnarExchange (31) + +- VeloxAppendBatches (30) + +- ^ RegularHashAggregateExecTransformer (28) + +- ^ InputIteratorTransformer (27) + +- ShuffleQueryStage (25) + +- ColumnarExchange (24) + +- VeloxAppendBatches (23) + +- ^ ProjectExecTransformer (21) + +- ^ FlushableHashAggregateExecTransformer (20) + +- ^ ProjectExecTransformer (19) + +- ^ ShuffledHashJoinExecTransformer LeftAnti BuildRight (18) + :- ^ InputIteratorTransformer (9) + : +- ShuffleQueryStage (7) + : +- ColumnarExchange (6) + : +- VeloxAppendBatches (5) : +- ^ ProjectExecTransformer (3) : +- ^ FilterExecTransformer (2) : +- ^ Scan parquet (1) - +- ^ InputIteratorTransformer (15) - +- ^ InputAdapter (14) - +- ^ ShuffleQueryStage (13) - +- ColumnarExchange (12) - +- ^ ProjectExecTransformer (10) - +- ^ Scan parquet (9) + +- ^ InputIteratorTransformer (17) + +- ShuffleQueryStage (15) + +- ColumnarExchange (14) + +- VeloxAppendBatches (13) + +- ^ ProjectExecTransformer (11) + +- ^ Scan parquet (10) +- == Initial Plan == - Sort (45) - +- Exchange (44) - +- HashAggregate (43) - +- Exchange (42) - +- HashAggregate (41) - +- Project (40) - +- ShuffledHashJoin LeftAnti BuildRight (39) - :- Exchange (36) - : +- Filter (35) - : +- Scan parquet (34) - +- Exchange (38) - +- Scan parquet (37) + Sort (51) + +- Exchange (50) + +- HashAggregate (49) + +- Exchange (48) + +- HashAggregate (47) + +- Project (46) + +- SortMergeJoin LeftAnti (45) + :- Sort (41) + : +- Exchange (40) + : +- Filter (39) + : +- Scan parquet (38) + +- Sort (44) + +- Exchange (43) + +- Scan parquet (42) (1) Scan parquet @@ -63,182 +65,206 @@ Input [3]: [c_custkey#X, c_phone#X, c_acctbal#X] Input [4]: [hash_partition_key#X, c_custkey#X, c_phone#X, c_acctbal#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [4]: [hash_partition_key#X, c_custkey#X, c_phone#X, c_acctbal#X] +Arguments: X + +(6) ColumnarExchange Input [4]: [hash_partition_key#X, c_custkey#X, c_phone#X, c_acctbal#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_phone#X, c_acctbal#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [3]: [c_custkey#X, c_phone#X, c_acctbal#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [3]: [c_custkey#X, c_phone#X, c_acctbal#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [3]: [c_custkey#X, c_phone#X, c_acctbal#X] -(9) Scan parquet +(10) Scan parquet Output [1]: [o_custkey#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(10) ProjectExecTransformer +(11) ProjectExecTransformer Output [2]: [hash(o_custkey#X, 42) AS hash_partition_key#X, o_custkey#X] Input [1]: [o_custkey#X] -(11) WholeStageCodegenTransformer (X) +(12) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, o_custkey#X] Arguments: false -(12) ColumnarExchange +(13) VeloxAppendBatches +Input [2]: [hash_partition_key#X, o_custkey#X] +Arguments: X + +(14) ColumnarExchange Input [2]: [hash_partition_key#X, o_custkey#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_custkey#X], [plan_id=X], [id=#X] -(13) ShuffleQueryStage +(15) ShuffleQueryStage Output [1]: [o_custkey#X] Arguments: X -(14) InputAdapter +(16) InputAdapter Input [1]: [o_custkey#X] -(15) InputIteratorTransformer +(17) InputIteratorTransformer Input [1]: [o_custkey#X] -(16) ShuffledHashJoinExecTransformer +(18) ShuffledHashJoinExecTransformer Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(17) ProjectExecTransformer +(19) ProjectExecTransformer Output [2]: [substring(c_phone#X, 1, 2) AS cntrycode#X, c_acctbal#X] Input [3]: [c_custkey#X, c_phone#X, c_acctbal#X] -(18) FlushableHashAggregateExecTransformer +(20) FlushableHashAggregateExecTransformer Input [2]: [cntrycode#X, c_acctbal#X] Keys [1]: [cntrycode#X] Functions [2]: [partial_count(1), partial_sum(c_acctbal#X)] Aggregate Attributes [3]: [count#X, sum#X, isEmpty#X] Results [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] -(19) ProjectExecTransformer +(21) ProjectExecTransformer Output [5]: [hash(cntrycode#X, 42) AS hash_partition_key#X, cntrycode#X, count#X, sum#X, isEmpty#X] Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] -(20) WholeStageCodegenTransformer (X) +(22) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, cntrycode#X, count#X, sum#X, isEmpty#X] Arguments: false -(21) ColumnarExchange +(23) VeloxAppendBatches +Input [5]: [hash_partition_key#X, cntrycode#X, count#X, sum#X, isEmpty#X] +Arguments: X + +(24) ColumnarExchange Input [5]: [hash_partition_key#X, cntrycode#X, count#X, sum#X, isEmpty#X] Arguments: hashpartitioning(cntrycode#X, 1), ENSURE_REQUIREMENTS, [cntrycode#X, count#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(22) ShuffleQueryStage +(25) ShuffleQueryStage Output [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] Arguments: X -(23) InputAdapter +(26) InputAdapter Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] -(24) InputIteratorTransformer +(27) InputIteratorTransformer Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] -(25) RegularHashAggregateExecTransformer +(28) RegularHashAggregateExecTransformer Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] Keys [1]: [cntrycode#X] Functions [2]: [count(1), sum(c_acctbal#X)] Aggregate Attributes [2]: [count(1)#X, sum(c_acctbal#X)#X] Results [3]: [cntrycode#X, count(1)#X AS numcust#X, sum(c_acctbal#X)#X AS totacctbal#X] -(26) WholeStageCodegenTransformer (X) +(29) WholeStageCodegenTransformer (X) Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: false -(27) ColumnarExchange +(30) VeloxAppendBatches +Input [3]: [cntrycode#X, numcust#X, totacctbal#X] +Arguments: X + +(31) ColumnarExchange Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: rangepartitioning(cntrycode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(28) ShuffleQueryStage +(32) ShuffleQueryStage Output [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: X -(29) InputAdapter +(33) InputAdapter Input [3]: [cntrycode#X, numcust#X, totacctbal#X] -(30) InputIteratorTransformer +(34) InputIteratorTransformer Input [3]: [cntrycode#X, numcust#X, totacctbal#X] -(31) SortExecTransformer +(35) SortExecTransformer Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: [cntrycode#X ASC NULLS FIRST], true, 0 -(32) WholeStageCodegenTransformer (X) +(36) WholeStageCodegenTransformer (X) Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: false -(33) VeloxColumnarToRowExec +(37) VeloxColumnarToRowExec Input [3]: [cntrycode#X, numcust#X, totacctbal#X] -(34) Scan parquet +(38) Scan parquet Output [3]: [c_custkey#X, c_phone#X, c_acctbal#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_acctbal)] ReadSchema: struct -(35) Filter +(39) Filter Input [3]: [c_custkey#X, c_phone#X, c_acctbal#X] Condition : ((isnotnull(c_acctbal#X) AND substring(c_phone#X, 1, 2) IN (13,31,23,29,30,18,17)) AND (cast(c_acctbal#X as decimal(16,6)) > Subquery subquery#X, [id=#X])) -(36) Exchange +(40) Exchange Input [3]: [c_custkey#X, c_phone#X, c_acctbal#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(37) Scan parquet +(41) Sort +Input [3]: [c_custkey#X, c_phone#X, c_acctbal#X] +Arguments: [c_custkey#X ASC NULLS FIRST], false, 0 + +(42) Scan parquet Output [1]: [o_custkey#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(38) Exchange +(43) Exchange Input [1]: [o_custkey#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(39) ShuffledHashJoin +(44) Sort +Input [1]: [o_custkey#X] +Arguments: [o_custkey#X ASC NULLS FIRST], false, 0 + +(45) SortMergeJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(40) Project +(46) Project Output [2]: [substring(c_phone#X, 1, 2) AS cntrycode#X, c_acctbal#X] Input [3]: [c_custkey#X, c_phone#X, c_acctbal#X] -(41) HashAggregate +(47) HashAggregate Input [2]: [cntrycode#X, c_acctbal#X] Keys [1]: [cntrycode#X] Functions [2]: [partial_count(1), partial_sum(c_acctbal#X)] Aggregate Attributes [3]: [count#X, sum#X, isEmpty#X] Results [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] -(42) Exchange +(48) Exchange Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] Arguments: hashpartitioning(cntrycode#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(43) HashAggregate +(49) HashAggregate Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] Keys [1]: [cntrycode#X] Functions [2]: [count(1), sum(c_acctbal#X)] Aggregate Attributes [2]: [count(1)#X, sum(c_acctbal#X)#X] Results [3]: [cntrycode#X, count(1)#X AS numcust#X, sum(c_acctbal#X)#X AS totacctbal#X] -(44) Exchange +(50) Exchange Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: rangepartitioning(cntrycode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(45) Sort +(51) Sort Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: [cntrycode#X ASC NULLS FIRST], true, 0 -(46) AdaptiveSparkPlan +(52) AdaptiveSparkPlan Output [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/3.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/3.txt index 157e13d105e4e..607d6444f4324 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/3.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/3.txt @@ -1,60 +1,64 @@ == Physical Plan == -AdaptiveSparkPlan (59) +AdaptiveSparkPlan (67) +- == Final Plan == - VeloxColumnarToRowExec (39) - +- TakeOrderedAndProjectExecTransformer (38) - +- ^ ProjectExecTransformer (36) - +- ^ RegularHashAggregateExecTransformer (35) - +- ^ RegularHashAggregateExecTransformer (34) - +- ^ ProjectExecTransformer (33) - +- ^ ShuffledHashJoinExecTransformer Inner (32) - :- ^ InputIteratorTransformer (23) - : +- ^ InputAdapter (22) - : +- ^ ShuffleQueryStage (21) - : +- ColumnarExchange (20) - : +- ^ ProjectExecTransformer (18) - : +- ^ ShuffledHashJoinExecTransformer Inner (17) - : :- ^ InputIteratorTransformer (8) - : : +- ^ InputAdapter (7) - : : +- ^ ShuffleQueryStage (6) - : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (43) + +- TakeOrderedAndProjectExecTransformer (42) + +- ^ ProjectExecTransformer (40) + +- ^ RegularHashAggregateExecTransformer (39) + +- ^ RegularHashAggregateExecTransformer (38) + +- ^ ProjectExecTransformer (37) + +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (36) + :- ^ InputIteratorTransformer (26) + : +- ShuffleQueryStage (24) + : +- ColumnarExchange (23) + : +- VeloxAppendBatches (22) + : +- ^ ProjectExecTransformer (20) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (19) + : :- ^ InputIteratorTransformer (9) + : : +- ShuffleQueryStage (7) + : : +- ColumnarExchange (6) + : : +- VeloxAppendBatches (5) : : +- ^ ProjectExecTransformer (3) : : +- ^ FilterExecTransformer (2) : : +- ^ Scan parquet (1) - : +- ^ InputIteratorTransformer (16) - : +- ^ InputAdapter (15) - : +- ^ ShuffleQueryStage (14) - : +- ColumnarExchange (13) - : +- ^ ProjectExecTransformer (11) - : +- ^ FilterExecTransformer (10) - : +- ^ Scan parquet (9) - +- ^ InputIteratorTransformer (31) - +- ^ InputAdapter (30) - +- ^ ShuffleQueryStage (29) - +- ColumnarExchange (28) - +- ^ ProjectExecTransformer (26) - +- ^ FilterExecTransformer (25) - +- ^ Scan parquet (24) + : +- ^ InputIteratorTransformer (18) + : +- ShuffleQueryStage (16) + : +- ColumnarExchange (15) + : +- VeloxAppendBatches (14) + : +- ^ ProjectExecTransformer (12) + : +- ^ FilterExecTransformer (11) + : +- ^ Scan parquet (10) + +- ^ InputIteratorTransformer (35) + +- ShuffleQueryStage (33) + +- ColumnarExchange (32) + +- VeloxAppendBatches (31) + +- ^ ProjectExecTransformer (29) + +- ^ FilterExecTransformer (28) + +- ^ Scan parquet (27) +- == Initial Plan == - TakeOrderedAndProject (58) - +- HashAggregate (57) - +- HashAggregate (56) - +- Project (55) - +- ShuffledHashJoin Inner BuildRight (54) - :- Exchange (49) - : +- Project (48) - : +- ShuffledHashJoin Inner BuildLeft (47) - : :- Exchange (43) - : : +- Project (42) - : : +- Filter (41) - : : +- Scan parquet (40) - : +- Exchange (46) - : +- Filter (45) - : +- Scan parquet (44) - +- Exchange (53) - +- Project (52) - +- Filter (51) - +- Scan parquet (50) + TakeOrderedAndProject (66) + +- HashAggregate (65) + +- HashAggregate (64) + +- Project (63) + +- SortMergeJoin Inner (62) + :- Sort (56) + : +- Exchange (55) + : +- Project (54) + : +- SortMergeJoin Inner (53) + : :- Sort (48) + : : +- Exchange (47) + : : +- Project (46) + : : +- Filter (45) + : : +- Scan parquet (44) + : +- Sort (52) + : +- Exchange (51) + : +- Filter (50) + : +- Scan parquet (49) + +- Sort (61) + +- Exchange (60) + +- Project (59) + +- Filter (58) + +- Scan parquet (57) (1) Scan parquet @@ -76,244 +80,276 @@ Input [2]: [c_custkey#X, c_mktsegment#X] Input [2]: [hash_partition_key#X, c_custkey#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [2]: [hash_partition_key#X, c_custkey#X] +Arguments: X + +(6) ColumnarExchange Input [2]: [hash_partition_key#X, c_custkey#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [1]: [c_custkey#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [1]: [c_custkey#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [1]: [c_custkey#X] -(9) Scan parquet +(10) Scan parquet Output [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), LessThan(o_orderdate,1995-03-15), IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(10) FilterExecTransformer +(11) FilterExecTransformer Input [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Arguments: (((isnotnull(o_orderdate#X) AND (o_orderdate#X < 1995-03-15)) AND isnotnull(o_custkey#X)) AND isnotnull(o_orderkey#X)) -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [5]: [hash(o_custkey#X, 42) AS hash_partition_key#X, o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Input [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] +Arguments: X + +(15) ColumnarExchange Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [4]: [hash(o_orderkey#X, 42) AS hash_partition_key#X, o_orderkey#X, o_orderdate#X, o_shippriority#X] Input [5]: [c_custkey#X, o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] -(19) WholeStageCodegenTransformer (X) +(21) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, o_orderkey#X, o_orderdate#X, o_shippriority#X] Arguments: false -(20) ColumnarExchange +(22) VeloxAppendBatches +Input [4]: [hash_partition_key#X, o_orderkey#X, o_orderdate#X, o_shippriority#X] +Arguments: X + +(23) ColumnarExchange Input [4]: [hash_partition_key#X, o_orderkey#X, o_orderdate#X, o_shippriority#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderdate#X, o_shippriority#X], [plan_id=X], [id=#X] -(21) ShuffleQueryStage +(24) ShuffleQueryStage Output [3]: [o_orderkey#X, o_orderdate#X, o_shippriority#X] Arguments: X -(22) InputAdapter +(25) InputAdapter Input [3]: [o_orderkey#X, o_orderdate#X, o_shippriority#X] -(23) InputIteratorTransformer +(26) InputIteratorTransformer Input [3]: [o_orderkey#X, o_orderdate#X, o_shippriority#X] -(24) Scan parquet +(27) Scan parquet Output [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThan(l_shipdate,1995-03-15), IsNotNull(l_orderkey)] ReadSchema: struct -(25) FilterExecTransformer +(28) FilterExecTransformer Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: ((isnotnull(l_shipdate#X) AND (l_shipdate#X > 1995-03-15)) AND isnotnull(l_orderkey#X)) -(26) ProjectExecTransformer +(29) ProjectExecTransformer Output [4]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(27) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X] Arguments: false -(28) ColumnarExchange +(31) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(32) ColumnarExchange Input [4]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(29) ShuffleQueryStage +(33) ShuffleQueryStage Output [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] Arguments: X -(30) InputAdapter +(34) InputAdapter Input [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] -(31) InputIteratorTransformer +(35) InputIteratorTransformer Input [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] -(32) ShuffledHashJoinExecTransformer +(36) ShuffledHashJoinExecTransformer Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(33) ProjectExecTransformer +(37) ProjectExecTransformer Output [6]: [o_orderdate#X, o_shippriority#X, l_orderkey#X, l_extendedprice#X, l_discount#X, CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) AS _pre_X#X] Input [6]: [o_orderkey#X, o_orderdate#X, o_shippriority#X, l_orderkey#X, l_extendedprice#X, l_discount#X] -(34) RegularHashAggregateExecTransformer +(38) RegularHashAggregateExecTransformer Input [6]: [o_orderdate#X, o_shippriority#X, l_orderkey#X, l_extendedprice#X, l_discount#X, _pre_X#X] Keys [3]: [l_orderkey#X, o_orderdate#X, o_shippriority#X] Functions [1]: [partial_sum(_pre_X#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] -(35) RegularHashAggregateExecTransformer +(39) RegularHashAggregateExecTransformer Input [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] Keys [3]: [l_orderkey#X, o_orderdate#X, o_shippriority#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] Results [4]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] -(36) ProjectExecTransformer +(40) ProjectExecTransformer Output [4]: [l_orderkey#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X AS revenue#X, o_orderdate#X, o_shippriority#X] Input [4]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] -(37) WholeStageCodegenTransformer (X) +(41) WholeStageCodegenTransformer (X) Input [4]: [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] Arguments: false -(38) TakeOrderedAndProjectExecTransformer +(42) TakeOrderedAndProjectExecTransformer Input [4]: [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] Arguments: X, [revenue#X DESC NULLS LAST, o_orderdate#X ASC NULLS FIRST], [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X], 0 -(39) VeloxColumnarToRowExec +(43) VeloxColumnarToRowExec Input [4]: [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] -(40) Scan parquet +(44) Scan parquet Output [2]: [c_custkey#X, c_mktsegment#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_mktsegment), EqualTo(c_mktsegment,BUILDING), IsNotNull(c_custkey)] ReadSchema: struct -(41) Filter +(45) Filter Input [2]: [c_custkey#X, c_mktsegment#X] Condition : ((isnotnull(c_mktsegment#X) AND (c_mktsegment#X = BUILDING)) AND isnotnull(c_custkey#X)) -(42) Project +(46) Project Output [1]: [c_custkey#X] Input [2]: [c_custkey#X, c_mktsegment#X] -(43) Exchange +(47) Exchange Input [1]: [c_custkey#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(44) Scan parquet +(48) Sort +Input [1]: [c_custkey#X] +Arguments: [c_custkey#X ASC NULLS FIRST], false, 0 + +(49) Scan parquet Output [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), LessThan(o_orderdate,1995-03-15), IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(45) Filter +(50) Filter Input [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Condition : (((isnotnull(o_orderdate#X) AND (o_orderdate#X < 1995-03-15)) AND isnotnull(o_custkey#X)) AND isnotnull(o_orderkey#X)) -(46) Exchange +(51) Exchange Input [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(47) ShuffledHashJoin +(52) Sort +Input [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] +Arguments: [o_custkey#X ASC NULLS FIRST], false, 0 + +(53) SortMergeJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(48) Project +(54) Project Output [3]: [o_orderkey#X, o_orderdate#X, o_shippriority#X] Input [5]: [c_custkey#X, o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] -(49) Exchange +(55) Exchange Input [3]: [o_orderkey#X, o_orderdate#X, o_shippriority#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(50) Scan parquet +(56) Sort +Input [3]: [o_orderkey#X, o_orderdate#X, o_shippriority#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(57) Scan parquet Output [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThan(l_shipdate,1995-03-15), IsNotNull(l_orderkey)] ReadSchema: struct -(51) Filter +(58) Filter Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : ((isnotnull(l_shipdate#X) AND (l_shipdate#X > 1995-03-15)) AND isnotnull(l_orderkey#X)) -(52) Project +(59) Project Output [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(53) Exchange +(60) Exchange Input [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(54) ShuffledHashJoin +(61) Sort +Input [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(62) SortMergeJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(55) Project +(63) Project Output [5]: [o_orderdate#X, o_shippriority#X, l_orderkey#X, l_extendedprice#X, l_discount#X] Input [6]: [o_orderkey#X, o_orderdate#X, o_shippriority#X, l_orderkey#X, l_extendedprice#X, l_discount#X] -(56) HashAggregate +(64) HashAggregate Input [5]: [o_orderdate#X, o_shippriority#X, l_orderkey#X, l_extendedprice#X, l_discount#X] Keys [3]: [l_orderkey#X, o_orderdate#X, o_shippriority#X] Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] -(57) HashAggregate +(65) HashAggregate Input [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] Keys [3]: [l_orderkey#X, o_orderdate#X, o_shippriority#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] Results [4]: [l_orderkey#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X AS revenue#X, o_orderdate#X, o_shippriority#X] -(58) TakeOrderedAndProject +(66) TakeOrderedAndProject Input [4]: [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] Arguments: X, [revenue#X DESC NULLS LAST, o_orderdate#X ASC NULLS FIRST], [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] -(59) AdaptiveSparkPlan +(67) AdaptiveSparkPlan Output [4]: [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/4.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/4.txt index 74fca4208dd54..cc6b8f351600a 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/4.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/4.txt @@ -1,51 +1,53 @@ == Physical Plan == -AdaptiveSparkPlan (50) +AdaptiveSparkPlan (56) +- == Final Plan == - VeloxColumnarToRowExec (34) - +- ^ SortExecTransformer (32) - +- ^ InputIteratorTransformer (31) - +- ^ InputAdapter (30) - +- ^ ShuffleQueryStage (29) - +- ColumnarExchange (28) - +- ^ RegularHashAggregateExecTransformer (26) - +- ^ InputIteratorTransformer (25) - +- ^ InputAdapter (24) - +- ^ ShuffleQueryStage (23) - +- ColumnarExchange (22) - +- ^ ProjectExecTransformer (20) - +- ^ FlushableHashAggregateExecTransformer (19) - +- ^ ProjectExecTransformer (18) - +- ^ ShuffledHashJoinExecTransformer LeftSemi (17) - :- ^ InputIteratorTransformer (8) - : +- ^ InputAdapter (7) - : +- ^ ShuffleQueryStage (6) - : +- ColumnarExchange (5) + VeloxColumnarToRowExec (38) + +- ^ SortExecTransformer (36) + +- ^ InputIteratorTransformer (35) + +- ShuffleQueryStage (33) + +- ColumnarExchange (32) + +- VeloxAppendBatches (31) + +- ^ RegularHashAggregateExecTransformer (29) + +- ^ InputIteratorTransformer (28) + +- ShuffleQueryStage (26) + +- ColumnarExchange (25) + +- VeloxAppendBatches (24) + +- ^ ProjectExecTransformer (22) + +- ^ FlushableHashAggregateExecTransformer (21) + +- ^ ProjectExecTransformer (20) + +- ^ ShuffledHashJoinExecTransformer LeftSemi BuildRight (19) + :- ^ InputIteratorTransformer (9) + : +- ShuffleQueryStage (7) + : +- ColumnarExchange (6) + : +- VeloxAppendBatches (5) : +- ^ ProjectExecTransformer (3) : +- ^ FilterExecTransformer (2) : +- ^ Scan parquet (1) - +- ^ InputIteratorTransformer (16) - +- ^ InputAdapter (15) - +- ^ ShuffleQueryStage (14) - +- ColumnarExchange (13) - +- ^ ProjectExecTransformer (11) - +- ^ FilterExecTransformer (10) - +- ^ Scan parquet (9) + +- ^ InputIteratorTransformer (18) + +- ShuffleQueryStage (16) + +- ColumnarExchange (15) + +- VeloxAppendBatches (14) + +- ^ ProjectExecTransformer (12) + +- ^ FilterExecTransformer (11) + +- ^ Scan parquet (10) +- == Initial Plan == - Sort (49) - +- Exchange (48) - +- HashAggregate (47) - +- Exchange (46) - +- HashAggregate (45) - +- Project (44) - +- ShuffledHashJoin LeftSemi BuildRight (43) - :- Exchange (38) - : +- Project (37) - : +- Filter (36) - : +- Scan parquet (35) - +- Exchange (42) - +- Project (41) - +- Filter (40) - +- Scan parquet (39) + Sort (55) + +- Exchange (54) + +- HashAggregate (53) + +- Exchange (52) + +- HashAggregate (51) + +- Project (50) + +- SortMergeJoin LeftSemi (49) + :- Sort (43) + : +- Exchange (42) + : +- Project (41) + : +- Filter (40) + : +- Scan parquet (39) + +- Sort (48) + +- Exchange (47) + +- Project (46) + +- Filter (45) + +- Scan parquet (44) (1) Scan parquet @@ -67,200 +69,224 @@ Input [3]: [o_orderkey#X, o_orderdate#X, o_orderpriority#X] Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderpriority#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderpriority#X] +Arguments: X + +(6) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderpriority#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderpriority#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [2]: [o_orderkey#X, o_orderpriority#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [2]: [o_orderkey#X, o_orderpriority#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [2]: [o_orderkey#X, o_orderpriority#X] -(9) Scan parquet +(10) Scan parquet Output [3]: [l_orderkey#X, l_commitdate#X, l_receiptdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_commitdate), IsNotNull(l_receiptdate)] ReadSchema: struct -(10) FilterExecTransformer +(11) FilterExecTransformer Input [3]: [l_orderkey#X, l_commitdate#X, l_receiptdate#X] Arguments: ((isnotnull(l_commitdate#X) AND isnotnull(l_receiptdate#X)) AND (l_commitdate#X < l_receiptdate#X)) -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [2]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X] Input [3]: [l_orderkey#X, l_commitdate#X, l_receiptdate#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, l_orderkey#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [2]: [hash_partition_key#X, l_orderkey#X] +Arguments: X + +(15) ColumnarExchange Input [2]: [hash_partition_key#X, l_orderkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [1]: [l_orderkey#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [1]: [l_orderkey#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [1]: [l_orderkey#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [1]: [o_orderpriority#X] Input [2]: [o_orderkey#X, o_orderpriority#X] -(19) FlushableHashAggregateExecTransformer +(21) FlushableHashAggregateExecTransformer Input [1]: [o_orderpriority#X] Keys [1]: [o_orderpriority#X] Functions [1]: [partial_count(1)] Aggregate Attributes [1]: [count#X] Results [2]: [o_orderpriority#X, count#X] -(20) ProjectExecTransformer +(22) ProjectExecTransformer Output [3]: [hash(o_orderpriority#X, 42) AS hash_partition_key#X, o_orderpriority#X, count#X] Input [2]: [o_orderpriority#X, count#X] -(21) WholeStageCodegenTransformer (X) +(23) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, o_orderpriority#X, count#X] Arguments: false -(22) ColumnarExchange +(24) VeloxAppendBatches +Input [3]: [hash_partition_key#X, o_orderpriority#X, count#X] +Arguments: X + +(25) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderpriority#X, count#X] Arguments: hashpartitioning(o_orderpriority#X, 1), ENSURE_REQUIREMENTS, [o_orderpriority#X, count#X], [plan_id=X], [id=#X] -(23) ShuffleQueryStage +(26) ShuffleQueryStage Output [2]: [o_orderpriority#X, count#X] Arguments: X -(24) InputAdapter +(27) InputAdapter Input [2]: [o_orderpriority#X, count#X] -(25) InputIteratorTransformer +(28) InputIteratorTransformer Input [2]: [o_orderpriority#X, count#X] -(26) RegularHashAggregateExecTransformer +(29) RegularHashAggregateExecTransformer Input [2]: [o_orderpriority#X, count#X] Keys [1]: [o_orderpriority#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [o_orderpriority#X, count(1)#X AS order_count#X] -(27) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [2]: [o_orderpriority#X, order_count#X] Arguments: false -(28) ColumnarExchange +(31) VeloxAppendBatches +Input [2]: [o_orderpriority#X, order_count#X] +Arguments: X + +(32) ColumnarExchange Input [2]: [o_orderpriority#X, order_count#X] Arguments: rangepartitioning(o_orderpriority#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(29) ShuffleQueryStage +(33) ShuffleQueryStage Output [2]: [o_orderpriority#X, order_count#X] Arguments: X -(30) InputAdapter +(34) InputAdapter Input [2]: [o_orderpriority#X, order_count#X] -(31) InputIteratorTransformer +(35) InputIteratorTransformer Input [2]: [o_orderpriority#X, order_count#X] -(32) SortExecTransformer +(36) SortExecTransformer Input [2]: [o_orderpriority#X, order_count#X] Arguments: [o_orderpriority#X ASC NULLS FIRST], true, 0 -(33) WholeStageCodegenTransformer (X) +(37) WholeStageCodegenTransformer (X) Input [2]: [o_orderpriority#X, order_count#X] Arguments: false -(34) VeloxColumnarToRowExec +(38) VeloxColumnarToRowExec Input [2]: [o_orderpriority#X, order_count#X] -(35) Scan parquet +(39) Scan parquet Output [3]: [o_orderkey#X, o_orderdate#X, o_orderpriority#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1993-07-01), LessThan(o_orderdate,1993-10-01)] ReadSchema: struct -(36) Filter +(40) Filter Input [3]: [o_orderkey#X, o_orderdate#X, o_orderpriority#X] Condition : ((isnotnull(o_orderdate#X) AND (o_orderdate#X >= 1993-07-01)) AND (o_orderdate#X < 1993-10-01)) -(37) Project +(41) Project Output [2]: [o_orderkey#X, o_orderpriority#X] Input [3]: [o_orderkey#X, o_orderdate#X, o_orderpriority#X] -(38) Exchange +(42) Exchange Input [2]: [o_orderkey#X, o_orderpriority#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(39) Scan parquet +(43) Sort +Input [2]: [o_orderkey#X, o_orderpriority#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(44) Scan parquet Output [3]: [l_orderkey#X, l_commitdate#X, l_receiptdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_commitdate), IsNotNull(l_receiptdate)] ReadSchema: struct -(40) Filter +(45) Filter Input [3]: [l_orderkey#X, l_commitdate#X, l_receiptdate#X] Condition : ((isnotnull(l_commitdate#X) AND isnotnull(l_receiptdate#X)) AND (l_commitdate#X < l_receiptdate#X)) -(41) Project +(46) Project Output [1]: [l_orderkey#X] Input [3]: [l_orderkey#X, l_commitdate#X, l_receiptdate#X] -(42) Exchange +(47) Exchange Input [1]: [l_orderkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(43) ShuffledHashJoin +(48) Sort +Input [1]: [l_orderkey#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(49) SortMergeJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(44) Project +(50) Project Output [1]: [o_orderpriority#X] Input [2]: [o_orderkey#X, o_orderpriority#X] -(45) HashAggregate +(51) HashAggregate Input [1]: [o_orderpriority#X] Keys [1]: [o_orderpriority#X] Functions [1]: [partial_count(1)] Aggregate Attributes [1]: [count#X] Results [2]: [o_orderpriority#X, count#X] -(46) Exchange +(52) Exchange Input [2]: [o_orderpriority#X, count#X] Arguments: hashpartitioning(o_orderpriority#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(47) HashAggregate +(53) HashAggregate Input [2]: [o_orderpriority#X, count#X] Keys [1]: [o_orderpriority#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [o_orderpriority#X, count(1)#X AS order_count#X] -(48) Exchange +(54) Exchange Input [2]: [o_orderpriority#X, order_count#X] Arguments: rangepartitioning(o_orderpriority#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(49) Sort +(55) Sort Input [2]: [o_orderpriority#X, order_count#X] Arguments: [o_orderpriority#X ASC NULLS FIRST], true, 0 -(50) AdaptiveSparkPlan +(56) AdaptiveSparkPlan Output [2]: [o_orderpriority#X, order_count#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/5.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/5.txt index c01060f99d4b4..a1f95887aae33 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/5.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/5.txt @@ -1,127 +1,137 @@ == Physical Plan == -AdaptiveSparkPlan (134) +AdaptiveSparkPlan (156) +- == Final Plan == - VeloxColumnarToRowExec (94) - +- ^ SortExecTransformer (92) - +- ^ InputIteratorTransformer (91) - +- ^ InputAdapter (90) - +- ^ ShuffleQueryStage (89) - +- ColumnarExchange (88) - +- ^ RegularHashAggregateExecTransformer (86) - +- ^ InputIteratorTransformer (85) - +- ^ InputAdapter (84) - +- ^ ShuffleQueryStage (83) - +- ColumnarExchange (82) - +- ^ ProjectExecTransformer (80) - +- ^ FlushableHashAggregateExecTransformer (79) - +- ^ ProjectExecTransformer (78) - +- ^ ShuffledHashJoinExecTransformer Inner (77) - :- ^ InputIteratorTransformer (68) - : +- ^ InputAdapter (67) - : +- ^ ShuffleQueryStage (66) - : +- ColumnarExchange (65) - : +- ^ ProjectExecTransformer (63) - : +- ^ ShuffledHashJoinExecTransformer Inner (62) - : :- ^ InputIteratorTransformer (53) - : : +- ^ InputAdapter (52) - : : +- ^ ShuffleQueryStage (51) - : : +- ColumnarExchange (50) - : : +- ^ ProjectExecTransformer (48) - : : +- ^ ShuffledHashJoinExecTransformer Inner (47) - : : :- ^ InputIteratorTransformer (38) - : : : +- ^ InputAdapter (37) - : : : +- ^ ShuffleQueryStage (36) - : : : +- ColumnarExchange (35) - : : : +- ^ ProjectExecTransformer (33) - : : : +- ^ ShuffledHashJoinExecTransformer Inner (32) - : : : :- ^ InputIteratorTransformer (23) - : : : : +- ^ InputAdapter (22) - : : : : +- ^ ShuffleQueryStage (21) - : : : : +- ColumnarExchange (20) - : : : : +- ^ ProjectExecTransformer (18) - : : : : +- ^ ShuffledHashJoinExecTransformer Inner (17) - : : : : :- ^ InputIteratorTransformer (8) - : : : : : +- ^ InputAdapter (7) - : : : : : +- ^ ShuffleQueryStage (6) - : : : : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (106) + +- ^ SortExecTransformer (104) + +- ^ InputIteratorTransformer (103) + +- ShuffleQueryStage (101) + +- ColumnarExchange (100) + +- VeloxAppendBatches (99) + +- ^ RegularHashAggregateExecTransformer (97) + +- ^ InputIteratorTransformer (96) + +- ShuffleQueryStage (94) + +- ColumnarExchange (93) + +- VeloxAppendBatches (92) + +- ^ ProjectExecTransformer (90) + +- ^ FlushableHashAggregateExecTransformer (89) + +- ^ ProjectExecTransformer (88) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (87) + :- ^ InputIteratorTransformer (77) + : +- ShuffleQueryStage (75) + : +- ColumnarExchange (74) + : +- VeloxAppendBatches (73) + : +- ^ ProjectExecTransformer (71) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (70) + : :- ^ InputIteratorTransformer (60) + : : +- ShuffleQueryStage (58) + : : +- ColumnarExchange (57) + : : +- VeloxAppendBatches (56) + : : +- ^ ProjectExecTransformer (54) + : : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (53) + : : :- ^ InputIteratorTransformer (43) + : : : +- ShuffleQueryStage (41) + : : : +- ColumnarExchange (40) + : : : +- VeloxAppendBatches (39) + : : : +- ^ ProjectExecTransformer (37) + : : : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (36) + : : : :- ^ InputIteratorTransformer (26) + : : : : +- ShuffleQueryStage (24) + : : : : +- ColumnarExchange (23) + : : : : +- VeloxAppendBatches (22) + : : : : +- ^ ProjectExecTransformer (20) + : : : : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (19) + : : : : :- ^ InputIteratorTransformer (9) + : : : : : +- ShuffleQueryStage (7) + : : : : : +- ColumnarExchange (6) + : : : : : +- VeloxAppendBatches (5) : : : : : +- ^ ProjectExecTransformer (3) : : : : : +- ^ FilterExecTransformer (2) : : : : : +- ^ Scan parquet (1) - : : : : +- ^ InputIteratorTransformer (16) - : : : : +- ^ InputAdapter (15) - : : : : +- ^ ShuffleQueryStage (14) - : : : : +- ColumnarExchange (13) - : : : : +- ^ ProjectExecTransformer (11) - : : : : +- ^ FilterExecTransformer (10) - : : : : +- ^ Scan parquet (9) - : : : +- ^ InputIteratorTransformer (31) - : : : +- ^ InputAdapter (30) - : : : +- ^ ShuffleQueryStage (29) - : : : +- ColumnarExchange (28) - : : : +- ^ ProjectExecTransformer (26) - : : : +- ^ FilterExecTransformer (25) - : : : +- ^ Scan parquet (24) - : : +- ^ InputIteratorTransformer (46) - : : +- ^ InputAdapter (45) - : : +- ^ ShuffleQueryStage (44) - : : +- ColumnarExchange (43) - : : +- ^ ProjectExecTransformer (41) - : : +- ^ FilterExecTransformer (40) - : : +- ^ Scan parquet (39) - : +- ^ InputIteratorTransformer (61) - : +- ^ InputAdapter (60) - : +- ^ ShuffleQueryStage (59) - : +- ColumnarExchange (58) - : +- ^ ProjectExecTransformer (56) - : +- ^ FilterExecTransformer (55) - : +- ^ Scan parquet (54) - +- ^ InputIteratorTransformer (76) - +- ^ InputAdapter (75) - +- ^ ShuffleQueryStage (74) - +- ColumnarExchange (73) - +- ^ ProjectExecTransformer (71) - +- ^ FilterExecTransformer (70) - +- ^ Scan parquet (69) + : : : : +- ^ InputIteratorTransformer (18) + : : : : +- ShuffleQueryStage (16) + : : : : +- ColumnarExchange (15) + : : : : +- VeloxAppendBatches (14) + : : : : +- ^ ProjectExecTransformer (12) + : : : : +- ^ FilterExecTransformer (11) + : : : : +- ^ Scan parquet (10) + : : : +- ^ InputIteratorTransformer (35) + : : : +- ShuffleQueryStage (33) + : : : +- ColumnarExchange (32) + : : : +- VeloxAppendBatches (31) + : : : +- ^ ProjectExecTransformer (29) + : : : +- ^ FilterExecTransformer (28) + : : : +- ^ Scan parquet (27) + : : +- ^ InputIteratorTransformer (52) + : : +- ShuffleQueryStage (50) + : : +- ColumnarExchange (49) + : : +- VeloxAppendBatches (48) + : : +- ^ ProjectExecTransformer (46) + : : +- ^ FilterExecTransformer (45) + : : +- ^ Scan parquet (44) + : +- ^ InputIteratorTransformer (69) + : +- ShuffleQueryStage (67) + : +- ColumnarExchange (66) + : +- VeloxAppendBatches (65) + : +- ^ ProjectExecTransformer (63) + : +- ^ FilterExecTransformer (62) + : +- ^ Scan parquet (61) + +- ^ InputIteratorTransformer (86) + +- ShuffleQueryStage (84) + +- ColumnarExchange (83) + +- VeloxAppendBatches (82) + +- ^ ProjectExecTransformer (80) + +- ^ FilterExecTransformer (79) + +- ^ Scan parquet (78) +- == Initial Plan == - Sort (133) - +- Exchange (132) - +- HashAggregate (131) - +- Exchange (130) - +- HashAggregate (129) - +- Project (128) - +- ShuffledHashJoin Inner BuildRight (127) - :- Exchange (122) - : +- Project (121) - : +- ShuffledHashJoin Inner BuildRight (120) - : :- Exchange (116) - : : +- Project (115) - : : +- ShuffledHashJoin Inner BuildRight (114) - : : :- Exchange (110) - : : : +- Project (109) - : : : +- ShuffledHashJoin Inner BuildRight (108) - : : : :- Exchange (104) - : : : : +- Project (103) - : : : : +- ShuffledHashJoin Inner BuildLeft (102) - : : : : :- Exchange (97) - : : : : : +- Filter (96) - : : : : : +- Scan parquet (95) - : : : : +- Exchange (101) - : : : : +- Project (100) - : : : : +- Filter (99) - : : : : +- Scan parquet (98) - : : : +- Exchange (107) - : : : +- Filter (106) - : : : +- Scan parquet (105) - : : +- Exchange (113) - : : +- Filter (112) - : : +- Scan parquet (111) - : +- Exchange (119) - : +- Filter (118) - : +- Scan parquet (117) - +- Exchange (126) - +- Project (125) - +- Filter (124) - +- Scan parquet (123) + Sort (155) + +- Exchange (154) + +- HashAggregate (153) + +- Exchange (152) + +- HashAggregate (151) + +- Project (150) + +- SortMergeJoin Inner (149) + :- Sort (143) + : +- Exchange (142) + : +- Project (141) + : +- SortMergeJoin Inner (140) + : :- Sort (135) + : : +- Exchange (134) + : : +- Project (133) + : : +- SortMergeJoin Inner (132) + : : :- Sort (127) + : : : +- Exchange (126) + : : : +- Project (125) + : : : +- SortMergeJoin Inner (124) + : : : :- Sort (119) + : : : : +- Exchange (118) + : : : : +- Project (117) + : : : : +- SortMergeJoin Inner (116) + : : : : :- Sort (110) + : : : : : +- Exchange (109) + : : : : : +- Filter (108) + : : : : : +- Scan parquet (107) + : : : : +- Sort (115) + : : : : +- Exchange (114) + : : : : +- Project (113) + : : : : +- Filter (112) + : : : : +- Scan parquet (111) + : : : +- Sort (123) + : : : +- Exchange (122) + : : : +- Filter (121) + : : : +- Scan parquet (120) + : : +- Sort (131) + : : +- Exchange (130) + : : +- Filter (129) + : : +- Scan parquet (128) + : +- Sort (139) + : +- Exchange (138) + : +- Filter (137) + : +- Scan parquet (136) + +- Sort (148) + +- Exchange (147) + +- Project (146) + +- Filter (145) + +- Scan parquet (144) (1) Scan parquet @@ -143,552 +153,640 @@ Input [2]: [c_custkey#X, c_nationkey#X] Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] +Arguments: X + +(6) ColumnarExchange Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_nationkey#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [2]: [c_custkey#X, c_nationkey#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [2]: [c_custkey#X, c_nationkey#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [2]: [c_custkey#X, c_nationkey#X] -(9) Scan parquet +(10) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1994-01-01), LessThan(o_orderdate,1995-01-01), IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(10) FilterExecTransformer +(11) FilterExecTransformer Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Arguments: ((((isnotnull(o_orderdate#X) AND (o_orderdate#X >= 1994-01-01)) AND (o_orderdate#X < 1995-01-01)) AND isnotnull(o_custkey#X)) AND isnotnull(o_orderkey#X)) -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [3]: [hash(o_custkey#X, 42) AS hash_partition_key#X, o_orderkey#X, o_custkey#X] Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] +Arguments: X + +(15) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [2]: [o_orderkey#X, o_custkey#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [2]: [o_orderkey#X, o_custkey#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [2]: [o_orderkey#X, o_custkey#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [3]: [hash(o_orderkey#X, 42) AS hash_partition_key#X, c_nationkey#X, o_orderkey#X] Input [4]: [c_custkey#X, c_nationkey#X, o_orderkey#X, o_custkey#X] -(19) WholeStageCodegenTransformer (X) +(21) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, c_nationkey#X, o_orderkey#X] Arguments: false -(20) ColumnarExchange +(22) VeloxAppendBatches +Input [3]: [hash_partition_key#X, c_nationkey#X, o_orderkey#X] +Arguments: X + +(23) ColumnarExchange Input [3]: [hash_partition_key#X, c_nationkey#X, o_orderkey#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [c_nationkey#X, o_orderkey#X], [plan_id=X], [id=#X] -(21) ShuffleQueryStage +(24) ShuffleQueryStage Output [2]: [c_nationkey#X, o_orderkey#X] Arguments: X -(22) InputAdapter +(25) InputAdapter Input [2]: [c_nationkey#X, o_orderkey#X] -(23) InputIteratorTransformer +(26) InputIteratorTransformer Input [2]: [c_nationkey#X, o_orderkey#X] -(24) Scan parquet +(27) Scan parquet Output [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_orderkey), IsNotNull(l_suppkey)] ReadSchema: struct -(25) FilterExecTransformer +(28) FilterExecTransformer Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: (isnotnull(l_orderkey#X) AND isnotnull(l_suppkey#X)) -(26) ProjectExecTransformer +(29) ProjectExecTransformer Output [5]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(27) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: false -(28) ColumnarExchange +(31) VeloxAppendBatches +Input [5]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(32) ColumnarExchange Input [5]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(29) ShuffleQueryStage +(33) ShuffleQueryStage Output [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: X -(30) InputAdapter +(34) InputAdapter Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(31) InputIteratorTransformer +(35) InputIteratorTransformer Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(32) ShuffledHashJoinExecTransformer +(36) ShuffledHashJoinExecTransformer Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(33) ProjectExecTransformer +(37) ProjectExecTransformer Output [5]: [hash(l_suppkey#X, c_nationkey#X, 42) AS hash_partition_key#X, c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Input [6]: [c_nationkey#X, o_orderkey#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(34) WholeStageCodegenTransformer (X) +(38) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: false -(35) ColumnarExchange +(39) VeloxAppendBatches +Input [5]: [hash_partition_key#X, c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(40) ColumnarExchange Input [5]: [hash_partition_key#X, c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_suppkey#X, c_nationkey#X, 1), ENSURE_REQUIREMENTS, [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(36) ShuffleQueryStage +(41) ShuffleQueryStage Output [4]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: X -(37) InputAdapter +(42) InputAdapter Input [4]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(38) InputIteratorTransformer +(43) InputIteratorTransformer Input [4]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(39) Scan parquet +(44) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(40) FilterExecTransformer +(45) FilterExecTransformer Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(41) ProjectExecTransformer +(46) ProjectExecTransformer Output [3]: [hash(s_suppkey#X, s_nationkey#X, 42) AS hash_partition_key#X, s_suppkey#X, s_nationkey#X] Input [2]: [s_suppkey#X, s_nationkey#X] -(42) WholeStageCodegenTransformer (X) +(47) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] Arguments: false -(43) ColumnarExchange +(48) VeloxAppendBatches +Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] +Arguments: X + +(49) ColumnarExchange Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [id=#X] -(44) ShuffleQueryStage +(50) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] Arguments: X -(45) InputAdapter +(51) InputAdapter Input [2]: [s_suppkey#X, s_nationkey#X] -(46) InputIteratorTransformer +(52) InputIteratorTransformer Input [2]: [s_suppkey#X, s_nationkey#X] -(47) ShuffledHashJoinExecTransformer +(53) ShuffledHashJoinExecTransformer Left keys [2]: [l_suppkey#X, c_nationkey#X] Right keys [2]: [s_suppkey#X, s_nationkey#X] Join condition: None -(48) ProjectExecTransformer +(54) ProjectExecTransformer Output [4]: [hash(s_nationkey#X, 42) AS hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Input [6]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, s_suppkey#X, s_nationkey#X] -(49) WholeStageCodegenTransformer (X) +(55) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: false -(50) ColumnarExchange +(56) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X] +Arguments: X + +(57) ColumnarExchange Input [4]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X], [plan_id=X], [id=#X] -(51) ShuffleQueryStage +(58) ShuffleQueryStage Output [3]: [l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: X -(52) InputAdapter +(59) InputAdapter Input [3]: [l_extendedprice#X, l_discount#X, s_nationkey#X] -(53) InputIteratorTransformer +(60) InputIteratorTransformer Input [3]: [l_extendedprice#X, l_discount#X, s_nationkey#X] -(54) Scan parquet +(61) Scan parquet Output [3]: [n_nationkey#X, n_name#X, n_regionkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), IsNotNull(n_regionkey)] ReadSchema: struct -(55) FilterExecTransformer +(62) FilterExecTransformer Input [3]: [n_nationkey#X, n_name#X, n_regionkey#X] Arguments: (isnotnull(n_nationkey#X) AND isnotnull(n_regionkey#X)) -(56) ProjectExecTransformer +(63) ProjectExecTransformer Output [4]: [hash(n_nationkey#X, 42) AS hash_partition_key#X, n_nationkey#X, n_name#X, n_regionkey#X] Input [3]: [n_nationkey#X, n_name#X, n_regionkey#X] -(57) WholeStageCodegenTransformer (X) +(64) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, n_nationkey#X, n_name#X, n_regionkey#X] Arguments: false -(58) ColumnarExchange +(65) VeloxAppendBatches +Input [4]: [hash_partition_key#X, n_nationkey#X, n_name#X, n_regionkey#X] +Arguments: X + +(66) ColumnarExchange Input [4]: [hash_partition_key#X, n_nationkey#X, n_name#X, n_regionkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X, n_regionkey#X], [plan_id=X], [id=#X] -(59) ShuffleQueryStage +(67) ShuffleQueryStage Output [3]: [n_nationkey#X, n_name#X, n_regionkey#X] Arguments: X -(60) InputAdapter +(68) InputAdapter Input [3]: [n_nationkey#X, n_name#X, n_regionkey#X] -(61) InputIteratorTransformer +(69) InputIteratorTransformer Input [3]: [n_nationkey#X, n_name#X, n_regionkey#X] -(62) ShuffledHashJoinExecTransformer +(70) ShuffledHashJoinExecTransformer Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(63) ProjectExecTransformer +(71) ProjectExecTransformer Output [5]: [hash(n_regionkey#X, 42) AS hash_partition_key#X, l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] Input [6]: [l_extendedprice#X, l_discount#X, s_nationkey#X, n_nationkey#X, n_name#X, n_regionkey#X] -(64) WholeStageCodegenTransformer (X) +(72) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] Arguments: false -(65) ColumnarExchange +(73) VeloxAppendBatches +Input [5]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] +Arguments: X + +(74) ColumnarExchange Input [5]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] Arguments: hashpartitioning(n_regionkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X], [plan_id=X], [id=#X] -(66) ShuffleQueryStage +(75) ShuffleQueryStage Output [4]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] Arguments: X -(67) InputAdapter +(76) InputAdapter Input [4]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] -(68) InputIteratorTransformer +(77) InputIteratorTransformer Input [4]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] -(69) Scan parquet +(78) Scan parquet Output [2]: [r_regionkey#X, r_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(r_name), EqualTo(r_name,ASIA), IsNotNull(r_regionkey)] ReadSchema: struct -(70) FilterExecTransformer +(79) FilterExecTransformer Input [2]: [r_regionkey#X, r_name#X] Arguments: ((isnotnull(r_name#X) AND (r_name#X = ASIA)) AND isnotnull(r_regionkey#X)) -(71) ProjectExecTransformer +(80) ProjectExecTransformer Output [2]: [hash(r_regionkey#X, 42) AS hash_partition_key#X, r_regionkey#X] Input [2]: [r_regionkey#X, r_name#X] -(72) WholeStageCodegenTransformer (X) +(81) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, r_regionkey#X] Arguments: false -(73) ColumnarExchange +(82) VeloxAppendBatches +Input [2]: [hash_partition_key#X, r_regionkey#X] +Arguments: X + +(83) ColumnarExchange Input [2]: [hash_partition_key#X, r_regionkey#X] Arguments: hashpartitioning(r_regionkey#X, 1), ENSURE_REQUIREMENTS, [r_regionkey#X], [plan_id=X], [id=#X] -(74) ShuffleQueryStage +(84) ShuffleQueryStage Output [1]: [r_regionkey#X] Arguments: X -(75) InputAdapter +(85) InputAdapter Input [1]: [r_regionkey#X] -(76) InputIteratorTransformer +(86) InputIteratorTransformer Input [1]: [r_regionkey#X] -(77) ShuffledHashJoinExecTransformer +(87) ShuffledHashJoinExecTransformer Left keys [1]: [n_regionkey#X] Right keys [1]: [r_regionkey#X] Join condition: None -(78) ProjectExecTransformer +(88) ProjectExecTransformer Output [4]: [l_extendedprice#X, l_discount#X, n_name#X, CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) AS _pre_X#X] Input [5]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X, r_regionkey#X] -(79) FlushableHashAggregateExecTransformer +(89) FlushableHashAggregateExecTransformer Input [4]: [l_extendedprice#X, l_discount#X, n_name#X, _pre_X#X] Keys [1]: [n_name#X] Functions [1]: [partial_sum(_pre_X#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [n_name#X, sum#X, isEmpty#X] -(80) ProjectExecTransformer +(90) ProjectExecTransformer Output [4]: [hash(n_name#X, 42) AS hash_partition_key#X, n_name#X, sum#X, isEmpty#X] Input [3]: [n_name#X, sum#X, isEmpty#X] -(81) WholeStageCodegenTransformer (X) +(91) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, n_name#X, sum#X, isEmpty#X] Arguments: false -(82) ColumnarExchange +(92) VeloxAppendBatches +Input [4]: [hash_partition_key#X, n_name#X, sum#X, isEmpty#X] +Arguments: X + +(93) ColumnarExchange Input [4]: [hash_partition_key#X, n_name#X, sum#X, isEmpty#X] Arguments: hashpartitioning(n_name#X, 1), ENSURE_REQUIREMENTS, [n_name#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(83) ShuffleQueryStage +(94) ShuffleQueryStage Output [3]: [n_name#X, sum#X, isEmpty#X] Arguments: X -(84) InputAdapter +(95) InputAdapter Input [3]: [n_name#X, sum#X, isEmpty#X] -(85) InputIteratorTransformer +(96) InputIteratorTransformer Input [3]: [n_name#X, sum#X, isEmpty#X] -(86) RegularHashAggregateExecTransformer +(97) RegularHashAggregateExecTransformer Input [3]: [n_name#X, sum#X, isEmpty#X] Keys [1]: [n_name#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] Results [2]: [n_name#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X AS revenue#X] -(87) WholeStageCodegenTransformer (X) +(98) WholeStageCodegenTransformer (X) Input [2]: [n_name#X, revenue#X] Arguments: false -(88) ColumnarExchange +(99) VeloxAppendBatches +Input [2]: [n_name#X, revenue#X] +Arguments: X + +(100) ColumnarExchange Input [2]: [n_name#X, revenue#X] Arguments: rangepartitioning(revenue#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(89) ShuffleQueryStage +(101) ShuffleQueryStage Output [2]: [n_name#X, revenue#X] Arguments: X -(90) InputAdapter +(102) InputAdapter Input [2]: [n_name#X, revenue#X] -(91) InputIteratorTransformer +(103) InputIteratorTransformer Input [2]: [n_name#X, revenue#X] -(92) SortExecTransformer +(104) SortExecTransformer Input [2]: [n_name#X, revenue#X] Arguments: [revenue#X DESC NULLS LAST], true, 0 -(93) WholeStageCodegenTransformer (X) +(105) WholeStageCodegenTransformer (X) Input [2]: [n_name#X, revenue#X] Arguments: false -(94) VeloxColumnarToRowExec +(106) VeloxColumnarToRowExec Input [2]: [n_name#X, revenue#X] -(95) Scan parquet +(107) Scan parquet Output [2]: [c_custkey#X, c_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey), IsNotNull(c_nationkey)] ReadSchema: struct -(96) Filter +(108) Filter Input [2]: [c_custkey#X, c_nationkey#X] Condition : (isnotnull(c_custkey#X) AND isnotnull(c_nationkey#X)) -(97) Exchange +(109) Exchange Input [2]: [c_custkey#X, c_nationkey#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(98) Scan parquet +(110) Sort +Input [2]: [c_custkey#X, c_nationkey#X] +Arguments: [c_custkey#X ASC NULLS FIRST], false, 0 + +(111) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1994-01-01), LessThan(o_orderdate,1995-01-01), IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(99) Filter +(112) Filter Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Condition : ((((isnotnull(o_orderdate#X) AND (o_orderdate#X >= 1994-01-01)) AND (o_orderdate#X < 1995-01-01)) AND isnotnull(o_custkey#X)) AND isnotnull(o_orderkey#X)) -(100) Project +(113) Project Output [2]: [o_orderkey#X, o_custkey#X] Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] -(101) Exchange +(114) Exchange Input [2]: [o_orderkey#X, o_custkey#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(102) ShuffledHashJoin +(115) Sort +Input [2]: [o_orderkey#X, o_custkey#X] +Arguments: [o_custkey#X ASC NULLS FIRST], false, 0 + +(116) SortMergeJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(103) Project +(117) Project Output [2]: [c_nationkey#X, o_orderkey#X] Input [4]: [c_custkey#X, c_nationkey#X, o_orderkey#X, o_custkey#X] -(104) Exchange +(118) Exchange Input [2]: [c_nationkey#X, o_orderkey#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(105) Scan parquet +(119) Sort +Input [2]: [c_nationkey#X, o_orderkey#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(120) Scan parquet Output [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_orderkey), IsNotNull(l_suppkey)] ReadSchema: struct -(106) Filter +(121) Filter Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Condition : (isnotnull(l_orderkey#X) AND isnotnull(l_suppkey#X)) -(107) Exchange +(122) Exchange Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(108) ShuffledHashJoin +(123) Sort +Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(124) SortMergeJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(109) Project +(125) Project Output [4]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Input [6]: [c_nationkey#X, o_orderkey#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(110) Exchange +(126) Exchange Input [4]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_suppkey#X, c_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(111) Scan parquet +(127) Sort +Input [4]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] +Arguments: [l_suppkey#X ASC NULLS FIRST, c_nationkey#X ASC NULLS FIRST], false, 0 + +(128) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(112) Filter +(129) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(113) Exchange +(130) Exchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, s_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(114) ShuffledHashJoin +(131) Sort +Input [2]: [s_suppkey#X, s_nationkey#X] +Arguments: [s_suppkey#X ASC NULLS FIRST, s_nationkey#X ASC NULLS FIRST], false, 0 + +(132) SortMergeJoin Left keys [2]: [l_suppkey#X, c_nationkey#X] Right keys [2]: [s_suppkey#X, s_nationkey#X] Join condition: None -(115) Project +(133) Project Output [3]: [l_extendedprice#X, l_discount#X, s_nationkey#X] Input [6]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, s_suppkey#X, s_nationkey#X] -(116) Exchange +(134) Exchange Input [3]: [l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(117) Scan parquet +(135) Sort +Input [3]: [l_extendedprice#X, l_discount#X, s_nationkey#X] +Arguments: [s_nationkey#X ASC NULLS FIRST], false, 0 + +(136) Scan parquet Output [3]: [n_nationkey#X, n_name#X, n_regionkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), IsNotNull(n_regionkey)] ReadSchema: struct -(118) Filter +(137) Filter Input [3]: [n_nationkey#X, n_name#X, n_regionkey#X] Condition : (isnotnull(n_nationkey#X) AND isnotnull(n_regionkey#X)) -(119) Exchange +(138) Exchange Input [3]: [n_nationkey#X, n_name#X, n_regionkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(120) ShuffledHashJoin +(139) Sort +Input [3]: [n_nationkey#X, n_name#X, n_regionkey#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(140) SortMergeJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(121) Project +(141) Project Output [4]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] Input [6]: [l_extendedprice#X, l_discount#X, s_nationkey#X, n_nationkey#X, n_name#X, n_regionkey#X] -(122) Exchange +(142) Exchange Input [4]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] Arguments: hashpartitioning(n_regionkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(123) Scan parquet +(143) Sort +Input [4]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] +Arguments: [n_regionkey#X ASC NULLS FIRST], false, 0 + +(144) Scan parquet Output [2]: [r_regionkey#X, r_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(r_name), EqualTo(r_name,ASIA), IsNotNull(r_regionkey)] ReadSchema: struct -(124) Filter +(145) Filter Input [2]: [r_regionkey#X, r_name#X] Condition : ((isnotnull(r_name#X) AND (r_name#X = ASIA)) AND isnotnull(r_regionkey#X)) -(125) Project +(146) Project Output [1]: [r_regionkey#X] Input [2]: [r_regionkey#X, r_name#X] -(126) Exchange +(147) Exchange Input [1]: [r_regionkey#X] Arguments: hashpartitioning(r_regionkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(127) ShuffledHashJoin +(148) Sort +Input [1]: [r_regionkey#X] +Arguments: [r_regionkey#X ASC NULLS FIRST], false, 0 + +(149) SortMergeJoin Left keys [1]: [n_regionkey#X] Right keys [1]: [r_regionkey#X] Join condition: None -(128) Project +(150) Project Output [3]: [l_extendedprice#X, l_discount#X, n_name#X] Input [5]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X, r_regionkey#X] -(129) HashAggregate +(151) HashAggregate Input [3]: [l_extendedprice#X, l_discount#X, n_name#X] Keys [1]: [n_name#X] Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [n_name#X, sum#X, isEmpty#X] -(130) Exchange +(152) Exchange Input [3]: [n_name#X, sum#X, isEmpty#X] Arguments: hashpartitioning(n_name#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(131) HashAggregate +(153) HashAggregate Input [3]: [n_name#X, sum#X, isEmpty#X] Keys [1]: [n_name#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X] Results [2]: [n_name#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true))#X AS revenue#X] -(132) Exchange +(154) Exchange Input [2]: [n_name#X, revenue#X] Arguments: rangepartitioning(revenue#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(133) Sort +(155) Sort Input [2]: [n_name#X, revenue#X] Arguments: [revenue#X DESC NULLS LAST], true, 0 -(134) AdaptiveSparkPlan +(156) AdaptiveSparkPlan Output [2]: [n_name#X, revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/6.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/6.txt index fa2536a365c8a..786a89fe715ae 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/6.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/6.txt @@ -1,23 +1,23 @@ == Physical Plan == -AdaptiveSparkPlan (19) +AdaptiveSparkPlan (20) +- == Final Plan == - VeloxColumnarToRowExec (12) - +- ^ RegularHashAggregateExecTransformer (10) - +- ^ InputIteratorTransformer (9) - +- ^ InputAdapter (8) - +- ^ ShuffleQueryStage (7) - +- ColumnarExchange (6) + VeloxColumnarToRowExec (13) + +- ^ RegularHashAggregateExecTransformer (11) + +- ^ InputIteratorTransformer (10) + +- ShuffleQueryStage (8) + +- ColumnarExchange (7) + +- VeloxAppendBatches (6) +- ^ FlushableHashAggregateExecTransformer (4) +- ^ ProjectExecTransformer (3) +- ^ FilterExecTransformer (2) +- ^ Scan parquet (1) +- == Initial Plan == - HashAggregate (18) - +- Exchange (17) - +- HashAggregate (16) - +- Project (15) - +- Filter (14) - +- Scan parquet (13) + HashAggregate (19) + +- Exchange (18) + +- HashAggregate (17) + +- Project (16) + +- Filter (15) + +- Scan parquet (14) (1) Scan parquet @@ -46,67 +46,71 @@ Results [2]: [sum#X, isEmpty#X] Input [2]: [sum#X, isEmpty#X] Arguments: false -(6) ColumnarExchange +(6) VeloxAppendBatches +Input [2]: [sum#X, isEmpty#X] +Arguments: X + +(7) ColumnarExchange Input [2]: [sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(7) ShuffleQueryStage +(8) ShuffleQueryStage Output [2]: [sum#X, isEmpty#X] Arguments: X -(8) InputAdapter +(9) InputAdapter Input [2]: [sum#X, isEmpty#X] -(9) InputIteratorTransformer +(10) InputIteratorTransformer Input [2]: [sum#X, isEmpty#X] -(10) RegularHashAggregateExecTransformer +(11) RegularHashAggregateExecTransformer Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(CheckOverflow((promote_precision(l_extendedprice#X) * promote_precision(l_discount#X)), DecimalType(25,4), true))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(l_extendedprice#X) * promote_precision(l_discount#X)), DecimalType(25,4), true))#X] Results [1]: [sum(CheckOverflow((promote_precision(l_extendedprice#X) * promote_precision(l_discount#X)), DecimalType(25,4), true))#X AS revenue#X] -(11) WholeStageCodegenTransformer (X) +(12) WholeStageCodegenTransformer (X) Input [1]: [revenue#X] Arguments: false -(12) VeloxColumnarToRowExec +(13) VeloxColumnarToRowExec Input [1]: [revenue#X] -(13) Scan parquet +(14) Scan parquet Output [4]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), IsNotNull(l_discount), IsNotNull(l_quantity), GreaterThanOrEqual(l_shipdate,1994-01-01), LessThan(l_shipdate,1995-01-01), GreaterThanOrEqual(l_discount,0.05), LessThanOrEqual(l_discount,0.07), LessThan(l_quantity,24.00)] ReadSchema: struct -(14) Filter +(15) Filter Input [4]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : (((((((isnotnull(l_shipdate#X) AND isnotnull(l_discount#X)) AND isnotnull(l_quantity#X)) AND (l_shipdate#X >= 1994-01-01)) AND (l_shipdate#X < 1995-01-01)) AND (l_discount#X >= 0.05)) AND (l_discount#X <= 0.07)) AND (l_quantity#X < 24.00)) -(15) Project +(16) Project Output [2]: [l_extendedprice#X, l_discount#X] Input [4]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(16) HashAggregate +(17) HashAggregate Input [2]: [l_extendedprice#X, l_discount#X] Keys: [] Functions [1]: [partial_sum(CheckOverflow((promote_precision(l_extendedprice#X) * promote_precision(l_discount#X)), DecimalType(25,4), true))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(17) Exchange +(18) Exchange Input [2]: [sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X] -(18) HashAggregate +(19) HashAggregate Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(CheckOverflow((promote_precision(l_extendedprice#X) * promote_precision(l_discount#X)), DecimalType(25,4), true))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(l_extendedprice#X) * promote_precision(l_discount#X)), DecimalType(25,4), true))#X] Results [1]: [sum(CheckOverflow((promote_precision(l_extendedprice#X) * promote_precision(l_discount#X)), DecimalType(25,4), true))#X AS revenue#X] -(19) AdaptiveSparkPlan +(20) AdaptiveSparkPlan Output [1]: [revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/7.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/7.txt index 9cf9167180397..64d51413a0849 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/7.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/7.txt @@ -1,122 +1,131 @@ == Physical Plan == -AdaptiveSparkPlan (128) +AdaptiveSparkPlan (149) +- == Final Plan == - VeloxColumnarToRowExec (90) - +- ^ SortExecTransformer (88) - +- ^ InputIteratorTransformer (87) - +- ^ InputAdapter (86) - +- ^ ShuffleQueryStage (85) - +- ColumnarExchange (84) - +- ^ RegularHashAggregateExecTransformer (82) - +- ^ InputIteratorTransformer (81) - +- ^ InputAdapter (80) - +- ^ ShuffleQueryStage (79) - +- ColumnarExchange (78) - +- ^ ProjectExecTransformer (76) - +- ^ FlushableHashAggregateExecTransformer (75) - +- ^ ProjectExecTransformer (74) - +- ^ ShuffledHashJoinExecTransformer Inner (73) - :- ^ InputIteratorTransformer (68) - : +- ^ InputAdapter (67) - : +- ^ ShuffleQueryStage (66) - : +- ColumnarExchange (65) - : +- ^ ProjectExecTransformer (63) - : +- ^ ShuffledHashJoinExecTransformer Inner (62) - : :- ^ InputIteratorTransformer (53) - : : +- ^ InputAdapter (52) - : : +- ^ ShuffleQueryStage (51) - : : +- ColumnarExchange (50) - : : +- ^ ProjectExecTransformer (48) - : : +- ^ ShuffledHashJoinExecTransformer Inner (47) - : : :- ^ InputIteratorTransformer (38) - : : : +- ^ InputAdapter (37) - : : : +- ^ ShuffleQueryStage (36) - : : : +- ColumnarExchange (35) - : : : +- ^ ProjectExecTransformer (33) - : : : +- ^ ShuffledHashJoinExecTransformer Inner (32) - : : : :- ^ InputIteratorTransformer (23) - : : : : +- ^ InputAdapter (22) - : : : : +- ^ ShuffleQueryStage (21) - : : : : +- ColumnarExchange (20) - : : : : +- ^ ProjectExecTransformer (18) - : : : : +- ^ ShuffledHashJoinExecTransformer Inner (17) - : : : : :- ^ InputIteratorTransformer (8) - : : : : : +- ^ InputAdapter (7) - : : : : : +- ^ ShuffleQueryStage (6) - : : : : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (101) + +- ^ SortExecTransformer (99) + +- ^ InputIteratorTransformer (98) + +- ShuffleQueryStage (96) + +- ColumnarExchange (95) + +- VeloxAppendBatches (94) + +- ^ RegularHashAggregateExecTransformer (92) + +- ^ InputIteratorTransformer (91) + +- ShuffleQueryStage (89) + +- ColumnarExchange (88) + +- VeloxAppendBatches (87) + +- ^ ProjectExecTransformer (85) + +- ^ FlushableHashAggregateExecTransformer (84) + +- ^ ProjectExecTransformer (83) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (82) + :- ^ InputIteratorTransformer (77) + : +- ShuffleQueryStage (75) + : +- ColumnarExchange (74) + : +- VeloxAppendBatches (73) + : +- ^ ProjectExecTransformer (71) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (70) + : :- ^ InputIteratorTransformer (60) + : : +- ShuffleQueryStage (58) + : : +- ColumnarExchange (57) + : : +- VeloxAppendBatches (56) + : : +- ^ ProjectExecTransformer (54) + : : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (53) + : : :- ^ InputIteratorTransformer (43) + : : : +- ShuffleQueryStage (41) + : : : +- ColumnarExchange (40) + : : : +- VeloxAppendBatches (39) + : : : +- ^ ProjectExecTransformer (37) + : : : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (36) + : : : :- ^ InputIteratorTransformer (26) + : : : : +- ShuffleQueryStage (24) + : : : : +- ColumnarExchange (23) + : : : : +- VeloxAppendBatches (22) + : : : : +- ^ ProjectExecTransformer (20) + : : : : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (19) + : : : : :- ^ InputIteratorTransformer (9) + : : : : : +- ShuffleQueryStage (7) + : : : : : +- ColumnarExchange (6) + : : : : : +- VeloxAppendBatches (5) : : : : : +- ^ ProjectExecTransformer (3) : : : : : +- ^ FilterExecTransformer (2) : : : : : +- ^ Scan parquet (1) - : : : : +- ^ InputIteratorTransformer (16) - : : : : +- ^ InputAdapter (15) - : : : : +- ^ ShuffleQueryStage (14) - : : : : +- ColumnarExchange (13) - : : : : +- ^ ProjectExecTransformer (11) - : : : : +- ^ FilterExecTransformer (10) - : : : : +- ^ Scan parquet (9) - : : : +- ^ InputIteratorTransformer (31) - : : : +- ^ InputAdapter (30) - : : : +- ^ ShuffleQueryStage (29) - : : : +- ColumnarExchange (28) - : : : +- ^ ProjectExecTransformer (26) - : : : +- ^ FilterExecTransformer (25) - : : : +- ^ Scan parquet (24) - : : +- ^ InputIteratorTransformer (46) - : : +- ^ InputAdapter (45) - : : +- ^ ShuffleQueryStage (44) - : : +- ColumnarExchange (43) - : : +- ^ ProjectExecTransformer (41) - : : +- ^ FilterExecTransformer (40) - : : +- ^ Scan parquet (39) - : +- ^ InputIteratorTransformer (61) - : +- ^ InputAdapter (60) - : +- ^ ShuffleQueryStage (59) - : +- ColumnarExchange (58) - : +- ^ ProjectExecTransformer (56) - : +- ^ FilterExecTransformer (55) - : +- ^ Scan parquet (54) - +- ^ InputIteratorTransformer (72) - +- ^ InputAdapter (71) - +- ^ ShuffleQueryStage (70) - +- ReusedExchange (69) + : : : : +- ^ InputIteratorTransformer (18) + : : : : +- ShuffleQueryStage (16) + : : : : +- ColumnarExchange (15) + : : : : +- VeloxAppendBatches (14) + : : : : +- ^ ProjectExecTransformer (12) + : : : : +- ^ FilterExecTransformer (11) + : : : : +- ^ Scan parquet (10) + : : : +- ^ InputIteratorTransformer (35) + : : : +- ShuffleQueryStage (33) + : : : +- ColumnarExchange (32) + : : : +- VeloxAppendBatches (31) + : : : +- ^ ProjectExecTransformer (29) + : : : +- ^ FilterExecTransformer (28) + : : : +- ^ Scan parquet (27) + : : +- ^ InputIteratorTransformer (52) + : : +- ShuffleQueryStage (50) + : : +- ColumnarExchange (49) + : : +- VeloxAppendBatches (48) + : : +- ^ ProjectExecTransformer (46) + : : +- ^ FilterExecTransformer (45) + : : +- ^ Scan parquet (44) + : +- ^ InputIteratorTransformer (69) + : +- ShuffleQueryStage (67) + : +- ColumnarExchange (66) + : +- VeloxAppendBatches (65) + : +- ^ ProjectExecTransformer (63) + : +- ^ FilterExecTransformer (62) + : +- ^ Scan parquet (61) + +- ^ InputIteratorTransformer (81) + +- ShuffleQueryStage (79) + +- ReusedExchange (78) +- == Initial Plan == - Sort (127) - +- Exchange (126) - +- HashAggregate (125) - +- Exchange (124) - +- HashAggregate (123) - +- Project (122) - +- ShuffledHashJoin Inner BuildRight (121) - :- Exchange (117) - : +- Project (116) - : +- ShuffledHashJoin Inner BuildRight (115) - : :- Exchange (111) - : : +- Project (110) - : : +- ShuffledHashJoin Inner BuildRight (109) - : : :- Exchange (105) - : : : +- Project (104) - : : : +- ShuffledHashJoin Inner BuildRight (103) - : : : :- Exchange (99) - : : : : +- Project (98) - : : : : +- ShuffledHashJoin Inner BuildLeft (97) - : : : : :- Exchange (93) - : : : : : +- Filter (92) - : : : : : +- Scan parquet (91) - : : : : +- Exchange (96) - : : : : +- Filter (95) - : : : : +- Scan parquet (94) - : : : +- Exchange (102) - : : : +- Filter (101) - : : : +- Scan parquet (100) - : : +- Exchange (108) - : : +- Filter (107) - : : +- Scan parquet (106) - : +- Exchange (114) - : +- Filter (113) - : +- Scan parquet (112) - +- Exchange (120) - +- Filter (119) - +- Scan parquet (118) + Sort (148) + +- Exchange (147) + +- HashAggregate (146) + +- Exchange (145) + +- HashAggregate (144) + +- Project (143) + +- SortMergeJoin Inner (142) + :- Sort (137) + : +- Exchange (136) + : +- Project (135) + : +- SortMergeJoin Inner (134) + : :- Sort (129) + : : +- Exchange (128) + : : +- Project (127) + : : +- SortMergeJoin Inner (126) + : : :- Sort (121) + : : : +- Exchange (120) + : : : +- Project (119) + : : : +- SortMergeJoin Inner (118) + : : : :- Sort (113) + : : : : +- Exchange (112) + : : : : +- Project (111) + : : : : +- SortMergeJoin Inner (110) + : : : : :- Sort (105) + : : : : : +- Exchange (104) + : : : : : +- Filter (103) + : : : : : +- Scan parquet (102) + : : : : +- Sort (109) + : : : : +- Exchange (108) + : : : : +- Filter (107) + : : : : +- Scan parquet (106) + : : : +- Sort (117) + : : : +- Exchange (116) + : : : +- Filter (115) + : : : +- Scan parquet (114) + : : +- Sort (125) + : : +- Exchange (124) + : : +- Filter (123) + : : +- Scan parquet (122) + : +- Sort (133) + : +- Exchange (132) + : +- Filter (131) + : +- Scan parquet (130) + +- Sort (141) + +- Exchange (140) + +- Filter (139) + +- Scan parquet (138) (1) Scan parquet @@ -138,524 +147,608 @@ Input [2]: [s_suppkey#X, s_nationkey#X] Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] +Arguments: X + +(6) ColumnarExchange Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [2]: [s_suppkey#X, s_nationkey#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [2]: [s_suppkey#X, s_nationkey#X] -(9) Scan parquet +(10) Scan parquet Output [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1995-01-01), LessThanOrEqual(l_shipdate,1996-12-31), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(10) FilterExecTransformer +(11) FilterExecTransformer Input [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: ((((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1995-01-01)) AND (l_shipdate#X <= 1996-12-31)) AND isnotnull(l_suppkey#X)) AND isnotnull(l_orderkey#X)) -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [6]: [hash(l_suppkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Input [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [6]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] +Arguments: X + +(15) ColumnarExchange Input [6]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [s_suppkey#X] Right keys [1]: [l_suppkey#X] Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [6]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Input [7]: [s_suppkey#X, s_nationkey#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(19) WholeStageCodegenTransformer (X) +(21) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: false -(20) ColumnarExchange +(22) VeloxAppendBatches +Input [6]: [hash_partition_key#X, s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] +Arguments: X + +(23) ColumnarExchange Input [6]: [hash_partition_key#X, s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X], [plan_id=X], [id=#X] -(21) ShuffleQueryStage +(24) ShuffleQueryStage Output [5]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: X -(22) InputAdapter +(25) InputAdapter Input [5]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(23) InputIteratorTransformer +(26) InputIteratorTransformer Input [5]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(24) Scan parquet +(27) Scan parquet Output [2]: [o_orderkey#X, o_custkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderkey), IsNotNull(o_custkey)] ReadSchema: struct -(25) FilterExecTransformer +(28) FilterExecTransformer Input [2]: [o_orderkey#X, o_custkey#X] Arguments: (isnotnull(o_orderkey#X) AND isnotnull(o_custkey#X)) -(26) ProjectExecTransformer +(29) ProjectExecTransformer Output [3]: [hash(o_orderkey#X, 42) AS hash_partition_key#X, o_orderkey#X, o_custkey#X] Input [2]: [o_orderkey#X, o_custkey#X] -(27) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] Arguments: false -(28) ColumnarExchange +(31) VeloxAppendBatches +Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] +Arguments: X + +(32) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [id=#X] -(29) ShuffleQueryStage +(33) ShuffleQueryStage Output [2]: [o_orderkey#X, o_custkey#X] Arguments: X -(30) InputAdapter +(34) InputAdapter Input [2]: [o_orderkey#X, o_custkey#X] -(31) InputIteratorTransformer +(35) InputIteratorTransformer Input [2]: [o_orderkey#X, o_custkey#X] -(32) ShuffledHashJoinExecTransformer +(36) ShuffledHashJoinExecTransformer Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join condition: None -(33) ProjectExecTransformer +(37) ProjectExecTransformer Output [6]: [hash(o_custkey#X, 42) AS hash_partition_key#X, s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] Input [7]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_orderkey#X, o_custkey#X] -(34) WholeStageCodegenTransformer (X) +(38) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] Arguments: false -(35) ColumnarExchange +(39) VeloxAppendBatches +Input [6]: [hash_partition_key#X, s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] +Arguments: X + +(40) ColumnarExchange Input [6]: [hash_partition_key#X, s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X], [plan_id=X], [id=#X] -(36) ShuffleQueryStage +(41) ShuffleQueryStage Output [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] Arguments: X -(37) InputAdapter +(42) InputAdapter Input [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] -(38) InputIteratorTransformer +(43) InputIteratorTransformer Input [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] -(39) Scan parquet +(44) Scan parquet Output [2]: [c_custkey#X, c_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey), IsNotNull(c_nationkey)] ReadSchema: struct -(40) FilterExecTransformer +(45) FilterExecTransformer Input [2]: [c_custkey#X, c_nationkey#X] Arguments: (isnotnull(c_custkey#X) AND isnotnull(c_nationkey#X)) -(41) ProjectExecTransformer +(46) ProjectExecTransformer Output [3]: [hash(c_custkey#X, 42) AS hash_partition_key#X, c_custkey#X, c_nationkey#X] Input [2]: [c_custkey#X, c_nationkey#X] -(42) WholeStageCodegenTransformer (X) +(47) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] Arguments: false -(43) ColumnarExchange +(48) VeloxAppendBatches +Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] +Arguments: X + +(49) ColumnarExchange Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_nationkey#X], [plan_id=X], [id=#X] -(44) ShuffleQueryStage +(50) ShuffleQueryStage Output [2]: [c_custkey#X, c_nationkey#X] Arguments: X -(45) InputAdapter +(51) InputAdapter Input [2]: [c_custkey#X, c_nationkey#X] -(46) InputIteratorTransformer +(52) InputIteratorTransformer Input [2]: [c_custkey#X, c_nationkey#X] -(47) ShuffledHashJoinExecTransformer +(53) ShuffledHashJoinExecTransformer Left keys [1]: [o_custkey#X] Right keys [1]: [c_custkey#X] Join condition: None -(48) ProjectExecTransformer +(54) ProjectExecTransformer Output [6]: [hash(s_nationkey#X, 42) AS hash_partition_key#X, s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] Input [7]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X, c_custkey#X, c_nationkey#X] -(49) WholeStageCodegenTransformer (X) +(55) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] Arguments: false -(50) ColumnarExchange +(56) VeloxAppendBatches +Input [6]: [hash_partition_key#X, s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] +Arguments: X + +(57) ColumnarExchange Input [6]: [hash_partition_key#X, s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X], [plan_id=X], [id=#X] -(51) ShuffleQueryStage +(58) ShuffleQueryStage Output [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] Arguments: X -(52) InputAdapter +(59) InputAdapter Input [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] -(53) InputIteratorTransformer +(60) InputIteratorTransformer Input [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] -(54) Scan parquet +(61) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), Or(EqualTo(n_name,FRANCE),EqualTo(n_name,GERMANY))] ReadSchema: struct -(55) FilterExecTransformer +(62) FilterExecTransformer Input [2]: [n_nationkey#X, n_name#X] Arguments: (isnotnull(n_nationkey#X) AND ((n_name#X = FRANCE) OR (n_name#X = GERMANY))) -(56) ProjectExecTransformer +(63) ProjectExecTransformer Output [3]: [hash(n_nationkey#X, 42) AS hash_partition_key#X, n_nationkey#X, n_name#X] Input [2]: [n_nationkey#X, n_name#X] -(57) WholeStageCodegenTransformer (X) +(64) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] Arguments: false -(58) ColumnarExchange +(65) VeloxAppendBatches +Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] +Arguments: X + +(66) ColumnarExchange Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [id=#X] -(59) ShuffleQueryStage +(67) ShuffleQueryStage Output [2]: [n_nationkey#X, n_name#X] Arguments: X -(60) InputAdapter +(68) InputAdapter Input [2]: [n_nationkey#X, n_name#X] -(61) InputIteratorTransformer +(69) InputIteratorTransformer Input [2]: [n_nationkey#X, n_name#X] -(62) ShuffledHashJoinExecTransformer +(70) ShuffledHashJoinExecTransformer Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(63) ProjectExecTransformer +(71) ProjectExecTransformer Output [6]: [hash(c_nationkey#X, 42) AS hash_partition_key#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] Input [7]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_nationkey#X, n_name#X] -(64) WholeStageCodegenTransformer (X) +(72) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] Arguments: false -(65) ColumnarExchange +(73) VeloxAppendBatches +Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] +Arguments: X + +(74) ColumnarExchange Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X], [plan_id=X], [id=#X] -(66) ShuffleQueryStage +(75) ShuffleQueryStage Output [5]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] Arguments: X -(67) InputAdapter +(76) InputAdapter Input [5]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] -(68) InputIteratorTransformer +(77) InputIteratorTransformer Input [5]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] -(69) ReusedExchange [Reuses operator id: 58] +(78) ReusedExchange [Reuses operator id: 66] Output [2]: [n_nationkey#X, n_name#X] -(70) ShuffleQueryStage +(79) ShuffleQueryStage Output [2]: [n_nationkey#X, n_name#X] Arguments: X -(71) InputAdapter +(80) InputAdapter Input [2]: [n_nationkey#X, n_name#X] -(72) InputIteratorTransformer +(81) InputIteratorTransformer Input [2]: [n_nationkey#X, n_name#X] -(73) ShuffledHashJoinExecTransformer +(82) ShuffledHashJoinExecTransformer Left keys [1]: [c_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: (((n_name#X = FRANCE) AND (n_name#X = GERMANY)) OR ((n_name#X = GERMANY) AND (n_name#X = FRANCE))) -(74) ProjectExecTransformer +(83) ProjectExecTransformer Output [4]: [n_name#X AS supp_nation#X, n_name#X AS cust_nation#X, year(l_shipdate#X) AS l_year#X, CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) AS volume#X] Input [7]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X, n_nationkey#X, n_name#X] -(75) FlushableHashAggregateExecTransformer +(84) FlushableHashAggregateExecTransformer Input [4]: [supp_nation#X, cust_nation#X, l_year#X, volume#X] Keys [3]: [supp_nation#X, cust_nation#X, l_year#X] Functions [1]: [partial_sum(volume#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] -(76) ProjectExecTransformer +(85) ProjectExecTransformer Output [6]: [hash(supp_nation#X, cust_nation#X, l_year#X, 42) AS hash_partition_key#X, supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] -(77) WholeStageCodegenTransformer (X) +(86) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Arguments: false -(78) ColumnarExchange +(87) VeloxAppendBatches +Input [6]: [hash_partition_key#X, supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] +Arguments: X + +(88) ColumnarExchange Input [6]: [hash_partition_key#X, supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Arguments: hashpartitioning(supp_nation#X, cust_nation#X, l_year#X, 1), ENSURE_REQUIREMENTS, [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(79) ShuffleQueryStage +(89) ShuffleQueryStage Output [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Arguments: X -(80) InputAdapter +(90) InputAdapter Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] -(81) InputIteratorTransformer +(91) InputIteratorTransformer Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] -(82) RegularHashAggregateExecTransformer +(92) RegularHashAggregateExecTransformer Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Keys [3]: [supp_nation#X, cust_nation#X, l_year#X] Functions [1]: [sum(volume#X)] Aggregate Attributes [1]: [sum(volume#X)#X] Results [4]: [supp_nation#X, cust_nation#X, l_year#X, sum(volume#X)#X AS revenue#X] -(83) WholeStageCodegenTransformer (X) +(93) WholeStageCodegenTransformer (X) Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: false -(84) ColumnarExchange +(94) VeloxAppendBatches +Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] +Arguments: X + +(95) ColumnarExchange Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: rangepartitioning(supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(85) ShuffleQueryStage +(96) ShuffleQueryStage Output [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: X -(86) InputAdapter +(97) InputAdapter Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] -(87) InputIteratorTransformer +(98) InputIteratorTransformer Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] -(88) SortExecTransformer +(99) SortExecTransformer Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: [supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST], true, 0 -(89) WholeStageCodegenTransformer (X) +(100) WholeStageCodegenTransformer (X) Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: false -(90) VeloxColumnarToRowExec +(101) VeloxColumnarToRowExec Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] -(91) Scan parquet +(102) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(92) Filter +(103) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(93) Exchange +(104) Exchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(94) Scan parquet +(105) Sort +Input [2]: [s_suppkey#X, s_nationkey#X] +Arguments: [s_suppkey#X ASC NULLS FIRST], false, 0 + +(106) Scan parquet Output [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1995-01-01), LessThanOrEqual(l_shipdate,1996-12-31), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(95) Filter +(107) Filter Input [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : ((((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1995-01-01)) AND (l_shipdate#X <= 1996-12-31)) AND isnotnull(l_suppkey#X)) AND isnotnull(l_orderkey#X)) -(96) Exchange +(108) Exchange Input [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(97) ShuffledHashJoin +(109) Sort +Input [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] +Arguments: [l_suppkey#X ASC NULLS FIRST], false, 0 + +(110) SortMergeJoin Left keys [1]: [s_suppkey#X] Right keys [1]: [l_suppkey#X] Join condition: None -(98) Project +(111) Project Output [5]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Input [7]: [s_suppkey#X, s_nationkey#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(99) Exchange +(112) Exchange Input [5]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(100) Scan parquet +(113) Sort +Input [5]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(114) Scan parquet Output [2]: [o_orderkey#X, o_custkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderkey), IsNotNull(o_custkey)] ReadSchema: struct -(101) Filter +(115) Filter Input [2]: [o_orderkey#X, o_custkey#X] Condition : (isnotnull(o_orderkey#X) AND isnotnull(o_custkey#X)) -(102) Exchange +(116) Exchange Input [2]: [o_orderkey#X, o_custkey#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(103) ShuffledHashJoin +(117) Sort +Input [2]: [o_orderkey#X, o_custkey#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(118) SortMergeJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join condition: None -(104) Project +(119) Project Output [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] Input [7]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_orderkey#X, o_custkey#X] -(105) Exchange +(120) Exchange Input [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(106) Scan parquet +(121) Sort +Input [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] +Arguments: [o_custkey#X ASC NULLS FIRST], false, 0 + +(122) Scan parquet Output [2]: [c_custkey#X, c_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey), IsNotNull(c_nationkey)] ReadSchema: struct -(107) Filter +(123) Filter Input [2]: [c_custkey#X, c_nationkey#X] Condition : (isnotnull(c_custkey#X) AND isnotnull(c_nationkey#X)) -(108) Exchange +(124) Exchange Input [2]: [c_custkey#X, c_nationkey#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(109) ShuffledHashJoin +(125) Sort +Input [2]: [c_custkey#X, c_nationkey#X] +Arguments: [c_custkey#X ASC NULLS FIRST], false, 0 + +(126) SortMergeJoin Left keys [1]: [o_custkey#X] Right keys [1]: [c_custkey#X] Join condition: None -(110) Project +(127) Project Output [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] Input [7]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X, c_custkey#X, c_nationkey#X] -(111) Exchange +(128) Exchange Input [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(112) Scan parquet +(129) Sort +Input [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] +Arguments: [s_nationkey#X ASC NULLS FIRST], false, 0 + +(130) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), Or(EqualTo(n_name,FRANCE),EqualTo(n_name,GERMANY))] ReadSchema: struct -(113) Filter +(131) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : (isnotnull(n_nationkey#X) AND ((n_name#X = FRANCE) OR (n_name#X = GERMANY))) -(114) Exchange +(132) Exchange Input [2]: [n_nationkey#X, n_name#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(115) ShuffledHashJoin +(133) Sort +Input [2]: [n_nationkey#X, n_name#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(134) SortMergeJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(116) Project +(135) Project Output [5]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] Input [7]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_nationkey#X, n_name#X] -(117) Exchange +(136) Exchange Input [5]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(118) Scan parquet +(137) Sort +Input [5]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] +Arguments: [c_nationkey#X ASC NULLS FIRST], false, 0 + +(138) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), Or(EqualTo(n_name,GERMANY),EqualTo(n_name,FRANCE))] ReadSchema: struct -(119) Filter +(139) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : (isnotnull(n_nationkey#X) AND ((n_name#X = GERMANY) OR (n_name#X = FRANCE))) -(120) Exchange +(140) Exchange Input [2]: [n_nationkey#X, n_name#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(121) ShuffledHashJoin +(141) Sort +Input [2]: [n_nationkey#X, n_name#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(142) SortMergeJoin Left keys [1]: [c_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: (((n_name#X = FRANCE) AND (n_name#X = GERMANY)) OR ((n_name#X = GERMANY) AND (n_name#X = FRANCE))) -(122) Project +(143) Project Output [4]: [n_name#X AS supp_nation#X, n_name#X AS cust_nation#X, year(l_shipdate#X) AS l_year#X, CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) AS volume#X] Input [7]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X, n_nationkey#X, n_name#X] -(123) HashAggregate +(144) HashAggregate Input [4]: [supp_nation#X, cust_nation#X, l_year#X, volume#X] Keys [3]: [supp_nation#X, cust_nation#X, l_year#X] Functions [1]: [partial_sum(volume#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] -(124) Exchange +(145) Exchange Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Arguments: hashpartitioning(supp_nation#X, cust_nation#X, l_year#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(125) HashAggregate +(146) HashAggregate Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Keys [3]: [supp_nation#X, cust_nation#X, l_year#X] Functions [1]: [sum(volume#X)] Aggregate Attributes [1]: [sum(volume#X)#X] Results [4]: [supp_nation#X, cust_nation#X, l_year#X, sum(volume#X)#X AS revenue#X] -(126) Exchange +(147) Exchange Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: rangepartitioning(supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(127) Sort +(148) Sort Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: [supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST], true, 0 -(128) AdaptiveSparkPlan +(149) AdaptiveSparkPlan Output [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/8.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/8.txt index 1e191ae4d23c6..8934d1a2e7a39 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/8.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/8.txt @@ -1,166 +1,180 @@ == Physical Plan == -AdaptiveSparkPlan (177) +AdaptiveSparkPlan (207) +- == Final Plan == - VeloxColumnarToRowExec (125) - +- ^ SortExecTransformer (123) - +- ^ InputIteratorTransformer (122) - +- ^ InputAdapter (121) - +- ^ ShuffleQueryStage (120) - +- ColumnarExchange (119) - +- ^ ProjectExecTransformer (117) - +- ^ RegularHashAggregateExecTransformer (116) - +- ^ InputIteratorTransformer (115) - +- ^ InputAdapter (114) - +- ^ ShuffleQueryStage (113) - +- ColumnarExchange (112) - +- ^ ProjectExecTransformer (110) - +- ^ FlushableHashAggregateExecTransformer (109) - +- ^ ProjectExecTransformer (108) - +- ^ ShuffledHashJoinExecTransformer Inner (107) - :- ^ InputIteratorTransformer (98) - : +- ^ InputAdapter (97) - : +- ^ ShuffleQueryStage (96) - : +- ColumnarExchange (95) - : +- ^ ProjectExecTransformer (93) - : +- ^ ShuffledHashJoinExecTransformer Inner (92) - : :- ^ InputIteratorTransformer (83) - : : +- ^ InputAdapter (82) - : : +- ^ ShuffleQueryStage (81) - : : +- ColumnarExchange (80) - : : +- ^ ProjectExecTransformer (78) - : : +- ^ ShuffledHashJoinExecTransformer Inner (77) - : : :- ^ InputIteratorTransformer (68) - : : : +- ^ InputAdapter (67) - : : : +- ^ ShuffleQueryStage (66) - : : : +- ColumnarExchange (65) - : : : +- ^ ProjectExecTransformer (63) - : : : +- ^ ShuffledHashJoinExecTransformer Inner (62) - : : : :- ^ InputIteratorTransformer (53) - : : : : +- ^ InputAdapter (52) - : : : : +- ^ ShuffleQueryStage (51) - : : : : +- ColumnarExchange (50) - : : : : +- ^ ProjectExecTransformer (48) - : : : : +- ^ ShuffledHashJoinExecTransformer Inner (47) - : : : : :- ^ InputIteratorTransformer (38) - : : : : : +- ^ InputAdapter (37) - : : : : : +- ^ ShuffleQueryStage (36) - : : : : : +- ColumnarExchange (35) - : : : : : +- ^ ProjectExecTransformer (33) - : : : : : +- ^ ShuffledHashJoinExecTransformer Inner (32) - : : : : : :- ^ InputIteratorTransformer (23) - : : : : : : +- ^ InputAdapter (22) - : : : : : : +- ^ ShuffleQueryStage (21) - : : : : : : +- ColumnarExchange (20) - : : : : : : +- ^ ProjectExecTransformer (18) - : : : : : : +- ^ ShuffledHashJoinExecTransformer Inner (17) - : : : : : : :- ^ InputIteratorTransformer (8) - : : : : : : : +- ^ InputAdapter (7) - : : : : : : : +- ^ ShuffleQueryStage (6) - : : : : : : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (141) + +- ^ SortExecTransformer (139) + +- ^ InputIteratorTransformer (138) + +- ShuffleQueryStage (136) + +- ColumnarExchange (135) + +- VeloxAppendBatches (134) + +- ^ ProjectExecTransformer (132) + +- ^ RegularHashAggregateExecTransformer (131) + +- ^ InputIteratorTransformer (130) + +- ShuffleQueryStage (128) + +- ColumnarExchange (127) + +- VeloxAppendBatches (126) + +- ^ ProjectExecTransformer (124) + +- ^ FlushableHashAggregateExecTransformer (123) + +- ^ ProjectExecTransformer (122) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (121) + :- ^ InputIteratorTransformer (111) + : +- ShuffleQueryStage (109) + : +- ColumnarExchange (108) + : +- VeloxAppendBatches (107) + : +- ^ ProjectExecTransformer (105) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (104) + : :- ^ InputIteratorTransformer (94) + : : +- ShuffleQueryStage (92) + : : +- ColumnarExchange (91) + : : +- VeloxAppendBatches (90) + : : +- ^ ProjectExecTransformer (88) + : : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (87) + : : :- ^ InputIteratorTransformer (77) + : : : +- ShuffleQueryStage (75) + : : : +- ColumnarExchange (74) + : : : +- VeloxAppendBatches (73) + : : : +- ^ ProjectExecTransformer (71) + : : : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (70) + : : : :- ^ InputIteratorTransformer (60) + : : : : +- ShuffleQueryStage (58) + : : : : +- ColumnarExchange (57) + : : : : +- VeloxAppendBatches (56) + : : : : +- ^ ProjectExecTransformer (54) + : : : : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (53) + : : : : :- ^ InputIteratorTransformer (43) + : : : : : +- ShuffleQueryStage (41) + : : : : : +- ColumnarExchange (40) + : : : : : +- VeloxAppendBatches (39) + : : : : : +- ^ ProjectExecTransformer (37) + : : : : : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (36) + : : : : : :- ^ InputIteratorTransformer (26) + : : : : : : +- ShuffleQueryStage (24) + : : : : : : +- ColumnarExchange (23) + : : : : : : +- VeloxAppendBatches (22) + : : : : : : +- ^ ProjectExecTransformer (20) + : : : : : : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (19) + : : : : : : :- ^ InputIteratorTransformer (9) + : : : : : : : +- ShuffleQueryStage (7) + : : : : : : : +- ColumnarExchange (6) + : : : : : : : +- VeloxAppendBatches (5) : : : : : : : +- ^ ProjectExecTransformer (3) : : : : : : : +- ^ FilterExecTransformer (2) : : : : : : : +- ^ Scan parquet (1) - : : : : : : +- ^ InputIteratorTransformer (16) - : : : : : : +- ^ InputAdapter (15) - : : : : : : +- ^ ShuffleQueryStage (14) - : : : : : : +- ColumnarExchange (13) - : : : : : : +- ^ ProjectExecTransformer (11) - : : : : : : +- ^ FilterExecTransformer (10) - : : : : : : +- ^ Scan parquet (9) - : : : : : +- ^ InputIteratorTransformer (31) - : : : : : +- ^ InputAdapter (30) - : : : : : +- ^ ShuffleQueryStage (29) - : : : : : +- ColumnarExchange (28) - : : : : : +- ^ ProjectExecTransformer (26) - : : : : : +- ^ FilterExecTransformer (25) - : : : : : +- ^ Scan parquet (24) - : : : : +- ^ InputIteratorTransformer (46) - : : : : +- ^ InputAdapter (45) - : : : : +- ^ ShuffleQueryStage (44) - : : : : +- ColumnarExchange (43) - : : : : +- ^ ProjectExecTransformer (41) - : : : : +- ^ FilterExecTransformer (40) - : : : : +- ^ Scan parquet (39) - : : : +- ^ InputIteratorTransformer (61) - : : : +- ^ InputAdapter (60) - : : : +- ^ ShuffleQueryStage (59) - : : : +- ColumnarExchange (58) - : : : +- ^ ProjectExecTransformer (56) - : : : +- ^ FilterExecTransformer (55) - : : : +- ^ Scan parquet (54) - : : +- ^ InputIteratorTransformer (76) - : : +- ^ InputAdapter (75) - : : +- ^ ShuffleQueryStage (74) - : : +- ColumnarExchange (73) - : : +- ^ ProjectExecTransformer (71) - : : +- ^ FilterExecTransformer (70) - : : +- ^ Scan parquet (69) - : +- ^ InputIteratorTransformer (91) - : +- ^ InputAdapter (90) - : +- ^ ShuffleQueryStage (89) - : +- ColumnarExchange (88) - : +- ^ ProjectExecTransformer (86) - : +- ^ FilterExecTransformer (85) - : +- ^ Scan parquet (84) - +- ^ InputIteratorTransformer (106) - +- ^ InputAdapter (105) - +- ^ ShuffleQueryStage (104) - +- ColumnarExchange (103) - +- ^ ProjectExecTransformer (101) - +- ^ FilterExecTransformer (100) - +- ^ Scan parquet (99) + : : : : : : +- ^ InputIteratorTransformer (18) + : : : : : : +- ShuffleQueryStage (16) + : : : : : : +- ColumnarExchange (15) + : : : : : : +- VeloxAppendBatches (14) + : : : : : : +- ^ ProjectExecTransformer (12) + : : : : : : +- ^ FilterExecTransformer (11) + : : : : : : +- ^ Scan parquet (10) + : : : : : +- ^ InputIteratorTransformer (35) + : : : : : +- ShuffleQueryStage (33) + : : : : : +- ColumnarExchange (32) + : : : : : +- VeloxAppendBatches (31) + : : : : : +- ^ ProjectExecTransformer (29) + : : : : : +- ^ FilterExecTransformer (28) + : : : : : +- ^ Scan parquet (27) + : : : : +- ^ InputIteratorTransformer (52) + : : : : +- ShuffleQueryStage (50) + : : : : +- ColumnarExchange (49) + : : : : +- VeloxAppendBatches (48) + : : : : +- ^ ProjectExecTransformer (46) + : : : : +- ^ FilterExecTransformer (45) + : : : : +- ^ Scan parquet (44) + : : : +- ^ InputIteratorTransformer (69) + : : : +- ShuffleQueryStage (67) + : : : +- ColumnarExchange (66) + : : : +- VeloxAppendBatches (65) + : : : +- ^ ProjectExecTransformer (63) + : : : +- ^ FilterExecTransformer (62) + : : : +- ^ Scan parquet (61) + : : +- ^ InputIteratorTransformer (86) + : : +- ShuffleQueryStage (84) + : : +- ColumnarExchange (83) + : : +- VeloxAppendBatches (82) + : : +- ^ ProjectExecTransformer (80) + : : +- ^ FilterExecTransformer (79) + : : +- ^ Scan parquet (78) + : +- ^ InputIteratorTransformer (103) + : +- ShuffleQueryStage (101) + : +- ColumnarExchange (100) + : +- VeloxAppendBatches (99) + : +- ^ ProjectExecTransformer (97) + : +- ^ FilterExecTransformer (96) + : +- ^ Scan parquet (95) + +- ^ InputIteratorTransformer (120) + +- ShuffleQueryStage (118) + +- ColumnarExchange (117) + +- VeloxAppendBatches (116) + +- ^ ProjectExecTransformer (114) + +- ^ FilterExecTransformer (113) + +- ^ Scan parquet (112) +- == Initial Plan == - Sort (176) - +- Exchange (175) - +- HashAggregate (174) - +- Exchange (173) - +- HashAggregate (172) - +- Project (171) - +- ShuffledHashJoin Inner BuildRight (170) - :- Exchange (165) - : +- Project (164) - : +- ShuffledHashJoin Inner BuildRight (163) - : :- Exchange (159) - : : +- Project (158) - : : +- ShuffledHashJoin Inner BuildRight (157) - : : :- Exchange (153) - : : : +- Project (152) - : : : +- ShuffledHashJoin Inner BuildRight (151) - : : : :- Exchange (147) - : : : : +- Project (146) - : : : : +- ShuffledHashJoin Inner BuildRight (145) - : : : : :- Exchange (141) - : : : : : +- Project (140) - : : : : : +- ShuffledHashJoin Inner BuildRight (139) - : : : : : :- Exchange (135) - : : : : : : +- Project (134) - : : : : : : +- ShuffledHashJoin Inner BuildLeft (133) - : : : : : : :- Exchange (129) - : : : : : : : +- Project (128) - : : : : : : : +- Filter (127) - : : : : : : : +- Scan parquet (126) - : : : : : : +- Exchange (132) - : : : : : : +- Filter (131) - : : : : : : +- Scan parquet (130) - : : : : : +- Exchange (138) - : : : : : +- Filter (137) - : : : : : +- Scan parquet (136) - : : : : +- Exchange (144) - : : : : +- Filter (143) - : : : : +- Scan parquet (142) - : : : +- Exchange (150) - : : : +- Filter (149) - : : : +- Scan parquet (148) - : : +- Exchange (156) - : : +- Filter (155) - : : +- Scan parquet (154) - : +- Exchange (162) - : +- Filter (161) - : +- Scan parquet (160) - +- Exchange (169) - +- Project (168) - +- Filter (167) - +- Scan parquet (166) + Sort (206) + +- Exchange (205) + +- HashAggregate (204) + +- Exchange (203) + +- HashAggregate (202) + +- Project (201) + +- SortMergeJoin Inner (200) + :- Sort (194) + : +- Exchange (193) + : +- Project (192) + : +- SortMergeJoin Inner (191) + : :- Sort (186) + : : +- Exchange (185) + : : +- Project (184) + : : +- SortMergeJoin Inner (183) + : : :- Sort (178) + : : : +- Exchange (177) + : : : +- Project (176) + : : : +- SortMergeJoin Inner (175) + : : : :- Sort (170) + : : : : +- Exchange (169) + : : : : +- Project (168) + : : : : +- SortMergeJoin Inner (167) + : : : : :- Sort (162) + : : : : : +- Exchange (161) + : : : : : +- Project (160) + : : : : : +- SortMergeJoin Inner (159) + : : : : : :- Sort (154) + : : : : : : +- Exchange (153) + : : : : : : +- Project (152) + : : : : : : +- SortMergeJoin Inner (151) + : : : : : : :- Sort (146) + : : : : : : : +- Exchange (145) + : : : : : : : +- Project (144) + : : : : : : : +- Filter (143) + : : : : : : : +- Scan parquet (142) + : : : : : : +- Sort (150) + : : : : : : +- Exchange (149) + : : : : : : +- Filter (148) + : : : : : : +- Scan parquet (147) + : : : : : +- Sort (158) + : : : : : +- Exchange (157) + : : : : : +- Filter (156) + : : : : : +- Scan parquet (155) + : : : : +- Sort (166) + : : : : +- Exchange (165) + : : : : +- Filter (164) + : : : : +- Scan parquet (163) + : : : +- Sort (174) + : : : +- Exchange (173) + : : : +- Filter (172) + : : : +- Scan parquet (171) + : : +- Sort (182) + : : +- Exchange (181) + : : +- Filter (180) + : : +- Scan parquet (179) + : +- Sort (190) + : +- Exchange (189) + : +- Filter (188) + : +- Scan parquet (187) + +- Sort (199) + +- Exchange (198) + +- Project (197) + +- Filter (196) + +- Scan parquet (195) (1) Scan parquet @@ -182,732 +196,852 @@ Input [2]: [p_partkey#X, p_type#X] Input [2]: [hash_partition_key#X, p_partkey#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [2]: [hash_partition_key#X, p_partkey#X] +Arguments: X + +(6) ColumnarExchange Input [2]: [hash_partition_key#X, p_partkey#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [1]: [p_partkey#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [1]: [p_partkey#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [1]: [p_partkey#X] -(9) Scan parquet +(10) Scan parquet Output [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(10) FilterExecTransformer +(11) FilterExecTransformer Input [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: ((isnotnull(l_partkey#X) AND isnotnull(l_suppkey#X)) AND isnotnull(l_orderkey#X)) -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [6]: [hash(l_partkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Input [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [6]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(15) ColumnarExchange Input [6]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [p_partkey#X] Right keys [1]: [l_partkey#X] Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [5]: [hash(l_suppkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Input [6]: [p_partkey#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(19) WholeStageCodegenTransformer (X) +(21) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: false -(20) ColumnarExchange +(22) VeloxAppendBatches +Input [5]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(23) ColumnarExchange Input [5]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(21) ShuffleQueryStage +(24) ShuffleQueryStage Output [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: X -(22) InputAdapter +(25) InputAdapter Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(23) InputIteratorTransformer +(26) InputIteratorTransformer Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(24) Scan parquet +(27) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(25) FilterExecTransformer +(28) FilterExecTransformer Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(26) ProjectExecTransformer +(29) ProjectExecTransformer Output [3]: [hash(s_suppkey#X, 42) AS hash_partition_key#X, s_suppkey#X, s_nationkey#X] Input [2]: [s_suppkey#X, s_nationkey#X] -(27) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] Arguments: false -(28) ColumnarExchange +(31) VeloxAppendBatches +Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] +Arguments: X + +(32) ColumnarExchange Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [id=#X] -(29) ShuffleQueryStage +(33) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] Arguments: X -(30) InputAdapter +(34) InputAdapter Input [2]: [s_suppkey#X, s_nationkey#X] -(31) InputIteratorTransformer +(35) InputIteratorTransformer Input [2]: [s_suppkey#X, s_nationkey#X] -(32) ShuffledHashJoinExecTransformer +(36) ShuffledHashJoinExecTransformer Left keys [1]: [l_suppkey#X] Right keys [1]: [s_suppkey#X] Join condition: None -(33) ProjectExecTransformer +(37) ProjectExecTransformer Output [5]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Input [6]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, s_suppkey#X, s_nationkey#X] -(34) WholeStageCodegenTransformer (X) +(38) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: false -(35) ColumnarExchange +(39) VeloxAppendBatches +Input [5]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] +Arguments: X + +(40) ColumnarExchange Input [5]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X], [plan_id=X], [id=#X] -(36) ShuffleQueryStage +(41) ShuffleQueryStage Output [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: X -(37) InputAdapter +(42) InputAdapter Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] -(38) InputIteratorTransformer +(43) InputIteratorTransformer Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] -(39) Scan parquet +(44) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1995-01-01), LessThanOrEqual(o_orderdate,1996-12-31), IsNotNull(o_orderkey), IsNotNull(o_custkey)] ReadSchema: struct -(40) FilterExecTransformer +(45) FilterExecTransformer Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Arguments: ((((isnotnull(o_orderdate#X) AND (o_orderdate#X >= 1995-01-01)) AND (o_orderdate#X <= 1996-12-31)) AND isnotnull(o_orderkey#X)) AND isnotnull(o_custkey#X)) -(41) ProjectExecTransformer +(46) ProjectExecTransformer Output [4]: [hash(o_orderkey#X, 42) AS hash_partition_key#X, o_orderkey#X, o_custkey#X, o_orderdate#X] Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] -(42) WholeStageCodegenTransformer (X) +(47) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_orderdate#X] Arguments: false -(43) ColumnarExchange +(48) VeloxAppendBatches +Input [4]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_orderdate#X] +Arguments: X + +(49) ColumnarExchange Input [4]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_orderdate#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_orderdate#X], [plan_id=X], [id=#X] -(44) ShuffleQueryStage +(50) ShuffleQueryStage Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Arguments: X -(45) InputAdapter +(51) InputAdapter Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] -(46) InputIteratorTransformer +(52) InputIteratorTransformer Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] -(47) ShuffledHashJoinExecTransformer +(53) ShuffledHashJoinExecTransformer Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join condition: None -(48) ProjectExecTransformer +(54) ProjectExecTransformer Output [6]: [hash(o_custkey#X, 42) AS hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] Input [7]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderkey#X, o_custkey#X, o_orderdate#X] -(49) WholeStageCodegenTransformer (X) +(55) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] Arguments: false -(50) ColumnarExchange +(56) VeloxAppendBatches +Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] +Arguments: X + +(57) ColumnarExchange Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X], [plan_id=X], [id=#X] -(51) ShuffleQueryStage +(58) ShuffleQueryStage Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] Arguments: X -(52) InputAdapter +(59) InputAdapter Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] -(53) InputIteratorTransformer +(60) InputIteratorTransformer Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] -(54) Scan parquet +(61) Scan parquet Output [2]: [c_custkey#X, c_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey), IsNotNull(c_nationkey)] ReadSchema: struct -(55) FilterExecTransformer +(62) FilterExecTransformer Input [2]: [c_custkey#X, c_nationkey#X] Arguments: (isnotnull(c_custkey#X) AND isnotnull(c_nationkey#X)) -(56) ProjectExecTransformer +(63) ProjectExecTransformer Output [3]: [hash(c_custkey#X, 42) AS hash_partition_key#X, c_custkey#X, c_nationkey#X] Input [2]: [c_custkey#X, c_nationkey#X] -(57) WholeStageCodegenTransformer (X) +(64) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] Arguments: false -(58) ColumnarExchange +(65) VeloxAppendBatches +Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] +Arguments: X + +(66) ColumnarExchange Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_nationkey#X], [plan_id=X], [id=#X] -(59) ShuffleQueryStage +(67) ShuffleQueryStage Output [2]: [c_custkey#X, c_nationkey#X] Arguments: X -(60) InputAdapter +(68) InputAdapter Input [2]: [c_custkey#X, c_nationkey#X] -(61) InputIteratorTransformer +(69) InputIteratorTransformer Input [2]: [c_custkey#X, c_nationkey#X] -(62) ShuffledHashJoinExecTransformer +(70) ShuffledHashJoinExecTransformer Left keys [1]: [o_custkey#X] Right keys [1]: [c_custkey#X] Join condition: None -(63) ProjectExecTransformer +(71) ProjectExecTransformer Output [6]: [hash(c_nationkey#X, 42) AS hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] Input [7]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X, c_custkey#X, c_nationkey#X] -(64) WholeStageCodegenTransformer (X) +(72) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] Arguments: false -(65) ColumnarExchange +(73) VeloxAppendBatches +Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] +Arguments: X + +(74) ColumnarExchange Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X], [plan_id=X], [id=#X] -(66) ShuffleQueryStage +(75) ShuffleQueryStage Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] Arguments: X -(67) InputAdapter +(76) InputAdapter Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] -(68) InputIteratorTransformer +(77) InputIteratorTransformer Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] -(69) Scan parquet +(78) Scan parquet Output [2]: [n_nationkey#X, n_regionkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), IsNotNull(n_regionkey)] ReadSchema: struct -(70) FilterExecTransformer +(79) FilterExecTransformer Input [2]: [n_nationkey#X, n_regionkey#X] Arguments: (isnotnull(n_nationkey#X) AND isnotnull(n_regionkey#X)) -(71) ProjectExecTransformer +(80) ProjectExecTransformer Output [3]: [hash(n_nationkey#X, 42) AS hash_partition_key#X, n_nationkey#X, n_regionkey#X] Input [2]: [n_nationkey#X, n_regionkey#X] -(72) WholeStageCodegenTransformer (X) +(81) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, n_nationkey#X, n_regionkey#X] Arguments: false -(73) ColumnarExchange +(82) VeloxAppendBatches +Input [3]: [hash_partition_key#X, n_nationkey#X, n_regionkey#X] +Arguments: X + +(83) ColumnarExchange Input [3]: [hash_partition_key#X, n_nationkey#X, n_regionkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_regionkey#X], [plan_id=X], [id=#X] -(74) ShuffleQueryStage +(84) ShuffleQueryStage Output [2]: [n_nationkey#X, n_regionkey#X] Arguments: X -(75) InputAdapter +(85) InputAdapter Input [2]: [n_nationkey#X, n_regionkey#X] -(76) InputIteratorTransformer +(86) InputIteratorTransformer Input [2]: [n_nationkey#X, n_regionkey#X] -(77) ShuffledHashJoinExecTransformer +(87) ShuffledHashJoinExecTransformer Left keys [1]: [c_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(78) ProjectExecTransformer +(88) ProjectExecTransformer Output [6]: [hash(s_nationkey#X, 42) AS hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] Input [7]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X, n_nationkey#X, n_regionkey#X] -(79) WholeStageCodegenTransformer (X) +(89) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] Arguments: false -(80) ColumnarExchange +(90) VeloxAppendBatches +Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] +Arguments: X + +(91) ColumnarExchange Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X], [plan_id=X], [id=#X] -(81) ShuffleQueryStage +(92) ShuffleQueryStage Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] Arguments: X -(82) InputAdapter +(93) InputAdapter Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] -(83) InputIteratorTransformer +(94) InputIteratorTransformer Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] -(84) Scan parquet +(95) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey)] ReadSchema: struct -(85) FilterExecTransformer +(96) FilterExecTransformer Input [2]: [n_nationkey#X, n_name#X] Arguments: isnotnull(n_nationkey#X) -(86) ProjectExecTransformer +(97) ProjectExecTransformer Output [3]: [hash(n_nationkey#X, 42) AS hash_partition_key#X, n_nationkey#X, n_name#X] Input [2]: [n_nationkey#X, n_name#X] -(87) WholeStageCodegenTransformer (X) +(98) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] Arguments: false -(88) ColumnarExchange +(99) VeloxAppendBatches +Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] +Arguments: X + +(100) ColumnarExchange Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [id=#X] -(89) ShuffleQueryStage +(101) ShuffleQueryStage Output [2]: [n_nationkey#X, n_name#X] Arguments: X -(90) InputAdapter +(102) InputAdapter Input [2]: [n_nationkey#X, n_name#X] -(91) InputIteratorTransformer +(103) InputIteratorTransformer Input [2]: [n_nationkey#X, n_name#X] -(92) ShuffledHashJoinExecTransformer +(104) ShuffledHashJoinExecTransformer Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(93) ProjectExecTransformer +(105) ProjectExecTransformer Output [6]: [hash(n_regionkey#X, 42) AS hash_partition_key#X, l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] Input [7]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X, n_nationkey#X, n_name#X] -(94) WholeStageCodegenTransformer (X) +(106) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] Arguments: false -(95) ColumnarExchange +(107) VeloxAppendBatches +Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] +Arguments: X + +(108) ColumnarExchange Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] Arguments: hashpartitioning(n_regionkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X], [plan_id=X], [id=#X] -(96) ShuffleQueryStage +(109) ShuffleQueryStage Output [5]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] Arguments: X -(97) InputAdapter +(110) InputAdapter Input [5]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] -(98) InputIteratorTransformer +(111) InputIteratorTransformer Input [5]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] -(99) Scan parquet +(112) Scan parquet Output [2]: [r_regionkey#X, r_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(r_name), EqualTo(r_name,AMERICA), IsNotNull(r_regionkey)] ReadSchema: struct -(100) FilterExecTransformer +(113) FilterExecTransformer Input [2]: [r_regionkey#X, r_name#X] Arguments: ((isnotnull(r_name#X) AND (r_name#X = AMERICA)) AND isnotnull(r_regionkey#X)) -(101) ProjectExecTransformer +(114) ProjectExecTransformer Output [2]: [hash(r_regionkey#X, 42) AS hash_partition_key#X, r_regionkey#X] Input [2]: [r_regionkey#X, r_name#X] -(102) WholeStageCodegenTransformer (X) +(115) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, r_regionkey#X] Arguments: false -(103) ColumnarExchange +(116) VeloxAppendBatches +Input [2]: [hash_partition_key#X, r_regionkey#X] +Arguments: X + +(117) ColumnarExchange Input [2]: [hash_partition_key#X, r_regionkey#X] Arguments: hashpartitioning(r_regionkey#X, 1), ENSURE_REQUIREMENTS, [r_regionkey#X], [plan_id=X], [id=#X] -(104) ShuffleQueryStage +(118) ShuffleQueryStage Output [1]: [r_regionkey#X] Arguments: X -(105) InputAdapter +(119) InputAdapter Input [1]: [r_regionkey#X] -(106) InputIteratorTransformer +(120) InputIteratorTransformer Input [1]: [r_regionkey#X] -(107) ShuffledHashJoinExecTransformer +(121) ShuffledHashJoinExecTransformer Left keys [1]: [n_regionkey#X] Right keys [1]: [r_regionkey#X] Join condition: None -(108) ProjectExecTransformer +(122) ProjectExecTransformer Output [4]: [year(o_orderdate#X) AS o_year#X, CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) AS volume#X, n_name#X AS nation#X, CASE WHEN (n_name#X = BRAZIL) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) ELSE 0.0000 END AS _pre_X#X] Input [6]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X, r_regionkey#X] -(109) FlushableHashAggregateExecTransformer +(123) FlushableHashAggregateExecTransformer Input [4]: [o_year#X, volume#X, nation#X, _pre_X#X] Keys [1]: [o_year#X] Functions [2]: [partial_sum(_pre_X#X), partial_sum(volume#X)] Aggregate Attributes [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Results [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] -(110) ProjectExecTransformer +(124) ProjectExecTransformer Output [6]: [hash(o_year#X, 42) AS hash_partition_key#X, o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] -(111) WholeStageCodegenTransformer (X) +(125) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: false -(112) ColumnarExchange +(126) VeloxAppendBatches +Input [6]: [hash_partition_key#X, o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] +Arguments: X + +(127) ColumnarExchange Input [6]: [hash_partition_key#X, o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: hashpartitioning(o_year#X, 1), ENSURE_REQUIREMENTS, [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(113) ShuffleQueryStage +(128) ShuffleQueryStage Output [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: X -(114) InputAdapter +(129) InputAdapter Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] -(115) InputIteratorTransformer +(130) InputIteratorTransformer Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] -(116) RegularHashAggregateExecTransformer +(131) RegularHashAggregateExecTransformer Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Keys [1]: [o_year#X] Functions [2]: [sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END), sum(volume#X)] Aggregate Attributes [2]: [sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X, sum(volume#X)#X] Results [3]: [o_year#X, sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X, sum(volume#X)#X] -(117) ProjectExecTransformer +(132) ProjectExecTransformer Output [2]: [o_year#X, CheckOverflow((promote_precision(sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X) / promote_precision(sum(volume#X)#X)), DecimalType(38,6), true) AS mkt_share#X] Input [3]: [o_year#X, sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X, sum(volume#X)#X] -(118) WholeStageCodegenTransformer (X) +(133) WholeStageCodegenTransformer (X) Input [2]: [o_year#X, mkt_share#X] Arguments: false -(119) ColumnarExchange +(134) VeloxAppendBatches +Input [2]: [o_year#X, mkt_share#X] +Arguments: X + +(135) ColumnarExchange Input [2]: [o_year#X, mkt_share#X] Arguments: rangepartitioning(o_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(120) ShuffleQueryStage +(136) ShuffleQueryStage Output [2]: [o_year#X, mkt_share#X] Arguments: X -(121) InputAdapter +(137) InputAdapter Input [2]: [o_year#X, mkt_share#X] -(122) InputIteratorTransformer +(138) InputIteratorTransformer Input [2]: [o_year#X, mkt_share#X] -(123) SortExecTransformer +(139) SortExecTransformer Input [2]: [o_year#X, mkt_share#X] Arguments: [o_year#X ASC NULLS FIRST], true, 0 -(124) WholeStageCodegenTransformer (X) +(140) WholeStageCodegenTransformer (X) Input [2]: [o_year#X, mkt_share#X] Arguments: false -(125) VeloxColumnarToRowExec +(141) VeloxColumnarToRowExec Input [2]: [o_year#X, mkt_share#X] -(126) Scan parquet +(142) Scan parquet Output [2]: [p_partkey#X, p_type#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_type), EqualTo(p_type,ECONOMY ANODIZED STEEL), IsNotNull(p_partkey)] ReadSchema: struct -(127) Filter +(143) Filter Input [2]: [p_partkey#X, p_type#X] Condition : ((isnotnull(p_type#X) AND (p_type#X = ECONOMY ANODIZED STEEL)) AND isnotnull(p_partkey#X)) -(128) Project +(144) Project Output [1]: [p_partkey#X] Input [2]: [p_partkey#X, p_type#X] -(129) Exchange +(145) Exchange Input [1]: [p_partkey#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(130) Scan parquet +(146) Sort +Input [1]: [p_partkey#X] +Arguments: [p_partkey#X ASC NULLS FIRST], false, 0 + +(147) Scan parquet Output [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(131) Filter +(148) Filter Input [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Condition : ((isnotnull(l_partkey#X) AND isnotnull(l_suppkey#X)) AND isnotnull(l_orderkey#X)) -(132) Exchange +(149) Exchange Input [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(133) ShuffledHashJoin +(150) Sort +Input [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] +Arguments: [l_partkey#X ASC NULLS FIRST], false, 0 + +(151) SortMergeJoin Left keys [1]: [p_partkey#X] Right keys [1]: [l_partkey#X] Join condition: None -(134) Project +(152) Project Output [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Input [6]: [p_partkey#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(135) Exchange +(153) Exchange Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(136) Scan parquet +(154) Sort +Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] +Arguments: [l_suppkey#X ASC NULLS FIRST], false, 0 + +(155) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(137) Filter +(156) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(138) Exchange +(157) Exchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(139) ShuffledHashJoin +(158) Sort +Input [2]: [s_suppkey#X, s_nationkey#X] +Arguments: [s_suppkey#X ASC NULLS FIRST], false, 0 + +(159) SortMergeJoin Left keys [1]: [l_suppkey#X] Right keys [1]: [s_suppkey#X] Join condition: None -(140) Project +(160) Project Output [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Input [6]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, s_suppkey#X, s_nationkey#X] -(141) Exchange +(161) Exchange Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(142) Scan parquet +(162) Sort +Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(163) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1995-01-01), LessThanOrEqual(o_orderdate,1996-12-31), IsNotNull(o_orderkey), IsNotNull(o_custkey)] ReadSchema: struct -(143) Filter +(164) Filter Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Condition : ((((isnotnull(o_orderdate#X) AND (o_orderdate#X >= 1995-01-01)) AND (o_orderdate#X <= 1996-12-31)) AND isnotnull(o_orderkey#X)) AND isnotnull(o_custkey#X)) -(144) Exchange +(165) Exchange Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(145) ShuffledHashJoin +(166) Sort +Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(167) SortMergeJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join condition: None -(146) Project +(168) Project Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] Input [7]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderkey#X, o_custkey#X, o_orderdate#X] -(147) Exchange +(169) Exchange Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(148) Scan parquet +(170) Sort +Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] +Arguments: [o_custkey#X ASC NULLS FIRST], false, 0 + +(171) Scan parquet Output [2]: [c_custkey#X, c_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey), IsNotNull(c_nationkey)] ReadSchema: struct -(149) Filter +(172) Filter Input [2]: [c_custkey#X, c_nationkey#X] Condition : (isnotnull(c_custkey#X) AND isnotnull(c_nationkey#X)) -(150) Exchange +(173) Exchange Input [2]: [c_custkey#X, c_nationkey#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(151) ShuffledHashJoin +(174) Sort +Input [2]: [c_custkey#X, c_nationkey#X] +Arguments: [c_custkey#X ASC NULLS FIRST], false, 0 + +(175) SortMergeJoin Left keys [1]: [o_custkey#X] Right keys [1]: [c_custkey#X] Join condition: None -(152) Project +(176) Project Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] Input [7]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X, c_custkey#X, c_nationkey#X] -(153) Exchange +(177) Exchange Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(154) Scan parquet +(178) Sort +Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] +Arguments: [c_nationkey#X ASC NULLS FIRST], false, 0 + +(179) Scan parquet Output [2]: [n_nationkey#X, n_regionkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), IsNotNull(n_regionkey)] ReadSchema: struct -(155) Filter +(180) Filter Input [2]: [n_nationkey#X, n_regionkey#X] Condition : (isnotnull(n_nationkey#X) AND isnotnull(n_regionkey#X)) -(156) Exchange +(181) Exchange Input [2]: [n_nationkey#X, n_regionkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(157) ShuffledHashJoin +(182) Sort +Input [2]: [n_nationkey#X, n_regionkey#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(183) SortMergeJoin Left keys [1]: [c_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(158) Project +(184) Project Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] Input [7]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X, n_nationkey#X, n_regionkey#X] -(159) Exchange +(185) Exchange Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(160) Scan parquet +(186) Sort +Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] +Arguments: [s_nationkey#X ASC NULLS FIRST], false, 0 + +(187) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey)] ReadSchema: struct -(161) Filter +(188) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : isnotnull(n_nationkey#X) -(162) Exchange +(189) Exchange Input [2]: [n_nationkey#X, n_name#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(163) ShuffledHashJoin +(190) Sort +Input [2]: [n_nationkey#X, n_name#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(191) SortMergeJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(164) Project +(192) Project Output [5]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] Input [7]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X, n_nationkey#X, n_name#X] -(165) Exchange +(193) Exchange Input [5]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] Arguments: hashpartitioning(n_regionkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(166) Scan parquet +(194) Sort +Input [5]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] +Arguments: [n_regionkey#X ASC NULLS FIRST], false, 0 + +(195) Scan parquet Output [2]: [r_regionkey#X, r_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(r_name), EqualTo(r_name,AMERICA), IsNotNull(r_regionkey)] ReadSchema: struct -(167) Filter +(196) Filter Input [2]: [r_regionkey#X, r_name#X] Condition : ((isnotnull(r_name#X) AND (r_name#X = AMERICA)) AND isnotnull(r_regionkey#X)) -(168) Project +(197) Project Output [1]: [r_regionkey#X] Input [2]: [r_regionkey#X, r_name#X] -(169) Exchange +(198) Exchange Input [1]: [r_regionkey#X] Arguments: hashpartitioning(r_regionkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(170) ShuffledHashJoin +(199) Sort +Input [1]: [r_regionkey#X] +Arguments: [r_regionkey#X ASC NULLS FIRST], false, 0 + +(200) SortMergeJoin Left keys [1]: [n_regionkey#X] Right keys [1]: [r_regionkey#X] Join condition: None -(171) Project +(201) Project Output [3]: [year(o_orderdate#X) AS o_year#X, CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) AS volume#X, n_name#X AS nation#X] Input [6]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X, r_regionkey#X] -(172) HashAggregate +(202) HashAggregate Input [3]: [o_year#X, volume#X, nation#X] Keys [1]: [o_year#X] Functions [2]: [partial_sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END), partial_sum(volume#X)] Aggregate Attributes [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Results [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] -(173) Exchange +(203) Exchange Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: hashpartitioning(o_year#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(174) HashAggregate +(204) HashAggregate Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Keys [1]: [o_year#X] Functions [2]: [sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END), sum(volume#X)] Aggregate Attributes [2]: [sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X, sum(volume#X)#X] Results [2]: [o_year#X, CheckOverflow((promote_precision(sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X) / promote_precision(sum(volume#X)#X)), DecimalType(38,6), true) AS mkt_share#X] -(175) Exchange +(205) Exchange Input [2]: [o_year#X, mkt_share#X] Arguments: rangepartitioning(o_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(176) Sort +(206) Sort Input [2]: [o_year#X, mkt_share#X] Arguments: [o_year#X ASC NULLS FIRST], true, 0 -(177) AdaptiveSparkPlan +(207) AdaptiveSparkPlan Output [2]: [o_year#X, mkt_share#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/9.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/9.txt index a811b042b603c..cb207c0800c3a 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/9.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/9.txt @@ -1,126 +1,136 @@ == Physical Plan == -AdaptiveSparkPlan (133) +AdaptiveSparkPlan (155) +- == Final Plan == - VeloxColumnarToRowExec (94) - +- ^ SortExecTransformer (92) - +- ^ InputIteratorTransformer (91) - +- ^ InputAdapter (90) - +- ^ ShuffleQueryStage (89) - +- ColumnarExchange (88) - +- ^ RegularHashAggregateExecTransformer (86) - +- ^ InputIteratorTransformer (85) - +- ^ InputAdapter (84) - +- ^ ShuffleQueryStage (83) - +- ColumnarExchange (82) - +- ^ ProjectExecTransformer (80) - +- ^ FlushableHashAggregateExecTransformer (79) - +- ^ ProjectExecTransformer (78) - +- ^ ShuffledHashJoinExecTransformer Inner (77) - :- ^ InputIteratorTransformer (68) - : +- ^ InputAdapter (67) - : +- ^ ShuffleQueryStage (66) - : +- ColumnarExchange (65) - : +- ^ ProjectExecTransformer (63) - : +- ^ ShuffledHashJoinExecTransformer Inner (62) - : :- ^ InputIteratorTransformer (53) - : : +- ^ InputAdapter (52) - : : +- ^ ShuffleQueryStage (51) - : : +- ColumnarExchange (50) - : : +- ^ ProjectExecTransformer (48) - : : +- ^ ShuffledHashJoinExecTransformer Inner (47) - : : :- ^ InputIteratorTransformer (38) - : : : +- ^ InputAdapter (37) - : : : +- ^ ShuffleQueryStage (36) - : : : +- ColumnarExchange (35) - : : : +- ^ ProjectExecTransformer (33) - : : : +- ^ ShuffledHashJoinExecTransformer Inner (32) - : : : :- ^ InputIteratorTransformer (23) - : : : : +- ^ InputAdapter (22) - : : : : +- ^ ShuffleQueryStage (21) - : : : : +- ColumnarExchange (20) - : : : : +- ^ ProjectExecTransformer (18) - : : : : +- ^ ShuffledHashJoinExecTransformer Inner (17) - : : : : :- ^ InputIteratorTransformer (8) - : : : : : +- ^ InputAdapter (7) - : : : : : +- ^ ShuffleQueryStage (6) - : : : : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (106) + +- ^ SortExecTransformer (104) + +- ^ InputIteratorTransformer (103) + +- ShuffleQueryStage (101) + +- ColumnarExchange (100) + +- VeloxAppendBatches (99) + +- ^ RegularHashAggregateExecTransformer (97) + +- ^ InputIteratorTransformer (96) + +- ShuffleQueryStage (94) + +- ColumnarExchange (93) + +- VeloxAppendBatches (92) + +- ^ ProjectExecTransformer (90) + +- ^ FlushableHashAggregateExecTransformer (89) + +- ^ ProjectExecTransformer (88) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (87) + :- ^ InputIteratorTransformer (77) + : +- ShuffleQueryStage (75) + : +- ColumnarExchange (74) + : +- VeloxAppendBatches (73) + : +- ^ ProjectExecTransformer (71) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (70) + : :- ^ InputIteratorTransformer (60) + : : +- ShuffleQueryStage (58) + : : +- ColumnarExchange (57) + : : +- VeloxAppendBatches (56) + : : +- ^ ProjectExecTransformer (54) + : : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (53) + : : :- ^ InputIteratorTransformer (43) + : : : +- ShuffleQueryStage (41) + : : : +- ColumnarExchange (40) + : : : +- VeloxAppendBatches (39) + : : : +- ^ ProjectExecTransformer (37) + : : : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (36) + : : : :- ^ InputIteratorTransformer (26) + : : : : +- ShuffleQueryStage (24) + : : : : +- ColumnarExchange (23) + : : : : +- VeloxAppendBatches (22) + : : : : +- ^ ProjectExecTransformer (20) + : : : : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (19) + : : : : :- ^ InputIteratorTransformer (9) + : : : : : +- ShuffleQueryStage (7) + : : : : : +- ColumnarExchange (6) + : : : : : +- VeloxAppendBatches (5) : : : : : +- ^ ProjectExecTransformer (3) : : : : : +- ^ FilterExecTransformer (2) : : : : : +- ^ Scan parquet (1) - : : : : +- ^ InputIteratorTransformer (16) - : : : : +- ^ InputAdapter (15) - : : : : +- ^ ShuffleQueryStage (14) - : : : : +- ColumnarExchange (13) - : : : : +- ^ ProjectExecTransformer (11) - : : : : +- ^ FilterExecTransformer (10) - : : : : +- ^ Scan parquet (9) - : : : +- ^ InputIteratorTransformer (31) - : : : +- ^ InputAdapter (30) - : : : +- ^ ShuffleQueryStage (29) - : : : +- ColumnarExchange (28) - : : : +- ^ ProjectExecTransformer (26) - : : : +- ^ FilterExecTransformer (25) - : : : +- ^ Scan parquet (24) - : : +- ^ InputIteratorTransformer (46) - : : +- ^ InputAdapter (45) - : : +- ^ ShuffleQueryStage (44) - : : +- ColumnarExchange (43) - : : +- ^ ProjectExecTransformer (41) - : : +- ^ FilterExecTransformer (40) - : : +- ^ Scan parquet (39) - : +- ^ InputIteratorTransformer (61) - : +- ^ InputAdapter (60) - : +- ^ ShuffleQueryStage (59) - : +- ColumnarExchange (58) - : +- ^ ProjectExecTransformer (56) - : +- ^ FilterExecTransformer (55) - : +- ^ Scan parquet (54) - +- ^ InputIteratorTransformer (76) - +- ^ InputAdapter (75) - +- ^ ShuffleQueryStage (74) - +- ColumnarExchange (73) - +- ^ ProjectExecTransformer (71) - +- ^ FilterExecTransformer (70) - +- ^ Scan parquet (69) + : : : : +- ^ InputIteratorTransformer (18) + : : : : +- ShuffleQueryStage (16) + : : : : +- ColumnarExchange (15) + : : : : +- VeloxAppendBatches (14) + : : : : +- ^ ProjectExecTransformer (12) + : : : : +- ^ FilterExecTransformer (11) + : : : : +- ^ Scan parquet (10) + : : : +- ^ InputIteratorTransformer (35) + : : : +- ShuffleQueryStage (33) + : : : +- ColumnarExchange (32) + : : : +- VeloxAppendBatches (31) + : : : +- ^ ProjectExecTransformer (29) + : : : +- ^ FilterExecTransformer (28) + : : : +- ^ Scan parquet (27) + : : +- ^ InputIteratorTransformer (52) + : : +- ShuffleQueryStage (50) + : : +- ColumnarExchange (49) + : : +- VeloxAppendBatches (48) + : : +- ^ ProjectExecTransformer (46) + : : +- ^ FilterExecTransformer (45) + : : +- ^ Scan parquet (44) + : +- ^ InputIteratorTransformer (69) + : +- ShuffleQueryStage (67) + : +- ColumnarExchange (66) + : +- VeloxAppendBatches (65) + : +- ^ ProjectExecTransformer (63) + : +- ^ FilterExecTransformer (62) + : +- ^ Scan parquet (61) + +- ^ InputIteratorTransformer (86) + +- ShuffleQueryStage (84) + +- ColumnarExchange (83) + +- VeloxAppendBatches (82) + +- ^ ProjectExecTransformer (80) + +- ^ FilterExecTransformer (79) + +- ^ Scan parquet (78) +- == Initial Plan == - Sort (132) - +- Exchange (131) - +- HashAggregate (130) - +- Exchange (129) - +- HashAggregate (128) - +- Project (127) - +- ShuffledHashJoin Inner BuildRight (126) - :- Exchange (122) - : +- Project (121) - : +- ShuffledHashJoin Inner BuildRight (120) - : :- Exchange (116) - : : +- Project (115) - : : +- ShuffledHashJoin Inner BuildRight (114) - : : :- Exchange (110) - : : : +- Project (109) - : : : +- ShuffledHashJoin Inner BuildRight (108) - : : : :- Exchange (104) - : : : : +- Project (103) - : : : : +- ShuffledHashJoin Inner BuildLeft (102) - : : : : :- Exchange (98) - : : : : : +- Project (97) - : : : : : +- Filter (96) - : : : : : +- Scan parquet (95) - : : : : +- Exchange (101) - : : : : +- Filter (100) - : : : : +- Scan parquet (99) - : : : +- Exchange (107) - : : : +- Filter (106) - : : : +- Scan parquet (105) - : : +- Exchange (113) - : : +- Filter (112) - : : +- Scan parquet (111) - : +- Exchange (119) - : +- Filter (118) - : +- Scan parquet (117) - +- Exchange (125) - +- Filter (124) - +- Scan parquet (123) + Sort (154) + +- Exchange (153) + +- HashAggregate (152) + +- Exchange (151) + +- HashAggregate (150) + +- Project (149) + +- SortMergeJoin Inner (148) + :- Sort (143) + : +- Exchange (142) + : +- Project (141) + : +- SortMergeJoin Inner (140) + : :- Sort (135) + : : +- Exchange (134) + : : +- Project (133) + : : +- SortMergeJoin Inner (132) + : : :- Sort (127) + : : : +- Exchange (126) + : : : +- Project (125) + : : : +- SortMergeJoin Inner (124) + : : : :- Sort (119) + : : : : +- Exchange (118) + : : : : +- Project (117) + : : : : +- SortMergeJoin Inner (116) + : : : : :- Sort (111) + : : : : : +- Exchange (110) + : : : : : +- Project (109) + : : : : : +- Filter (108) + : : : : : +- Scan parquet (107) + : : : : +- Sort (115) + : : : : +- Exchange (114) + : : : : +- Filter (113) + : : : : +- Scan parquet (112) + : : : +- Sort (123) + : : : +- Exchange (122) + : : : +- Filter (121) + : : : +- Scan parquet (120) + : : +- Sort (131) + : : +- Exchange (130) + : : +- Filter (129) + : : +- Scan parquet (128) + : +- Sort (139) + : +- Exchange (138) + : +- Filter (137) + : +- Scan parquet (136) + +- Sort (147) + +- Exchange (146) + +- Filter (145) + +- Scan parquet (144) (1) Scan parquet @@ -142,548 +152,636 @@ Input [2]: [p_partkey#X, p_name#X] Input [2]: [hash_partition_key#X, p_partkey#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [2]: [hash_partition_key#X, p_partkey#X] +Arguments: X + +(6) ColumnarExchange Input [2]: [hash_partition_key#X, p_partkey#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [1]: [p_partkey#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [1]: [p_partkey#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [1]: [p_partkey#X] -(9) Scan parquet +(10) Scan parquet Output [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(10) FilterExecTransformer +(11) FilterExecTransformer Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: ((isnotnull(l_partkey#X) AND isnotnull(l_suppkey#X)) AND isnotnull(l_orderkey#X)) -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [7]: [hash(l_partkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [7]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [7]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(15) ColumnarExchange Input [7]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [p_partkey#X] Right keys [1]: [l_partkey#X] Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [7]: [hash(l_suppkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Input [7]: [p_partkey#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -(19) WholeStageCodegenTransformer (X) +(21) WholeStageCodegenTransformer (X) Input [7]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: false -(20) ColumnarExchange +(22) VeloxAppendBatches +Input [7]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(23) ColumnarExchange Input [7]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(21) ShuffleQueryStage +(24) ShuffleQueryStage Output [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: X -(22) InputAdapter +(25) InputAdapter Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -(23) InputIteratorTransformer +(26) InputIteratorTransformer Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -(24) Scan parquet +(27) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(25) FilterExecTransformer +(28) FilterExecTransformer Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(26) ProjectExecTransformer +(29) ProjectExecTransformer Output [3]: [hash(s_suppkey#X, 42) AS hash_partition_key#X, s_suppkey#X, s_nationkey#X] Input [2]: [s_suppkey#X, s_nationkey#X] -(27) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] Arguments: false -(28) ColumnarExchange +(31) VeloxAppendBatches +Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] +Arguments: X + +(32) ColumnarExchange Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [id=#X] -(29) ShuffleQueryStage +(33) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] Arguments: X -(30) InputAdapter +(34) InputAdapter Input [2]: [s_suppkey#X, s_nationkey#X] -(31) InputIteratorTransformer +(35) InputIteratorTransformer Input [2]: [s_suppkey#X, s_nationkey#X] -(32) ShuffledHashJoinExecTransformer +(36) ShuffledHashJoinExecTransformer Left keys [1]: [l_suppkey#X] Right keys [1]: [s_suppkey#X] Join condition: None -(33) ProjectExecTransformer +(37) ProjectExecTransformer Output [8]: [hash(l_suppkey#X, l_partkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Input [8]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_suppkey#X, s_nationkey#X] -(34) WholeStageCodegenTransformer (X) +(38) WholeStageCodegenTransformer (X) Input [8]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: false -(35) ColumnarExchange +(39) VeloxAppendBatches +Input [8]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] +Arguments: X + +(40) ColumnarExchange Input [8]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: hashpartitioning(l_suppkey#X, l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X], [plan_id=X], [id=#X] -(36) ShuffleQueryStage +(41) ShuffleQueryStage Output [7]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: X -(37) InputAdapter +(42) InputAdapter Input [7]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] -(38) InputIteratorTransformer +(43) InputIteratorTransformer Input [7]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] -(39) Scan parquet +(44) Scan parquet Output [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_suppkey), IsNotNull(ps_partkey)] ReadSchema: struct -(40) FilterExecTransformer +(45) FilterExecTransformer Input [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Arguments: (isnotnull(ps_suppkey#X) AND isnotnull(ps_partkey#X)) -(41) ProjectExecTransformer +(46) ProjectExecTransformer Output [4]: [hash(ps_suppkey#X, ps_partkey#X, 42) AS hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Input [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] -(42) WholeStageCodegenTransformer (X) +(47) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Arguments: false -(43) ColumnarExchange +(48) VeloxAppendBatches +Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_supplycost#X] +Arguments: X + +(49) ColumnarExchange Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Arguments: hashpartitioning(ps_suppkey#X, ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_supplycost#X], [plan_id=X], [id=#X] -(44) ShuffleQueryStage +(50) ShuffleQueryStage Output [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Arguments: X -(45) InputAdapter +(51) InputAdapter Input [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] -(46) InputIteratorTransformer +(52) InputIteratorTransformer Input [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] -(47) ShuffledHashJoinExecTransformer +(53) ShuffledHashJoinExecTransformer Left keys [2]: [l_suppkey#X, l_partkey#X] Right keys [2]: [ps_suppkey#X, ps_partkey#X] Join condition: None -(48) ProjectExecTransformer +(54) ProjectExecTransformer Output [7]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] Input [10]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_partkey#X, ps_suppkey#X, ps_supplycost#X] -(49) WholeStageCodegenTransformer (X) +(55) WholeStageCodegenTransformer (X) Input [7]: [hash_partition_key#X, l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] Arguments: false -(50) ColumnarExchange +(56) VeloxAppendBatches +Input [7]: [hash_partition_key#X, l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] +Arguments: X + +(57) ColumnarExchange Input [7]: [hash_partition_key#X, l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X], [plan_id=X], [id=#X] -(51) ShuffleQueryStage +(58) ShuffleQueryStage Output [6]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] Arguments: X -(52) InputAdapter +(59) InputAdapter Input [6]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] -(53) InputIteratorTransformer +(60) InputIteratorTransformer Input [6]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] -(54) Scan parquet +(61) Scan parquet Output [2]: [o_orderkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderkey)] ReadSchema: struct -(55) FilterExecTransformer +(62) FilterExecTransformer Input [2]: [o_orderkey#X, o_orderdate#X] Arguments: isnotnull(o_orderkey#X) -(56) ProjectExecTransformer +(63) ProjectExecTransformer Output [3]: [hash(o_orderkey#X, 42) AS hash_partition_key#X, o_orderkey#X, o_orderdate#X] Input [2]: [o_orderkey#X, o_orderdate#X] -(57) WholeStageCodegenTransformer (X) +(64) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderdate#X] Arguments: false -(58) ColumnarExchange +(65) VeloxAppendBatches +Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderdate#X] +Arguments: X + +(66) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderdate#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderdate#X], [plan_id=X], [id=#X] -(59) ShuffleQueryStage +(67) ShuffleQueryStage Output [2]: [o_orderkey#X, o_orderdate#X] Arguments: X -(60) InputAdapter +(68) InputAdapter Input [2]: [o_orderkey#X, o_orderdate#X] -(61) InputIteratorTransformer +(69) InputIteratorTransformer Input [2]: [o_orderkey#X, o_orderdate#X] -(62) ShuffledHashJoinExecTransformer +(70) ShuffledHashJoinExecTransformer Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join condition: None -(63) ProjectExecTransformer +(71) ProjectExecTransformer Output [7]: [hash(s_nationkey#X, 42) AS hash_partition_key#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] Input [8]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderkey#X, o_orderdate#X] -(64) WholeStageCodegenTransformer (X) +(72) WholeStageCodegenTransformer (X) Input [7]: [hash_partition_key#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] Arguments: false -(65) ColumnarExchange +(73) VeloxAppendBatches +Input [7]: [hash_partition_key#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] +Arguments: X + +(74) ColumnarExchange Input [7]: [hash_partition_key#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X], [plan_id=X], [id=#X] -(66) ShuffleQueryStage +(75) ShuffleQueryStage Output [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] Arguments: X -(67) InputAdapter +(76) InputAdapter Input [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] -(68) InputIteratorTransformer +(77) InputIteratorTransformer Input [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] -(69) Scan parquet +(78) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey)] ReadSchema: struct -(70) FilterExecTransformer +(79) FilterExecTransformer Input [2]: [n_nationkey#X, n_name#X] Arguments: isnotnull(n_nationkey#X) -(71) ProjectExecTransformer +(80) ProjectExecTransformer Output [3]: [hash(n_nationkey#X, 42) AS hash_partition_key#X, n_nationkey#X, n_name#X] Input [2]: [n_nationkey#X, n_name#X] -(72) WholeStageCodegenTransformer (X) +(81) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] Arguments: false -(73) ColumnarExchange +(82) VeloxAppendBatches +Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] +Arguments: X + +(83) ColumnarExchange Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [id=#X] -(74) ShuffleQueryStage +(84) ShuffleQueryStage Output [2]: [n_nationkey#X, n_name#X] Arguments: X -(75) InputAdapter +(85) InputAdapter Input [2]: [n_nationkey#X, n_name#X] -(76) InputIteratorTransformer +(86) InputIteratorTransformer Input [2]: [n_nationkey#X, n_name#X] -(77) ShuffledHashJoinExecTransformer +(87) ShuffledHashJoinExecTransformer Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(78) ProjectExecTransformer +(88) ProjectExecTransformer Output [3]: [n_name#X AS nation#X, year(o_orderdate#X) AS o_year#X, CheckOverflow((promote_precision(cast(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) as decimal(27,4))) - promote_precision(cast(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(l_quantity#X)), DecimalType(25,4), true) as decimal(27,4)))), DecimalType(27,4), true) AS amount#X] Input [8]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X, n_nationkey#X, n_name#X] -(79) FlushableHashAggregateExecTransformer +(89) FlushableHashAggregateExecTransformer Input [3]: [nation#X, o_year#X, amount#X] Keys [2]: [nation#X, o_year#X] Functions [1]: [partial_sum(amount#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [4]: [nation#X, o_year#X, sum#X, isEmpty#X] -(80) ProjectExecTransformer +(90) ProjectExecTransformer Output [5]: [hash(nation#X, o_year#X, 42) AS hash_partition_key#X, nation#X, o_year#X, sum#X, isEmpty#X] Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] -(81) WholeStageCodegenTransformer (X) +(91) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, nation#X, o_year#X, sum#X, isEmpty#X] Arguments: false -(82) ColumnarExchange +(92) VeloxAppendBatches +Input [5]: [hash_partition_key#X, nation#X, o_year#X, sum#X, isEmpty#X] +Arguments: X + +(93) ColumnarExchange Input [5]: [hash_partition_key#X, nation#X, o_year#X, sum#X, isEmpty#X] Arguments: hashpartitioning(nation#X, o_year#X, 1), ENSURE_REQUIREMENTS, [nation#X, o_year#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(83) ShuffleQueryStage +(94) ShuffleQueryStage Output [4]: [nation#X, o_year#X, sum#X, isEmpty#X] Arguments: X -(84) InputAdapter +(95) InputAdapter Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] -(85) InputIteratorTransformer +(96) InputIteratorTransformer Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] -(86) RegularHashAggregateExecTransformer +(97) RegularHashAggregateExecTransformer Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] Keys [2]: [nation#X, o_year#X] Functions [1]: [sum(amount#X)] Aggregate Attributes [1]: [sum(amount#X)#X] Results [3]: [nation#X, o_year#X, sum(amount#X)#X AS sum_profit#X] -(87) WholeStageCodegenTransformer (X) +(98) WholeStageCodegenTransformer (X) Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: false -(88) ColumnarExchange +(99) VeloxAppendBatches +Input [3]: [nation#X, o_year#X, sum_profit#X] +Arguments: X + +(100) ColumnarExchange Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: rangepartitioning(nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(89) ShuffleQueryStage +(101) ShuffleQueryStage Output [3]: [nation#X, o_year#X, sum_profit#X] Arguments: X -(90) InputAdapter +(102) InputAdapter Input [3]: [nation#X, o_year#X, sum_profit#X] -(91) InputIteratorTransformer +(103) InputIteratorTransformer Input [3]: [nation#X, o_year#X, sum_profit#X] -(92) SortExecTransformer +(104) SortExecTransformer Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: [nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST], true, 0 -(93) WholeStageCodegenTransformer (X) +(105) WholeStageCodegenTransformer (X) Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: false -(94) VeloxColumnarToRowExec +(106) VeloxColumnarToRowExec Input [3]: [nation#X, o_year#X, sum_profit#X] -(95) Scan parquet +(107) Scan parquet Output [2]: [p_partkey#X, p_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_name), StringContains(p_name,green), IsNotNull(p_partkey)] ReadSchema: struct -(96) Filter +(108) Filter Input [2]: [p_partkey#X, p_name#X] Condition : ((isnotnull(p_name#X) AND Contains(p_name#X, green)) AND isnotnull(p_partkey#X)) -(97) Project +(109) Project Output [1]: [p_partkey#X] Input [2]: [p_partkey#X, p_name#X] -(98) Exchange +(110) Exchange Input [1]: [p_partkey#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(99) Scan parquet +(111) Sort +Input [1]: [p_partkey#X] +Arguments: [p_partkey#X ASC NULLS FIRST], false, 0 + +(112) Scan parquet Output [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(100) Filter +(113) Filter Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Condition : ((isnotnull(l_partkey#X) AND isnotnull(l_suppkey#X)) AND isnotnull(l_orderkey#X)) -(101) Exchange +(114) Exchange Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(102) ShuffledHashJoin +(115) Sort +Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] +Arguments: [l_partkey#X ASC NULLS FIRST], false, 0 + +(116) SortMergeJoin Left keys [1]: [p_partkey#X] Right keys [1]: [l_partkey#X] Join condition: None -(103) Project +(117) Project Output [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Input [7]: [p_partkey#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -(104) Exchange +(118) Exchange Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(105) Scan parquet +(119) Sort +Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] +Arguments: [l_suppkey#X ASC NULLS FIRST], false, 0 + +(120) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(106) Filter +(121) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(107) Exchange +(122) Exchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(108) ShuffledHashJoin +(123) Sort +Input [2]: [s_suppkey#X, s_nationkey#X] +Arguments: [s_suppkey#X ASC NULLS FIRST], false, 0 + +(124) SortMergeJoin Left keys [1]: [l_suppkey#X] Right keys [1]: [s_suppkey#X] Join condition: None -(109) Project +(125) Project Output [7]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Input [8]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_suppkey#X, s_nationkey#X] -(110) Exchange +(126) Exchange Input [7]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: hashpartitioning(l_suppkey#X, l_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(111) Scan parquet +(127) Sort +Input [7]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] +Arguments: [l_suppkey#X ASC NULLS FIRST, l_partkey#X ASC NULLS FIRST], false, 0 + +(128) Scan parquet Output [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_suppkey), IsNotNull(ps_partkey)] ReadSchema: struct -(112) Filter +(129) Filter Input [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Condition : (isnotnull(ps_suppkey#X) AND isnotnull(ps_partkey#X)) -(113) Exchange +(130) Exchange Input [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Arguments: hashpartitioning(ps_suppkey#X, ps_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(114) ShuffledHashJoin +(131) Sort +Input [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] +Arguments: [ps_suppkey#X ASC NULLS FIRST, ps_partkey#X ASC NULLS FIRST], false, 0 + +(132) SortMergeJoin Left keys [2]: [l_suppkey#X, l_partkey#X] Right keys [2]: [ps_suppkey#X, ps_partkey#X] Join condition: None -(115) Project +(133) Project Output [6]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] Input [10]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_partkey#X, ps_suppkey#X, ps_supplycost#X] -(116) Exchange +(134) Exchange Input [6]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(117) Scan parquet +(135) Sort +Input [6]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(136) Scan parquet Output [2]: [o_orderkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderkey)] ReadSchema: struct -(118) Filter +(137) Filter Input [2]: [o_orderkey#X, o_orderdate#X] Condition : isnotnull(o_orderkey#X) -(119) Exchange +(138) Exchange Input [2]: [o_orderkey#X, o_orderdate#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(120) ShuffledHashJoin +(139) Sort +Input [2]: [o_orderkey#X, o_orderdate#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(140) SortMergeJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join condition: None -(121) Project +(141) Project Output [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] Input [8]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderkey#X, o_orderdate#X] -(122) Exchange +(142) Exchange Input [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(123) Scan parquet +(143) Sort +Input [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] +Arguments: [s_nationkey#X ASC NULLS FIRST], false, 0 + +(144) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey)] ReadSchema: struct -(124) Filter +(145) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : isnotnull(n_nationkey#X) -(125) Exchange +(146) Exchange Input [2]: [n_nationkey#X, n_name#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(126) ShuffledHashJoin +(147) Sort +Input [2]: [n_nationkey#X, n_name#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(148) SortMergeJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(127) Project +(149) Project Output [3]: [n_name#X AS nation#X, year(o_orderdate#X) AS o_year#X, CheckOverflow((promote_precision(cast(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2), true))), DecimalType(26,4), true) as decimal(27,4))) - promote_precision(cast(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(l_quantity#X)), DecimalType(25,4), true) as decimal(27,4)))), DecimalType(27,4), true) AS amount#X] Input [8]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X, n_nationkey#X, n_name#X] -(128) HashAggregate +(150) HashAggregate Input [3]: [nation#X, o_year#X, amount#X] Keys [2]: [nation#X, o_year#X] Functions [1]: [partial_sum(amount#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [4]: [nation#X, o_year#X, sum#X, isEmpty#X] -(129) Exchange +(151) Exchange Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] Arguments: hashpartitioning(nation#X, o_year#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(130) HashAggregate +(152) HashAggregate Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] Keys [2]: [nation#X, o_year#X] Functions [1]: [sum(amount#X)] Aggregate Attributes [1]: [sum(amount#X)#X] Results [3]: [nation#X, o_year#X, sum(amount#X)#X AS sum_profit#X] -(131) Exchange +(153) Exchange Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: rangepartitioning(nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(132) Sort +(154) Sort Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: [nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST], true, 0 -(133) AdaptiveSparkPlan +(155) AdaptiveSparkPlan Output [3]: [nation#X, o_year#X, sum_profit#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/1.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/1.txt index 0cacd70fdca74..22dd5100c4fb3 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/1.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/1.txt @@ -1,31 +1,31 @@ == Physical Plan == -AdaptiveSparkPlan (28) +AdaptiveSparkPlan (30) +- == Final Plan == - VeloxColumnarToRowExec (19) - +- ^ SortExecTransformer (17) - +- ^ InputIteratorTransformer (16) - +- ^ InputAdapter (15) - +- ^ ShuffleQueryStage (14), Statistics(X) - +- ColumnarExchange (13) - +- ^ RegularHashAggregateExecTransformer (11) - +- ^ InputIteratorTransformer (10) - +- ^ InputAdapter (9) - +- ^ ShuffleQueryStage (8), Statistics(X) - +- ColumnarExchange (7) + VeloxColumnarToRowExec (21) + +- ^ SortExecTransformer (19) + +- ^ InputIteratorTransformer (18) + +- ShuffleQueryStage (16), Statistics(X) + +- ColumnarExchange (15) + +- VeloxAppendBatches (14) + +- ^ RegularHashAggregateExecTransformer (12) + +- ^ InputIteratorTransformer (11) + +- ShuffleQueryStage (9), Statistics(X) + +- ColumnarExchange (8) + +- VeloxAppendBatches (7) +- ^ ProjectExecTransformer (5) +- ^ FlushableHashAggregateExecTransformer (4) +- ^ ProjectExecTransformer (3) +- ^ FilterExecTransformer (2) +- ^ Scan parquet (1) +- == Initial Plan == - Sort (27) - +- Exchange (26) - +- HashAggregate (25) - +- Exchange (24) - +- HashAggregate (23) - +- Project (22) - +- Filter (21) - +- Scan parquet (20) + Sort (29) + +- Exchange (28) + +- HashAggregate (27) + +- Exchange (26) + +- HashAggregate (25) + +- Project (24) + +- Filter (23) + +- Scan parquet (22) (1) Scan parquet @@ -58,97 +58,105 @@ Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, Input [18]: [hash_partition_key#X, l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Arguments: false -(7) ColumnarExchange +(7) VeloxAppendBatches +Input [18]: [hash_partition_key#X, l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] +Arguments: X + +(8) ColumnarExchange Input [18]: [hash_partition_key#X, l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Arguments: hashpartitioning(l_returnflag#X, l_linestatus#X, 1), ENSURE_REQUIREMENTS, [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X], [plan_id=X], [id=#X] -(8) ShuffleQueryStage +(9) ShuffleQueryStage Output [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Arguments: X -(9) InputAdapter +(10) InputAdapter Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] -(10) InputIteratorTransformer +(11) InputIteratorTransformer Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] -(11) RegularHashAggregateExecTransformer +(12) RegularHashAggregateExecTransformer Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Keys [2]: [l_returnflag#X, l_linestatus#X] Functions [8]: [sum(l_quantity#X), sum(l_extendedprice#X), sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4))), sum(CheckOverflow((promote_precision(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4))) * promote_precision(cast(CheckOverflow((1.00 + promote_precision(cast(l_tax#X as decimal(13,2)))), DecimalType(13,2)) as decimal(26,4)))), DecimalType(38,6))), avg(l_quantity#X), avg(l_extendedprice#X), avg(l_discount#X), count(1)] Aggregate Attributes [8]: [sum(l_quantity#X)#X, sum(l_extendedprice#X)#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X, sum(CheckOverflow((promote_precision(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4))) * promote_precision(cast(CheckOverflow((1.00 + promote_precision(cast(l_tax#X as decimal(13,2)))), DecimalType(13,2)) as decimal(26,4)))), DecimalType(38,6)))#X, avg(l_quantity#X)#X, avg(l_extendedprice#X)#X, avg(l_discount#X)#X, count(1)#X] Results [10]: [l_returnflag#X, l_linestatus#X, sum(l_quantity#X)#X AS sum_qty#X, sum(l_extendedprice#X)#X AS sum_base_price#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X AS sum_disc_price#X, sum(CheckOverflow((promote_precision(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4))) * promote_precision(cast(CheckOverflow((1.00 + promote_precision(cast(l_tax#X as decimal(13,2)))), DecimalType(13,2)) as decimal(26,4)))), DecimalType(38,6)))#X AS sum_charge#X, avg(l_quantity#X)#X AS avg_qty#X, avg(l_extendedprice#X)#X AS avg_price#X, avg(l_discount#X)#X AS avg_disc#X, count(1)#X AS count_order#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] +Arguments: X + +(15) ColumnarExchange Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: rangepartitioning(l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] -(17) SortExecTransformer +(19) SortExecTransformer Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: [l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST], true, 0 -(18) WholeStageCodegenTransformer (X) +(20) WholeStageCodegenTransformer (X) Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: false -(19) VeloxColumnarToRowExec +(21) VeloxColumnarToRowExec Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] -(20) Scan parquet +(22) Scan parquet Output [7]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_tax#X, l_returnflag#X, l_linestatus#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), LessThanOrEqual(l_shipdate,1998-09-02)] ReadSchema: struct -(21) Filter +(23) Filter Input [7]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_tax#X, l_returnflag#X, l_linestatus#X, l_shipdate#X] Condition : (isnotnull(l_shipdate#X) AND (l_shipdate#X <= 1998-09-02)) -(22) Project +(24) Project Output [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_tax#X, l_returnflag#X, l_linestatus#X] Input [7]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_tax#X, l_returnflag#X, l_linestatus#X, l_shipdate#X] -(23) HashAggregate +(25) HashAggregate Input [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_tax#X, l_returnflag#X, l_linestatus#X] Keys [2]: [l_returnflag#X, l_linestatus#X] Functions [8]: [partial_sum(l_quantity#X), partial_sum(l_extendedprice#X), partial_sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4))), partial_sum(CheckOverflow((promote_precision(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4))) * promote_precision(cast(CheckOverflow((1.00 + promote_precision(cast(l_tax#X as decimal(13,2)))), DecimalType(13,2)) as decimal(26,4)))), DecimalType(38,6))), partial_avg(l_quantity#X), partial_avg(l_extendedprice#X), partial_avg(l_discount#X), partial_count(1)] Aggregate Attributes [15]: [sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Results [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] -(24) Exchange +(26) Exchange Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Arguments: hashpartitioning(l_returnflag#X, l_linestatus#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(25) HashAggregate +(27) HashAggregate Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Keys [2]: [l_returnflag#X, l_linestatus#X] Functions [8]: [sum(l_quantity#X), sum(l_extendedprice#X), sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4))), sum(CheckOverflow((promote_precision(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4))) * promote_precision(cast(CheckOverflow((1.00 + promote_precision(cast(l_tax#X as decimal(13,2)))), DecimalType(13,2)) as decimal(26,4)))), DecimalType(38,6))), avg(l_quantity#X), avg(l_extendedprice#X), avg(l_discount#X), count(1)] Aggregate Attributes [8]: [sum(l_quantity#X)#X, sum(l_extendedprice#X)#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X, sum(CheckOverflow((promote_precision(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4))) * promote_precision(cast(CheckOverflow((1.00 + promote_precision(cast(l_tax#X as decimal(13,2)))), DecimalType(13,2)) as decimal(26,4)))), DecimalType(38,6)))#X, avg(l_quantity#X)#X, avg(l_extendedprice#X)#X, avg(l_discount#X)#X, count(1)#X] Results [10]: [l_returnflag#X, l_linestatus#X, sum(l_quantity#X)#X AS sum_qty#X, sum(l_extendedprice#X)#X AS sum_base_price#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X AS sum_disc_price#X, sum(CheckOverflow((promote_precision(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4))) * promote_precision(cast(CheckOverflow((1.00 + promote_precision(cast(l_tax#X as decimal(13,2)))), DecimalType(13,2)) as decimal(26,4)))), DecimalType(38,6)))#X AS sum_charge#X, avg(l_quantity#X)#X AS avg_qty#X, avg(l_extendedprice#X)#X AS avg_price#X, avg(l_discount#X)#X AS avg_disc#X, count(1)#X AS count_order#X] -(26) Exchange +(28) Exchange Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: rangepartitioning(l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(27) Sort +(29) Sort Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: [l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST], true, 0 -(28) AdaptiveSparkPlan +(30) AdaptiveSparkPlan Output [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/10.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/10.txt index ae8759a6b86ce..c295515b8a6cb 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/10.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/10.txt @@ -1,85 +1,91 @@ == Physical Plan == -AdaptiveSparkPlan (87) +AdaptiveSparkPlan (100) +- == Final Plan == - VeloxColumnarToRowExec (60) - +- TakeOrderedAndProjectExecTransformer (59) - +- ^ ProjectExecTransformer (57) - +- ^ RegularHashAggregateExecTransformer (56) - +- ^ InputIteratorTransformer (55) - +- ^ InputAdapter (54) - +- ^ ShuffleQueryStage (53), Statistics(X) - +- ColumnarExchange (52) - +- ^ ProjectExecTransformer (50) - +- ^ FlushableHashAggregateExecTransformer (49) - +- ^ ProjectExecTransformer (48) - +- ^ ShuffledHashJoinExecTransformer Inner (47) - :- ^ InputIteratorTransformer (38) - : +- ^ InputAdapter (37) - : +- ^ ShuffleQueryStage (36), Statistics(X) - : +- ColumnarExchange (35) - : +- ^ ProjectExecTransformer (33) - : +- ^ ShuffledHashJoinExecTransformer Inner (32) - : :- ^ InputIteratorTransformer (23) - : : +- ^ InputAdapter (22) - : : +- ^ ShuffleQueryStage (21), Statistics(X) - : : +- ColumnarExchange (20) - : : +- ^ ProjectExecTransformer (18) - : : +- ^ ShuffledHashJoinExecTransformer Inner (17) - : : :- ^ InputIteratorTransformer (8) - : : : +- ^ InputAdapter (7) - : : : +- ^ ShuffleQueryStage (6), Statistics(X) - : : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (67) + +- TakeOrderedAndProjectExecTransformer (66) + +- ^ ProjectExecTransformer (64) + +- ^ RegularHashAggregateExecTransformer (63) + +- ^ InputIteratorTransformer (62) + +- ShuffleQueryStage (60), Statistics(X) + +- ColumnarExchange (59) + +- VeloxAppendBatches (58) + +- ^ ProjectExecTransformer (56) + +- ^ FlushableHashAggregateExecTransformer (55) + +- ^ ProjectExecTransformer (54) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (53) + :- ^ InputIteratorTransformer (43) + : +- ShuffleQueryStage (41), Statistics(X) + : +- ColumnarExchange (40) + : +- VeloxAppendBatches (39) + : +- ^ ProjectExecTransformer (37) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (36) + : :- ^ InputIteratorTransformer (26) + : : +- ShuffleQueryStage (24), Statistics(X) + : : +- ColumnarExchange (23) + : : +- VeloxAppendBatches (22) + : : +- ^ ProjectExecTransformer (20) + : : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (19) + : : :- ^ InputIteratorTransformer (9) + : : : +- ShuffleQueryStage (7), Statistics(X) + : : : +- ColumnarExchange (6) + : : : +- VeloxAppendBatches (5) : : : +- ^ ProjectExecTransformer (3) : : : +- ^ FilterExecTransformer (2) : : : +- ^ Scan parquet (1) - : : +- ^ InputIteratorTransformer (16) - : : +- ^ InputAdapter (15) - : : +- ^ ShuffleQueryStage (14), Statistics(X) - : : +- ColumnarExchange (13) - : : +- ^ ProjectExecTransformer (11) - : : +- ^ FilterExecTransformer (10) - : : +- ^ Scan parquet (9) - : +- ^ InputIteratorTransformer (31) - : +- ^ InputAdapter (30) - : +- ^ ShuffleQueryStage (29), Statistics(X) - : +- ColumnarExchange (28) - : +- ^ ProjectExecTransformer (26) - : +- ^ FilterExecTransformer (25) - : +- ^ Scan parquet (24) - +- ^ InputIteratorTransformer (46) - +- ^ InputAdapter (45) - +- ^ ShuffleQueryStage (44), Statistics(X) - +- ColumnarExchange (43) - +- ^ ProjectExecTransformer (41) - +- ^ FilterExecTransformer (40) - +- ^ Scan parquet (39) + : : +- ^ InputIteratorTransformer (18) + : : +- ShuffleQueryStage (16), Statistics(X) + : : +- ColumnarExchange (15) + : : +- VeloxAppendBatches (14) + : : +- ^ ProjectExecTransformer (12) + : : +- ^ FilterExecTransformer (11) + : : +- ^ Scan parquet (10) + : +- ^ InputIteratorTransformer (35) + : +- ShuffleQueryStage (33), Statistics(X) + : +- ColumnarExchange (32) + : +- VeloxAppendBatches (31) + : +- ^ ProjectExecTransformer (29) + : +- ^ FilterExecTransformer (28) + : +- ^ Scan parquet (27) + +- ^ InputIteratorTransformer (52) + +- ShuffleQueryStage (50), Statistics(X) + +- ColumnarExchange (49) + +- VeloxAppendBatches (48) + +- ^ ProjectExecTransformer (46) + +- ^ FilterExecTransformer (45) + +- ^ Scan parquet (44) +- == Initial Plan == - TakeOrderedAndProject (86) - +- HashAggregate (85) - +- Exchange (84) - +- HashAggregate (83) - +- Project (82) - +- ShuffledHashJoin Inner BuildRight (81) - :- Exchange (77) - : +- Project (76) - : +- ShuffledHashJoin Inner BuildRight (75) - : :- Exchange (70) - : : +- Project (69) - : : +- ShuffledHashJoin Inner BuildRight (68) - : : :- Exchange (63) - : : : +- Filter (62) - : : : +- Scan parquet (61) - : : +- Exchange (67) - : : +- Project (66) - : : +- Filter (65) - : : +- Scan parquet (64) - : +- Exchange (74) - : +- Project (73) - : +- Filter (72) - : +- Scan parquet (71) - +- Exchange (80) - +- Filter (79) - +- Scan parquet (78) + TakeOrderedAndProject (99) + +- HashAggregate (98) + +- Exchange (97) + +- HashAggregate (96) + +- Project (95) + +- SortMergeJoin Inner (94) + :- Sort (89) + : +- Exchange (88) + : +- Project (87) + : +- SortMergeJoin Inner (86) + : :- Sort (80) + : : +- Exchange (79) + : : +- Project (78) + : : +- SortMergeJoin Inner (77) + : : :- Sort (71) + : : : +- Exchange (70) + : : : +- Filter (69) + : : : +- Scan parquet (68) + : : +- Sort (76) + : : +- Exchange (75) + : : +- Project (74) + : : +- Filter (73) + : : +- Scan parquet (72) + : +- Sort (85) + : +- Exchange (84) + : +- Project (83) + : +- Filter (82) + : +- Scan parquet (81) + +- Sort (93) + +- Exchange (92) + +- Filter (91) + +- Scan parquet (90) (1) Scan parquet @@ -101,358 +107,410 @@ Input [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acct Input [8]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [8]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] +Arguments: X + +(6) ColumnarExchange Input [8]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] -(9) Scan parquet +(10) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1993-10-01), LessThan(o_orderdate,1994-01-01), IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(10) FilterExecTransformer +(11) FilterExecTransformer Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Arguments: ((((isnotnull(o_orderdate#X) AND (o_orderdate#X >= 1993-10-01)) AND (o_orderdate#X < 1994-01-01)) AND isnotnull(o_custkey#X)) AND isnotnull(o_orderkey#X)) -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [3]: [hash(o_custkey#X, 42) AS hash_partition_key#X, o_orderkey#X, o_custkey#X] Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] +Arguments: X + +(15) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [2]: [o_orderkey#X, o_custkey#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [2]: [o_orderkey#X, o_custkey#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [2]: [o_orderkey#X, o_custkey#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [9]: [hash(o_orderkey#X, 42) AS hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] Input [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X, o_custkey#X] -(19) WholeStageCodegenTransformer (X) +(21) WholeStageCodegenTransformer (X) Input [9]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] Arguments: false -(20) ColumnarExchange +(22) VeloxAppendBatches +Input [9]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] +Arguments: X + +(23) ColumnarExchange Input [9]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X], [plan_id=X], [id=#X] -(21) ShuffleQueryStage +(24) ShuffleQueryStage Output [8]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] Arguments: X -(22) InputAdapter +(25) InputAdapter Input [8]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] -(23) InputIteratorTransformer +(26) InputIteratorTransformer Input [8]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] -(24) Scan parquet +(27) Scan parquet Output [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_returnflag#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_returnflag), EqualTo(l_returnflag,R), IsNotNull(l_orderkey)] ReadSchema: struct -(25) FilterExecTransformer +(28) FilterExecTransformer Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_returnflag#X] Arguments: ((isnotnull(l_returnflag#X) AND (l_returnflag#X = R)) AND isnotnull(l_orderkey#X)) -(26) ProjectExecTransformer +(29) ProjectExecTransformer Output [4]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_returnflag#X] -(27) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X] Arguments: false -(28) ColumnarExchange +(31) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(32) ColumnarExchange Input [4]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(29) ShuffleQueryStage +(33) ShuffleQueryStage Output [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] Arguments: X -(30) InputAdapter +(34) InputAdapter Input [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] -(31) InputIteratorTransformer +(35) InputIteratorTransformer Input [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] -(32) ShuffledHashJoinExecTransformer +(36) ShuffledHashJoinExecTransformer Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(33) ProjectExecTransformer +(37) ProjectExecTransformer Output [10]: [hash(c_nationkey#X, 42) AS hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] Input [11]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X] -(34) WholeStageCodegenTransformer (X) +(38) WholeStageCodegenTransformer (X) Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] Arguments: false -(35) ColumnarExchange +(39) VeloxAppendBatches +Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(40) ColumnarExchange Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(36) ShuffleQueryStage +(41) ShuffleQueryStage Output [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] Arguments: X -(37) InputAdapter +(42) InputAdapter Input [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] -(38) InputIteratorTransformer +(43) InputIteratorTransformer Input [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] -(39) Scan parquet +(44) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey)] ReadSchema: struct -(40) FilterExecTransformer +(45) FilterExecTransformer Input [2]: [n_nationkey#X, n_name#X] Arguments: isnotnull(n_nationkey#X) -(41) ProjectExecTransformer +(46) ProjectExecTransformer Output [3]: [hash(n_nationkey#X, 42) AS hash_partition_key#X, n_nationkey#X, n_name#X] Input [2]: [n_nationkey#X, n_name#X] -(42) WholeStageCodegenTransformer (X) +(47) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] Arguments: false -(43) ColumnarExchange +(48) VeloxAppendBatches +Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] +Arguments: X + +(49) ColumnarExchange Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [id=#X] -(44) ShuffleQueryStage +(50) ShuffleQueryStage Output [2]: [n_nationkey#X, n_name#X] Arguments: X -(45) InputAdapter +(51) InputAdapter Input [2]: [n_nationkey#X, n_name#X] -(46) InputIteratorTransformer +(52) InputIteratorTransformer Input [2]: [n_nationkey#X, n_name#X] -(47) ShuffledHashJoinExecTransformer +(53) ShuffledHashJoinExecTransformer Left keys [1]: [c_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(48) ProjectExecTransformer +(54) ProjectExecTransformer Output [10]: [c_custkey#X, c_name#X, c_address#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X, n_name#X, CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) AS _pre_X#X] Input [11]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X, n_nationkey#X, n_name#X] -(49) FlushableHashAggregateExecTransformer +(55) FlushableHashAggregateExecTransformer Input [10]: [c_custkey#X, c_name#X, c_address#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X, n_name#X, _pre_X#X] Keys [7]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X] Functions [1]: [partial_sum(_pre_X#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] -(50) ProjectExecTransformer +(56) ProjectExecTransformer Output [10]: [hash(c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, 42) AS hash_partition_key#X, c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] -(51) WholeStageCodegenTransformer (X) +(57) WholeStageCodegenTransformer (X) Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Arguments: false -(52) ColumnarExchange +(58) VeloxAppendBatches +Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] +Arguments: X + +(59) ColumnarExchange Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Arguments: hashpartitioning(c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(53) ShuffleQueryStage +(60) ShuffleQueryStage Output [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Arguments: X -(54) InputAdapter +(61) InputAdapter Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] -(55) InputIteratorTransformer +(62) InputIteratorTransformer Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] -(56) RegularHashAggregateExecTransformer +(63) RegularHashAggregateExecTransformer Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Keys [7]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] Results [8]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] -(57) ProjectExecTransformer +(64) ProjectExecTransformer Output [8]: [c_custkey#X, c_name#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X AS revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] Input [8]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] -(58) WholeStageCodegenTransformer (X) +(65) WholeStageCodegenTransformer (X) Input [8]: [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] Arguments: false -(59) TakeOrderedAndProjectExecTransformer +(66) TakeOrderedAndProjectExecTransformer Input [8]: [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] Arguments: X, [revenue#X DESC NULLS LAST], [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X], 0 -(60) VeloxColumnarToRowExec +(67) VeloxColumnarToRowExec Input [8]: [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] -(61) Scan parquet +(68) Scan parquet Output [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey), IsNotNull(c_nationkey)] ReadSchema: struct -(62) Filter +(69) Filter Input [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] Condition : (isnotnull(c_custkey#X) AND isnotnull(c_nationkey#X)) -(63) Exchange +(70) Exchange Input [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(64) Scan parquet +(71) Sort +Input [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] +Arguments: [c_custkey#X ASC NULLS FIRST], false, 0 + +(72) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1993-10-01), LessThan(o_orderdate,1994-01-01), IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(65) Filter +(73) Filter Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Condition : ((((isnotnull(o_orderdate#X) AND (o_orderdate#X >= 1993-10-01)) AND (o_orderdate#X < 1994-01-01)) AND isnotnull(o_custkey#X)) AND isnotnull(o_orderkey#X)) -(66) Project +(74) Project Output [2]: [o_orderkey#X, o_custkey#X] Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] -(67) Exchange +(75) Exchange Input [2]: [o_orderkey#X, o_custkey#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(68) ShuffledHashJoin +(76) Sort +Input [2]: [o_orderkey#X, o_custkey#X] +Arguments: [o_custkey#X ASC NULLS FIRST], false, 0 + +(77) SortMergeJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(69) Project +(78) Project Output [8]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] Input [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X, o_custkey#X] -(70) Exchange +(79) Exchange Input [8]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(71) Scan parquet +(80) Sort +Input [8]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(81) Scan parquet Output [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_returnflag#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_returnflag), EqualTo(l_returnflag,R), IsNotNull(l_orderkey)] ReadSchema: struct -(72) Filter +(82) Filter Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_returnflag#X] Condition : ((isnotnull(l_returnflag#X) AND (l_returnflag#X = R)) AND isnotnull(l_orderkey#X)) -(73) Project +(83) Project Output [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_returnflag#X] -(74) Exchange +(84) Exchange Input [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(75) ShuffledHashJoin +(85) Sort +Input [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(86) SortMergeJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(76) Project +(87) Project Output [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] Input [11]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X] -(77) Exchange +(88) Exchange Input [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(78) Scan parquet +(89) Sort +Input [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] +Arguments: [c_nationkey#X ASC NULLS FIRST], false, 0 + +(90) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey)] ReadSchema: struct -(79) Filter +(91) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : isnotnull(n_nationkey#X) -(80) Exchange +(92) Exchange Input [2]: [n_nationkey#X, n_name#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(81) ShuffledHashJoin +(93) Sort +Input [2]: [n_nationkey#X, n_name#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(94) SortMergeJoin Left keys [1]: [c_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(82) Project +(95) Project Output [9]: [c_custkey#X, c_name#X, c_address#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X, n_name#X] Input [11]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X, n_nationkey#X, n_name#X] -(83) HashAggregate +(96) HashAggregate Input [9]: [c_custkey#X, c_name#X, c_address#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X, n_name#X] Keys [7]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X] Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] -(84) Exchange +(97) Exchange Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Arguments: hashpartitioning(c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(85) HashAggregate +(98) HashAggregate Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Keys [7]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] Results [8]: [c_custkey#X, c_name#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X AS revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] -(86) TakeOrderedAndProject +(99) TakeOrderedAndProject Input [8]: [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] Arguments: X, [revenue#X DESC NULLS LAST], [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] -(87) AdaptiveSparkPlan +(100) AdaptiveSparkPlan Output [8]: [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/11.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/11.txt index 3b6c477c0cf5e..20bb486f38418 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/11.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/11.txt @@ -1,71 +1,75 @@ == Physical Plan == -AdaptiveSparkPlan (72) +AdaptiveSparkPlan (82) +- == Final Plan == - VeloxColumnarToRowExec (50) - +- ^ SortExecTransformer (48) - +- ^ InputIteratorTransformer (47) - +- ^ InputAdapter (46) - +- ^ ShuffleQueryStage (45), Statistics(X) - +- ColumnarExchange (44) - +- ^ FilterExecTransformer (42) - +- ^ RegularHashAggregateExecTransformer (41) - +- ^ InputIteratorTransformer (40) - +- ^ InputAdapter (39) - +- ^ ShuffleQueryStage (38), Statistics(X) - +- ColumnarExchange (37) - +- ^ ProjectExecTransformer (35) - +- ^ FlushableHashAggregateExecTransformer (34) - +- ^ ProjectExecTransformer (33) - +- ^ ShuffledHashJoinExecTransformer Inner (32) - :- ^ InputIteratorTransformer (23) - : +- ^ InputAdapter (22) - : +- ^ ShuffleQueryStage (21), Statistics(X) - : +- ColumnarExchange (20) - : +- ^ ProjectExecTransformer (18) - : +- ^ ShuffledHashJoinExecTransformer Inner (17) - : :- ^ InputIteratorTransformer (8) - : : +- ^ InputAdapter (7) - : : +- ^ ShuffleQueryStage (6), Statistics(X) - : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (56) + +- ^ SortExecTransformer (54) + +- ^ InputIteratorTransformer (53) + +- ShuffleQueryStage (51), Statistics(X) + +- ColumnarExchange (50) + +- VeloxAppendBatches (49) + +- ^ FilterExecTransformer (47) + +- ^ RegularHashAggregateExecTransformer (46) + +- ^ InputIteratorTransformer (45) + +- ShuffleQueryStage (43), Statistics(X) + +- ColumnarExchange (42) + +- VeloxAppendBatches (41) + +- ^ ProjectExecTransformer (39) + +- ^ FlushableHashAggregateExecTransformer (38) + +- ^ ProjectExecTransformer (37) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (36) + :- ^ InputIteratorTransformer (26) + : +- ShuffleQueryStage (24), Statistics(X) + : +- ColumnarExchange (23) + : +- VeloxAppendBatches (22) + : +- ^ ProjectExecTransformer (20) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (19) + : :- ^ InputIteratorTransformer (9) + : : +- ShuffleQueryStage (7), Statistics(X) + : : +- ColumnarExchange (6) + : : +- VeloxAppendBatches (5) : : +- ^ ProjectExecTransformer (3) : : +- ^ FilterExecTransformer (2) : : +- ^ Scan parquet (1) - : +- ^ InputIteratorTransformer (16) - : +- ^ InputAdapter (15) - : +- ^ ShuffleQueryStage (14), Statistics(X) - : +- ColumnarExchange (13) - : +- ^ ProjectExecTransformer (11) - : +- ^ FilterExecTransformer (10) - : +- ^ Scan parquet (9) - +- ^ InputIteratorTransformer (31) - +- ^ InputAdapter (30) - +- ^ ShuffleQueryStage (29), Statistics(X) - +- ColumnarExchange (28) - +- ^ ProjectExecTransformer (26) - +- ^ FilterExecTransformer (25) - +- ^ Scan parquet (24) + : +- ^ InputIteratorTransformer (18) + : +- ShuffleQueryStage (16), Statistics(X) + : +- ColumnarExchange (15) + : +- VeloxAppendBatches (14) + : +- ^ ProjectExecTransformer (12) + : +- ^ FilterExecTransformer (11) + : +- ^ Scan parquet (10) + +- ^ InputIteratorTransformer (35) + +- ShuffleQueryStage (33), Statistics(X) + +- ColumnarExchange (32) + +- VeloxAppendBatches (31) + +- ^ ProjectExecTransformer (29) + +- ^ FilterExecTransformer (28) + +- ^ Scan parquet (27) +- == Initial Plan == - Sort (71) - +- Exchange (70) - +- Filter (69) - +- HashAggregate (68) - +- Exchange (67) - +- HashAggregate (66) - +- Project (65) - +- ShuffledHashJoin Inner BuildRight (64) - :- Exchange (59) - : +- Project (58) - : +- ShuffledHashJoin Inner BuildRight (57) - : :- Exchange (53) - : : +- Filter (52) - : : +- Scan parquet (51) - : +- Exchange (56) - : +- Filter (55) - : +- Scan parquet (54) - +- Exchange (63) - +- Project (62) - +- Filter (61) - +- Scan parquet (60) + Sort (81) + +- Exchange (80) + +- Filter (79) + +- HashAggregate (78) + +- Exchange (77) + +- HashAggregate (76) + +- Project (75) + +- SortMergeJoin Inner (74) + :- Sort (68) + : +- Exchange (67) + : +- Project (66) + : +- SortMergeJoin Inner (65) + : :- Sort (60) + : : +- Exchange (59) + : : +- Filter (58) + : : +- Scan parquet (57) + : +- Sort (64) + : +- Exchange (63) + : +- Filter (62) + : +- Scan parquet (61) + +- Sort (73) + +- Exchange (72) + +- Project (71) + +- Filter (70) + +- Scan parquet (69) (1) Scan parquet @@ -87,553 +91,619 @@ Input [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Input [5]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [5]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] +Arguments: X + +(6) ColumnarExchange Input [5]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] -(9) Scan parquet +(10) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(10) FilterExecTransformer +(11) FilterExecTransformer Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [3]: [hash(s_suppkey#X, 42) AS hash_partition_key#X, s_suppkey#X, s_nationkey#X] Input [2]: [s_suppkey#X, s_nationkey#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] +Arguments: X + +(15) ColumnarExchange Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [2]: [s_suppkey#X, s_nationkey#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [2]: [s_suppkey#X, s_nationkey#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [ps_suppkey#X] Right keys [1]: [s_suppkey#X] Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [5]: [hash(s_nationkey#X, 42) AS hash_partition_key#X, ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] Input [6]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X, s_suppkey#X, s_nationkey#X] -(19) WholeStageCodegenTransformer (X) +(21) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] Arguments: false -(20) ColumnarExchange +(22) VeloxAppendBatches +Input [5]: [hash_partition_key#X, ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] +Arguments: X + +(23) ColumnarExchange Input [5]: [hash_partition_key#X, ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X], [plan_id=X], [id=#X] -(21) ShuffleQueryStage +(24) ShuffleQueryStage Output [4]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] Arguments: X -(22) InputAdapter +(25) InputAdapter Input [4]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] -(23) InputIteratorTransformer +(26) InputIteratorTransformer Input [4]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] -(24) Scan parquet +(27) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,GERMANY), IsNotNull(n_nationkey)] ReadSchema: struct -(25) FilterExecTransformer +(28) FilterExecTransformer Input [2]: [n_nationkey#X, n_name#X] Arguments: ((isnotnull(n_name#X) AND (n_name#X = GERMANY)) AND isnotnull(n_nationkey#X)) -(26) ProjectExecTransformer +(29) ProjectExecTransformer Output [2]: [hash(n_nationkey#X, 42) AS hash_partition_key#X, n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(27) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, n_nationkey#X] Arguments: false -(28) ColumnarExchange +(31) VeloxAppendBatches +Input [2]: [hash_partition_key#X, n_nationkey#X] +Arguments: X + +(32) ColumnarExchange Input [2]: [hash_partition_key#X, n_nationkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X], [plan_id=X], [id=#X] -(29) ShuffleQueryStage +(33) ShuffleQueryStage Output [1]: [n_nationkey#X] Arguments: X -(30) InputAdapter +(34) InputAdapter Input [1]: [n_nationkey#X] -(31) InputIteratorTransformer +(35) InputIteratorTransformer Input [1]: [n_nationkey#X] -(32) ShuffledHashJoinExecTransformer +(36) ShuffledHashJoinExecTransformer Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(33) ProjectExecTransformer +(37) ProjectExecTransformer Output [4]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)) AS _pre_X#X] Input [5]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X, n_nationkey#X] -(34) FlushableHashAggregateExecTransformer +(38) FlushableHashAggregateExecTransformer Input [4]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, _pre_X#X] Keys [1]: [ps_partkey#X] Functions [1]: [partial_sum(_pre_X#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [ps_partkey#X, sum#X, isEmpty#X] -(35) ProjectExecTransformer +(39) ProjectExecTransformer Output [4]: [hash(ps_partkey#X, 42) AS hash_partition_key#X, ps_partkey#X, sum#X, isEmpty#X] Input [3]: [ps_partkey#X, sum#X, isEmpty#X] -(36) WholeStageCodegenTransformer (X) +(40) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, ps_partkey#X, sum#X, isEmpty#X] Arguments: false -(37) ColumnarExchange +(41) VeloxAppendBatches +Input [4]: [hash_partition_key#X, ps_partkey#X, sum#X, isEmpty#X] +Arguments: X + +(42) ColumnarExchange Input [4]: [hash_partition_key#X, ps_partkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(38) ShuffleQueryStage +(43) ShuffleQueryStage Output [3]: [ps_partkey#X, sum#X, isEmpty#X] Arguments: X -(39) InputAdapter +(44) InputAdapter Input [3]: [ps_partkey#X, sum#X, isEmpty#X] -(40) InputIteratorTransformer +(45) InputIteratorTransformer Input [3]: [ps_partkey#X, sum#X, isEmpty#X] -(41) RegularHashAggregateExecTransformer +(46) RegularHashAggregateExecTransformer Input [3]: [ps_partkey#X, sum#X, isEmpty#X] Keys [1]: [ps_partkey#X] Functions [1]: [sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))#X] Results [2]: [ps_partkey#X, sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))#X AS value#X] -(42) FilterExecTransformer +(47) FilterExecTransformer Input [2]: [ps_partkey#X, value#X] Arguments: (isnotnull(value#X) AND (cast(value#X as decimal(38,6)) > Subquery subquery#X, [id=#X])) -(43) WholeStageCodegenTransformer (X) +(48) WholeStageCodegenTransformer (X) Input [2]: [ps_partkey#X, value#X] Arguments: false -(44) ColumnarExchange +(49) VeloxAppendBatches +Input [2]: [ps_partkey#X, value#X] +Arguments: X + +(50) ColumnarExchange Input [2]: [ps_partkey#X, value#X] Arguments: rangepartitioning(value#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(45) ShuffleQueryStage +(51) ShuffleQueryStage Output [2]: [ps_partkey#X, value#X] Arguments: X -(46) InputAdapter +(52) InputAdapter Input [2]: [ps_partkey#X, value#X] -(47) InputIteratorTransformer +(53) InputIteratorTransformer Input [2]: [ps_partkey#X, value#X] -(48) SortExecTransformer +(54) SortExecTransformer Input [2]: [ps_partkey#X, value#X] Arguments: [value#X DESC NULLS LAST], true, 0 -(49) WholeStageCodegenTransformer (X) +(55) WholeStageCodegenTransformer (X) Input [2]: [ps_partkey#X, value#X] Arguments: false -(50) VeloxColumnarToRowExec +(56) VeloxColumnarToRowExec Input [2]: [ps_partkey#X, value#X] -(51) Scan parquet +(57) Scan parquet Output [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_suppkey)] ReadSchema: struct -(52) Filter +(58) Filter Input [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Condition : isnotnull(ps_suppkey#X) -(53) Exchange +(59) Exchange Input [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(54) Scan parquet +(60) Sort +Input [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] +Arguments: [ps_suppkey#X ASC NULLS FIRST], false, 0 + +(61) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(55) Filter +(62) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(56) Exchange +(63) Exchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(57) ShuffledHashJoin +(64) Sort +Input [2]: [s_suppkey#X, s_nationkey#X] +Arguments: [s_suppkey#X ASC NULLS FIRST], false, 0 + +(65) SortMergeJoin Left keys [1]: [ps_suppkey#X] Right keys [1]: [s_suppkey#X] Join condition: None -(58) Project +(66) Project Output [4]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] Input [6]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X, s_suppkey#X, s_nationkey#X] -(59) Exchange +(67) Exchange Input [4]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(60) Scan parquet +(68) Sort +Input [4]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] +Arguments: [s_nationkey#X ASC NULLS FIRST], false, 0 + +(69) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,GERMANY), IsNotNull(n_nationkey)] ReadSchema: struct -(61) Filter +(70) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : ((isnotnull(n_name#X) AND (n_name#X = GERMANY)) AND isnotnull(n_nationkey#X)) -(62) Project +(71) Project Output [1]: [n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(63) Exchange +(72) Exchange Input [1]: [n_nationkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(64) ShuffledHashJoin +(73) Sort +Input [1]: [n_nationkey#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(74) SortMergeJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(65) Project +(75) Project Output [3]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X] Input [5]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X, n_nationkey#X] -(66) HashAggregate +(76) HashAggregate Input [3]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X] Keys [1]: [ps_partkey#X] Functions [1]: [partial_sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [ps_partkey#X, sum#X, isEmpty#X] -(67) Exchange +(77) Exchange Input [3]: [ps_partkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(68) HashAggregate +(78) HashAggregate Input [3]: [ps_partkey#X, sum#X, isEmpty#X] Keys [1]: [ps_partkey#X] Functions [1]: [sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))#X] Results [2]: [ps_partkey#X, sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))#X AS value#X] -(69) Filter +(79) Filter Input [2]: [ps_partkey#X, value#X] Condition : (isnotnull(value#X) AND (cast(value#X as decimal(38,6)) > Subquery subquery#X, [id=#X])) -(70) Exchange +(80) Exchange Input [2]: [ps_partkey#X, value#X] Arguments: rangepartitioning(value#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(71) Sort +(81) Sort Input [2]: [ps_partkey#X, value#X] Arguments: [value#X DESC NULLS LAST], true, 0 -(72) AdaptiveSparkPlan +(82) AdaptiveSparkPlan Output [2]: [ps_partkey#X, value#X] Arguments: isFinalPlan=true ===== Subqueries ===== -Subquery:1 Hosting operator id = 42 Hosting Expression = Subquery subquery#X, [id=#X] -AdaptiveSparkPlan (120) +Subquery:1 Hosting operator id = 47 Hosting Expression = Subquery subquery#X, [id=#X] +AdaptiveSparkPlan (136) +- == Final Plan == - VeloxColumnarToRowExec (102) - +- ^ ProjectExecTransformer (100) - +- ^ RegularHashAggregateExecTransformer (99) - +- ^ RegularHashAggregateExecTransformer (98) - +- ^ ProjectExecTransformer (97) - +- ^ ShuffledHashJoinExecTransformer Inner (96) - :- ^ InputIteratorTransformer (91) - : +- ^ InputAdapter (90) - : +- ^ ShuffleQueryStage (89), Statistics(X) - : +- ColumnarExchange (88) - : +- ^ ProjectExecTransformer (86) - : +- ^ ShuffledHashJoinExecTransformer Inner (85) - : :- ^ InputIteratorTransformer (80) - : : +- ^ InputAdapter (79) - : : +- ^ ShuffleQueryStage (78), Statistics(X) - : : +- ColumnarExchange (77) - : : +- ^ ProjectExecTransformer (75) - : : +- ^ FilterExecTransformer (74) - : : +- ^ Scan parquet (73) - : +- ^ InputIteratorTransformer (84) - : +- ^ InputAdapter (83) - : +- ^ ShuffleQueryStage (82), Statistics(X) - : +- ReusedExchange (81) - +- ^ InputIteratorTransformer (95) - +- ^ InputAdapter (94) - +- ^ ShuffleQueryStage (93), Statistics(X) - +- ReusedExchange (92) + VeloxColumnarToRowExec (114) + +- ^ ProjectExecTransformer (112) + +- ^ RegularHashAggregateExecTransformer (111) + +- ^ RegularHashAggregateExecTransformer (110) + +- ^ ProjectExecTransformer (109) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (108) + :- ^ InputIteratorTransformer (103) + : +- ShuffleQueryStage (101), Statistics(X) + : +- ColumnarExchange (100) + : +- VeloxAppendBatches (99) + : +- ^ ProjectExecTransformer (97) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (96) + : :- ^ InputIteratorTransformer (91) + : : +- ShuffleQueryStage (89), Statistics(X) + : : +- ColumnarExchange (88) + : : +- VeloxAppendBatches (87) + : : +- ^ ProjectExecTransformer (85) + : : +- ^ FilterExecTransformer (84) + : : +- ^ Scan parquet (83) + : +- ^ InputIteratorTransformer (95) + : +- ShuffleQueryStage (93), Statistics(X) + : +- ReusedExchange (92) + +- ^ InputIteratorTransformer (107) + +- ShuffleQueryStage (105), Statistics(X) + +- ReusedExchange (104) +- == Initial Plan == - HashAggregate (119) - +- HashAggregate (118) - +- Project (117) - +- ShuffledHashJoin Inner BuildRight (116) - :- Exchange (111) - : +- Project (110) - : +- ShuffledHashJoin Inner BuildRight (109) - : :- Exchange (105) - : : +- Filter (104) - : : +- Scan parquet (103) - : +- Exchange (108) - : +- Filter (107) - : +- Scan parquet (106) - +- Exchange (115) - +- Project (114) - +- Filter (113) - +- Scan parquet (112) - - -(73) Scan parquet + HashAggregate (135) + +- HashAggregate (134) + +- Project (133) + +- SortMergeJoin Inner (132) + :- Sort (126) + : +- Exchange (125) + : +- Project (124) + : +- SortMergeJoin Inner (123) + : :- Sort (118) + : : +- Exchange (117) + : : +- Filter (116) + : : +- Scan parquet (115) + : +- Sort (122) + : +- Exchange (121) + : +- Filter (120) + : +- Scan parquet (119) + +- Sort (131) + +- Exchange (130) + +- Project (129) + +- Filter (128) + +- Scan parquet (127) + + +(83) Scan parquet Output [3]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_suppkey)] ReadSchema: struct -(74) FilterExecTransformer +(84) FilterExecTransformer Input [3]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Arguments: isnotnull(ps_suppkey#X) -(75) ProjectExecTransformer +(85) ProjectExecTransformer Output [4]: [hash(ps_suppkey#X, 42) AS hash_partition_key#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Input [3]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] -(76) WholeStageCodegenTransformer (X) +(86) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Arguments: false -(77) ColumnarExchange +(87) VeloxAppendBatches +Input [4]: [hash_partition_key#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] +Arguments: X + +(88) ColumnarExchange Input [4]: [hash_partition_key#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_suppkey#X, ps_availqty#X, ps_supplycost#X], [plan_id=X], [id=#X] -(78) ShuffleQueryStage +(89) ShuffleQueryStage Output [3]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Arguments: X -(79) InputAdapter +(90) InputAdapter Input [3]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] -(80) InputIteratorTransformer +(91) InputIteratorTransformer Input [3]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] -(81) ReusedExchange [Reuses operator id: 13] +(92) ReusedExchange [Reuses operator id: 15] Output [2]: [s_suppkey#X, s_nationkey#X] -(82) ShuffleQueryStage +(93) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] Arguments: X -(83) InputAdapter +(94) InputAdapter Input [2]: [s_suppkey#X, s_nationkey#X] -(84) InputIteratorTransformer +(95) InputIteratorTransformer Input [2]: [s_suppkey#X, s_nationkey#X] -(85) ShuffledHashJoinExecTransformer +(96) ShuffledHashJoinExecTransformer Left keys [1]: [ps_suppkey#X] Right keys [1]: [s_suppkey#X] Join condition: None -(86) ProjectExecTransformer +(97) ProjectExecTransformer Output [4]: [hash(s_nationkey#X, 42) AS hash_partition_key#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] Input [5]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X, s_suppkey#X, s_nationkey#X] -(87) WholeStageCodegenTransformer (X) +(98) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] Arguments: false -(88) ColumnarExchange +(99) VeloxAppendBatches +Input [4]: [hash_partition_key#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] +Arguments: X + +(100) ColumnarExchange Input [4]: [hash_partition_key#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [ps_availqty#X, ps_supplycost#X, s_nationkey#X], [plan_id=X], [id=#X] -(89) ShuffleQueryStage +(101) ShuffleQueryStage Output [3]: [ps_availqty#X, ps_supplycost#X, s_nationkey#X] Arguments: X -(90) InputAdapter +(102) InputAdapter Input [3]: [ps_availqty#X, ps_supplycost#X, s_nationkey#X] -(91) InputIteratorTransformer +(103) InputIteratorTransformer Input [3]: [ps_availqty#X, ps_supplycost#X, s_nationkey#X] -(92) ReusedExchange [Reuses operator id: 28] +(104) ReusedExchange [Reuses operator id: 32] Output [1]: [n_nationkey#X] -(93) ShuffleQueryStage +(105) ShuffleQueryStage Output [1]: [n_nationkey#X] Arguments: X -(94) InputAdapter +(106) InputAdapter Input [1]: [n_nationkey#X] -(95) InputIteratorTransformer +(107) InputIteratorTransformer Input [1]: [n_nationkey#X] -(96) ShuffledHashJoinExecTransformer +(108) ShuffledHashJoinExecTransformer Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(97) ProjectExecTransformer +(109) ProjectExecTransformer Output [3]: [ps_availqty#X, ps_supplycost#X, CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)) AS _pre_X#X] Input [4]: [ps_availqty#X, ps_supplycost#X, s_nationkey#X, n_nationkey#X] -(98) RegularHashAggregateExecTransformer +(110) RegularHashAggregateExecTransformer Input [3]: [ps_availqty#X, ps_supplycost#X, _pre_X#X] Keys: [] Functions [1]: [partial_sum(_pre_X#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(99) RegularHashAggregateExecTransformer +(111) RegularHashAggregateExecTransformer Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))#X] Results [1]: [sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))#X] -(100) ProjectExecTransformer +(112) ProjectExecTransformer Output [1]: [CheckOverflow((promote_precision(cast(sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))#X as decimal(38,10))) * 0.0001000000), DecimalType(38,6)) AS (sum((ps_supplycost * ps_availqty)) * 0.0001000000)#X] Input [1]: [sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))#X] -(101) WholeStageCodegenTransformer (X) +(113) WholeStageCodegenTransformer (X) Input [1]: [(sum((ps_supplycost * ps_availqty)) * 0.0001000000)#X] Arguments: false -(102) VeloxColumnarToRowExec +(114) VeloxColumnarToRowExec Input [1]: [(sum((ps_supplycost * ps_availqty)) * 0.0001000000)#X] -(103) Scan parquet +(115) Scan parquet Output [3]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_suppkey)] ReadSchema: struct -(104) Filter +(116) Filter Input [3]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Condition : isnotnull(ps_suppkey#X) -(105) Exchange +(117) Exchange Input [3]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(106) Scan parquet +(118) Sort +Input [3]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] +Arguments: [ps_suppkey#X ASC NULLS FIRST], false, 0 + +(119) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(107) Filter +(120) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(108) Exchange +(121) Exchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(109) ShuffledHashJoin +(122) Sort +Input [2]: [s_suppkey#X, s_nationkey#X] +Arguments: [s_suppkey#X ASC NULLS FIRST], false, 0 + +(123) SortMergeJoin Left keys [1]: [ps_suppkey#X] Right keys [1]: [s_suppkey#X] Join condition: None -(110) Project +(124) Project Output [3]: [ps_availqty#X, ps_supplycost#X, s_nationkey#X] Input [5]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X, s_suppkey#X, s_nationkey#X] -(111) Exchange +(125) Exchange Input [3]: [ps_availqty#X, ps_supplycost#X, s_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(112) Scan parquet +(126) Sort +Input [3]: [ps_availqty#X, ps_supplycost#X, s_nationkey#X] +Arguments: [s_nationkey#X ASC NULLS FIRST], false, 0 + +(127) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,GERMANY), IsNotNull(n_nationkey)] ReadSchema: struct -(113) Filter +(128) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : ((isnotnull(n_name#X) AND (n_name#X = GERMANY)) AND isnotnull(n_nationkey#X)) -(114) Project +(129) Project Output [1]: [n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(115) Exchange +(130) Exchange Input [1]: [n_nationkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(116) ShuffledHashJoin +(131) Sort +Input [1]: [n_nationkey#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(132) SortMergeJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(117) Project +(133) Project Output [2]: [ps_availqty#X, ps_supplycost#X] Input [4]: [ps_availqty#X, ps_supplycost#X, s_nationkey#X, n_nationkey#X] -(118) HashAggregate +(134) HashAggregate Input [2]: [ps_availqty#X, ps_supplycost#X] Keys: [] Functions [1]: [partial_sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(119) HashAggregate +(135) HashAggregate Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))#X] Results [1]: [CheckOverflow((promote_precision(cast(sum(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(cast(ps_availqty#X as decimal(12,2)))), DecimalType(23,2)))#X as decimal(38,10))) * 0.0001000000), DecimalType(38,6)) AS (sum((ps_supplycost * ps_availqty)) * 0.0001000000)#X] -(120) AdaptiveSparkPlan +(136) AdaptiveSparkPlan Output [1]: [(sum((ps_supplycost * ps_availqty)) * 0.0001000000)#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/12.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/12.txt index 259ddb368b8de..1b36d274aab44 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/12.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/12.txt @@ -1,50 +1,52 @@ == Physical Plan == -AdaptiveSparkPlan (49) +AdaptiveSparkPlan (55) +- == Final Plan == - VeloxColumnarToRowExec (34) - +- ^ SortExecTransformer (32) - +- ^ InputIteratorTransformer (31) - +- ^ InputAdapter (30) - +- ^ ShuffleQueryStage (29), Statistics(X) - +- ColumnarExchange (28) - +- ^ RegularHashAggregateExecTransformer (26) - +- ^ InputIteratorTransformer (25) - +- ^ InputAdapter (24) - +- ^ ShuffleQueryStage (23), Statistics(X) - +- ColumnarExchange (22) - +- ^ ProjectExecTransformer (20) - +- ^ FlushableHashAggregateExecTransformer (19) - +- ^ ProjectExecTransformer (18) - +- ^ ShuffledHashJoinExecTransformer Inner (17) - :- ^ InputIteratorTransformer (8) - : +- ^ InputAdapter (7) - : +- ^ ShuffleQueryStage (6), Statistics(X) - : +- ColumnarExchange (5) + VeloxColumnarToRowExec (38) + +- ^ SortExecTransformer (36) + +- ^ InputIteratorTransformer (35) + +- ShuffleQueryStage (33), Statistics(X) + +- ColumnarExchange (32) + +- VeloxAppendBatches (31) + +- ^ RegularHashAggregateExecTransformer (29) + +- ^ InputIteratorTransformer (28) + +- ShuffleQueryStage (26), Statistics(X) + +- ColumnarExchange (25) + +- VeloxAppendBatches (24) + +- ^ ProjectExecTransformer (22) + +- ^ FlushableHashAggregateExecTransformer (21) + +- ^ ProjectExecTransformer (20) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (19) + :- ^ InputIteratorTransformer (9) + : +- ShuffleQueryStage (7), Statistics(X) + : +- ColumnarExchange (6) + : +- VeloxAppendBatches (5) : +- ^ ProjectExecTransformer (3) : +- ^ FilterExecTransformer (2) : +- ^ Scan parquet (1) - +- ^ InputIteratorTransformer (16) - +- ^ InputAdapter (15) - +- ^ ShuffleQueryStage (14), Statistics(X) - +- ColumnarExchange (13) - +- ^ ProjectExecTransformer (11) - +- ^ FilterExecTransformer (10) - +- ^ Scan parquet (9) + +- ^ InputIteratorTransformer (18) + +- ShuffleQueryStage (16), Statistics(X) + +- ColumnarExchange (15) + +- VeloxAppendBatches (14) + +- ^ ProjectExecTransformer (12) + +- ^ FilterExecTransformer (11) + +- ^ Scan parquet (10) +- == Initial Plan == - Sort (48) - +- Exchange (47) - +- HashAggregate (46) - +- Exchange (45) - +- HashAggregate (44) - +- Project (43) - +- ShuffledHashJoin Inner BuildLeft (42) - :- Exchange (37) - : +- Filter (36) - : +- Scan parquet (35) - +- Exchange (41) - +- Project (40) - +- Filter (39) - +- Scan parquet (38) + Sort (54) + +- Exchange (53) + +- HashAggregate (52) + +- Exchange (51) + +- HashAggregate (50) + +- Project (49) + +- SortMergeJoin Inner (48) + :- Sort (42) + : +- Exchange (41) + : +- Filter (40) + : +- Scan parquet (39) + +- Sort (47) + +- Exchange (46) + +- Project (45) + +- Filter (44) + +- Scan parquet (43) (1) Scan parquet @@ -66,196 +68,220 @@ Input [2]: [o_orderkey#X, o_orderpriority#X] Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderpriority#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderpriority#X] +Arguments: X + +(6) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderpriority#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderpriority#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [2]: [o_orderkey#X, o_orderpriority#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [2]: [o_orderkey#X, o_orderpriority#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [2]: [o_orderkey#X, o_orderpriority#X] -(9) Scan parquet +(10) Scan parquet Output [5]: [l_orderkey#X, l_shipdate#X, l_commitdate#X, l_receiptdate#X, l_shipmode#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_commitdate), IsNotNull(l_receiptdate), IsNotNull(l_shipdate), In(l_shipmode, [MAIL,SHIP]), GreaterThanOrEqual(l_receiptdate,1994-01-01), LessThan(l_receiptdate,1995-01-01), IsNotNull(l_orderkey)] ReadSchema: struct -(10) FilterExecTransformer +(11) FilterExecTransformer Input [5]: [l_orderkey#X, l_shipdate#X, l_commitdate#X, l_receiptdate#X, l_shipmode#X] Arguments: ((((((((isnotnull(l_commitdate#X) AND isnotnull(l_receiptdate#X)) AND isnotnull(l_shipdate#X)) AND l_shipmode#X IN (MAIL,SHIP)) AND (l_commitdate#X < l_receiptdate#X)) AND (l_shipdate#X < l_commitdate#X)) AND (l_receiptdate#X >= 1994-01-01)) AND (l_receiptdate#X < 1995-01-01)) AND isnotnull(l_orderkey#X)) -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [3]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_shipmode#X] Input [5]: [l_orderkey#X, l_shipdate#X, l_commitdate#X, l_receiptdate#X, l_shipmode#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, l_orderkey#X, l_shipmode#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [3]: [hash_partition_key#X, l_orderkey#X, l_shipmode#X] +Arguments: X + +(15) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_shipmode#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_shipmode#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [2]: [l_orderkey#X, l_shipmode#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [2]: [l_orderkey#X, l_shipmode#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [2]: [l_orderkey#X, l_shipmode#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [4]: [o_orderpriority#X, l_shipmode#X, CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END AS _pre_X#X, CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END AS _pre_X#X] Input [4]: [o_orderkey#X, o_orderpriority#X, l_orderkey#X, l_shipmode#X] -(19) FlushableHashAggregateExecTransformer +(21) FlushableHashAggregateExecTransformer Input [4]: [o_orderpriority#X, l_shipmode#X, _pre_X#X, _pre_X#X] Keys [1]: [l_shipmode#X] Functions [2]: [partial_sum(_pre_X#X), partial_sum(_pre_X#X)] Aggregate Attributes [2]: [sum#X, sum#X] Results [3]: [l_shipmode#X, sum#X, sum#X] -(20) ProjectExecTransformer +(22) ProjectExecTransformer Output [4]: [hash(l_shipmode#X, 42) AS hash_partition_key#X, l_shipmode#X, sum#X, sum#X] Input [3]: [l_shipmode#X, sum#X, sum#X] -(21) WholeStageCodegenTransformer (X) +(23) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, l_shipmode#X, sum#X, sum#X] Arguments: false -(22) ColumnarExchange +(24) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_shipmode#X, sum#X, sum#X] +Arguments: X + +(25) ColumnarExchange Input [4]: [hash_partition_key#X, l_shipmode#X, sum#X, sum#X] Arguments: hashpartitioning(l_shipmode#X, 1), ENSURE_REQUIREMENTS, [l_shipmode#X, sum#X, sum#X], [plan_id=X], [id=#X] -(23) ShuffleQueryStage +(26) ShuffleQueryStage Output [3]: [l_shipmode#X, sum#X, sum#X] Arguments: X -(24) InputAdapter +(27) InputAdapter Input [3]: [l_shipmode#X, sum#X, sum#X] -(25) InputIteratorTransformer +(28) InputIteratorTransformer Input [3]: [l_shipmode#X, sum#X, sum#X] -(26) RegularHashAggregateExecTransformer +(29) RegularHashAggregateExecTransformer Input [3]: [l_shipmode#X, sum#X, sum#X] Keys [1]: [l_shipmode#X] Functions [2]: [sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END), sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)] Aggregate Attributes [2]: [sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X, sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X] Results [3]: [l_shipmode#X, sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X AS high_line_count#X, sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X AS low_line_count#X] -(27) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: false -(28) ColumnarExchange +(31) VeloxAppendBatches +Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] +Arguments: X + +(32) ColumnarExchange Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: rangepartitioning(l_shipmode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(29) ShuffleQueryStage +(33) ShuffleQueryStage Output [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: X -(30) InputAdapter +(34) InputAdapter Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] -(31) InputIteratorTransformer +(35) InputIteratorTransformer Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] -(32) SortExecTransformer +(36) SortExecTransformer Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: [l_shipmode#X ASC NULLS FIRST], true, 0 -(33) WholeStageCodegenTransformer (X) +(37) WholeStageCodegenTransformer (X) Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: false -(34) VeloxColumnarToRowExec +(38) VeloxColumnarToRowExec Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] -(35) Scan parquet +(39) Scan parquet Output [2]: [o_orderkey#X, o_orderpriority#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderkey)] ReadSchema: struct -(36) Filter +(40) Filter Input [2]: [o_orderkey#X, o_orderpriority#X] Condition : isnotnull(o_orderkey#X) -(37) Exchange +(41) Exchange Input [2]: [o_orderkey#X, o_orderpriority#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(38) Scan parquet +(42) Sort +Input [2]: [o_orderkey#X, o_orderpriority#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(43) Scan parquet Output [5]: [l_orderkey#X, l_shipdate#X, l_commitdate#X, l_receiptdate#X, l_shipmode#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_commitdate), IsNotNull(l_receiptdate), IsNotNull(l_shipdate), In(l_shipmode, [MAIL,SHIP]), GreaterThanOrEqual(l_receiptdate,1994-01-01), LessThan(l_receiptdate,1995-01-01), IsNotNull(l_orderkey)] ReadSchema: struct -(39) Filter +(44) Filter Input [5]: [l_orderkey#X, l_shipdate#X, l_commitdate#X, l_receiptdate#X, l_shipmode#X] Condition : ((((((((isnotnull(l_commitdate#X) AND isnotnull(l_receiptdate#X)) AND isnotnull(l_shipdate#X)) AND l_shipmode#X IN (MAIL,SHIP)) AND (l_commitdate#X < l_receiptdate#X)) AND (l_shipdate#X < l_commitdate#X)) AND (l_receiptdate#X >= 1994-01-01)) AND (l_receiptdate#X < 1995-01-01)) AND isnotnull(l_orderkey#X)) -(40) Project +(45) Project Output [2]: [l_orderkey#X, l_shipmode#X] Input [5]: [l_orderkey#X, l_shipdate#X, l_commitdate#X, l_receiptdate#X, l_shipmode#X] -(41) Exchange +(46) Exchange Input [2]: [l_orderkey#X, l_shipmode#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(42) ShuffledHashJoin +(47) Sort +Input [2]: [l_orderkey#X, l_shipmode#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(48) SortMergeJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(43) Project +(49) Project Output [2]: [o_orderpriority#X, l_shipmode#X] Input [4]: [o_orderkey#X, o_orderpriority#X, l_orderkey#X, l_shipmode#X] -(44) HashAggregate +(50) HashAggregate Input [2]: [o_orderpriority#X, l_shipmode#X] Keys [1]: [l_shipmode#X] Functions [2]: [partial_sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END), partial_sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)] Aggregate Attributes [2]: [sum#X, sum#X] Results [3]: [l_shipmode#X, sum#X, sum#X] -(45) Exchange +(51) Exchange Input [3]: [l_shipmode#X, sum#X, sum#X] Arguments: hashpartitioning(l_shipmode#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(46) HashAggregate +(52) HashAggregate Input [3]: [l_shipmode#X, sum#X, sum#X] Keys [1]: [l_shipmode#X] Functions [2]: [sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END), sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)] Aggregate Attributes [2]: [sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X, sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X] Results [3]: [l_shipmode#X, sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X AS high_line_count#X, sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X AS low_line_count#X] -(47) Exchange +(53) Exchange Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: rangepartitioning(l_shipmode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(48) Sort +(54) Sort Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: [l_shipmode#X ASC NULLS FIRST], true, 0 -(49) AdaptiveSparkPlan +(55) AdaptiveSparkPlan Output [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/13.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/13.txt index 3f1309f476d94..83ec9aeda98a2 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/13.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/13.txt @@ -1,53 +1,55 @@ == Physical Plan == -AdaptiveSparkPlan (52) +AdaptiveSparkPlan (58) +- == Final Plan == - VeloxColumnarToRowExec (36) - +- ^ SortExecTransformer (34) - +- ^ InputIteratorTransformer (33) - +- ^ InputAdapter (32) - +- ^ ShuffleQueryStage (31), Statistics(X) - +- ColumnarExchange (30) - +- ^ RegularHashAggregateExecTransformer (28) - +- ^ InputIteratorTransformer (27) - +- ^ InputAdapter (26) - +- ^ ShuffleQueryStage (25), Statistics(X) - +- ColumnarExchange (24) - +- ^ ProjectExecTransformer (22) - +- ^ FlushableHashAggregateExecTransformer (21) - +- ^ ProjectExecTransformer (20) - +- ^ RegularHashAggregateExecTransformer (19) - +- ^ RegularHashAggregateExecTransformer (18) - +- ^ ProjectExecTransformer (17) - +- ^ ShuffledHashJoinExecTransformer LeftOuter (16) - :- ^ InputIteratorTransformer (7) - : +- ^ InputAdapter (6) - : +- ^ ShuffleQueryStage (5), Statistics(X) - : +- ColumnarExchange (4) + VeloxColumnarToRowExec (40) + +- ^ SortExecTransformer (38) + +- ^ InputIteratorTransformer (37) + +- ShuffleQueryStage (35), Statistics(X) + +- ColumnarExchange (34) + +- VeloxAppendBatches (33) + +- ^ RegularHashAggregateExecTransformer (31) + +- ^ InputIteratorTransformer (30) + +- ShuffleQueryStage (28), Statistics(X) + +- ColumnarExchange (27) + +- VeloxAppendBatches (26) + +- ^ ProjectExecTransformer (24) + +- ^ FlushableHashAggregateExecTransformer (23) + +- ^ ProjectExecTransformer (22) + +- ^ RegularHashAggregateExecTransformer (21) + +- ^ RegularHashAggregateExecTransformer (20) + +- ^ ProjectExecTransformer (19) + +- ^ ShuffledHashJoinExecTransformer LeftOuter BuildLeft (18) + :- ^ InputIteratorTransformer (8) + : +- ShuffleQueryStage (6), Statistics(X) + : +- ColumnarExchange (5) + : +- VeloxAppendBatches (4) : +- ^ ProjectExecTransformer (2) : +- ^ Scan parquet (1) - +- ^ InputIteratorTransformer (15) - +- ^ InputAdapter (14) - +- ^ ShuffleQueryStage (13), Statistics(X) - +- ColumnarExchange (12) - +- ^ ProjectExecTransformer (10) - +- ^ FilterExecTransformer (9) - +- ^ Scan parquet (8) + +- ^ InputIteratorTransformer (17) + +- ShuffleQueryStage (15), Statistics(X) + +- ColumnarExchange (14) + +- VeloxAppendBatches (13) + +- ^ ProjectExecTransformer (11) + +- ^ FilterExecTransformer (10) + +- ^ Scan parquet (9) +- == Initial Plan == - Sort (51) - +- Exchange (50) - +- HashAggregate (49) - +- Exchange (48) - +- HashAggregate (47) - +- HashAggregate (46) - +- HashAggregate (45) - +- Project (44) - +- ShuffledHashJoin LeftOuter BuildRight (43) - :- Exchange (38) - : +- Scan parquet (37) - +- Exchange (42) - +- Project (41) - +- Filter (40) - +- Scan parquet (39) + Sort (57) + +- Exchange (56) + +- HashAggregate (55) + +- Exchange (54) + +- HashAggregate (53) + +- HashAggregate (52) + +- HashAggregate (51) + +- Project (50) + +- SortMergeJoin LeftOuter (49) + :- Sort (43) + : +- Exchange (42) + : +- Scan parquet (41) + +- Sort (48) + +- Exchange (47) + +- Project (46) + +- Filter (45) + +- Scan parquet (44) (1) Scan parquet @@ -64,223 +66,247 @@ Input [1]: [c_custkey#X] Input [2]: [hash_partition_key#X, c_custkey#X] Arguments: false -(4) ColumnarExchange +(4) VeloxAppendBatches +Input [2]: [hash_partition_key#X, c_custkey#X] +Arguments: X + +(5) ColumnarExchange Input [2]: [hash_partition_key#X, c_custkey#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X], [plan_id=X], [id=#X] -(5) ShuffleQueryStage +(6) ShuffleQueryStage Output [1]: [c_custkey#X] Arguments: X -(6) InputAdapter +(7) InputAdapter Input [1]: [c_custkey#X] -(7) InputIteratorTransformer +(8) InputIteratorTransformer Input [1]: [c_custkey#X] -(8) Scan parquet +(9) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_comment#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_comment), IsNotNull(o_custkey)] ReadSchema: struct -(9) FilterExecTransformer +(10) FilterExecTransformer Input [3]: [o_orderkey#X, o_custkey#X, o_comment#X] Arguments: ((isnotnull(o_comment#X) AND NOT o_comment#X LIKE %special%requests%) AND isnotnull(o_custkey#X)) -(10) ProjectExecTransformer +(11) ProjectExecTransformer Output [3]: [hash(o_custkey#X, 42) AS hash_partition_key#X, o_orderkey#X, o_custkey#X] Input [3]: [o_orderkey#X, o_custkey#X, o_comment#X] -(11) WholeStageCodegenTransformer (X) +(12) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] Arguments: false -(12) ColumnarExchange +(13) VeloxAppendBatches +Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] +Arguments: X + +(14) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [id=#X] -(13) ShuffleQueryStage +(15) ShuffleQueryStage Output [2]: [o_orderkey#X, o_custkey#X] Arguments: X -(14) InputAdapter +(16) InputAdapter Input [2]: [o_orderkey#X, o_custkey#X] -(15) InputIteratorTransformer +(17) InputIteratorTransformer Input [2]: [o_orderkey#X, o_custkey#X] -(16) ShuffledHashJoinExecTransformer +(18) ShuffledHashJoinExecTransformer Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(17) ProjectExecTransformer +(19) ProjectExecTransformer Output [2]: [c_custkey#X, o_orderkey#X] Input [3]: [c_custkey#X, o_orderkey#X, o_custkey#X] -(18) RegularHashAggregateExecTransformer +(20) RegularHashAggregateExecTransformer Input [2]: [c_custkey#X, o_orderkey#X] Keys [1]: [c_custkey#X] Functions [1]: [partial_count(o_orderkey#X)] Aggregate Attributes [1]: [count#X] Results [2]: [c_custkey#X, count#X] -(19) RegularHashAggregateExecTransformer +(21) RegularHashAggregateExecTransformer Input [2]: [c_custkey#X, count#X] Keys [1]: [c_custkey#X] Functions [1]: [count(o_orderkey#X)] Aggregate Attributes [1]: [count(o_orderkey#X)#X] Results [2]: [c_custkey#X, count(o_orderkey#X)#X] -(20) ProjectExecTransformer +(22) ProjectExecTransformer Output [1]: [count(o_orderkey#X)#X AS c_count#X] Input [2]: [c_custkey#X, count(o_orderkey#X)#X] -(21) FlushableHashAggregateExecTransformer +(23) FlushableHashAggregateExecTransformer Input [1]: [c_count#X] Keys [1]: [c_count#X] Functions [1]: [partial_count(1)] Aggregate Attributes [1]: [count#X] Results [2]: [c_count#X, count#X] -(22) ProjectExecTransformer +(24) ProjectExecTransformer Output [3]: [hash(c_count#X, 42) AS hash_partition_key#X, c_count#X, count#X] Input [2]: [c_count#X, count#X] -(23) WholeStageCodegenTransformer (X) +(25) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, c_count#X, count#X] Arguments: false -(24) ColumnarExchange +(26) VeloxAppendBatches +Input [3]: [hash_partition_key#X, c_count#X, count#X] +Arguments: X + +(27) ColumnarExchange Input [3]: [hash_partition_key#X, c_count#X, count#X] Arguments: hashpartitioning(c_count#X, 1), ENSURE_REQUIREMENTS, [c_count#X, count#X], [plan_id=X], [id=#X] -(25) ShuffleQueryStage +(28) ShuffleQueryStage Output [2]: [c_count#X, count#X] Arguments: X -(26) InputAdapter +(29) InputAdapter Input [2]: [c_count#X, count#X] -(27) InputIteratorTransformer +(30) InputIteratorTransformer Input [2]: [c_count#X, count#X] -(28) RegularHashAggregateExecTransformer +(31) RegularHashAggregateExecTransformer Input [2]: [c_count#X, count#X] Keys [1]: [c_count#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [c_count#X, count(1)#X AS custdist#X] -(29) WholeStageCodegenTransformer (X) +(32) WholeStageCodegenTransformer (X) Input [2]: [c_count#X, custdist#X] Arguments: false -(30) ColumnarExchange +(33) VeloxAppendBatches +Input [2]: [c_count#X, custdist#X] +Arguments: X + +(34) ColumnarExchange Input [2]: [c_count#X, custdist#X] Arguments: rangepartitioning(custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(31) ShuffleQueryStage +(35) ShuffleQueryStage Output [2]: [c_count#X, custdist#X] Arguments: X -(32) InputAdapter +(36) InputAdapter Input [2]: [c_count#X, custdist#X] -(33) InputIteratorTransformer +(37) InputIteratorTransformer Input [2]: [c_count#X, custdist#X] -(34) SortExecTransformer +(38) SortExecTransformer Input [2]: [c_count#X, custdist#X] Arguments: [custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST], true, 0 -(35) WholeStageCodegenTransformer (X) +(39) WholeStageCodegenTransformer (X) Input [2]: [c_count#X, custdist#X] Arguments: false -(36) VeloxColumnarToRowExec +(40) VeloxColumnarToRowExec Input [2]: [c_count#X, custdist#X] -(37) Scan parquet +(41) Scan parquet Output [1]: [c_custkey#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(38) Exchange +(42) Exchange Input [1]: [c_custkey#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(39) Scan parquet +(43) Sort +Input [1]: [c_custkey#X] +Arguments: [c_custkey#X ASC NULLS FIRST], false, 0 + +(44) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_comment#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_comment), IsNotNull(o_custkey)] ReadSchema: struct -(40) Filter +(45) Filter Input [3]: [o_orderkey#X, o_custkey#X, o_comment#X] Condition : ((isnotnull(o_comment#X) AND NOT o_comment#X LIKE %special%requests%) AND isnotnull(o_custkey#X)) -(41) Project +(46) Project Output [2]: [o_orderkey#X, o_custkey#X] Input [3]: [o_orderkey#X, o_custkey#X, o_comment#X] -(42) Exchange +(47) Exchange Input [2]: [o_orderkey#X, o_custkey#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(43) ShuffledHashJoin +(48) Sort +Input [2]: [o_orderkey#X, o_custkey#X] +Arguments: [o_custkey#X ASC NULLS FIRST], false, 0 + +(49) SortMergeJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(44) Project +(50) Project Output [2]: [c_custkey#X, o_orderkey#X] Input [3]: [c_custkey#X, o_orderkey#X, o_custkey#X] -(45) HashAggregate +(51) HashAggregate Input [2]: [c_custkey#X, o_orderkey#X] Keys [1]: [c_custkey#X] Functions [1]: [partial_count(o_orderkey#X)] Aggregate Attributes [1]: [count#X] Results [2]: [c_custkey#X, count#X] -(46) HashAggregate +(52) HashAggregate Input [2]: [c_custkey#X, count#X] Keys [1]: [c_custkey#X] Functions [1]: [count(o_orderkey#X)] Aggregate Attributes [1]: [count(o_orderkey#X)#X] Results [1]: [count(o_orderkey#X)#X AS c_count#X] -(47) HashAggregate +(53) HashAggregate Input [1]: [c_count#X] Keys [1]: [c_count#X] Functions [1]: [partial_count(1)] Aggregate Attributes [1]: [count#X] Results [2]: [c_count#X, count#X] -(48) Exchange +(54) Exchange Input [2]: [c_count#X, count#X] Arguments: hashpartitioning(c_count#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(49) HashAggregate +(55) HashAggregate Input [2]: [c_count#X, count#X] Keys [1]: [c_count#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [c_count#X, count(1)#X AS custdist#X] -(50) Exchange +(56) Exchange Input [2]: [c_count#X, custdist#X] Arguments: rangepartitioning(custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(51) Sort +(57) Sort Input [2]: [c_count#X, custdist#X] Arguments: [custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST], true, 0 -(52) AdaptiveSparkPlan +(58) AdaptiveSparkPlan Output [2]: [c_count#X, custdist#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/14.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/14.txt index 68716942cea85..a1108606b5bb9 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/14.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/14.txt @@ -1,38 +1,40 @@ == Physical Plan == -AdaptiveSparkPlan (35) +AdaptiveSparkPlan (39) +- == Final Plan == - VeloxColumnarToRowExec (23) - +- ^ ProjectExecTransformer (21) - +- ^ RegularHashAggregateExecTransformer (20) - +- ^ RegularHashAggregateExecTransformer (19) - +- ^ ProjectExecTransformer (18) - +- ^ ShuffledHashJoinExecTransformer Inner (17) - :- ^ InputIteratorTransformer (8) - : +- ^ InputAdapter (7) - : +- ^ ShuffleQueryStage (6), Statistics(X) - : +- ColumnarExchange (5) + VeloxColumnarToRowExec (25) + +- ^ ProjectExecTransformer (23) + +- ^ RegularHashAggregateExecTransformer (22) + +- ^ RegularHashAggregateExecTransformer (21) + +- ^ ProjectExecTransformer (20) + +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (19) + :- ^ InputIteratorTransformer (9) + : +- ShuffleQueryStage (7), Statistics(X) + : +- ColumnarExchange (6) + : +- VeloxAppendBatches (5) : +- ^ ProjectExecTransformer (3) : +- ^ FilterExecTransformer (2) : +- ^ Scan parquet (1) - +- ^ InputIteratorTransformer (16) - +- ^ InputAdapter (15) - +- ^ ShuffleQueryStage (14), Statistics(X) - +- ColumnarExchange (13) - +- ^ ProjectExecTransformer (11) - +- ^ FilterExecTransformer (10) - +- ^ Scan parquet (9) + +- ^ InputIteratorTransformer (18) + +- ShuffleQueryStage (16), Statistics(X) + +- ColumnarExchange (15) + +- VeloxAppendBatches (14) + +- ^ ProjectExecTransformer (12) + +- ^ FilterExecTransformer (11) + +- ^ Scan parquet (10) +- == Initial Plan == - HashAggregate (34) - +- HashAggregate (33) - +- Project (32) - +- ShuffledHashJoin Inner BuildRight (31) - :- Exchange (27) - : +- Project (26) - : +- Filter (25) - : +- Scan parquet (24) - +- Exchange (30) - +- Filter (29) - +- Scan parquet (28) + HashAggregate (38) + +- HashAggregate (37) + +- Project (36) + +- SortMergeJoin Inner (35) + :- Sort (30) + : +- Exchange (29) + : +- Project (28) + : +- Filter (27) + : +- Scan parquet (26) + +- Sort (34) + +- Exchange (33) + +- Filter (32) + +- Scan parquet (31) (1) Scan parquet @@ -54,144 +56,160 @@ Input [4]: [l_partkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Input [4]: [hash_partition_key#X, l_partkey#X, l_extendedprice#X, l_discount#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_partkey#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(6) ColumnarExchange Input [4]: [hash_partition_key#X, l_partkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [3]: [l_partkey#X, l_extendedprice#X, l_discount#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [3]: [l_partkey#X, l_extendedprice#X, l_discount#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [3]: [l_partkey#X, l_extendedprice#X, l_discount#X] -(9) Scan parquet +(10) Scan parquet Output [2]: [p_partkey#X, p_type#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_partkey)] ReadSchema: struct -(10) FilterExecTransformer +(11) FilterExecTransformer Input [2]: [p_partkey#X, p_type#X] Arguments: isnotnull(p_partkey#X) -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [3]: [hash(p_partkey#X, 42) AS hash_partition_key#X, p_partkey#X, p_type#X] Input [2]: [p_partkey#X, p_type#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, p_partkey#X, p_type#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [3]: [hash_partition_key#X, p_partkey#X, p_type#X] +Arguments: X + +(15) ColumnarExchange Input [3]: [hash_partition_key#X, p_partkey#X, p_type#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X, p_type#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [2]: [p_partkey#X, p_type#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [2]: [p_partkey#X, p_type#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [2]: [p_partkey#X, p_type#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [5]: [l_extendedprice#X, l_discount#X, p_type#X, CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) ELSE 0.0000 END AS _pre_X#X, CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) AS _pre_X#X] Input [5]: [l_partkey#X, l_extendedprice#X, l_discount#X, p_partkey#X, p_type#X] -(19) RegularHashAggregateExecTransformer +(21) RegularHashAggregateExecTransformer Input [5]: [l_extendedprice#X, l_discount#X, p_type#X, _pre_X#X, _pre_X#X] Keys: [] Functions [2]: [partial_sum(_pre_X#X), partial_sum(_pre_X#X)] Aggregate Attributes [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Results [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] -(20) RegularHashAggregateExecTransformer +(22) RegularHashAggregateExecTransformer Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Keys: [] Functions [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) ELSE 0.0000 END), sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) ELSE 0.0000 END)#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] Results [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) ELSE 0.0000 END)#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] -(21) ProjectExecTransformer +(23) ProjectExecTransformer Output [1]: [CheckOverflow((promote_precision(CheckOverflow((100.0000 * promote_precision(sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) ELSE 0.0000 END)#X)), DecimalType(38,6))) / promote_precision(cast(sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X as decimal(38,6)))), DecimalType(38,6)) AS promo_revenue#X] Input [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) ELSE 0.0000 END)#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] -(22) WholeStageCodegenTransformer (X) +(24) WholeStageCodegenTransformer (X) Input [1]: [promo_revenue#X] Arguments: false -(23) VeloxColumnarToRowExec +(25) VeloxColumnarToRowExec Input [1]: [promo_revenue#X] -(24) Scan parquet +(26) Scan parquet Output [4]: [l_partkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1995-09-01), LessThan(l_shipdate,1995-10-01), IsNotNull(l_partkey)] ReadSchema: struct -(25) Filter +(27) Filter Input [4]: [l_partkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : (((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1995-09-01)) AND (l_shipdate#X < 1995-10-01)) AND isnotnull(l_partkey#X)) -(26) Project +(28) Project Output [3]: [l_partkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_partkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(27) Exchange +(29) Exchange Input [3]: [l_partkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(28) Scan parquet +(30) Sort +Input [3]: [l_partkey#X, l_extendedprice#X, l_discount#X] +Arguments: [l_partkey#X ASC NULLS FIRST], false, 0 + +(31) Scan parquet Output [2]: [p_partkey#X, p_type#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_partkey)] ReadSchema: struct -(29) Filter +(32) Filter Input [2]: [p_partkey#X, p_type#X] Condition : isnotnull(p_partkey#X) -(30) Exchange +(33) Exchange Input [2]: [p_partkey#X, p_type#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(31) ShuffledHashJoin +(34) Sort +Input [2]: [p_partkey#X, p_type#X] +Arguments: [p_partkey#X ASC NULLS FIRST], false, 0 + +(35) SortMergeJoin Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(32) Project +(36) Project Output [3]: [l_extendedprice#X, l_discount#X, p_type#X] Input [5]: [l_partkey#X, l_extendedprice#X, l_discount#X, p_partkey#X, p_type#X] -(33) HashAggregate +(37) HashAggregate Input [3]: [l_extendedprice#X, l_discount#X, p_type#X] Keys: [] Functions [2]: [partial_sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) ELSE 0.0000 END), partial_sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Results [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] -(34) HashAggregate +(38) HashAggregate Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Keys: [] Functions [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) ELSE 0.0000 END), sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) ELSE 0.0000 END)#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] Results [1]: [CheckOverflow((promote_precision(CheckOverflow((100.0000 * promote_precision(sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) ELSE 0.0000 END)#X)), DecimalType(38,6))) / promote_precision(cast(sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X as decimal(38,6)))), DecimalType(38,6)) AS promo_revenue#X] -(35) AdaptiveSparkPlan +(39) AdaptiveSparkPlan Output [1]: [promo_revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/15.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/15.txt index a9c9e810dd929..88730deb3c32b 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/15.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/15.txt @@ -1,45 +1,48 @@ == Physical Plan == -AdaptiveSparkPlan (42) +AdaptiveSparkPlan (47) +- == Final Plan == - VeloxColumnarToRowExec (27) - +- AQEShuffleRead (26) - +- ShuffleQueryStage (25), Statistics(X) - +- ColumnarExchange (24) - +- ^ ProjectExecTransformer (22) - +- ^ ShuffledHashJoinExecTransformer Inner (21) - :- ^ InputIteratorTransformer (8) - : +- ^ InputAdapter (7) - : +- ^ ShuffleQueryStage (6), Statistics(X) - : +- ColumnarExchange (5) - : +- ^ ProjectExecTransformer (3) - : +- ^ FilterExecTransformer (2) - : +- ^ Scan parquet (1) - +- ^ FilterExecTransformer (20) - +- ^ RegularHashAggregateExecTransformer (19) - +- ^ InputIteratorTransformer (18) - +- ^ InputAdapter (17) - +- ^ ShuffleQueryStage (16), Statistics(X) - +- ColumnarExchange (15) - +- ^ ProjectExecTransformer (13) - +- ^ FlushableHashAggregateExecTransformer (12) - +- ^ ProjectExecTransformer (11) - +- ^ FilterExecTransformer (10) - +- ^ Scan parquet (9) + VeloxColumnarToRowExec (30) + +- AQEShuffleRead (29) + +- ShuffleQueryStage (28), Statistics(X) + +- ColumnarExchange (27) + +- VeloxAppendBatches (26) + +- ^ ProjectExecTransformer (24) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (23) + :- ^ InputIteratorTransformer (9) + : +- ShuffleQueryStage (7), Statistics(X) + : +- ColumnarExchange (6) + : +- VeloxAppendBatches (5) + : +- ^ ProjectExecTransformer (3) + : +- ^ FilterExecTransformer (2) + : +- ^ Scan parquet (1) + +- ^ FilterExecTransformer (22) + +- ^ RegularHashAggregateExecTransformer (21) + +- ^ InputIteratorTransformer (20) + +- ShuffleQueryStage (18), Statistics(X) + +- ColumnarExchange (17) + +- VeloxAppendBatches (16) + +- ^ ProjectExecTransformer (14) + +- ^ FlushableHashAggregateExecTransformer (13) + +- ^ ProjectExecTransformer (12) + +- ^ FilterExecTransformer (11) + +- ^ Scan parquet (10) +- == Initial Plan == - Sort (41) - +- Exchange (40) - +- Project (39) - +- ShuffledHashJoin Inner BuildLeft (38) - :- Exchange (30) - : +- Filter (29) - : +- Scan parquet (28) - +- Filter (37) - +- HashAggregate (36) - +- Exchange (35) - +- HashAggregate (34) - +- Project (33) - +- Filter (32) - +- Scan parquet (31) + Sort (46) + +- Exchange (45) + +- Project (44) + +- SortMergeJoin Inner (43) + :- Sort (34) + : +- Exchange (33) + : +- Filter (32) + : +- Scan parquet (31) + +- Sort (42) + +- Filter (41) + +- HashAggregate (40) + +- Exchange (39) + +- HashAggregate (38) + +- Project (37) + +- Filter (36) + +- Scan parquet (35) (1) Scan parquet @@ -61,329 +64,353 @@ Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_phone#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_phone#X] +Arguments: X + +(6) ColumnarExchange Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_phone#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_address#X, s_phone#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] -(9) Scan parquet +(10) Scan parquet Output [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1996-01-01), LessThan(l_shipdate,1996-04-01), IsNotNull(l_suppkey)] ReadSchema: struct -(10) FilterExecTransformer +(11) FilterExecTransformer Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: (((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1996-01-01)) AND (l_shipdate#X < 1996-04-01)) AND isnotnull(l_suppkey#X)) -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) AS _pre_X#X] Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(12) FlushableHashAggregateExecTransformer +(13) FlushableHashAggregateExecTransformer Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, _pre_X#X] Keys [1]: [l_suppkey#X] Functions [1]: [partial_sum(_pre_X#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_suppkey#X, sum#X, isEmpty#X] -(13) ProjectExecTransformer +(14) ProjectExecTransformer Output [4]: [hash(l_suppkey#X, 42) AS hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] Input [3]: [l_suppkey#X, sum#X, isEmpty#X] -(14) WholeStageCodegenTransformer (X) +(15) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: false -(15) ColumnarExchange +(16) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] +Arguments: X + +(17) ColumnarExchange Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(16) ShuffleQueryStage +(18) ShuffleQueryStage Output [3]: [l_suppkey#X, sum#X, isEmpty#X] Arguments: X -(17) InputAdapter +(19) InputAdapter Input [3]: [l_suppkey#X, sum#X, isEmpty#X] -(18) InputIteratorTransformer +(20) InputIteratorTransformer Input [3]: [l_suppkey#X, sum#X, isEmpty#X] -(19) RegularHashAggregateExecTransformer +(21) RegularHashAggregateExecTransformer Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Keys [1]: [l_suppkey#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] Results [2]: [l_suppkey#X AS supplier_no#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X AS total_revenue#X] -(20) FilterExecTransformer +(22) FilterExecTransformer Input [2]: [supplier_no#X, total_revenue#X] Arguments: (isnotnull(total_revenue#X) AND (total_revenue#X = Subquery subquery#X, [id=#X])) -(21) ShuffledHashJoinExecTransformer +(23) ShuffledHashJoinExecTransformer Left keys [1]: [s_suppkey#X] Right keys [1]: [supplier_no#X] Join condition: None -(22) ProjectExecTransformer +(24) ProjectExecTransformer Output [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Input [6]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, supplier_no#X, total_revenue#X] -(23) WholeStageCodegenTransformer (X) +(25) WholeStageCodegenTransformer (X) Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: false -(24) ColumnarExchange +(26) VeloxAppendBatches +Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] +Arguments: X + +(27) ColumnarExchange Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: rangepartitioning(s_suppkey#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(25) ShuffleQueryStage +(28) ShuffleQueryStage Output [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: X -(26) AQEShuffleRead +(29) AQEShuffleRead Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: local -(27) VeloxColumnarToRowExec +(30) VeloxColumnarToRowExec Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] -(28) Scan parquet +(31) Scan parquet Output [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey)] ReadSchema: struct -(29) Filter +(32) Filter Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] Condition : isnotnull(s_suppkey#X) -(30) Exchange +(33) Exchange Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(31) Scan parquet +(34) Sort +Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] +Arguments: [s_suppkey#X ASC NULLS FIRST], false, 0 + +(35) Scan parquet Output [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1996-01-01), LessThan(l_shipdate,1996-04-01), IsNotNull(l_suppkey)] ReadSchema: struct -(32) Filter +(36) Filter Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : (((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1996-01-01)) AND (l_shipdate#X < 1996-04-01)) AND isnotnull(l_suppkey#X)) -(33) Project +(37) Project Output [3]: [l_suppkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(34) HashAggregate +(38) HashAggregate Input [3]: [l_suppkey#X, l_extendedprice#X, l_discount#X] Keys [1]: [l_suppkey#X] Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_suppkey#X, sum#X, isEmpty#X] -(35) Exchange +(39) Exchange Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(36) HashAggregate +(40) HashAggregate Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Keys [1]: [l_suppkey#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] Results [2]: [l_suppkey#X AS supplier_no#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X AS total_revenue#X] -(37) Filter +(41) Filter Input [2]: [supplier_no#X, total_revenue#X] Condition : (isnotnull(total_revenue#X) AND (total_revenue#X = Subquery subquery#X, [id=#X])) -(38) ShuffledHashJoin +(42) Sort +Input [2]: [supplier_no#X, total_revenue#X] +Arguments: [supplier_no#X ASC NULLS FIRST], false, 0 + +(43) SortMergeJoin Left keys [1]: [s_suppkey#X] Right keys [1]: [supplier_no#X] Join condition: None -(39) Project +(44) Project Output [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Input [6]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, supplier_no#X, total_revenue#X] -(40) Exchange +(45) Exchange Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: rangepartitioning(s_suppkey#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(41) Sort +(46) Sort Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: [s_suppkey#X ASC NULLS FIRST], true, 0 -(42) AdaptiveSparkPlan +(47) AdaptiveSparkPlan Output [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: isFinalPlan=true ===== Subqueries ===== -Subquery:1 Hosting operator id = 20 Hosting Expression = Subquery subquery#X, [id=#X] -AdaptiveSparkPlan (67) +Subquery:1 Hosting operator id = 22 Hosting Expression = Subquery subquery#X, [id=#X] +AdaptiveSparkPlan (73) +- == Final Plan == - VeloxColumnarToRowExec (58) - +- ^ RegularHashAggregateExecTransformer (56) - +- ^ RegularHashAggregateExecTransformer (55) - +- ^ ProjectExecTransformer (54) - +- ^ RegularHashAggregateExecTransformer (53) - +- ^ InputIteratorTransformer (52) - +- ^ InputAdapter (51) - +- ^ ShuffleQueryStage (50), Statistics(X) - +- ColumnarExchange (49) - +- ^ ProjectExecTransformer (47) - +- ^ FlushableHashAggregateExecTransformer (46) - +- ^ ProjectExecTransformer (45) - +- ^ FilterExecTransformer (44) - +- ^ Scan parquet (43) + VeloxColumnarToRowExec (64) + +- ^ RegularHashAggregateExecTransformer (62) + +- ^ RegularHashAggregateExecTransformer (61) + +- ^ ProjectExecTransformer (60) + +- ^ RegularHashAggregateExecTransformer (59) + +- ^ InputIteratorTransformer (58) + +- ShuffleQueryStage (56), Statistics(X) + +- ColumnarExchange (55) + +- VeloxAppendBatches (54) + +- ^ ProjectExecTransformer (52) + +- ^ FlushableHashAggregateExecTransformer (51) + +- ^ ProjectExecTransformer (50) + +- ^ FilterExecTransformer (49) + +- ^ Scan parquet (48) +- == Initial Plan == - HashAggregate (66) - +- HashAggregate (65) - +- HashAggregate (64) - +- Exchange (63) - +- HashAggregate (62) - +- Project (61) - +- Filter (60) - +- Scan parquet (59) + HashAggregate (72) + +- HashAggregate (71) + +- HashAggregate (70) + +- Exchange (69) + +- HashAggregate (68) + +- Project (67) + +- Filter (66) + +- Scan parquet (65) -(43) Scan parquet +(48) Scan parquet Output [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1996-01-01), LessThan(l_shipdate,1996-04-01)] ReadSchema: struct -(44) FilterExecTransformer +(49) FilterExecTransformer Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: ((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1996-01-01)) AND (l_shipdate#X < 1996-04-01)) -(45) ProjectExecTransformer +(50) ProjectExecTransformer Output [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) AS _pre_X#X] Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(46) FlushableHashAggregateExecTransformer +(51) FlushableHashAggregateExecTransformer Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, _pre_X#X] Keys [1]: [l_suppkey#X] Functions [1]: [partial_sum(_pre_X#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_suppkey#X, sum#X, isEmpty#X] -(47) ProjectExecTransformer +(52) ProjectExecTransformer Output [4]: [hash(l_suppkey#X, 42) AS hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] Input [3]: [l_suppkey#X, sum#X, isEmpty#X] -(48) WholeStageCodegenTransformer (X) +(53) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: false -(49) ColumnarExchange +(54) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] +Arguments: X + +(55) ColumnarExchange Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(50) ShuffleQueryStage +(56) ShuffleQueryStage Output [3]: [l_suppkey#X, sum#X, isEmpty#X] Arguments: X -(51) InputAdapter +(57) InputAdapter Input [3]: [l_suppkey#X, sum#X, isEmpty#X] -(52) InputIteratorTransformer +(58) InputIteratorTransformer Input [3]: [l_suppkey#X, sum#X, isEmpty#X] -(53) RegularHashAggregateExecTransformer +(59) RegularHashAggregateExecTransformer Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Keys [1]: [l_suppkey#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] Results [2]: [l_suppkey#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] -(54) ProjectExecTransformer +(60) ProjectExecTransformer Output [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X AS total_revenue#X] Input [2]: [l_suppkey#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] -(55) RegularHashAggregateExecTransformer +(61) RegularHashAggregateExecTransformer Input [1]: [total_revenue#X] Keys: [] Functions [1]: [partial_max(total_revenue#X)] Aggregate Attributes [1]: [max#X] Results [1]: [max#X] -(56) RegularHashAggregateExecTransformer +(62) RegularHashAggregateExecTransformer Input [1]: [max#X] Keys: [] Functions [1]: [max(total_revenue#X)] Aggregate Attributes [1]: [max(total_revenue#X)#X] Results [1]: [max(total_revenue#X)#X AS max(total_revenue)#X] -(57) WholeStageCodegenTransformer (X) +(63) WholeStageCodegenTransformer (X) Input [1]: [max(total_revenue)#X] Arguments: false -(58) VeloxColumnarToRowExec +(64) VeloxColumnarToRowExec Input [1]: [max(total_revenue)#X] -(59) Scan parquet +(65) Scan parquet Output [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1996-01-01), LessThan(l_shipdate,1996-04-01)] ReadSchema: struct -(60) Filter +(66) Filter Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : ((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1996-01-01)) AND (l_shipdate#X < 1996-04-01)) -(61) Project +(67) Project Output [3]: [l_suppkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(62) HashAggregate +(68) HashAggregate Input [3]: [l_suppkey#X, l_extendedprice#X, l_discount#X] Keys [1]: [l_suppkey#X] Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_suppkey#X, sum#X, isEmpty#X] -(63) Exchange +(69) Exchange Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(64) HashAggregate +(70) HashAggregate Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Keys [1]: [l_suppkey#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] Results [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X AS total_revenue#X] -(65) HashAggregate +(71) HashAggregate Input [1]: [total_revenue#X] Keys: [] Functions [1]: [partial_max(total_revenue#X)] Aggregate Attributes [1]: [max#X] Results [1]: [max#X] -(66) HashAggregate +(72) HashAggregate Input [1]: [max#X] Keys: [] Functions [1]: [max(total_revenue#X)] Aggregate Attributes [1]: [max(total_revenue#X)#X] Results [1]: [max(total_revenue#X)#X AS max(total_revenue)#X] -(67) AdaptiveSparkPlan +(73) AdaptiveSparkPlan Output [1]: [max(total_revenue)#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/16.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/16.txt index f4b4bc81f48f9..535b6940301d8 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/16.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/16.txt @@ -1,64 +1,66 @@ == Physical Plan == -AdaptiveSparkPlan (64) +AdaptiveSparkPlan (71) +- == Final Plan == - VeloxColumnarToRowExec (42) - +- ^ SortExecTransformer (40) - +- ^ InputIteratorTransformer (39) - +- ^ InputAdapter (38) - +- ^ ShuffleQueryStage (37), Statistics(X) - +- ColumnarExchange (36) - +- ^ RegularHashAggregateExecTransformer (34) - +- ^ InputIteratorTransformer (33) - +- ^ InputAdapter (32) - +- ^ ShuffleQueryStage (31), Statistics(X) - +- ColumnarExchange (30) - +- ^ ProjectExecTransformer (28) - +- ^ FlushableHashAggregateExecTransformer (27) - +- ^ RegularHashAggregateExecTransformer (26) - +- ^ InputIteratorTransformer (25) - +- ^ InputAdapter (24) - +- ^ ShuffleQueryStage (23), Statistics(X) - +- ColumnarExchange (22) - +- ^ ProjectExecTransformer (20) - +- ^ FlushableHashAggregateExecTransformer (19) - +- ^ ProjectExecTransformer (18) - +- ^ ShuffledHashJoinExecTransformer Inner (17) - :- ^ InputIteratorTransformer (8) - : +- ^ InputAdapter (7) - : +- ^ ShuffleQueryStage (6), Statistics(X) - : +- ColumnarExchange (5) + VeloxColumnarToRowExec (47) + +- ^ SortExecTransformer (45) + +- ^ InputIteratorTransformer (44) + +- ShuffleQueryStage (42), Statistics(X) + +- ColumnarExchange (41) + +- VeloxAppendBatches (40) + +- ^ RegularHashAggregateExecTransformer (38) + +- ^ InputIteratorTransformer (37) + +- ShuffleQueryStage (35), Statistics(X) + +- ColumnarExchange (34) + +- VeloxAppendBatches (33) + +- ^ ProjectExecTransformer (31) + +- ^ FlushableHashAggregateExecTransformer (30) + +- ^ RegularHashAggregateExecTransformer (29) + +- ^ InputIteratorTransformer (28) + +- ShuffleQueryStage (26), Statistics(X) + +- ColumnarExchange (25) + +- VeloxAppendBatches (24) + +- ^ ProjectExecTransformer (22) + +- ^ FlushableHashAggregateExecTransformer (21) + +- ^ ProjectExecTransformer (20) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (19) + :- ^ InputIteratorTransformer (9) + : +- ShuffleQueryStage (7), Statistics(X) + : +- ColumnarExchange (6) + : +- VeloxAppendBatches (5) : +- ^ ProjectExecTransformer (3) : +- ^ FilterExecTransformer (2) : +- ^ Scan parquet (1) - +- ^ InputIteratorTransformer (16) - +- ^ InputAdapter (15) - +- ^ ShuffleQueryStage (14), Statistics(X) - +- ColumnarExchange (13) - +- ^ ProjectExecTransformer (11) - +- ^ FilterExecTransformer (10) - +- ^ Scan parquet (9) + +- ^ InputIteratorTransformer (18) + +- ShuffleQueryStage (16), Statistics(X) + +- ColumnarExchange (15) + +- VeloxAppendBatches (14) + +- ^ ProjectExecTransformer (12) + +- ^ FilterExecTransformer (11) + +- ^ Scan parquet (10) +- == Initial Plan == - Sort (63) - +- Exchange (62) - +- HashAggregate (61) - +- Exchange (60) - +- HashAggregate (59) - +- HashAggregate (58) - +- Exchange (57) - +- HashAggregate (56) - +- Project (55) - +- ShuffledHashJoin Inner BuildRight (54) - :- Exchange (50) - : +- BroadcastHashJoin LeftAnti BuildRight (49) - : :- Filter (44) - : : +- Scan parquet (43) - : +- BroadcastExchange (48) - : +- Project (47) - : +- Filter (46) - : +- Scan parquet (45) - +- Exchange (53) - +- Filter (52) - +- Scan parquet (51) + Sort (70) + +- Exchange (69) + +- HashAggregate (68) + +- Exchange (67) + +- HashAggregate (66) + +- HashAggregate (65) + +- Exchange (64) + +- HashAggregate (63) + +- Project (62) + +- SortMergeJoin Inner (61) + :- Sort (56) + : +- Exchange (55) + : +- BroadcastHashJoin LeftAnti BuildRight (54) + : :- Filter (49) + : : +- Scan parquet (48) + : +- BroadcastExchange (53) + : +- Project (52) + : +- Filter (51) + : +- Scan parquet (50) + +- Sort (60) + +- Exchange (59) + +- Filter (58) + +- Scan parquet (57) (1) Scan parquet @@ -80,270 +82,298 @@ Input [2]: [ps_partkey#X, ps_suppkey#X] Input [3]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [3]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X] +Arguments: X + +(6) ColumnarExchange Input [3]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X] Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [2]: [ps_partkey#X, ps_suppkey#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [2]: [ps_partkey#X, ps_suppkey#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [2]: [ps_partkey#X, ps_suppkey#X] -(9) Scan parquet +(10) Scan parquet Output [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_brand), IsNotNull(p_type), Not(EqualTo(p_brand,Brand#X)), Not(StringStartsWith(p_type,MEDIUM POLISHED)), In(p_size, [14,19,23,3,36,45,49,9]), IsNotNull(p_partkey)] ReadSchema: struct -(10) FilterExecTransformer +(11) FilterExecTransformer Input [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] Arguments: (((((isnotnull(p_brand#X) AND isnotnull(p_type#X)) AND NOT (p_brand#X = Brand#X)) AND NOT StartsWith(p_type#X, MEDIUM POLISHED)) AND p_size#X IN (49,14,23,45,19,3,36,9)) AND isnotnull(p_partkey#X)) -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [5]: [hash(p_partkey#X, 42) AS hash_partition_key#X, p_partkey#X, p_brand#X, p_type#X, p_size#X] Input [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, p_partkey#X, p_brand#X, p_type#X, p_size#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [5]: [hash_partition_key#X, p_partkey#X, p_brand#X, p_type#X, p_size#X] +Arguments: X + +(15) ColumnarExchange Input [5]: [hash_partition_key#X, p_partkey#X, p_brand#X, p_type#X, p_size#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X, p_brand#X, p_type#X, p_size#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [ps_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [4]: [ps_suppkey#X, p_brand#X, p_type#X, p_size#X] Input [6]: [ps_partkey#X, ps_suppkey#X, p_partkey#X, p_brand#X, p_type#X, p_size#X] -(19) FlushableHashAggregateExecTransformer +(21) FlushableHashAggregateExecTransformer Input [4]: [ps_suppkey#X, p_brand#X, p_type#X, p_size#X] Keys [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Functions: [] Aggregate Attributes: [] Results [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(20) ProjectExecTransformer +(22) ProjectExecTransformer Output [5]: [hash(p_brand#X, p_type#X, p_size#X, ps_suppkey#X, 42) AS hash_partition_key#X, p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(21) WholeStageCodegenTransformer (X) +(23) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Arguments: false -(22) ColumnarExchange +(24) VeloxAppendBatches +Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, ps_suppkey#X] +Arguments: X + +(25) ColumnarExchange Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, ps_suppkey#X], [plan_id=X], [id=#X] -(23) ShuffleQueryStage +(26) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Arguments: X -(24) InputAdapter +(27) InputAdapter Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(25) InputIteratorTransformer +(28) InputIteratorTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(26) RegularHashAggregateExecTransformer +(29) RegularHashAggregateExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Keys [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Functions: [] Aggregate Attributes: [] Results [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(27) FlushableHashAggregateExecTransformer +(30) FlushableHashAggregateExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] Functions [1]: [partial_count(distinct ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count#X] -(28) ProjectExecTransformer +(31) ProjectExecTransformer Output [5]: [hash(p_brand#X, p_type#X, p_size#X, 42) AS hash_partition_key#X, p_brand#X, p_type#X, p_size#X, count#X] Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] -(29) WholeStageCodegenTransformer (X) +(32) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, count#X] Arguments: false -(30) ColumnarExchange +(33) VeloxAppendBatches +Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, count#X] +Arguments: X + +(34) ColumnarExchange Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, count#X] Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, count#X], [plan_id=X], [id=#X] -(31) ShuffleQueryStage +(35) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, count#X] Arguments: X -(32) InputAdapter +(36) InputAdapter Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] -(33) InputIteratorTransformer +(37) InputIteratorTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] -(34) RegularHashAggregateExecTransformer +(38) RegularHashAggregateExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] Functions [1]: [count(distinct ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count(ps_suppkey#X)#X AS supplier_cnt#X] -(35) WholeStageCodegenTransformer (X) +(39) WholeStageCodegenTransformer (X) Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: false -(36) ColumnarExchange +(40) VeloxAppendBatches +Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] +Arguments: X + +(41) ColumnarExchange Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: rangepartitioning(supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(37) ShuffleQueryStage +(42) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: X -(38) InputAdapter +(43) InputAdapter Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] -(39) InputIteratorTransformer +(44) InputIteratorTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] -(40) SortExecTransformer +(45) SortExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: [supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST], true, 0 -(41) WholeStageCodegenTransformer (X) +(46) WholeStageCodegenTransformer (X) Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: false -(42) VeloxColumnarToRowExec +(47) VeloxColumnarToRowExec Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] -(43) Scan parquet +(48) Scan parquet Output [2]: [ps_partkey#X, ps_suppkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_partkey)] ReadSchema: struct -(44) Filter +(49) Filter Input [2]: [ps_partkey#X, ps_suppkey#X] Condition : isnotnull(ps_partkey#X) -(45) Scan parquet +(50) Scan parquet Output [2]: [s_suppkey#X, s_comment#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_comment)] ReadSchema: struct -(46) Filter +(51) Filter Input [2]: [s_suppkey#X, s_comment#X] Condition : (isnotnull(s_comment#X) AND s_comment#X LIKE %Customer%Complaints%) -(47) Project +(52) Project Output [1]: [s_suppkey#X] Input [2]: [s_suppkey#X, s_comment#X] -(48) BroadcastExchange +(53) BroadcastExchange Input [1]: [s_suppkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),true), [plan_id=X] -(49) BroadcastHashJoin +(54) BroadcastHashJoin Left keys [1]: [ps_suppkey#X] Right keys [1]: [s_suppkey#X] Join condition: None -(50) Exchange +(55) Exchange Input [2]: [ps_partkey#X, ps_suppkey#X] Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(51) Scan parquet +(56) Sort +Input [2]: [ps_partkey#X, ps_suppkey#X] +Arguments: [ps_partkey#X ASC NULLS FIRST], false, 0 + +(57) Scan parquet Output [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_brand), IsNotNull(p_type), Not(EqualTo(p_brand,Brand#X)), Not(StringStartsWith(p_type,MEDIUM POLISHED)), In(p_size, [14,19,23,3,36,45,49,9]), IsNotNull(p_partkey)] ReadSchema: struct -(52) Filter +(58) Filter Input [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] Condition : (((((isnotnull(p_brand#X) AND isnotnull(p_type#X)) AND NOT (p_brand#X = Brand#X)) AND NOT StartsWith(p_type#X, MEDIUM POLISHED)) AND p_size#X IN (49,14,23,45,19,3,36,9)) AND isnotnull(p_partkey#X)) -(53) Exchange +(59) Exchange Input [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(54) ShuffledHashJoin +(60) Sort +Input [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] +Arguments: [p_partkey#X ASC NULLS FIRST], false, 0 + +(61) SortMergeJoin Left keys [1]: [ps_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(55) Project +(62) Project Output [4]: [ps_suppkey#X, p_brand#X, p_type#X, p_size#X] Input [6]: [ps_partkey#X, ps_suppkey#X, p_partkey#X, p_brand#X, p_type#X, p_size#X] -(56) HashAggregate +(63) HashAggregate Input [4]: [ps_suppkey#X, p_brand#X, p_type#X, p_size#X] Keys [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Functions: [] Aggregate Attributes: [] Results [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(57) Exchange +(64) Exchange Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(58) HashAggregate +(65) HashAggregate Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Keys [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Functions: [] Aggregate Attributes: [] Results [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(59) HashAggregate +(66) HashAggregate Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] Functions [1]: [partial_count(distinct ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count#X] -(60) Exchange +(67) Exchange Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(61) HashAggregate +(68) HashAggregate Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] Functions [1]: [count(distinct ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count(ps_suppkey#X)#X AS supplier_cnt#X] -(62) Exchange +(69) Exchange Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: rangepartitioning(supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(63) Sort +(70) Sort Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: [supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST], true, 0 -(64) AdaptiveSparkPlan +(71) AdaptiveSparkPlan Output [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/17.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/17.txt index 50c9d211b3d36..d360b6c948e34 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/17.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/17.txt @@ -1,59 +1,62 @@ == Physical Plan == -AdaptiveSparkPlan (57) +AdaptiveSparkPlan (63) +- == Final Plan == - VeloxColumnarToRowExec (37) - +- ^ ProjectExecTransformer (35) - +- ^ RegularHashAggregateExecTransformer (34) - +- ^ RegularHashAggregateExecTransformer (33) - +- ^ ProjectExecTransformer (32) - +- ^ ShuffledHashJoinExecTransformer Inner (31) - :- ^ ProjectExecTransformer (18) - : +- ^ ShuffledHashJoinExecTransformer Inner (17) - : :- ^ InputIteratorTransformer (8) - : : +- ^ InputAdapter (7) - : : +- ^ ShuffleQueryStage (6), Statistics(X) - : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (40) + +- ^ ProjectExecTransformer (38) + +- ^ RegularHashAggregateExecTransformer (37) + +- ^ RegularHashAggregateExecTransformer (36) + +- ^ ProjectExecTransformer (35) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (34) + :- ^ ProjectExecTransformer (20) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (19) + : :- ^ InputIteratorTransformer (9) + : : +- ShuffleQueryStage (7), Statistics(X) + : : +- ColumnarExchange (6) + : : +- VeloxAppendBatches (5) : : +- ^ ProjectExecTransformer (3) : : +- ^ FilterExecTransformer (2) : : +- ^ Scan parquet (1) - : +- ^ InputIteratorTransformer (16) - : +- ^ InputAdapter (15) - : +- ^ ShuffleQueryStage (14), Statistics(X) - : +- ColumnarExchange (13) - : +- ^ ProjectExecTransformer (11) - : +- ^ FilterExecTransformer (10) - : +- ^ Scan parquet (9) - +- ^ FilterExecTransformer (30) - +- ^ ProjectExecTransformer (29) - +- ^ RegularHashAggregateExecTransformer (28) - +- ^ InputIteratorTransformer (27) - +- ^ InputAdapter (26) - +- ^ ShuffleQueryStage (25), Statistics(X) - +- ColumnarExchange (24) - +- ^ ProjectExecTransformer (22) - +- ^ FlushableHashAggregateExecTransformer (21) - +- ^ FilterExecTransformer (20) - +- ^ Scan parquet (19) + : +- ^ InputIteratorTransformer (18) + : +- ShuffleQueryStage (16), Statistics(X) + : +- ColumnarExchange (15) + : +- VeloxAppendBatches (14) + : +- ^ ProjectExecTransformer (12) + : +- ^ FilterExecTransformer (11) + : +- ^ Scan parquet (10) + +- ^ FilterExecTransformer (33) + +- ^ ProjectExecTransformer (32) + +- ^ RegularHashAggregateExecTransformer (31) + +- ^ InputIteratorTransformer (30) + +- ShuffleQueryStage (28), Statistics(X) + +- ColumnarExchange (27) + +- VeloxAppendBatches (26) + +- ^ ProjectExecTransformer (24) + +- ^ FlushableHashAggregateExecTransformer (23) + +- ^ FilterExecTransformer (22) + +- ^ Scan parquet (21) +- == Initial Plan == - HashAggregate (56) - +- HashAggregate (55) - +- Project (54) - +- ShuffledHashJoin Inner BuildRight (53) - :- Project (46) - : +- ShuffledHashJoin Inner BuildRight (45) - : :- Exchange (40) - : : +- Filter (39) - : : +- Scan parquet (38) - : +- Exchange (44) - : +- Project (43) - : +- Filter (42) - : +- Scan parquet (41) - +- Filter (52) - +- HashAggregate (51) - +- Exchange (50) - +- HashAggregate (49) - +- Filter (48) - +- Scan parquet (47) + HashAggregate (62) + +- HashAggregate (61) + +- Project (60) + +- SortMergeJoin Inner (59) + :- Project (51) + : +- SortMergeJoin Inner (50) + : :- Sort (44) + : : +- Exchange (43) + : : +- Filter (42) + : : +- Scan parquet (41) + : +- Sort (49) + : +- Exchange (48) + : +- Project (47) + : +- Filter (46) + : +- Scan parquet (45) + +- Sort (58) + +- Filter (57) + +- HashAggregate (56) + +- Exchange (55) + +- HashAggregate (54) + +- Filter (53) + +- Scan parquet (52) (1) Scan parquet @@ -75,250 +78,274 @@ Input [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] Input [4]: [hash_partition_key#X, l_partkey#X, l_quantity#X, l_extendedprice#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_partkey#X, l_quantity#X, l_extendedprice#X] +Arguments: X + +(6) ColumnarExchange Input [4]: [hash_partition_key#X, l_partkey#X, l_quantity#X, l_extendedprice#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_quantity#X, l_extendedprice#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] -(9) Scan parquet +(10) Scan parquet Output [3]: [p_partkey#X, p_brand#X, p_container#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_brand), IsNotNull(p_container), EqualTo(p_brand,Brand#X), EqualTo(p_container,MED BOX), IsNotNull(p_partkey)] ReadSchema: struct -(10) FilterExecTransformer +(11) FilterExecTransformer Input [3]: [p_partkey#X, p_brand#X, p_container#X] Arguments: ((((isnotnull(p_brand#X) AND isnotnull(p_container#X)) AND (p_brand#X = Brand#X)) AND (p_container#X = MED BOX)) AND isnotnull(p_partkey#X)) -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [2]: [hash(p_partkey#X, 42) AS hash_partition_key#X, p_partkey#X] Input [3]: [p_partkey#X, p_brand#X, p_container#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, p_partkey#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [2]: [hash_partition_key#X, p_partkey#X] +Arguments: X + +(15) ColumnarExchange Input [2]: [hash_partition_key#X, p_partkey#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [1]: [p_partkey#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [1]: [p_partkey#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [1]: [p_partkey#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [3]: [l_quantity#X, l_extendedprice#X, p_partkey#X] Input [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, p_partkey#X] -(19) Scan parquet +(21) Scan parquet Output [2]: [l_partkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey)] ReadSchema: struct -(20) FilterExecTransformer +(22) FilterExecTransformer Input [2]: [l_partkey#X, l_quantity#X] Arguments: isnotnull(l_partkey#X) -(21) FlushableHashAggregateExecTransformer +(23) FlushableHashAggregateExecTransformer Input [2]: [l_partkey#X, l_quantity#X] Keys [1]: [l_partkey#X] Functions [1]: [partial_avg(l_quantity#X)] Aggregate Attributes [2]: [sum#X, count#X] Results [3]: [l_partkey#X, sum#X, count#X] -(22) ProjectExecTransformer +(24) ProjectExecTransformer Output [4]: [hash(l_partkey#X, 42) AS hash_partition_key#X, l_partkey#X, sum#X, count#X] Input [3]: [l_partkey#X, sum#X, count#X] -(23) WholeStageCodegenTransformer (X) +(25) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, l_partkey#X, sum#X, count#X] Arguments: false -(24) ColumnarExchange +(26) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_partkey#X, sum#X, count#X] +Arguments: X + +(27) ColumnarExchange Input [4]: [hash_partition_key#X, l_partkey#X, sum#X, count#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, sum#X, count#X], [plan_id=X], [id=#X] -(25) ShuffleQueryStage +(28) ShuffleQueryStage Output [3]: [l_partkey#X, sum#X, count#X] Arguments: X -(26) InputAdapter +(29) InputAdapter Input [3]: [l_partkey#X, sum#X, count#X] -(27) InputIteratorTransformer +(30) InputIteratorTransformer Input [3]: [l_partkey#X, sum#X, count#X] -(28) RegularHashAggregateExecTransformer +(31) RegularHashAggregateExecTransformer Input [3]: [l_partkey#X, sum#X, count#X] Keys [1]: [l_partkey#X] Functions [1]: [avg(l_quantity#X)] Aggregate Attributes [1]: [avg(l_quantity#X)#X] Results [2]: [l_partkey#X, avg(l_quantity#X)#X] -(29) ProjectExecTransformer +(32) ProjectExecTransformer Output [2]: [CheckOverflow((0.200000 * promote_precision(avg(l_quantity#X)#X)), DecimalType(18,7)) AS (0.2 * avg(l_quantity))#X, l_partkey#X] Input [2]: [l_partkey#X, avg(l_quantity#X)#X] -(30) FilterExecTransformer +(33) FilterExecTransformer Input [2]: [(0.2 * avg(l_quantity))#X, l_partkey#X] Arguments: isnotnull((0.2 * avg(l_quantity))#X) -(31) ShuffledHashJoinExecTransformer +(34) ShuffledHashJoinExecTransformer Left keys [1]: [p_partkey#X] Right keys [1]: [l_partkey#X] Join condition: (cast(l_quantity#X as decimal(18,7)) < (0.2 * avg(l_quantity))#X) -(32) ProjectExecTransformer +(35) ProjectExecTransformer Output [1]: [l_extendedprice#X] Input [5]: [l_quantity#X, l_extendedprice#X, p_partkey#X, (0.2 * avg(l_quantity))#X, l_partkey#X] -(33) RegularHashAggregateExecTransformer +(36) RegularHashAggregateExecTransformer Input [1]: [l_extendedprice#X] Keys: [] Functions [1]: [partial_sum(l_extendedprice#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(34) RegularHashAggregateExecTransformer +(37) RegularHashAggregateExecTransformer Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(l_extendedprice#X)] Aggregate Attributes [1]: [sum(l_extendedprice#X)#X] Results [1]: [sum(l_extendedprice#X)#X] -(35) ProjectExecTransformer +(38) ProjectExecTransformer Output [1]: [CheckOverflow((promote_precision(sum(l_extendedprice#X)#X) / 7.00), DecimalType(27,6)) AS avg_yearly#X] Input [1]: [sum(l_extendedprice#X)#X] -(36) WholeStageCodegenTransformer (X) +(39) WholeStageCodegenTransformer (X) Input [1]: [avg_yearly#X] Arguments: false -(37) VeloxColumnarToRowExec +(40) VeloxColumnarToRowExec Input [1]: [avg_yearly#X] -(38) Scan parquet +(41) Scan parquet Output [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey), IsNotNull(l_quantity)] ReadSchema: struct -(39) Filter +(42) Filter Input [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] Condition : (isnotnull(l_partkey#X) AND isnotnull(l_quantity#X)) -(40) Exchange +(43) Exchange Input [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(41) Scan parquet +(44) Sort +Input [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] +Arguments: [l_partkey#X ASC NULLS FIRST], false, 0 + +(45) Scan parquet Output [3]: [p_partkey#X, p_brand#X, p_container#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_brand), IsNotNull(p_container), EqualTo(p_brand,Brand#X), EqualTo(p_container,MED BOX), IsNotNull(p_partkey)] ReadSchema: struct -(42) Filter +(46) Filter Input [3]: [p_partkey#X, p_brand#X, p_container#X] Condition : ((((isnotnull(p_brand#X) AND isnotnull(p_container#X)) AND (p_brand#X = Brand#X)) AND (p_container#X = MED BOX)) AND isnotnull(p_partkey#X)) -(43) Project +(47) Project Output [1]: [p_partkey#X] Input [3]: [p_partkey#X, p_brand#X, p_container#X] -(44) Exchange +(48) Exchange Input [1]: [p_partkey#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(45) ShuffledHashJoin +(49) Sort +Input [1]: [p_partkey#X] +Arguments: [p_partkey#X ASC NULLS FIRST], false, 0 + +(50) SortMergeJoin Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(46) Project +(51) Project Output [3]: [l_quantity#X, l_extendedprice#X, p_partkey#X] Input [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, p_partkey#X] -(47) Scan parquet +(52) Scan parquet Output [2]: [l_partkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey)] ReadSchema: struct -(48) Filter +(53) Filter Input [2]: [l_partkey#X, l_quantity#X] Condition : isnotnull(l_partkey#X) -(49) HashAggregate +(54) HashAggregate Input [2]: [l_partkey#X, l_quantity#X] Keys [1]: [l_partkey#X] Functions [1]: [partial_avg(l_quantity#X)] Aggregate Attributes [2]: [sum#X, count#X] Results [3]: [l_partkey#X, sum#X, count#X] -(50) Exchange +(55) Exchange Input [3]: [l_partkey#X, sum#X, count#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(51) HashAggregate +(56) HashAggregate Input [3]: [l_partkey#X, sum#X, count#X] Keys [1]: [l_partkey#X] Functions [1]: [avg(l_quantity#X)] Aggregate Attributes [1]: [avg(l_quantity#X)#X] Results [2]: [CheckOverflow((0.200000 * promote_precision(avg(l_quantity#X)#X)), DecimalType(18,7)) AS (0.2 * avg(l_quantity))#X, l_partkey#X] -(52) Filter +(57) Filter Input [2]: [(0.2 * avg(l_quantity))#X, l_partkey#X] Condition : isnotnull((0.2 * avg(l_quantity))#X) -(53) ShuffledHashJoin +(58) Sort +Input [2]: [(0.2 * avg(l_quantity))#X, l_partkey#X] +Arguments: [l_partkey#X ASC NULLS FIRST], false, 0 + +(59) SortMergeJoin Left keys [1]: [p_partkey#X] Right keys [1]: [l_partkey#X] Join condition: (cast(l_quantity#X as decimal(18,7)) < (0.2 * avg(l_quantity))#X) -(54) Project +(60) Project Output [1]: [l_extendedprice#X] Input [5]: [l_quantity#X, l_extendedprice#X, p_partkey#X, (0.2 * avg(l_quantity))#X, l_partkey#X] -(55) HashAggregate +(61) HashAggregate Input [1]: [l_extendedprice#X] Keys: [] Functions [1]: [partial_sum(l_extendedprice#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(56) HashAggregate +(62) HashAggregate Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(l_extendedprice#X)] Aggregate Attributes [1]: [sum(l_extendedprice#X)#X] Results [1]: [CheckOverflow((promote_precision(sum(l_extendedprice#X)#X) / 7.00), DecimalType(27,6)) AS avg_yearly#X] -(57) AdaptiveSparkPlan +(63) AdaptiveSparkPlan Output [1]: [avg_yearly#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/18.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/18.txt index a29b40d1e996e..a664adfd3175c 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/18.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/18.txt @@ -1,96 +1,102 @@ == Physical Plan == -AdaptiveSparkPlan (97) +AdaptiveSparkPlan (110) +- == Final Plan == - VeloxColumnarToRowExec (64) - +- TakeOrderedAndProjectExecTransformer (63) - +- ^ RegularHashAggregateExecTransformer (61) - +- ^ RegularHashAggregateExecTransformer (60) - +- ^ ProjectExecTransformer (59) - +- ^ ShuffledHashJoinExecTransformer Inner (58) - :- ^ InputIteratorTransformer (41) - : +- ^ InputAdapter (40) - : +- ^ ShuffleQueryStage (39), Statistics(X) - : +- ColumnarExchange (38) - : +- ^ ProjectExecTransformer (36) - : +- ^ ShuffledHashJoinExecTransformer Inner (35) - : :- ^ InputIteratorTransformer (8) - : : +- ^ InputAdapter (7) - : : +- ^ ShuffleQueryStage (6), Statistics(X) - : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (70) + +- TakeOrderedAndProjectExecTransformer (69) + +- ^ RegularHashAggregateExecTransformer (67) + +- ^ RegularHashAggregateExecTransformer (66) + +- ^ ProjectExecTransformer (65) + +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (64) + :- ^ InputIteratorTransformer (46) + : +- ShuffleQueryStage (44), Statistics(X) + : +- ColumnarExchange (43) + : +- VeloxAppendBatches (42) + : +- ^ ProjectExecTransformer (40) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (39) + : :- ^ InputIteratorTransformer (9) + : : +- ShuffleQueryStage (7), Statistics(X) + : : +- ColumnarExchange (6) + : : +- VeloxAppendBatches (5) : : +- ^ ProjectExecTransformer (3) : : +- ^ FilterExecTransformer (2) : : +- ^ Scan parquet (1) - : +- ^ InputIteratorTransformer (34) - : +- ^ InputAdapter (33) - : +- ^ ShuffleQueryStage (32), Statistics(X) - : +- ColumnarExchange (31) - : +- ^ ProjectExecTransformer (29) - : +- ^ ShuffledHashJoinExecTransformer LeftSemi (28) - : :- ^ InputIteratorTransformer (16) - : : +- ^ InputAdapter (15) - : : +- ^ ShuffleQueryStage (14), Statistics(X) - : : +- ColumnarExchange (13) - : : +- ^ ProjectExecTransformer (11) - : : +- ^ FilterExecTransformer (10) - : : +- ^ Scan parquet (9) - : +- ^ ProjectExecTransformer (27) - : +- ^ FilterExecTransformer (26) - : +- ^ RegularHashAggregateExecTransformer (25) - : +- ^ InputIteratorTransformer (24) - : +- ^ InputAdapter (23) - : +- ^ ShuffleQueryStage (22), Statistics(X) - : +- ColumnarExchange (21) - : +- ^ ProjectExecTransformer (19) - : +- ^ FlushableHashAggregateExecTransformer (18) - : +- ^ Scan parquet (17) - +- ^ ShuffledHashJoinExecTransformer LeftSemi (57) - :- ^ InputIteratorTransformer (49) - : +- ^ InputAdapter (48) - : +- ^ ShuffleQueryStage (47), Statistics(X) - : +- ColumnarExchange (46) - : +- ^ ProjectExecTransformer (44) - : +- ^ FilterExecTransformer (43) - : +- ^ Scan parquet (42) - +- ^ ProjectExecTransformer (56) - +- ^ FilterExecTransformer (55) - +- ^ RegularHashAggregateExecTransformer (54) - +- ^ InputIteratorTransformer (53) - +- ^ InputAdapter (52) - +- ^ ShuffleQueryStage (51), Statistics(X) - +- ReusedExchange (50) + : +- ^ InputIteratorTransformer (38) + : +- ShuffleQueryStage (36), Statistics(X) + : +- ColumnarExchange (35) + : +- VeloxAppendBatches (34) + : +- ^ ProjectExecTransformer (32) + : +- ^ ShuffledHashJoinExecTransformer LeftSemi BuildRight (31) + : :- ^ InputIteratorTransformer (18) + : : +- ShuffleQueryStage (16), Statistics(X) + : : +- ColumnarExchange (15) + : : +- VeloxAppendBatches (14) + : : +- ^ ProjectExecTransformer (12) + : : +- ^ FilterExecTransformer (11) + : : +- ^ Scan parquet (10) + : +- ^ ProjectExecTransformer (30) + : +- ^ FilterExecTransformer (29) + : +- ^ RegularHashAggregateExecTransformer (28) + : +- ^ InputIteratorTransformer (27) + : +- ShuffleQueryStage (25), Statistics(X) + : +- ColumnarExchange (24) + : +- VeloxAppendBatches (23) + : +- ^ ProjectExecTransformer (21) + : +- ^ FlushableHashAggregateExecTransformer (20) + : +- ^ Scan parquet (19) + +- ^ ShuffledHashJoinExecTransformer LeftSemi BuildRight (63) + :- ^ InputIteratorTransformer (55) + : +- ShuffleQueryStage (53), Statistics(X) + : +- ColumnarExchange (52) + : +- VeloxAppendBatches (51) + : +- ^ ProjectExecTransformer (49) + : +- ^ FilterExecTransformer (48) + : +- ^ Scan parquet (47) + +- ^ ProjectExecTransformer (62) + +- ^ FilterExecTransformer (61) + +- ^ RegularHashAggregateExecTransformer (60) + +- ^ InputIteratorTransformer (59) + +- ShuffleQueryStage (57), Statistics(X) + +- ReusedExchange (56) +- == Initial Plan == - TakeOrderedAndProject (96) - +- HashAggregate (95) - +- HashAggregate (94) - +- Project (93) - +- ShuffledHashJoin Inner BuildRight (92) - :- Exchange (81) - : +- Project (80) - : +- ShuffledHashJoin Inner BuildLeft (79) - : :- Exchange (67) - : : +- Filter (66) - : : +- Scan parquet (65) - : +- Exchange (78) - : +- ShuffledHashJoin LeftSemi BuildRight (77) - : :- Exchange (70) - : : +- Filter (69) - : : +- Scan parquet (68) - : +- Project (76) - : +- Filter (75) - : +- HashAggregate (74) - : +- Exchange (73) - : +- HashAggregate (72) - : +- Scan parquet (71) - +- ShuffledHashJoin LeftSemi BuildRight (91) - :- Exchange (84) - : +- Filter (83) - : +- Scan parquet (82) - +- Project (90) - +- Filter (89) - +- HashAggregate (88) - +- Exchange (87) - +- HashAggregate (86) - +- Scan parquet (85) + TakeOrderedAndProject (109) + +- HashAggregate (108) + +- HashAggregate (107) + +- Project (106) + +- SortMergeJoin Inner (105) + :- Sort (92) + : +- Exchange (91) + : +- Project (90) + : +- SortMergeJoin Inner (89) + : :- Sort (74) + : : +- Exchange (73) + : : +- Filter (72) + : : +- Scan parquet (71) + : +- Sort (88) + : +- Exchange (87) + : +- SortMergeJoin LeftSemi (86) + : :- Sort (78) + : : +- Exchange (77) + : : +- Filter (76) + : : +- Scan parquet (75) + : +- Sort (85) + : +- Project (84) + : +- Filter (83) + : +- HashAggregate (82) + : +- Exchange (81) + : +- HashAggregate (80) + : +- Scan parquet (79) + +- SortMergeJoin LeftSemi (104) + :- Sort (96) + : +- Exchange (95) + : +- Filter (94) + : +- Scan parquet (93) + +- Sort (103) + +- Project (102) + +- Filter (101) + +- HashAggregate (100) + +- Exchange (99) + +- HashAggregate (98) + +- Scan parquet (97) (1) Scan parquet @@ -112,420 +118,472 @@ Input [2]: [c_custkey#X, c_name#X] Input [3]: [hash_partition_key#X, c_custkey#X, c_name#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [3]: [hash_partition_key#X, c_custkey#X, c_name#X] +Arguments: X + +(6) ColumnarExchange Input [3]: [hash_partition_key#X, c_custkey#X, c_name#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [2]: [c_custkey#X, c_name#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [2]: [c_custkey#X, c_name#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [2]: [c_custkey#X, c_name#X] -(9) Scan parquet +(10) Scan parquet Output [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(10) FilterExecTransformer +(11) FilterExecTransformer Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Arguments: (isnotnull(o_custkey#X) AND isnotnull(o_orderkey#X)) -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [5]: [hash(o_orderkey#X, 42) AS hash_partition_key#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] +Arguments: X + +(15) ColumnarExchange Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -(17) Scan parquet +(19) Scan parquet Output [2]: [l_orderkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(18) FlushableHashAggregateExecTransformer +(20) FlushableHashAggregateExecTransformer Input [2]: [l_orderkey#X, l_quantity#X] Keys [1]: [l_orderkey#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_orderkey#X, sum#X, isEmpty#X] -(19) ProjectExecTransformer +(21) ProjectExecTransformer Output [4]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, sum#X, isEmpty#X] Input [3]: [l_orderkey#X, sum#X, isEmpty#X] -(20) WholeStageCodegenTransformer (X) +(22) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, l_orderkey#X, sum#X, isEmpty#X] Arguments: false -(21) ColumnarExchange +(23) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_orderkey#X, sum#X, isEmpty#X] +Arguments: X + +(24) ColumnarExchange Input [4]: [hash_partition_key#X, l_orderkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(22) ShuffleQueryStage +(25) ShuffleQueryStage Output [3]: [l_orderkey#X, sum#X, isEmpty#X] Arguments: X -(23) InputAdapter +(26) InputAdapter Input [3]: [l_orderkey#X, sum#X, isEmpty#X] -(24) InputIteratorTransformer +(27) InputIteratorTransformer Input [3]: [l_orderkey#X, sum#X, isEmpty#X] -(25) RegularHashAggregateExecTransformer +(28) RegularHashAggregateExecTransformer Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Keys [1]: [l_orderkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [2]: [l_orderkey#X, sum(l_quantity#X)#X AS sum(l_quantity#X)#X] -(26) FilterExecTransformer +(29) FilterExecTransformer Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] Arguments: (isnotnull(sum(l_quantity#X)#X) AND (sum(l_quantity#X)#X > 300.00)) -(27) ProjectExecTransformer +(30) ProjectExecTransformer Output [1]: [l_orderkey#X] Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] -(28) ShuffledHashJoinExecTransformer +(31) ShuffledHashJoinExecTransformer Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(29) ProjectExecTransformer +(32) ProjectExecTransformer Output [5]: [hash(o_custkey#X, 42) AS hash_partition_key#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -(30) WholeStageCodegenTransformer (X) +(33) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Arguments: false -(31) ColumnarExchange +(34) VeloxAppendBatches +Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] +Arguments: X + +(35) ColumnarExchange Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X], [plan_id=X], [id=#X] -(32) ShuffleQueryStage +(36) ShuffleQueryStage Output [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Arguments: X -(33) InputAdapter +(37) InputAdapter Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -(34) InputIteratorTransformer +(38) InputIteratorTransformer Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -(35) ShuffledHashJoinExecTransformer +(39) ShuffledHashJoinExecTransformer Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(36) ProjectExecTransformer +(40) ProjectExecTransformer Output [6]: [hash(o_orderkey#X, 42) AS hash_partition_key#X, c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] Input [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -(37) WholeStageCodegenTransformer (X) +(41) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] Arguments: false -(38) ColumnarExchange +(42) VeloxAppendBatches +Input [6]: [hash_partition_key#X, c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] +Arguments: X + +(43) ColumnarExchange Input [6]: [hash_partition_key#X, c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X], [plan_id=X], [id=#X] -(39) ShuffleQueryStage +(44) ShuffleQueryStage Output [5]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] Arguments: X -(40) InputAdapter +(45) InputAdapter Input [5]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] -(41) InputIteratorTransformer +(46) InputIteratorTransformer Input [5]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] -(42) Scan parquet +(47) Scan parquet Output [2]: [l_orderkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_orderkey)] ReadSchema: struct -(43) FilterExecTransformer +(48) FilterExecTransformer Input [2]: [l_orderkey#X, l_quantity#X] Arguments: isnotnull(l_orderkey#X) -(44) ProjectExecTransformer +(49) ProjectExecTransformer Output [3]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_quantity#X] Input [2]: [l_orderkey#X, l_quantity#X] -(45) WholeStageCodegenTransformer (X) +(50) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, l_orderkey#X, l_quantity#X] Arguments: false -(46) ColumnarExchange +(51) VeloxAppendBatches +Input [3]: [hash_partition_key#X, l_orderkey#X, l_quantity#X] +Arguments: X + +(52) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_quantity#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_quantity#X], [plan_id=X], [id=#X] -(47) ShuffleQueryStage +(53) ShuffleQueryStage Output [2]: [l_orderkey#X, l_quantity#X] Arguments: X -(48) InputAdapter +(54) InputAdapter Input [2]: [l_orderkey#X, l_quantity#X] -(49) InputIteratorTransformer +(55) InputIteratorTransformer Input [2]: [l_orderkey#X, l_quantity#X] -(50) ReusedExchange [Reuses operator id: 21] +(56) ReusedExchange [Reuses operator id: 24] Output [3]: [l_orderkey#X, sum#X, isEmpty#X] -(51) ShuffleQueryStage +(57) ShuffleQueryStage Output [3]: [l_orderkey#X, sum#X, isEmpty#X] Arguments: X -(52) InputAdapter +(58) InputAdapter Input [3]: [l_orderkey#X, sum#X, isEmpty#X] -(53) InputIteratorTransformer +(59) InputIteratorTransformer Input [3]: [l_orderkey#X, sum#X, isEmpty#X] -(54) RegularHashAggregateExecTransformer +(60) RegularHashAggregateExecTransformer Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Keys [1]: [l_orderkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [2]: [l_orderkey#X, sum(l_quantity#X)#X AS sum(l_quantity#X)#X] -(55) FilterExecTransformer +(61) FilterExecTransformer Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] Arguments: (isnotnull(sum(l_quantity#X)#X) AND (sum(l_quantity#X)#X > 300.00)) -(56) ProjectExecTransformer +(62) ProjectExecTransformer Output [1]: [l_orderkey#X] Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] -(57) ShuffledHashJoinExecTransformer +(63) ShuffledHashJoinExecTransformer Left keys [1]: [l_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(58) ShuffledHashJoinExecTransformer +(64) ShuffledHashJoinExecTransformer Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(59) ProjectExecTransformer +(65) ProjectExecTransformer Output [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_quantity#X] Input [7]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_orderkey#X, l_quantity#X] -(60) RegularHashAggregateExecTransformer +(66) RegularHashAggregateExecTransformer Input [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_quantity#X] Keys [5]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] -(61) RegularHashAggregateExecTransformer +(67) RegularHashAggregateExecTransformer Input [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] Keys [5]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity#X)#X AS sum(l_quantity)#X] -(62) WholeStageCodegenTransformer (X) +(68) WholeStageCodegenTransformer (X) Input [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] Arguments: false -(63) TakeOrderedAndProjectExecTransformer +(69) TakeOrderedAndProjectExecTransformer Input [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] Arguments: X, [o_totalprice#X DESC NULLS LAST, o_orderdate#X ASC NULLS FIRST], [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X], 0 -(64) VeloxColumnarToRowExec +(70) VeloxColumnarToRowExec Input [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] -(65) Scan parquet +(71) Scan parquet Output [2]: [c_custkey#X, c_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey)] ReadSchema: struct -(66) Filter +(72) Filter Input [2]: [c_custkey#X, c_name#X] Condition : isnotnull(c_custkey#X) -(67) Exchange +(73) Exchange Input [2]: [c_custkey#X, c_name#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(68) Scan parquet +(74) Sort +Input [2]: [c_custkey#X, c_name#X] +Arguments: [c_custkey#X ASC NULLS FIRST], false, 0 + +(75) Scan parquet Output [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(69) Filter +(76) Filter Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Condition : (isnotnull(o_custkey#X) AND isnotnull(o_orderkey#X)) -(70) Exchange +(77) Exchange Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(71) Scan parquet +(78) Sort +Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(79) Scan parquet Output [2]: [l_orderkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(72) HashAggregate +(80) HashAggregate Input [2]: [l_orderkey#X, l_quantity#X] Keys [1]: [l_orderkey#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_orderkey#X, sum#X, isEmpty#X] -(73) Exchange +(81) Exchange Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(74) HashAggregate +(82) HashAggregate Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Keys [1]: [l_orderkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [2]: [l_orderkey#X, sum(l_quantity#X)#X AS sum(l_quantity#X)#X] -(75) Filter +(83) Filter Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] Condition : (isnotnull(sum(l_quantity#X)#X) AND (sum(l_quantity#X)#X > 300.00)) -(76) Project +(84) Project Output [1]: [l_orderkey#X] Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] -(77) ShuffledHashJoin +(85) Sort +Input [1]: [l_orderkey#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(86) SortMergeJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(78) Exchange +(87) Exchange Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(79) ShuffledHashJoin +(88) Sort +Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] +Arguments: [o_custkey#X ASC NULLS FIRST], false, 0 + +(89) SortMergeJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(80) Project +(90) Project Output [5]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] Input [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -(81) Exchange +(91) Exchange Input [5]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(82) Scan parquet +(92) Sort +Input [5]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(93) Scan parquet Output [2]: [l_orderkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_orderkey)] ReadSchema: struct -(83) Filter +(94) Filter Input [2]: [l_orderkey#X, l_quantity#X] Condition : isnotnull(l_orderkey#X) -(84) Exchange +(95) Exchange Input [2]: [l_orderkey#X, l_quantity#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(85) Scan parquet +(96) Sort +Input [2]: [l_orderkey#X, l_quantity#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(97) Scan parquet Output [2]: [l_orderkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(86) HashAggregate +(98) HashAggregate Input [2]: [l_orderkey#X, l_quantity#X] Keys [1]: [l_orderkey#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_orderkey#X, sum#X, isEmpty#X] -(87) Exchange +(99) Exchange Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(88) HashAggregate +(100) HashAggregate Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Keys [1]: [l_orderkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [2]: [l_orderkey#X, sum(l_quantity#X)#X AS sum(l_quantity#X)#X] -(89) Filter +(101) Filter Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] Condition : (isnotnull(sum(l_quantity#X)#X) AND (sum(l_quantity#X)#X > 300.00)) -(90) Project +(102) Project Output [1]: [l_orderkey#X] Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] -(91) ShuffledHashJoin +(103) Sort +Input [1]: [l_orderkey#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(104) SortMergeJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(92) ShuffledHashJoin +(105) SortMergeJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(93) Project +(106) Project Output [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_quantity#X] Input [7]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_orderkey#X, l_quantity#X] -(94) HashAggregate +(107) HashAggregate Input [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_quantity#X] Keys [5]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] -(95) HashAggregate +(108) HashAggregate Input [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] Keys [5]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity#X)#X AS sum(l_quantity)#X] -(96) TakeOrderedAndProject +(109) TakeOrderedAndProject Input [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] Arguments: X, [o_totalprice#X DESC NULLS LAST, o_orderdate#X ASC NULLS FIRST], [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] -(97) AdaptiveSparkPlan +(110) AdaptiveSparkPlan Output [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/19.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/19.txt index 69b8adaee61da..58e80362020ff 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/19.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/19.txt @@ -1,37 +1,39 @@ == Physical Plan == -AdaptiveSparkPlan (34) +AdaptiveSparkPlan (38) +- == Final Plan == - VeloxColumnarToRowExec (22) - +- ^ RegularHashAggregateExecTransformer (20) - +- ^ RegularHashAggregateExecTransformer (19) - +- ^ ProjectExecTransformer (18) - +- ^ ShuffledHashJoinExecTransformer Inner (17) - :- ^ InputIteratorTransformer (8) - : +- ^ InputAdapter (7) - : +- ^ ShuffleQueryStage (6), Statistics(X) - : +- ColumnarExchange (5) + VeloxColumnarToRowExec (24) + +- ^ RegularHashAggregateExecTransformer (22) + +- ^ RegularHashAggregateExecTransformer (21) + +- ^ ProjectExecTransformer (20) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (19) + :- ^ InputIteratorTransformer (9) + : +- ShuffleQueryStage (7), Statistics(X) + : +- ColumnarExchange (6) + : +- VeloxAppendBatches (5) : +- ^ ProjectExecTransformer (3) : +- ^ FilterExecTransformer (2) : +- ^ Scan parquet (1) - +- ^ InputIteratorTransformer (16) - +- ^ InputAdapter (15) - +- ^ ShuffleQueryStage (14), Statistics(X) - +- ColumnarExchange (13) - +- ^ ProjectExecTransformer (11) - +- ^ FilterExecTransformer (10) - +- ^ Scan parquet (9) + +- ^ InputIteratorTransformer (18) + +- ShuffleQueryStage (16), Statistics(X) + +- ColumnarExchange (15) + +- VeloxAppendBatches (14) + +- ^ ProjectExecTransformer (12) + +- ^ FilterExecTransformer (11) + +- ^ Scan parquet (10) +- == Initial Plan == - HashAggregate (33) - +- HashAggregate (32) - +- Project (31) - +- ShuffledHashJoin Inner BuildRight (30) - :- Exchange (26) - : +- Project (25) - : +- Filter (24) - : +- Scan parquet (23) - +- Exchange (29) - +- Filter (28) - +- Scan parquet (27) + HashAggregate (37) + +- HashAggregate (36) + +- Project (35) + +- SortMergeJoin Inner (34) + :- Sort (29) + : +- Exchange (28) + : +- Project (27) + : +- Filter (26) + : +- Scan parquet (25) + +- Sort (33) + +- Exchange (32) + +- Filter (31) + +- Scan parquet (30) (1) Scan parquet @@ -53,140 +55,156 @@ Input [6]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, l_shipin Input [5]: [hash_partition_key#X, l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [5]: [hash_partition_key#X, l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(6) ColumnarExchange Input [5]: [hash_partition_key#X, l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -(9) Scan parquet +(10) Scan parquet Output [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_size), GreaterThanOrEqual(p_size,1), IsNotNull(p_partkey), Or(Or(And(And(EqualTo(p_brand,Brand#X),In(p_container, [SM BOX,SM CASE,SM PACK,SM PKG])),LessThanOrEqual(p_size,5)),And(And(EqualTo(p_brand,Brand#X),In(p_container, [MED BAG,MED BOX,MED PACK,MED PKG])),LessThanOrEqual(p_size,10))),And(And(EqualTo(p_brand,Brand#X),In(p_container, [LG BOX,LG CASE,LG PACK,LG PKG])),LessThanOrEqual(p_size,15)))] ReadSchema: struct -(10) FilterExecTransformer +(11) FilterExecTransformer Input [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] Arguments: (((isnotnull(p_size#X) AND (p_size#X >= 1)) AND isnotnull(p_partkey#X)) AND (((((p_brand#X = Brand#X) AND p_container#X IN (SM CASE,SM BOX,SM PACK,SM PKG)) AND (p_size#X <= 5)) OR (((p_brand#X = Brand#X) AND p_container#X IN (MED BAG,MED BOX,MED PKG,MED PACK)) AND (p_size#X <= 10))) OR (((p_brand#X = Brand#X) AND p_container#X IN (LG CASE,LG BOX,LG PACK,LG PKG)) AND (p_size#X <= 15)))) -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [5]: [hash(p_partkey#X, 42) AS hash_partition_key#X, p_partkey#X, p_brand#X, p_size#X, p_container#X] Input [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, p_partkey#X, p_brand#X, p_size#X, p_container#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [5]: [hash_partition_key#X, p_partkey#X, p_brand#X, p_size#X, p_container#X] +Arguments: X + +(15) ColumnarExchange Input [5]: [hash_partition_key#X, p_partkey#X, p_brand#X, p_size#X, p_container#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X, p_brand#X, p_size#X, p_container#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join condition: (((((((p_brand#X = Brand#X) AND p_container#X IN (SM CASE,SM BOX,SM PACK,SM PKG)) AND (l_quantity#X >= 1.00)) AND (l_quantity#X <= 11.00)) AND (p_size#X <= 5)) OR (((((p_brand#X = Brand#X) AND p_container#X IN (MED BAG,MED BOX,MED PKG,MED PACK)) AND (l_quantity#X >= 10.00)) AND (l_quantity#X <= 20.00)) AND (p_size#X <= 10))) OR (((((p_brand#X = Brand#X) AND p_container#X IN (LG CASE,LG BOX,LG PACK,LG PKG)) AND (l_quantity#X >= 20.00)) AND (l_quantity#X <= 30.00)) AND (p_size#X <= 15))) -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [3]: [l_extendedprice#X, l_discount#X, CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) AS _pre_X#X] Input [8]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, p_partkey#X, p_brand#X, p_size#X, p_container#X] -(19) RegularHashAggregateExecTransformer +(21) RegularHashAggregateExecTransformer Input [3]: [l_extendedprice#X, l_discount#X, _pre_X#X] Keys: [] Functions [1]: [partial_sum(_pre_X#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(20) RegularHashAggregateExecTransformer +(22) RegularHashAggregateExecTransformer Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] Results [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X AS revenue#X] -(21) WholeStageCodegenTransformer (X) +(23) WholeStageCodegenTransformer (X) Input [1]: [revenue#X] Arguments: false -(22) VeloxColumnarToRowExec +(24) VeloxColumnarToRowExec Input [1]: [revenue#X] -(23) Scan parquet +(25) Scan parquet Output [6]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, l_shipinstruct#X, l_shipmode#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipinstruct), In(l_shipmode, [AIR,AIR REG]), EqualTo(l_shipinstruct,DELIVER IN PERSON), IsNotNull(l_partkey), Or(Or(And(GreaterThanOrEqual(l_quantity,1.00),LessThanOrEqual(l_quantity,11.00)),And(GreaterThanOrEqual(l_quantity,10.00),LessThanOrEqual(l_quantity,20.00))),And(GreaterThanOrEqual(l_quantity,20.00),LessThanOrEqual(l_quantity,30.00)))] ReadSchema: struct -(24) Filter +(26) Filter Input [6]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, l_shipinstruct#X, l_shipmode#X] Condition : ((((isnotnull(l_shipinstruct#X) AND l_shipmode#X IN (AIR,AIR REG)) AND (l_shipinstruct#X = DELIVER IN PERSON)) AND isnotnull(l_partkey#X)) AND ((((l_quantity#X >= 1.00) AND (l_quantity#X <= 11.00)) OR ((l_quantity#X >= 10.00) AND (l_quantity#X <= 20.00))) OR ((l_quantity#X >= 20.00) AND (l_quantity#X <= 30.00)))) -(25) Project +(27) Project Output [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Input [6]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, l_shipinstruct#X, l_shipmode#X] -(26) Exchange +(28) Exchange Input [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(27) Scan parquet +(29) Sort +Input [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] +Arguments: [l_partkey#X ASC NULLS FIRST], false, 0 + +(30) Scan parquet Output [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_size), GreaterThanOrEqual(p_size,1), IsNotNull(p_partkey), Or(Or(And(And(EqualTo(p_brand,Brand#X),In(p_container, [SM BOX,SM CASE,SM PACK,SM PKG])),LessThanOrEqual(p_size,5)),And(And(EqualTo(p_brand,Brand#X),In(p_container, [MED BAG,MED BOX,MED PACK,MED PKG])),LessThanOrEqual(p_size,10))),And(And(EqualTo(p_brand,Brand#X),In(p_container, [LG BOX,LG CASE,LG PACK,LG PKG])),LessThanOrEqual(p_size,15)))] ReadSchema: struct -(28) Filter +(31) Filter Input [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] Condition : (((isnotnull(p_size#X) AND (p_size#X >= 1)) AND isnotnull(p_partkey#X)) AND (((((p_brand#X = Brand#X) AND p_container#X IN (SM CASE,SM BOX,SM PACK,SM PKG)) AND (p_size#X <= 5)) OR (((p_brand#X = Brand#X) AND p_container#X IN (MED BAG,MED BOX,MED PKG,MED PACK)) AND (p_size#X <= 10))) OR (((p_brand#X = Brand#X) AND p_container#X IN (LG CASE,LG BOX,LG PACK,LG PKG)) AND (p_size#X <= 15)))) -(29) Exchange +(32) Exchange Input [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(30) ShuffledHashJoin +(33) Sort +Input [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] +Arguments: [p_partkey#X ASC NULLS FIRST], false, 0 + +(34) SortMergeJoin Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join condition: (((((((p_brand#X = Brand#X) AND p_container#X IN (SM CASE,SM BOX,SM PACK,SM PKG)) AND (l_quantity#X >= 1.00)) AND (l_quantity#X <= 11.00)) AND (p_size#X <= 5)) OR (((((p_brand#X = Brand#X) AND p_container#X IN (MED BAG,MED BOX,MED PKG,MED PACK)) AND (l_quantity#X >= 10.00)) AND (l_quantity#X <= 20.00)) AND (p_size#X <= 10))) OR (((((p_brand#X = Brand#X) AND p_container#X IN (LG CASE,LG BOX,LG PACK,LG PKG)) AND (l_quantity#X >= 20.00)) AND (l_quantity#X <= 30.00)) AND (p_size#X <= 15))) -(31) Project +(35) Project Output [2]: [l_extendedprice#X, l_discount#X] Input [8]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, p_partkey#X, p_brand#X, p_size#X, p_container#X] -(32) HashAggregate +(36) HashAggregate Input [2]: [l_extendedprice#X, l_discount#X] Keys: [] Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(33) HashAggregate +(37) HashAggregate Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] Results [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X AS revenue#X] -(34) AdaptiveSparkPlan +(38) AdaptiveSparkPlan Output [1]: [revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/20.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/20.txt index 4fa43644b5686..c22b822e6f7df 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/20.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/20.txt @@ -1,119 +1,129 @@ == Physical Plan == -AdaptiveSparkPlan (123) +AdaptiveSparkPlan (143) +- == Final Plan == - VeloxColumnarToRowExec (83) - +- AQEShuffleRead (82) - +- ShuffleQueryStage (81), Statistics(X) - +- ColumnarExchange (80) - +- ^ ProjectExecTransformer (78) - +- ^ ShuffledHashJoinExecTransformer Inner (77) - :- ^ InputIteratorTransformer (68) - : +- ^ InputAdapter (67) - : +- ^ ShuffleQueryStage (66), Statistics(X) - : +- ColumnarExchange (65) - : +- ^ ProjectExecTransformer (63) - : +- ^ ShuffledHashJoinExecTransformer LeftSemi (62) - : :- ^ InputIteratorTransformer (8) - : : +- ^ InputAdapter (7) - : : +- ^ ShuffleQueryStage (6), Statistics(X) - : : +- ColumnarExchange (5) - : : +- ^ ProjectExecTransformer (3) - : : +- ^ FilterExecTransformer (2) - : : +- ^ Scan parquet (1) - : +- ^ InputIteratorTransformer (61) - : +- ^ InputAdapter (60) - : +- ^ ShuffleQueryStage (59), Statistics(X) - : +- ColumnarExchange (58) - : +- ^ ProjectExecTransformer (56) - : +- ^ ShuffledHashJoinExecTransformer Inner (55) - : :- ^ InputIteratorTransformer (31) - : : +- ^ InputAdapter (30) - : : +- ^ ShuffleQueryStage (29), Statistics(X) - : : +- ColumnarExchange (28) - : : +- ^ ProjectExecTransformer (26) - : : +- ^ ShuffledHashJoinExecTransformer LeftSemi (25) - : : :- ^ InputIteratorTransformer (16) - : : : +- ^ InputAdapter (15) - : : : +- ^ ShuffleQueryStage (14), Statistics(X) - : : : +- ColumnarExchange (13) - : : : +- ^ ProjectExecTransformer (11) - : : : +- ^ FilterExecTransformer (10) - : : : +- ^ Scan parquet (9) - : : +- ^ InputIteratorTransformer (24) - : : +- ^ InputAdapter (23) - : : +- ^ ShuffleQueryStage (22), Statistics(X) - : : +- ColumnarExchange (21) - : : +- ^ ProjectExecTransformer (19) - : : +- ^ FilterExecTransformer (18) - : : +- ^ Scan parquet (17) - : +- ^ InputIteratorTransformer (54) - : +- ^ InputAdapter (53) - : +- ^ ShuffleQueryStage (52), Statistics(X) - : +- ColumnarExchange (51) - : +- ^ ProjectExecTransformer (49) - : +- ^ FilterExecTransformer (48) - : +- ^ ProjectExecTransformer (47) - : +- ^ RegularHashAggregateExecTransformer (46) - : +- ^ RegularHashAggregateExecTransformer (45) - : +- ^ ShuffledHashJoinExecTransformer LeftSemi (44) - : :- ^ InputIteratorTransformer (39) - : : +- ^ InputAdapter (38) - : : +- ^ ShuffleQueryStage (37), Statistics(X) - : : +- ColumnarExchange (36) - : : +- ^ ProjectExecTransformer (34) - : : +- ^ FilterExecTransformer (33) - : : +- ^ Scan parquet (32) - : +- ^ InputIteratorTransformer (43) - : +- ^ InputAdapter (42) - : +- ^ ShuffleQueryStage (41), Statistics(X) - : +- ReusedExchange (40) - +- ^ InputIteratorTransformer (76) - +- ^ InputAdapter (75) - +- ^ ShuffleQueryStage (74), Statistics(X) - +- ColumnarExchange (73) - +- ^ ProjectExecTransformer (71) - +- ^ FilterExecTransformer (70) - +- ^ Scan parquet (69) + VeloxColumnarToRowExec (93) + +- AQEShuffleRead (92) + +- ShuffleQueryStage (91), Statistics(X) + +- ColumnarExchange (90) + +- VeloxAppendBatches (89) + +- ^ ProjectExecTransformer (87) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (86) + :- ^ InputIteratorTransformer (76) + : +- ShuffleQueryStage (74), Statistics(X) + : +- ColumnarExchange (73) + : +- VeloxAppendBatches (72) + : +- ^ ProjectExecTransformer (70) + : +- ^ ShuffledHashJoinExecTransformer LeftSemi BuildRight (69) + : :- ^ InputIteratorTransformer (9) + : : +- ShuffleQueryStage (7), Statistics(X) + : : +- ColumnarExchange (6) + : : +- VeloxAppendBatches (5) + : : +- ^ ProjectExecTransformer (3) + : : +- ^ FilterExecTransformer (2) + : : +- ^ Scan parquet (1) + : +- ^ InputIteratorTransformer (68) + : +- ShuffleQueryStage (66), Statistics(X) + : +- ColumnarExchange (65) + : +- VeloxAppendBatches (64) + : +- ^ ProjectExecTransformer (62) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (61) + : :- ^ InputIteratorTransformer (35) + : : +- ShuffleQueryStage (33), Statistics(X) + : : +- ColumnarExchange (32) + : : +- VeloxAppendBatches (31) + : : +- ^ ProjectExecTransformer (29) + : : +- ^ ShuffledHashJoinExecTransformer LeftSemi BuildRight (28) + : : :- ^ InputIteratorTransformer (18) + : : : +- ShuffleQueryStage (16), Statistics(X) + : : : +- ColumnarExchange (15) + : : : +- VeloxAppendBatches (14) + : : : +- ^ ProjectExecTransformer (12) + : : : +- ^ FilterExecTransformer (11) + : : : +- ^ Scan parquet (10) + : : +- ^ InputIteratorTransformer (27) + : : +- ShuffleQueryStage (25), Statistics(X) + : : +- ColumnarExchange (24) + : : +- VeloxAppendBatches (23) + : : +- ^ ProjectExecTransformer (21) + : : +- ^ FilterExecTransformer (20) + : : +- ^ Scan parquet (19) + : +- ^ InputIteratorTransformer (60) + : +- ShuffleQueryStage (58), Statistics(X) + : +- ColumnarExchange (57) + : +- VeloxAppendBatches (56) + : +- ^ ProjectExecTransformer (54) + : +- ^ FilterExecTransformer (53) + : +- ^ ProjectExecTransformer (52) + : +- ^ RegularHashAggregateExecTransformer (51) + : +- ^ RegularHashAggregateExecTransformer (50) + : +- ^ ShuffledHashJoinExecTransformer LeftSemi BuildRight (49) + : :- ^ InputIteratorTransformer (44) + : : +- ShuffleQueryStage (42), Statistics(X) + : : +- ColumnarExchange (41) + : : +- VeloxAppendBatches (40) + : : +- ^ ProjectExecTransformer (38) + : : +- ^ FilterExecTransformer (37) + : : +- ^ Scan parquet (36) + : +- ^ InputIteratorTransformer (48) + : +- ShuffleQueryStage (46), Statistics(X) + : +- ReusedExchange (45) + +- ^ InputIteratorTransformer (85) + +- ShuffleQueryStage (83), Statistics(X) + +- ColumnarExchange (82) + +- VeloxAppendBatches (81) + +- ^ ProjectExecTransformer (79) + +- ^ FilterExecTransformer (78) + +- ^ Scan parquet (77) +- == Initial Plan == - Sort (122) - +- Exchange (121) - +- Project (120) - +- ShuffledHashJoin Inner BuildRight (119) - :- Exchange (114) - : +- Project (113) - : +- ShuffledHashJoin LeftSemi BuildRight (112) - : :- Exchange (86) - : : +- Filter (85) - : : +- Scan parquet (84) - : +- Exchange (111) - : +- Project (110) - : +- ShuffledHashJoin Inner BuildLeft (109) - : :- Exchange (95) - : : +- ShuffledHashJoin LeftSemi BuildRight (94) - : : :- Exchange (89) - : : : +- Filter (88) - : : : +- Scan parquet (87) - : : +- Exchange (93) - : : +- Project (92) - : : +- Filter (91) - : : +- Scan parquet (90) - : +- Exchange (108) - : +- Filter (107) - : +- HashAggregate (106) - : +- HashAggregate (105) - : +- ShuffledHashJoin LeftSemi BuildRight (104) - : :- Exchange (99) - : : +- Project (98) - : : +- Filter (97) - : : +- Scan parquet (96) - : +- Exchange (103) - : +- Project (102) - : +- Filter (101) - : +- Scan parquet (100) - +- Exchange (118) - +- Project (117) - +- Filter (116) - +- Scan parquet (115) + Sort (142) + +- Exchange (141) + +- Project (140) + +- SortMergeJoin Inner (139) + :- Sort (133) + : +- Exchange (132) + : +- Project (131) + : +- SortMergeJoin LeftSemi (130) + : :- Sort (97) + : : +- Exchange (96) + : : +- Filter (95) + : : +- Scan parquet (94) + : +- Sort (129) + : +- Exchange (128) + : +- Project (127) + : +- SortMergeJoin Inner (126) + : :- Sort (109) + : : +- Exchange (108) + : : +- SortMergeJoin LeftSemi (107) + : : :- Sort (101) + : : : +- Exchange (100) + : : : +- Filter (99) + : : : +- Scan parquet (98) + : : +- Sort (106) + : : +- Exchange (105) + : : +- Project (104) + : : +- Filter (103) + : : +- Scan parquet (102) + : +- Sort (125) + : +- Exchange (124) + : +- Filter (123) + : +- HashAggregate (122) + : +- HashAggregate (121) + : +- SortMergeJoin LeftSemi (120) + : :- Sort (114) + : : +- Exchange (113) + : : +- Project (112) + : : +- Filter (111) + : : +- Scan parquet (110) + : +- Sort (119) + : +- Exchange (118) + : +- Project (117) + : +- Filter (116) + : +- Scan parquet (115) + +- Sort (138) + +- Exchange (137) + +- Project (136) + +- Filter (135) + +- Scan parquet (134) (1) Scan parquet @@ -135,508 +145,588 @@ Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] +Arguments: X + +(6) ColumnarExchange Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] -(9) Scan parquet +(10) Scan parquet Output [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_availqty), IsNotNull(ps_partkey), IsNotNull(ps_suppkey)] ReadSchema: struct -(10) FilterExecTransformer +(11) FilterExecTransformer Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: ((isnotnull(ps_availqty#X) AND isnotnull(ps_partkey#X)) AND isnotnull(ps_suppkey#X)) -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [4]: [hash(ps_partkey#X, 42) AS hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X] Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X] +Arguments: X + +(15) ColumnarExchange Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_availqty#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] -(17) Scan parquet +(19) Scan parquet Output [2]: [p_partkey#X, p_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_name), StringStartsWith(p_name,forest)] ReadSchema: struct -(18) FilterExecTransformer +(20) FilterExecTransformer Input [2]: [p_partkey#X, p_name#X] Arguments: (isnotnull(p_name#X) AND StartsWith(p_name#X, forest)) -(19) ProjectExecTransformer +(21) ProjectExecTransformer Output [2]: [hash(p_partkey#X, 42) AS hash_partition_key#X, p_partkey#X] Input [2]: [p_partkey#X, p_name#X] -(20) WholeStageCodegenTransformer (X) +(22) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, p_partkey#X] Arguments: false -(21) ColumnarExchange +(23) VeloxAppendBatches +Input [2]: [hash_partition_key#X, p_partkey#X] +Arguments: X + +(24) ColumnarExchange Input [2]: [hash_partition_key#X, p_partkey#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [id=#X] -(22) ShuffleQueryStage +(25) ShuffleQueryStage Output [1]: [p_partkey#X] Arguments: X -(23) InputAdapter +(26) InputAdapter Input [1]: [p_partkey#X] -(24) InputIteratorTransformer +(27) InputIteratorTransformer Input [1]: [p_partkey#X] -(25) ShuffledHashJoinExecTransformer +(28) ShuffledHashJoinExecTransformer Left keys [1]: [ps_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(26) ProjectExecTransformer +(29) ProjectExecTransformer Output [4]: [hash(ps_partkey#X, ps_suppkey#X, 42) AS hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X] Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] -(27) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: false -(28) ColumnarExchange +(31) VeloxAppendBatches +Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X] +Arguments: X + +(32) ColumnarExchange Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: hashpartitioning(ps_partkey#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_availqty#X], [plan_id=X], [id=#X] -(29) ShuffleQueryStage +(33) ShuffleQueryStage Output [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: X -(30) InputAdapter +(34) InputAdapter Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] -(31) InputIteratorTransformer +(35) InputIteratorTransformer Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] -(32) Scan parquet +(36) Scan parquet Output [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1994-01-01), LessThan(l_shipdate,1995-01-01), IsNotNull(l_partkey), IsNotNull(l_suppkey)] ReadSchema: struct -(33) FilterExecTransformer +(37) FilterExecTransformer Input [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] Arguments: ((((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1994-01-01)) AND (l_shipdate#X < 1995-01-01)) AND isnotnull(l_partkey#X)) AND isnotnull(l_suppkey#X)) -(34) ProjectExecTransformer +(38) ProjectExecTransformer Output [4]: [hash(l_partkey#X, 42) AS hash_partition_key#X, l_partkey#X, l_suppkey#X, l_quantity#X] Input [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] -(35) WholeStageCodegenTransformer (X) +(39) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, l_partkey#X, l_suppkey#X, l_quantity#X] Arguments: false -(36) ColumnarExchange +(40) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_partkey#X, l_suppkey#X, l_quantity#X] +Arguments: X + +(41) ColumnarExchange Input [4]: [hash_partition_key#X, l_partkey#X, l_suppkey#X, l_quantity#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_suppkey#X, l_quantity#X], [plan_id=X], [id=#X] -(37) ShuffleQueryStage +(42) ShuffleQueryStage Output [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] Arguments: X -(38) InputAdapter +(43) InputAdapter Input [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] -(39) InputIteratorTransformer +(44) InputIteratorTransformer Input [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] -(40) ReusedExchange [Reuses operator id: 21] +(45) ReusedExchange [Reuses operator id: 24] Output [1]: [p_partkey#X] -(41) ShuffleQueryStage +(46) ShuffleQueryStage Output [1]: [p_partkey#X] Arguments: X -(42) InputAdapter +(47) InputAdapter Input [1]: [p_partkey#X] -(43) InputIteratorTransformer +(48) InputIteratorTransformer Input [1]: [p_partkey#X] -(44) ShuffledHashJoinExecTransformer +(49) ShuffledHashJoinExecTransformer Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(45) RegularHashAggregateExecTransformer +(50) RegularHashAggregateExecTransformer Input [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] Keys [2]: [l_partkey#X, l_suppkey#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] -(46) RegularHashAggregateExecTransformer +(51) RegularHashAggregateExecTransformer Input [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] Keys [2]: [l_partkey#X, l_suppkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [3]: [l_partkey#X, l_suppkey#X, sum(l_quantity#X)#X] -(47) ProjectExecTransformer +(52) ProjectExecTransformer Output [3]: [CheckOverflow((0.50 * promote_precision(sum(l_quantity#X)#X)), DecimalType(24,3)) AS (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Input [3]: [l_partkey#X, l_suppkey#X, sum(l_quantity#X)#X] -(48) FilterExecTransformer +(53) FilterExecTransformer Input [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Arguments: isnotnull((0.5 * sum(l_quantity))#X) -(49) ProjectExecTransformer +(54) ProjectExecTransformer Output [4]: [hash(l_partkey#X, l_suppkey#X, 42) AS hash_partition_key#X, (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Input [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] -(50) WholeStageCodegenTransformer (X) +(55) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Arguments: false -(51) ColumnarExchange +(56) VeloxAppendBatches +Input [4]: [hash_partition_key#X, (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] +Arguments: X + +(57) ColumnarExchange Input [4]: [hash_partition_key#X, (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Arguments: hashpartitioning(l_partkey#X, l_suppkey#X, 1), ENSURE_REQUIREMENTS, [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X], [plan_id=X], [id=#X] -(52) ShuffleQueryStage +(58) ShuffleQueryStage Output [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Arguments: X -(53) InputAdapter +(59) InputAdapter Input [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] -(54) InputIteratorTransformer +(60) InputIteratorTransformer Input [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] -(55) ShuffledHashJoinExecTransformer +(61) ShuffledHashJoinExecTransformer Left keys [2]: [ps_partkey#X, ps_suppkey#X] Right keys [2]: [l_partkey#X, l_suppkey#X] Join condition: (cast(ps_availqty#X as decimal(24,3)) > (0.5 * sum(l_quantity))#X) -(56) ProjectExecTransformer +(62) ProjectExecTransformer Output [2]: [hash(ps_suppkey#X, 42) AS hash_partition_key#X, ps_suppkey#X] Input [6]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] -(57) WholeStageCodegenTransformer (X) +(63) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, ps_suppkey#X] Arguments: false -(58) ColumnarExchange +(64) VeloxAppendBatches +Input [2]: [hash_partition_key#X, ps_suppkey#X] +Arguments: X + +(65) ColumnarExchange Input [2]: [hash_partition_key#X, ps_suppkey#X] Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_suppkey#X], [plan_id=X], [id=#X] -(59) ShuffleQueryStage +(66) ShuffleQueryStage Output [1]: [ps_suppkey#X] Arguments: X -(60) InputAdapter +(67) InputAdapter Input [1]: [ps_suppkey#X] -(61) InputIteratorTransformer +(68) InputIteratorTransformer Input [1]: [ps_suppkey#X] -(62) ShuffledHashJoinExecTransformer +(69) ShuffledHashJoinExecTransformer Left keys [1]: [s_suppkey#X] Right keys [1]: [ps_suppkey#X] Join condition: None -(63) ProjectExecTransformer +(70) ProjectExecTransformer Output [4]: [hash(s_nationkey#X, 42) AS hash_partition_key#X, s_name#X, s_address#X, s_nationkey#X] Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] -(64) WholeStageCodegenTransformer (X) +(71) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, s_name#X, s_address#X, s_nationkey#X] Arguments: false -(65) ColumnarExchange +(72) VeloxAppendBatches +Input [4]: [hash_partition_key#X, s_name#X, s_address#X, s_nationkey#X] +Arguments: X + +(73) ColumnarExchange Input [4]: [hash_partition_key#X, s_name#X, s_address#X, s_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_name#X, s_address#X, s_nationkey#X], [plan_id=X], [id=#X] -(66) ShuffleQueryStage +(74) ShuffleQueryStage Output [3]: [s_name#X, s_address#X, s_nationkey#X] Arguments: X -(67) InputAdapter +(75) InputAdapter Input [3]: [s_name#X, s_address#X, s_nationkey#X] -(68) InputIteratorTransformer +(76) InputIteratorTransformer Input [3]: [s_name#X, s_address#X, s_nationkey#X] -(69) Scan parquet +(77) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,CANADA), IsNotNull(n_nationkey)] ReadSchema: struct -(70) FilterExecTransformer +(78) FilterExecTransformer Input [2]: [n_nationkey#X, n_name#X] Arguments: ((isnotnull(n_name#X) AND (n_name#X = CANADA)) AND isnotnull(n_nationkey#X)) -(71) ProjectExecTransformer +(79) ProjectExecTransformer Output [2]: [hash(n_nationkey#X, 42) AS hash_partition_key#X, n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(72) WholeStageCodegenTransformer (X) +(80) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, n_nationkey#X] Arguments: false -(73) ColumnarExchange +(81) VeloxAppendBatches +Input [2]: [hash_partition_key#X, n_nationkey#X] +Arguments: X + +(82) ColumnarExchange Input [2]: [hash_partition_key#X, n_nationkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X], [plan_id=X], [id=#X] -(74) ShuffleQueryStage +(83) ShuffleQueryStage Output [1]: [n_nationkey#X] Arguments: X -(75) InputAdapter +(84) InputAdapter Input [1]: [n_nationkey#X] -(76) InputIteratorTransformer +(85) InputIteratorTransformer Input [1]: [n_nationkey#X] -(77) ShuffledHashJoinExecTransformer +(86) ShuffledHashJoinExecTransformer Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(78) ProjectExecTransformer +(87) ProjectExecTransformer Output [2]: [s_name#X, s_address#X] Input [4]: [s_name#X, s_address#X, s_nationkey#X, n_nationkey#X] -(79) WholeStageCodegenTransformer (X) +(88) WholeStageCodegenTransformer (X) Input [2]: [s_name#X, s_address#X] Arguments: false -(80) ColumnarExchange +(89) VeloxAppendBatches +Input [2]: [s_name#X, s_address#X] +Arguments: X + +(90) ColumnarExchange Input [2]: [s_name#X, s_address#X] Arguments: rangepartitioning(s_name#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(81) ShuffleQueryStage +(91) ShuffleQueryStage Output [2]: [s_name#X, s_address#X] Arguments: X -(82) AQEShuffleRead +(92) AQEShuffleRead Input [2]: [s_name#X, s_address#X] Arguments: local -(83) VeloxColumnarToRowExec +(93) VeloxColumnarToRowExec Input [2]: [s_name#X, s_address#X] -(84) Scan parquet +(94) Scan parquet Output [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_nationkey)] ReadSchema: struct -(85) Filter +(95) Filter Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Condition : isnotnull(s_nationkey#X) -(86) Exchange +(96) Exchange Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(87) Scan parquet +(97) Sort +Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] +Arguments: [s_suppkey#X ASC NULLS FIRST], false, 0 + +(98) Scan parquet Output [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_availqty), IsNotNull(ps_partkey), IsNotNull(ps_suppkey)] ReadSchema: struct -(88) Filter +(99) Filter Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Condition : ((isnotnull(ps_availqty#X) AND isnotnull(ps_partkey#X)) AND isnotnull(ps_suppkey#X)) -(89) Exchange +(100) Exchange Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(90) Scan parquet +(101) Sort +Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] +Arguments: [ps_partkey#X ASC NULLS FIRST], false, 0 + +(102) Scan parquet Output [2]: [p_partkey#X, p_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_name), StringStartsWith(p_name,forest)] ReadSchema: struct -(91) Filter +(103) Filter Input [2]: [p_partkey#X, p_name#X] Condition : (isnotnull(p_name#X) AND StartsWith(p_name#X, forest)) -(92) Project +(104) Project Output [1]: [p_partkey#X] Input [2]: [p_partkey#X, p_name#X] -(93) Exchange +(105) Exchange Input [1]: [p_partkey#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(94) ShuffledHashJoin +(106) Sort +Input [1]: [p_partkey#X] +Arguments: [p_partkey#X ASC NULLS FIRST], false, 0 + +(107) SortMergeJoin Left keys [1]: [ps_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(95) Exchange +(108) Exchange Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: hashpartitioning(ps_partkey#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(96) Scan parquet +(109) Sort +Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] +Arguments: [ps_partkey#X ASC NULLS FIRST, ps_suppkey#X ASC NULLS FIRST], false, 0 + +(110) Scan parquet Output [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1994-01-01), LessThan(l_shipdate,1995-01-01), IsNotNull(l_partkey), IsNotNull(l_suppkey)] ReadSchema: struct -(97) Filter +(111) Filter Input [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] Condition : ((((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1994-01-01)) AND (l_shipdate#X < 1995-01-01)) AND isnotnull(l_partkey#X)) AND isnotnull(l_suppkey#X)) -(98) Project +(112) Project Output [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] Input [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] -(99) Exchange +(113) Exchange Input [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(100) Scan parquet +(114) Sort +Input [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] +Arguments: [l_partkey#X ASC NULLS FIRST], false, 0 + +(115) Scan parquet Output [2]: [p_partkey#X, p_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_name), StringStartsWith(p_name,forest)] ReadSchema: struct -(101) Filter +(116) Filter Input [2]: [p_partkey#X, p_name#X] Condition : (isnotnull(p_name#X) AND StartsWith(p_name#X, forest)) -(102) Project +(117) Project Output [1]: [p_partkey#X] Input [2]: [p_partkey#X, p_name#X] -(103) Exchange +(118) Exchange Input [1]: [p_partkey#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(104) ShuffledHashJoin +(119) Sort +Input [1]: [p_partkey#X] +Arguments: [p_partkey#X ASC NULLS FIRST], false, 0 + +(120) SortMergeJoin Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join condition: None -(105) HashAggregate +(121) HashAggregate Input [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] Keys [2]: [l_partkey#X, l_suppkey#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] -(106) HashAggregate +(122) HashAggregate Input [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] Keys [2]: [l_partkey#X, l_suppkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [3]: [CheckOverflow((0.50 * promote_precision(sum(l_quantity#X)#X)), DecimalType(24,3)) AS (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] -(107) Filter +(123) Filter Input [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Condition : isnotnull((0.5 * sum(l_quantity))#X) -(108) Exchange +(124) Exchange Input [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Arguments: hashpartitioning(l_partkey#X, l_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(109) ShuffledHashJoin +(125) Sort +Input [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] +Arguments: [l_partkey#X ASC NULLS FIRST, l_suppkey#X ASC NULLS FIRST], false, 0 + +(126) SortMergeJoin Left keys [2]: [ps_partkey#X, ps_suppkey#X] Right keys [2]: [l_partkey#X, l_suppkey#X] Join condition: (cast(ps_availqty#X as decimal(24,3)) > (0.5 * sum(l_quantity))#X) -(110) Project +(127) Project Output [1]: [ps_suppkey#X] Input [6]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] -(111) Exchange +(128) Exchange Input [1]: [ps_suppkey#X] Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(112) ShuffledHashJoin +(129) Sort +Input [1]: [ps_suppkey#X] +Arguments: [ps_suppkey#X ASC NULLS FIRST], false, 0 + +(130) SortMergeJoin Left keys [1]: [s_suppkey#X] Right keys [1]: [ps_suppkey#X] Join condition: None -(113) Project +(131) Project Output [3]: [s_name#X, s_address#X, s_nationkey#X] Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] -(114) Exchange +(132) Exchange Input [3]: [s_name#X, s_address#X, s_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(115) Scan parquet +(133) Sort +Input [3]: [s_name#X, s_address#X, s_nationkey#X] +Arguments: [s_nationkey#X ASC NULLS FIRST], false, 0 + +(134) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,CANADA), IsNotNull(n_nationkey)] ReadSchema: struct -(116) Filter +(135) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : ((isnotnull(n_name#X) AND (n_name#X = CANADA)) AND isnotnull(n_nationkey#X)) -(117) Project +(136) Project Output [1]: [n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(118) Exchange +(137) Exchange Input [1]: [n_nationkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(119) ShuffledHashJoin +(138) Sort +Input [1]: [n_nationkey#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(139) SortMergeJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(120) Project +(140) Project Output [2]: [s_name#X, s_address#X] Input [4]: [s_name#X, s_address#X, s_nationkey#X, n_nationkey#X] -(121) Exchange +(141) Exchange Input [2]: [s_name#X, s_address#X] Arguments: rangepartitioning(s_name#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(122) Sort +(142) Sort Input [2]: [s_name#X, s_address#X] Arguments: [s_name#X ASC NULLS FIRST], true, 0 -(123) AdaptiveSparkPlan +(143) AdaptiveSparkPlan Output [2]: [s_name#X, s_address#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/21.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/21.txt index f17e6fa164a81..8413e2f8f2325 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/21.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/21.txt @@ -1,113 +1,122 @@ == Physical Plan == -AdaptiveSparkPlan (118) +AdaptiveSparkPlan (137) +- == Final Plan == - VeloxColumnarToRowExec (81) - +- ^ RegularHashAggregateExecTransformer (79) - +- ^ InputIteratorTransformer (78) - +- ^ InputAdapter (77) - +- ^ ShuffleQueryStage (76), Statistics(X) - +- ColumnarExchange (75) - +- ^ ProjectExecTransformer (73) - +- ^ FlushableHashAggregateExecTransformer (72) - +- ^ ProjectExecTransformer (71) - +- ^ ShuffledHashJoinExecTransformer Inner (70) - :- ^ InputIteratorTransformer (61) - : +- ^ InputAdapter (60) - : +- ^ ShuffleQueryStage (59), Statistics(X) - : +- ColumnarExchange (58) - : +- ^ ProjectExecTransformer (56) - : +- ^ ShuffledHashJoinExecTransformer Inner (55) - : :- ^ InputIteratorTransformer (46) - : : +- ^ InputAdapter (45) - : : +- ^ ShuffleQueryStage (44), Statistics(X) - : : +- ColumnarExchange (43) - : : +- ^ ProjectExecTransformer (41) - : : +- ^ ShuffledHashJoinExecTransformer Inner (40) - : : :- ^ InputIteratorTransformer (8) - : : : +- ^ InputAdapter (7) - : : : +- ^ ShuffleQueryStage (6), Statistics(X) - : : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (91) + +- ^ RegularHashAggregateExecTransformer (89) + +- ^ InputIteratorTransformer (88) + +- ShuffleQueryStage (86), Statistics(X) + +- ColumnarExchange (85) + +- VeloxAppendBatches (84) + +- ^ ProjectExecTransformer (82) + +- ^ FlushableHashAggregateExecTransformer (81) + +- ^ ProjectExecTransformer (80) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (79) + :- ^ InputIteratorTransformer (69) + : +- ShuffleQueryStage (67), Statistics(X) + : +- ColumnarExchange (66) + : +- VeloxAppendBatches (65) + : +- ^ ProjectExecTransformer (63) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (62) + : :- ^ InputIteratorTransformer (52) + : : +- ShuffleQueryStage (50), Statistics(X) + : : +- ColumnarExchange (49) + : : +- VeloxAppendBatches (48) + : : +- ^ ProjectExecTransformer (46) + : : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (45) + : : :- ^ InputIteratorTransformer (9) + : : : +- ShuffleQueryStage (7), Statistics(X) + : : : +- ColumnarExchange (6) + : : : +- VeloxAppendBatches (5) : : : +- ^ ProjectExecTransformer (3) : : : +- ^ FilterExecTransformer (2) : : : +- ^ Scan parquet (1) - : : +- ^ InputIteratorTransformer (39) - : : +- ^ InputAdapter (38) - : : +- ^ ShuffleQueryStage (37), Statistics(X) - : : +- ColumnarExchange (36) - : : +- ^ ProjectExecTransformer (34) - : : +- ^ ShuffledHashJoinExecTransformer LeftAnti (33) - : : :- ^ ShuffledHashJoinExecTransformer LeftSemi (24) - : : : :- ^ InputIteratorTransformer (16) - : : : : +- ^ InputAdapter (15) - : : : : +- ^ ShuffleQueryStage (14), Statistics(X) - : : : : +- ColumnarExchange (13) - : : : : +- ^ ProjectExecTransformer (11) - : : : : +- ^ FilterExecTransformer (10) - : : : : +- ^ Scan parquet (9) - : : : +- ^ InputIteratorTransformer (23) - : : : +- ^ InputAdapter (22) - : : : +- ^ ShuffleQueryStage (21), Statistics(X) - : : : +- ColumnarExchange (20) - : : : +- ^ ProjectExecTransformer (18) - : : : +- ^ Scan parquet (17) - : : +- ^ InputIteratorTransformer (32) - : : +- ^ InputAdapter (31) - : : +- ^ ShuffleQueryStage (30), Statistics(X) - : : +- ColumnarExchange (29) - : : +- ^ ProjectExecTransformer (27) - : : +- ^ FilterExecTransformer (26) - : : +- ^ Scan parquet (25) - : +- ^ InputIteratorTransformer (54) - : +- ^ InputAdapter (53) - : +- ^ ShuffleQueryStage (52), Statistics(X) - : +- ColumnarExchange (51) - : +- ^ ProjectExecTransformer (49) - : +- ^ FilterExecTransformer (48) - : +- ^ Scan parquet (47) - +- ^ InputIteratorTransformer (69) - +- ^ InputAdapter (68) - +- ^ ShuffleQueryStage (67), Statistics(X) - +- ColumnarExchange (66) - +- ^ ProjectExecTransformer (64) - +- ^ FilterExecTransformer (63) - +- ^ Scan parquet (62) + : : +- ^ InputIteratorTransformer (44) + : : +- ShuffleQueryStage (42), Statistics(X) + : : +- ColumnarExchange (41) + : : +- VeloxAppendBatches (40) + : : +- ^ ProjectExecTransformer (38) + : : +- ^ ShuffledHashJoinExecTransformer LeftAnti BuildRight (37) + : : :- ^ ShuffledHashJoinExecTransformer LeftSemi BuildRight (27) + : : : :- ^ InputIteratorTransformer (18) + : : : : +- ShuffleQueryStage (16), Statistics(X) + : : : : +- ColumnarExchange (15) + : : : : +- VeloxAppendBatches (14) + : : : : +- ^ ProjectExecTransformer (12) + : : : : +- ^ FilterExecTransformer (11) + : : : : +- ^ Scan parquet (10) + : : : +- ^ InputIteratorTransformer (26) + : : : +- ShuffleQueryStage (24), Statistics(X) + : : : +- ColumnarExchange (23) + : : : +- VeloxAppendBatches (22) + : : : +- ^ ProjectExecTransformer (20) + : : : +- ^ Scan parquet (19) + : : +- ^ InputIteratorTransformer (36) + : : +- ShuffleQueryStage (34), Statistics(X) + : : +- ColumnarExchange (33) + : : +- VeloxAppendBatches (32) + : : +- ^ ProjectExecTransformer (30) + : : +- ^ FilterExecTransformer (29) + : : +- ^ Scan parquet (28) + : +- ^ InputIteratorTransformer (61) + : +- ShuffleQueryStage (59), Statistics(X) + : +- ColumnarExchange (58) + : +- VeloxAppendBatches (57) + : +- ^ ProjectExecTransformer (55) + : +- ^ FilterExecTransformer (54) + : +- ^ Scan parquet (53) + +- ^ InputIteratorTransformer (78) + +- ShuffleQueryStage (76), Statistics(X) + +- ColumnarExchange (75) + +- VeloxAppendBatches (74) + +- ^ ProjectExecTransformer (72) + +- ^ FilterExecTransformer (71) + +- ^ Scan parquet (70) +- == Initial Plan == - TakeOrderedAndProject (117) - +- HashAggregate (116) - +- Exchange (115) - +- HashAggregate (114) - +- Project (113) - +- ShuffledHashJoin Inner BuildRight (112) - :- Exchange (107) - : +- Project (106) - : +- ShuffledHashJoin Inner BuildRight (105) - : :- Exchange (100) - : : +- Project (99) - : : +- ShuffledHashJoin Inner BuildLeft (98) - : : :- Exchange (84) - : : : +- Filter (83) - : : : +- Scan parquet (82) - : : +- Exchange (97) - : : +- ShuffledHashJoin LeftAnti BuildRight (96) - : : :- ShuffledHashJoin LeftSemi BuildRight (91) - : : : :- Exchange (88) - : : : : +- Project (87) - : : : : +- Filter (86) - : : : : +- Scan parquet (85) - : : : +- Exchange (90) - : : : +- Scan parquet (89) - : : +- Exchange (95) - : : +- Project (94) - : : +- Filter (93) - : : +- Scan parquet (92) - : +- Exchange (104) - : +- Project (103) - : +- Filter (102) - : +- Scan parquet (101) - +- Exchange (111) - +- Project (110) - +- Filter (109) - +- Scan parquet (108) + TakeOrderedAndProject (136) + +- HashAggregate (135) + +- Exchange (134) + +- HashAggregate (133) + +- Project (132) + +- SortMergeJoin Inner (131) + :- Sort (125) + : +- Exchange (124) + : +- Project (123) + : +- SortMergeJoin Inner (122) + : :- Sort (116) + : : +- Exchange (115) + : : +- Project (114) + : : +- SortMergeJoin Inner (113) + : : :- Sort (95) + : : : +- Exchange (94) + : : : +- Filter (93) + : : : +- Scan parquet (92) + : : +- Sort (112) + : : +- Exchange (111) + : : +- SortMergeJoin LeftAnti (110) + : : :- SortMergeJoin LeftSemi (104) + : : : :- Sort (100) + : : : : +- Exchange (99) + : : : : +- Project (98) + : : : : +- Filter (97) + : : : : +- Scan parquet (96) + : : : +- Sort (103) + : : : +- Exchange (102) + : : : +- Scan parquet (101) + : : +- Sort (109) + : : +- Exchange (108) + : : +- Project (107) + : : +- Filter (106) + : : +- Scan parquet (105) + : +- Sort (121) + : +- Exchange (120) + : +- Project (119) + : +- Filter (118) + : +- Scan parquet (117) + +- Sort (130) + +- Exchange (129) + +- Project (128) + +- Filter (127) + +- Scan parquet (126) (1) Scan parquet @@ -129,490 +138,566 @@ Input [3]: [s_suppkey#X, s_name#X, s_nationkey#X] Input [4]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_nationkey#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [4]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_nationkey#X] +Arguments: X + +(6) ColumnarExchange Input [4]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_nationkey#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [3]: [s_suppkey#X, s_name#X, s_nationkey#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [3]: [s_suppkey#X, s_name#X, s_nationkey#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [3]: [s_suppkey#X, s_name#X, s_nationkey#X] -(9) Scan parquet +(10) Scan parquet Output [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_receiptdate), IsNotNull(l_commitdate), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(10) FilterExecTransformer +(11) FilterExecTransformer Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Arguments: ((((isnotnull(l_receiptdate#X) AND isnotnull(l_commitdate#X)) AND (l_receiptdate#X > l_commitdate#X)) AND isnotnull(l_suppkey#X)) AND isnotnull(l_orderkey#X)) -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [3]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_suppkey#X] Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] +Arguments: X + +(15) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [2]: [l_orderkey#X, l_suppkey#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [2]: [l_orderkey#X, l_suppkey#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [2]: [l_orderkey#X, l_suppkey#X] -(17) Scan parquet +(19) Scan parquet Output [2]: [l_orderkey#X, l_suppkey#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [3]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_suppkey#X] Input [2]: [l_orderkey#X, l_suppkey#X] -(19) WholeStageCodegenTransformer (X) +(21) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] Arguments: false -(20) ColumnarExchange +(22) VeloxAppendBatches +Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] +Arguments: X + +(23) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [id=#X] -(21) ShuffleQueryStage +(24) ShuffleQueryStage Output [2]: [l_orderkey#X, l_suppkey#X] Arguments: X -(22) InputAdapter +(25) InputAdapter Input [2]: [l_orderkey#X, l_suppkey#X] -(23) InputIteratorTransformer +(26) InputIteratorTransformer Input [2]: [l_orderkey#X, l_suppkey#X] -(24) ShuffledHashJoinExecTransformer +(27) ShuffledHashJoinExecTransformer Left keys [1]: [l_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: NOT (l_suppkey#X = l_suppkey#X) -(25) Scan parquet +(28) Scan parquet Output [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_receiptdate), IsNotNull(l_commitdate)] ReadSchema: struct -(26) FilterExecTransformer +(29) FilterExecTransformer Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Arguments: ((isnotnull(l_receiptdate#X) AND isnotnull(l_commitdate#X)) AND (l_receiptdate#X > l_commitdate#X)) -(27) ProjectExecTransformer +(30) ProjectExecTransformer Output [3]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_suppkey#X] Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] -(28) WholeStageCodegenTransformer (X) +(31) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] Arguments: false -(29) ColumnarExchange +(32) VeloxAppendBatches +Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] +Arguments: X + +(33) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [id=#X] -(30) ShuffleQueryStage +(34) ShuffleQueryStage Output [2]: [l_orderkey#X, l_suppkey#X] Arguments: X -(31) InputAdapter +(35) InputAdapter Input [2]: [l_orderkey#X, l_suppkey#X] -(32) InputIteratorTransformer +(36) InputIteratorTransformer Input [2]: [l_orderkey#X, l_suppkey#X] -(33) ShuffledHashJoinExecTransformer +(37) ShuffledHashJoinExecTransformer Left keys [1]: [l_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: NOT (l_suppkey#X = l_suppkey#X) -(34) ProjectExecTransformer +(38) ProjectExecTransformer Output [3]: [hash(l_suppkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_suppkey#X] Input [2]: [l_orderkey#X, l_suppkey#X] -(35) WholeStageCodegenTransformer (X) +(39) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] Arguments: false -(36) ColumnarExchange +(40) VeloxAppendBatches +Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] +Arguments: X + +(41) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [id=#X] -(37) ShuffleQueryStage +(42) ShuffleQueryStage Output [2]: [l_orderkey#X, l_suppkey#X] Arguments: X -(38) InputAdapter +(43) InputAdapter Input [2]: [l_orderkey#X, l_suppkey#X] -(39) InputIteratorTransformer +(44) InputIteratorTransformer Input [2]: [l_orderkey#X, l_suppkey#X] -(40) ShuffledHashJoinExecTransformer +(45) ShuffledHashJoinExecTransformer Left keys [1]: [s_suppkey#X] Right keys [1]: [l_suppkey#X] Join condition: None -(41) ProjectExecTransformer +(46) ProjectExecTransformer Output [4]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, s_name#X, s_nationkey#X, l_orderkey#X] Input [5]: [s_suppkey#X, s_name#X, s_nationkey#X, l_orderkey#X, l_suppkey#X] -(42) WholeStageCodegenTransformer (X) +(47) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, s_name#X, s_nationkey#X, l_orderkey#X] Arguments: false -(43) ColumnarExchange +(48) VeloxAppendBatches +Input [4]: [hash_partition_key#X, s_name#X, s_nationkey#X, l_orderkey#X] +Arguments: X + +(49) ColumnarExchange Input [4]: [hash_partition_key#X, s_name#X, s_nationkey#X, l_orderkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [s_name#X, s_nationkey#X, l_orderkey#X], [plan_id=X], [id=#X] -(44) ShuffleQueryStage +(50) ShuffleQueryStage Output [3]: [s_name#X, s_nationkey#X, l_orderkey#X] Arguments: X -(45) InputAdapter +(51) InputAdapter Input [3]: [s_name#X, s_nationkey#X, l_orderkey#X] -(46) InputIteratorTransformer +(52) InputIteratorTransformer Input [3]: [s_name#X, s_nationkey#X, l_orderkey#X] -(47) Scan parquet +(53) Scan parquet Output [2]: [o_orderkey#X, o_orderstatus#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderstatus), EqualTo(o_orderstatus,F), IsNotNull(o_orderkey)] ReadSchema: struct -(48) FilterExecTransformer +(54) FilterExecTransformer Input [2]: [o_orderkey#X, o_orderstatus#X] Arguments: ((isnotnull(o_orderstatus#X) AND (o_orderstatus#X = F)) AND isnotnull(o_orderkey#X)) -(49) ProjectExecTransformer +(55) ProjectExecTransformer Output [2]: [hash(o_orderkey#X, 42) AS hash_partition_key#X, o_orderkey#X] Input [2]: [o_orderkey#X, o_orderstatus#X] -(50) WholeStageCodegenTransformer (X) +(56) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, o_orderkey#X] Arguments: false -(51) ColumnarExchange +(57) VeloxAppendBatches +Input [2]: [hash_partition_key#X, o_orderkey#X] +Arguments: X + +(58) ColumnarExchange Input [2]: [hash_partition_key#X, o_orderkey#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X], [plan_id=X], [id=#X] -(52) ShuffleQueryStage +(59) ShuffleQueryStage Output [1]: [o_orderkey#X] Arguments: X -(53) InputAdapter +(60) InputAdapter Input [1]: [o_orderkey#X] -(54) InputIteratorTransformer +(61) InputIteratorTransformer Input [1]: [o_orderkey#X] -(55) ShuffledHashJoinExecTransformer +(62) ShuffledHashJoinExecTransformer Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join condition: None -(56) ProjectExecTransformer +(63) ProjectExecTransformer Output [3]: [hash(s_nationkey#X, 42) AS hash_partition_key#X, s_name#X, s_nationkey#X] Input [4]: [s_name#X, s_nationkey#X, l_orderkey#X, o_orderkey#X] -(57) WholeStageCodegenTransformer (X) +(64) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, s_name#X, s_nationkey#X] Arguments: false -(58) ColumnarExchange +(65) VeloxAppendBatches +Input [3]: [hash_partition_key#X, s_name#X, s_nationkey#X] +Arguments: X + +(66) ColumnarExchange Input [3]: [hash_partition_key#X, s_name#X, s_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_name#X, s_nationkey#X], [plan_id=X], [id=#X] -(59) ShuffleQueryStage +(67) ShuffleQueryStage Output [2]: [s_name#X, s_nationkey#X] Arguments: X -(60) InputAdapter +(68) InputAdapter Input [2]: [s_name#X, s_nationkey#X] -(61) InputIteratorTransformer +(69) InputIteratorTransformer Input [2]: [s_name#X, s_nationkey#X] -(62) Scan parquet +(70) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,SAUDI ARABIA), IsNotNull(n_nationkey)] ReadSchema: struct -(63) FilterExecTransformer +(71) FilterExecTransformer Input [2]: [n_nationkey#X, n_name#X] Arguments: ((isnotnull(n_name#X) AND (n_name#X = SAUDI ARABIA)) AND isnotnull(n_nationkey#X)) -(64) ProjectExecTransformer +(72) ProjectExecTransformer Output [2]: [hash(n_nationkey#X, 42) AS hash_partition_key#X, n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(65) WholeStageCodegenTransformer (X) +(73) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, n_nationkey#X] Arguments: false -(66) ColumnarExchange +(74) VeloxAppendBatches +Input [2]: [hash_partition_key#X, n_nationkey#X] +Arguments: X + +(75) ColumnarExchange Input [2]: [hash_partition_key#X, n_nationkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X], [plan_id=X], [id=#X] -(67) ShuffleQueryStage +(76) ShuffleQueryStage Output [1]: [n_nationkey#X] Arguments: X -(68) InputAdapter +(77) InputAdapter Input [1]: [n_nationkey#X] -(69) InputIteratorTransformer +(78) InputIteratorTransformer Input [1]: [n_nationkey#X] -(70) ShuffledHashJoinExecTransformer +(79) ShuffledHashJoinExecTransformer Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(71) ProjectExecTransformer +(80) ProjectExecTransformer Output [1]: [s_name#X] Input [3]: [s_name#X, s_nationkey#X, n_nationkey#X] -(72) FlushableHashAggregateExecTransformer +(81) FlushableHashAggregateExecTransformer Input [1]: [s_name#X] Keys [1]: [s_name#X] Functions [1]: [partial_count(1)] Aggregate Attributes [1]: [count#X] Results [2]: [s_name#X, count#X] -(73) ProjectExecTransformer +(82) ProjectExecTransformer Output [3]: [hash(s_name#X, 42) AS hash_partition_key#X, s_name#X, count#X] Input [2]: [s_name#X, count#X] -(74) WholeStageCodegenTransformer (X) +(83) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, s_name#X, count#X] Arguments: false -(75) ColumnarExchange +(84) VeloxAppendBatches +Input [3]: [hash_partition_key#X, s_name#X, count#X] +Arguments: X + +(85) ColumnarExchange Input [3]: [hash_partition_key#X, s_name#X, count#X] Arguments: hashpartitioning(s_name#X, 1), ENSURE_REQUIREMENTS, [s_name#X, count#X], [plan_id=X], [id=#X] -(76) ShuffleQueryStage +(86) ShuffleQueryStage Output [2]: [s_name#X, count#X] Arguments: X -(77) InputAdapter +(87) InputAdapter Input [2]: [s_name#X, count#X] -(78) InputIteratorTransformer +(88) InputIteratorTransformer Input [2]: [s_name#X, count#X] -(79) RegularHashAggregateExecTransformer +(89) RegularHashAggregateExecTransformer Input [2]: [s_name#X, count#X] Keys [1]: [s_name#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [s_name#X, count(1)#X AS numwait#X] -(80) WholeStageCodegenTransformer (X) +(90) WholeStageCodegenTransformer (X) Input [2]: [s_name#X, numwait#X] Arguments: false -(81) VeloxColumnarToRowExec +(91) VeloxColumnarToRowExec Input [2]: [s_name#X, numwait#X] -(82) Scan parquet +(92) Scan parquet Output [3]: [s_suppkey#X, s_name#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(83) Filter +(93) Filter Input [3]: [s_suppkey#X, s_name#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(84) Exchange +(94) Exchange Input [3]: [s_suppkey#X, s_name#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(85) Scan parquet +(95) Sort +Input [3]: [s_suppkey#X, s_name#X, s_nationkey#X] +Arguments: [s_suppkey#X ASC NULLS FIRST], false, 0 + +(96) Scan parquet Output [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_receiptdate), IsNotNull(l_commitdate), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(86) Filter +(97) Filter Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Condition : ((((isnotnull(l_receiptdate#X) AND isnotnull(l_commitdate#X)) AND (l_receiptdate#X > l_commitdate#X)) AND isnotnull(l_suppkey#X)) AND isnotnull(l_orderkey#X)) -(87) Project +(98) Project Output [2]: [l_orderkey#X, l_suppkey#X] Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] -(88) Exchange +(99) Exchange Input [2]: [l_orderkey#X, l_suppkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(89) Scan parquet +(100) Sort +Input [2]: [l_orderkey#X, l_suppkey#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(101) Scan parquet Output [2]: [l_orderkey#X, l_suppkey#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(90) Exchange +(102) Exchange Input [2]: [l_orderkey#X, l_suppkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(91) ShuffledHashJoin +(103) Sort +Input [2]: [l_orderkey#X, l_suppkey#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(104) SortMergeJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: NOT (l_suppkey#X = l_suppkey#X) -(92) Scan parquet +(105) Scan parquet Output [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_receiptdate), IsNotNull(l_commitdate)] ReadSchema: struct -(93) Filter +(106) Filter Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Condition : ((isnotnull(l_receiptdate#X) AND isnotnull(l_commitdate#X)) AND (l_receiptdate#X > l_commitdate#X)) -(94) Project +(107) Project Output [2]: [l_orderkey#X, l_suppkey#X] Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] -(95) Exchange +(108) Exchange Input [2]: [l_orderkey#X, l_suppkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(96) ShuffledHashJoin +(109) Sort +Input [2]: [l_orderkey#X, l_suppkey#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(110) SortMergeJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: NOT (l_suppkey#X = l_suppkey#X) -(97) Exchange +(111) Exchange Input [2]: [l_orderkey#X, l_suppkey#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(98) ShuffledHashJoin +(112) Sort +Input [2]: [l_orderkey#X, l_suppkey#X] +Arguments: [l_suppkey#X ASC NULLS FIRST], false, 0 + +(113) SortMergeJoin Left keys [1]: [s_suppkey#X] Right keys [1]: [l_suppkey#X] Join condition: None -(99) Project +(114) Project Output [3]: [s_name#X, s_nationkey#X, l_orderkey#X] Input [5]: [s_suppkey#X, s_name#X, s_nationkey#X, l_orderkey#X, l_suppkey#X] -(100) Exchange +(115) Exchange Input [3]: [s_name#X, s_nationkey#X, l_orderkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(101) Scan parquet +(116) Sort +Input [3]: [s_name#X, s_nationkey#X, l_orderkey#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(117) Scan parquet Output [2]: [o_orderkey#X, o_orderstatus#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderstatus), EqualTo(o_orderstatus,F), IsNotNull(o_orderkey)] ReadSchema: struct -(102) Filter +(118) Filter Input [2]: [o_orderkey#X, o_orderstatus#X] Condition : ((isnotnull(o_orderstatus#X) AND (o_orderstatus#X = F)) AND isnotnull(o_orderkey#X)) -(103) Project +(119) Project Output [1]: [o_orderkey#X] Input [2]: [o_orderkey#X, o_orderstatus#X] -(104) Exchange +(120) Exchange Input [1]: [o_orderkey#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(105) ShuffledHashJoin +(121) Sort +Input [1]: [o_orderkey#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(122) SortMergeJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join condition: None -(106) Project +(123) Project Output [2]: [s_name#X, s_nationkey#X] Input [4]: [s_name#X, s_nationkey#X, l_orderkey#X, o_orderkey#X] -(107) Exchange +(124) Exchange Input [2]: [s_name#X, s_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(108) Scan parquet +(125) Sort +Input [2]: [s_name#X, s_nationkey#X] +Arguments: [s_nationkey#X ASC NULLS FIRST], false, 0 + +(126) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,SAUDI ARABIA), IsNotNull(n_nationkey)] ReadSchema: struct -(109) Filter +(127) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : ((isnotnull(n_name#X) AND (n_name#X = SAUDI ARABIA)) AND isnotnull(n_nationkey#X)) -(110) Project +(128) Project Output [1]: [n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(111) Exchange +(129) Exchange Input [1]: [n_nationkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(112) ShuffledHashJoin +(130) Sort +Input [1]: [n_nationkey#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(131) SortMergeJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(113) Project +(132) Project Output [1]: [s_name#X] Input [3]: [s_name#X, s_nationkey#X, n_nationkey#X] -(114) HashAggregate +(133) HashAggregate Input [1]: [s_name#X] Keys [1]: [s_name#X] Functions [1]: [partial_count(1)] Aggregate Attributes [1]: [count#X] Results [2]: [s_name#X, count#X] -(115) Exchange +(134) Exchange Input [2]: [s_name#X, count#X] Arguments: hashpartitioning(s_name#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(116) HashAggregate +(135) HashAggregate Input [2]: [s_name#X, count#X] Keys [1]: [s_name#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [s_name#X, count(1)#X AS numwait#X] -(117) TakeOrderedAndProject +(136) TakeOrderedAndProject Input [2]: [s_name#X, numwait#X] Arguments: X, [numwait#X DESC NULLS LAST, s_name#X ASC NULLS FIRST], [s_name#X, numwait#X] -(118) AdaptiveSparkPlan +(137) AdaptiveSparkPlan Output [2]: [s_name#X, numwait#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/22.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/22.txt index f8ce5b6c7485a..214b34066a8f8 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/22.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/22.txt @@ -1,47 +1,49 @@ == Physical Plan == -AdaptiveSparkPlan (46) +AdaptiveSparkPlan (52) +- == Final Plan == - VeloxColumnarToRowExec (33) - +- ^ SortExecTransformer (31) - +- ^ InputIteratorTransformer (30) - +- ^ InputAdapter (29) - +- ^ ShuffleQueryStage (28), Statistics(X) - +- ColumnarExchange (27) - +- ^ RegularHashAggregateExecTransformer (25) - +- ^ InputIteratorTransformer (24) - +- ^ InputAdapter (23) - +- ^ ShuffleQueryStage (22), Statistics(X) - +- ColumnarExchange (21) - +- ^ ProjectExecTransformer (19) - +- ^ FlushableHashAggregateExecTransformer (18) - +- ^ ProjectExecTransformer (17) - +- ^ ShuffledHashJoinExecTransformer LeftAnti (16) - :- ^ InputIteratorTransformer (8) - : +- ^ InputAdapter (7) - : +- ^ ShuffleQueryStage (6), Statistics(X) - : +- ColumnarExchange (5) + VeloxColumnarToRowExec (37) + +- ^ SortExecTransformer (35) + +- ^ InputIteratorTransformer (34) + +- ShuffleQueryStage (32), Statistics(X) + +- ColumnarExchange (31) + +- VeloxAppendBatches (30) + +- ^ RegularHashAggregateExecTransformer (28) + +- ^ InputIteratorTransformer (27) + +- ShuffleQueryStage (25), Statistics(X) + +- ColumnarExchange (24) + +- VeloxAppendBatches (23) + +- ^ ProjectExecTransformer (21) + +- ^ FlushableHashAggregateExecTransformer (20) + +- ^ ProjectExecTransformer (19) + +- ^ ShuffledHashJoinExecTransformer LeftAnti BuildRight (18) + :- ^ InputIteratorTransformer (9) + : +- ShuffleQueryStage (7), Statistics(X) + : +- ColumnarExchange (6) + : +- VeloxAppendBatches (5) : +- ^ ProjectExecTransformer (3) : +- ^ FilterExecTransformer (2) : +- ^ Scan parquet (1) - +- ^ InputIteratorTransformer (15) - +- ^ InputAdapter (14) - +- ^ ShuffleQueryStage (13), Statistics(X) - +- ColumnarExchange (12) - +- ^ ProjectExecTransformer (10) - +- ^ Scan parquet (9) + +- ^ InputIteratorTransformer (17) + +- ShuffleQueryStage (15), Statistics(X) + +- ColumnarExchange (14) + +- VeloxAppendBatches (13) + +- ^ ProjectExecTransformer (11) + +- ^ Scan parquet (10) +- == Initial Plan == - Sort (45) - +- Exchange (44) - +- HashAggregate (43) - +- Exchange (42) - +- HashAggregate (41) - +- Project (40) - +- ShuffledHashJoin LeftAnti BuildRight (39) - :- Exchange (36) - : +- Filter (35) - : +- Scan parquet (34) - +- Exchange (38) - +- Scan parquet (37) + Sort (51) + +- Exchange (50) + +- HashAggregate (49) + +- Exchange (48) + +- HashAggregate (47) + +- Project (46) + +- SortMergeJoin LeftAnti (45) + :- Sort (41) + : +- Exchange (40) + : +- Filter (39) + : +- Scan parquet (38) + +- Sort (44) + +- Exchange (43) + +- Scan parquet (42) (1) Scan parquet @@ -63,318 +65,346 @@ Input [3]: [c_custkey#X, c_phone#X, c_acctbal#X] Input [4]: [hash_partition_key#X, c_custkey#X, c_phone#X, c_acctbal#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [4]: [hash_partition_key#X, c_custkey#X, c_phone#X, c_acctbal#X] +Arguments: X + +(6) ColumnarExchange Input [4]: [hash_partition_key#X, c_custkey#X, c_phone#X, c_acctbal#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_phone#X, c_acctbal#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [3]: [c_custkey#X, c_phone#X, c_acctbal#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [3]: [c_custkey#X, c_phone#X, c_acctbal#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [3]: [c_custkey#X, c_phone#X, c_acctbal#X] -(9) Scan parquet +(10) Scan parquet Output [1]: [o_custkey#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(10) ProjectExecTransformer +(11) ProjectExecTransformer Output [2]: [hash(o_custkey#X, 42) AS hash_partition_key#X, o_custkey#X] Input [1]: [o_custkey#X] -(11) WholeStageCodegenTransformer (X) +(12) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, o_custkey#X] Arguments: false -(12) ColumnarExchange +(13) VeloxAppendBatches +Input [2]: [hash_partition_key#X, o_custkey#X] +Arguments: X + +(14) ColumnarExchange Input [2]: [hash_partition_key#X, o_custkey#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_custkey#X], [plan_id=X], [id=#X] -(13) ShuffleQueryStage +(15) ShuffleQueryStage Output [1]: [o_custkey#X] Arguments: X -(14) InputAdapter +(16) InputAdapter Input [1]: [o_custkey#X] -(15) InputIteratorTransformer +(17) InputIteratorTransformer Input [1]: [o_custkey#X] -(16) ShuffledHashJoinExecTransformer +(18) ShuffledHashJoinExecTransformer Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(17) ProjectExecTransformer +(19) ProjectExecTransformer Output [2]: [substring(c_phone#X, 1, 2) AS cntrycode#X, c_acctbal#X] Input [3]: [c_custkey#X, c_phone#X, c_acctbal#X] -(18) FlushableHashAggregateExecTransformer +(20) FlushableHashAggregateExecTransformer Input [2]: [cntrycode#X, c_acctbal#X] Keys [1]: [cntrycode#X] Functions [2]: [partial_count(1), partial_sum(c_acctbal#X)] Aggregate Attributes [3]: [count#X, sum#X, isEmpty#X] Results [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] -(19) ProjectExecTransformer +(21) ProjectExecTransformer Output [5]: [hash(cntrycode#X, 42) AS hash_partition_key#X, cntrycode#X, count#X, sum#X, isEmpty#X] Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] -(20) WholeStageCodegenTransformer (X) +(22) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, cntrycode#X, count#X, sum#X, isEmpty#X] Arguments: false -(21) ColumnarExchange +(23) VeloxAppendBatches +Input [5]: [hash_partition_key#X, cntrycode#X, count#X, sum#X, isEmpty#X] +Arguments: X + +(24) ColumnarExchange Input [5]: [hash_partition_key#X, cntrycode#X, count#X, sum#X, isEmpty#X] Arguments: hashpartitioning(cntrycode#X, 1), ENSURE_REQUIREMENTS, [cntrycode#X, count#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(22) ShuffleQueryStage +(25) ShuffleQueryStage Output [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] Arguments: X -(23) InputAdapter +(26) InputAdapter Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] -(24) InputIteratorTransformer +(27) InputIteratorTransformer Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] -(25) RegularHashAggregateExecTransformer +(28) RegularHashAggregateExecTransformer Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] Keys [1]: [cntrycode#X] Functions [2]: [count(1), sum(c_acctbal#X)] Aggregate Attributes [2]: [count(1)#X, sum(c_acctbal#X)#X] Results [3]: [cntrycode#X, count(1)#X AS numcust#X, sum(c_acctbal#X)#X AS totacctbal#X] -(26) WholeStageCodegenTransformer (X) +(29) WholeStageCodegenTransformer (X) Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: false -(27) ColumnarExchange +(30) VeloxAppendBatches +Input [3]: [cntrycode#X, numcust#X, totacctbal#X] +Arguments: X + +(31) ColumnarExchange Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: rangepartitioning(cntrycode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(28) ShuffleQueryStage +(32) ShuffleQueryStage Output [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: X -(29) InputAdapter +(33) InputAdapter Input [3]: [cntrycode#X, numcust#X, totacctbal#X] -(30) InputIteratorTransformer +(34) InputIteratorTransformer Input [3]: [cntrycode#X, numcust#X, totacctbal#X] -(31) SortExecTransformer +(35) SortExecTransformer Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: [cntrycode#X ASC NULLS FIRST], true, 0 -(32) WholeStageCodegenTransformer (X) +(36) WholeStageCodegenTransformer (X) Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: false -(33) VeloxColumnarToRowExec +(37) VeloxColumnarToRowExec Input [3]: [cntrycode#X, numcust#X, totacctbal#X] -(34) Scan parquet +(38) Scan parquet Output [3]: [c_custkey#X, c_phone#X, c_acctbal#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_acctbal)] ReadSchema: struct -(35) Filter +(39) Filter Input [3]: [c_custkey#X, c_phone#X, c_acctbal#X] Condition : ((isnotnull(c_acctbal#X) AND substring(c_phone#X, 1, 2) IN (13,31,23,29,30,18,17)) AND (cast(c_acctbal#X as decimal(16,6)) > Subquery subquery#X, [id=#X])) -(36) Exchange +(40) Exchange Input [3]: [c_custkey#X, c_phone#X, c_acctbal#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(37) Scan parquet +(41) Sort +Input [3]: [c_custkey#X, c_phone#X, c_acctbal#X] +Arguments: [c_custkey#X ASC NULLS FIRST], false, 0 + +(42) Scan parquet Output [1]: [o_custkey#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(38) Exchange +(43) Exchange Input [1]: [o_custkey#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(39) ShuffledHashJoin +(44) Sort +Input [1]: [o_custkey#X] +Arguments: [o_custkey#X ASC NULLS FIRST], false, 0 + +(45) SortMergeJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(40) Project +(46) Project Output [2]: [substring(c_phone#X, 1, 2) AS cntrycode#X, c_acctbal#X] Input [3]: [c_custkey#X, c_phone#X, c_acctbal#X] -(41) HashAggregate +(47) HashAggregate Input [2]: [cntrycode#X, c_acctbal#X] Keys [1]: [cntrycode#X] Functions [2]: [partial_count(1), partial_sum(c_acctbal#X)] Aggregate Attributes [3]: [count#X, sum#X, isEmpty#X] Results [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] -(42) Exchange +(48) Exchange Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] Arguments: hashpartitioning(cntrycode#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(43) HashAggregate +(49) HashAggregate Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] Keys [1]: [cntrycode#X] Functions [2]: [count(1), sum(c_acctbal#X)] Aggregate Attributes [2]: [count(1)#X, sum(c_acctbal#X)#X] Results [3]: [cntrycode#X, count(1)#X AS numcust#X, sum(c_acctbal#X)#X AS totacctbal#X] -(44) Exchange +(50) Exchange Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: rangepartitioning(cntrycode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(45) Sort +(51) Sort Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: [cntrycode#X ASC NULLS FIRST], true, 0 -(46) AdaptiveSparkPlan +(52) AdaptiveSparkPlan Output [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: isFinalPlan=true ===== Subqueries ===== Subquery:1 Hosting operator id = 2 Hosting Expression = Subquery subquery#X, [id=#X] -AdaptiveSparkPlan (65) +AdaptiveSparkPlan (72) +- == Final Plan == - VeloxColumnarToRowExec (58) - +- ^ RegularHashAggregateExecTransformer (56) - +- ^ InputIteratorTransformer (55) - +- ^ InputAdapter (54) - +- ^ ShuffleQueryStage (53), Statistics(X) - +- ColumnarExchange (52) - +- ^ FlushableHashAggregateExecTransformer (50) - +- ^ ProjectExecTransformer (49) - +- ^ FilterExecTransformer (48) - +- ^ Scan parquet (47) + VeloxColumnarToRowExec (65) + +- ^ RegularHashAggregateExecTransformer (63) + +- ^ InputIteratorTransformer (62) + +- ShuffleQueryStage (60), Statistics(X) + +- ColumnarExchange (59) + +- VeloxAppendBatches (58) + +- ^ FlushableHashAggregateExecTransformer (56) + +- ^ ProjectExecTransformer (55) + +- ^ FilterExecTransformer (54) + +- ^ Scan parquet (53) +- == Initial Plan == - HashAggregate (64) - +- Exchange (63) - +- HashAggregate (62) - +- Project (61) - +- Filter (60) - +- Scan parquet (59) + HashAggregate (71) + +- Exchange (70) + +- HashAggregate (69) + +- Project (68) + +- Filter (67) + +- Scan parquet (66) -(47) Scan parquet +(53) Scan parquet Output [2]: [c_phone#X, c_acctbal#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_acctbal), GreaterThan(c_acctbal,0.00)] ReadSchema: struct -(48) FilterExecTransformer +(54) FilterExecTransformer Input [2]: [c_phone#X, c_acctbal#X] Arguments: ((isnotnull(c_acctbal#X) AND (c_acctbal#X > 0.00)) AND substring(c_phone#X, 1, 2) IN (13,31,23,29,30,18,17)) -(49) ProjectExecTransformer +(55) ProjectExecTransformer Output [1]: [c_acctbal#X] Input [2]: [c_phone#X, c_acctbal#X] -(50) FlushableHashAggregateExecTransformer +(56) FlushableHashAggregateExecTransformer Input [1]: [c_acctbal#X] Keys: [] Functions [1]: [partial_avg(c_acctbal#X)] Aggregate Attributes [2]: [sum#X, count#X] Results [2]: [sum#X, count#X] -(51) WholeStageCodegenTransformer (X) +(57) WholeStageCodegenTransformer (X) Input [2]: [sum#X, count#X] Arguments: false -(52) ColumnarExchange +(58) VeloxAppendBatches +Input [2]: [sum#X, count#X] +Arguments: X + +(59) ColumnarExchange Input [2]: [sum#X, count#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(53) ShuffleQueryStage +(60) ShuffleQueryStage Output [2]: [sum#X, count#X] Arguments: X -(54) InputAdapter +(61) InputAdapter Input [2]: [sum#X, count#X] -(55) InputIteratorTransformer +(62) InputIteratorTransformer Input [2]: [sum#X, count#X] -(56) RegularHashAggregateExecTransformer +(63) RegularHashAggregateExecTransformer Input [2]: [sum#X, count#X] Keys: [] Functions [1]: [avg(c_acctbal#X)] Aggregate Attributes [1]: [avg(c_acctbal#X)#X] Results [1]: [avg(c_acctbal#X)#X AS avg(c_acctbal)#X] -(57) WholeStageCodegenTransformer (X) +(64) WholeStageCodegenTransformer (X) Input [1]: [avg(c_acctbal)#X] Arguments: false -(58) VeloxColumnarToRowExec +(65) VeloxColumnarToRowExec Input [1]: [avg(c_acctbal)#X] -(59) Scan parquet +(66) Scan parquet Output [2]: [c_phone#X, c_acctbal#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_acctbal), GreaterThan(c_acctbal,0.00)] ReadSchema: struct -(60) Filter +(67) Filter Input [2]: [c_phone#X, c_acctbal#X] Condition : ((isnotnull(c_acctbal#X) AND (c_acctbal#X > 0.00)) AND substring(c_phone#X, 1, 2) IN (13,31,23,29,30,18,17)) -(61) Project +(68) Project Output [1]: [c_acctbal#X] Input [2]: [c_phone#X, c_acctbal#X] -(62) HashAggregate +(69) HashAggregate Input [1]: [c_acctbal#X] Keys: [] Functions [1]: [partial_avg(c_acctbal#X)] Aggregate Attributes [2]: [sum#X, count#X] Results [2]: [sum#X, count#X] -(63) Exchange +(70) Exchange Input [2]: [sum#X, count#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X] -(64) HashAggregate +(71) HashAggregate Input [2]: [sum#X, count#X] Keys: [] Functions [1]: [avg(c_acctbal#X)] Aggregate Attributes [1]: [avg(c_acctbal#X)#X] Results [1]: [avg(c_acctbal#X)#X AS avg(c_acctbal)#X] -(65) AdaptiveSparkPlan +(72) AdaptiveSparkPlan Output [1]: [avg(c_acctbal)#X] Arguments: isFinalPlan=true Subquery:2 Hosting operator id = 1 Hosting Expression = Subquery subquery#X, [id=#X] -AdaptiveSparkPlan (65) +AdaptiveSparkPlan (72) +- == Final Plan == - VeloxColumnarToRowExec (58) - +- ^ RegularHashAggregateExecTransformer (56) - +- ^ InputIteratorTransformer (55) - +- ^ InputAdapter (54) - +- ^ ShuffleQueryStage (53), Statistics(X) - +- ColumnarExchange (52) - +- ^ FlushableHashAggregateExecTransformer (50) - +- ^ ProjectExecTransformer (49) - +- ^ FilterExecTransformer (48) - +- ^ Scan parquet (47) + VeloxColumnarToRowExec (65) + +- ^ RegularHashAggregateExecTransformer (63) + +- ^ InputIteratorTransformer (62) + +- ShuffleQueryStage (60), Statistics(X) + +- ColumnarExchange (59) + +- VeloxAppendBatches (58) + +- ^ FlushableHashAggregateExecTransformer (56) + +- ^ ProjectExecTransformer (55) + +- ^ FilterExecTransformer (54) + +- ^ Scan parquet (53) +- == Initial Plan == - HashAggregate (64) - +- Exchange (63) - +- HashAggregate (62) - +- Project (61) - +- Filter (60) - +- Scan parquet (59) \ No newline at end of file + HashAggregate (71) + +- Exchange (70) + +- HashAggregate (69) + +- Project (68) + +- Filter (67) + +- Scan parquet (66) \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/3.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/3.txt index 8492e97cdbcc7..df17819cafe96 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/3.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/3.txt @@ -1,60 +1,64 @@ == Physical Plan == -AdaptiveSparkPlan (59) +AdaptiveSparkPlan (67) +- == Final Plan == - VeloxColumnarToRowExec (39) - +- TakeOrderedAndProjectExecTransformer (38) - +- ^ ProjectExecTransformer (36) - +- ^ RegularHashAggregateExecTransformer (35) - +- ^ RegularHashAggregateExecTransformer (34) - +- ^ ProjectExecTransformer (33) - +- ^ ShuffledHashJoinExecTransformer Inner (32) - :- ^ InputIteratorTransformer (23) - : +- ^ InputAdapter (22) - : +- ^ ShuffleQueryStage (21), Statistics(X) - : +- ColumnarExchange (20) - : +- ^ ProjectExecTransformer (18) - : +- ^ ShuffledHashJoinExecTransformer Inner (17) - : :- ^ InputIteratorTransformer (8) - : : +- ^ InputAdapter (7) - : : +- ^ ShuffleQueryStage (6), Statistics(X) - : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (43) + +- TakeOrderedAndProjectExecTransformer (42) + +- ^ ProjectExecTransformer (40) + +- ^ RegularHashAggregateExecTransformer (39) + +- ^ RegularHashAggregateExecTransformer (38) + +- ^ ProjectExecTransformer (37) + +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (36) + :- ^ InputIteratorTransformer (26) + : +- ShuffleQueryStage (24), Statistics(X) + : +- ColumnarExchange (23) + : +- VeloxAppendBatches (22) + : +- ^ ProjectExecTransformer (20) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (19) + : :- ^ InputIteratorTransformer (9) + : : +- ShuffleQueryStage (7), Statistics(X) + : : +- ColumnarExchange (6) + : : +- VeloxAppendBatches (5) : : +- ^ ProjectExecTransformer (3) : : +- ^ FilterExecTransformer (2) : : +- ^ Scan parquet (1) - : +- ^ InputIteratorTransformer (16) - : +- ^ InputAdapter (15) - : +- ^ ShuffleQueryStage (14), Statistics(X) - : +- ColumnarExchange (13) - : +- ^ ProjectExecTransformer (11) - : +- ^ FilterExecTransformer (10) - : +- ^ Scan parquet (9) - +- ^ InputIteratorTransformer (31) - +- ^ InputAdapter (30) - +- ^ ShuffleQueryStage (29), Statistics(X) - +- ColumnarExchange (28) - +- ^ ProjectExecTransformer (26) - +- ^ FilterExecTransformer (25) - +- ^ Scan parquet (24) + : +- ^ InputIteratorTransformer (18) + : +- ShuffleQueryStage (16), Statistics(X) + : +- ColumnarExchange (15) + : +- VeloxAppendBatches (14) + : +- ^ ProjectExecTransformer (12) + : +- ^ FilterExecTransformer (11) + : +- ^ Scan parquet (10) + +- ^ InputIteratorTransformer (35) + +- ShuffleQueryStage (33), Statistics(X) + +- ColumnarExchange (32) + +- VeloxAppendBatches (31) + +- ^ ProjectExecTransformer (29) + +- ^ FilterExecTransformer (28) + +- ^ Scan parquet (27) +- == Initial Plan == - TakeOrderedAndProject (58) - +- HashAggregate (57) - +- HashAggregate (56) - +- Project (55) - +- ShuffledHashJoin Inner BuildRight (54) - :- Exchange (49) - : +- Project (48) - : +- ShuffledHashJoin Inner BuildLeft (47) - : :- Exchange (43) - : : +- Project (42) - : : +- Filter (41) - : : +- Scan parquet (40) - : +- Exchange (46) - : +- Filter (45) - : +- Scan parquet (44) - +- Exchange (53) - +- Project (52) - +- Filter (51) - +- Scan parquet (50) + TakeOrderedAndProject (66) + +- HashAggregate (65) + +- HashAggregate (64) + +- Project (63) + +- SortMergeJoin Inner (62) + :- Sort (56) + : +- Exchange (55) + : +- Project (54) + : +- SortMergeJoin Inner (53) + : :- Sort (48) + : : +- Exchange (47) + : : +- Project (46) + : : +- Filter (45) + : : +- Scan parquet (44) + : +- Sort (52) + : +- Exchange (51) + : +- Filter (50) + : +- Scan parquet (49) + +- Sort (61) + +- Exchange (60) + +- Project (59) + +- Filter (58) + +- Scan parquet (57) (1) Scan parquet @@ -76,244 +80,276 @@ Input [2]: [c_custkey#X, c_mktsegment#X] Input [2]: [hash_partition_key#X, c_custkey#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [2]: [hash_partition_key#X, c_custkey#X] +Arguments: X + +(6) ColumnarExchange Input [2]: [hash_partition_key#X, c_custkey#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [1]: [c_custkey#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [1]: [c_custkey#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [1]: [c_custkey#X] -(9) Scan parquet +(10) Scan parquet Output [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), LessThan(o_orderdate,1995-03-15), IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(10) FilterExecTransformer +(11) FilterExecTransformer Input [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Arguments: (((isnotnull(o_orderdate#X) AND (o_orderdate#X < 1995-03-15)) AND isnotnull(o_custkey#X)) AND isnotnull(o_orderkey#X)) -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [5]: [hash(o_custkey#X, 42) AS hash_partition_key#X, o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Input [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] +Arguments: X + +(15) ColumnarExchange Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [4]: [hash(o_orderkey#X, 42) AS hash_partition_key#X, o_orderkey#X, o_orderdate#X, o_shippriority#X] Input [5]: [c_custkey#X, o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] -(19) WholeStageCodegenTransformer (X) +(21) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, o_orderkey#X, o_orderdate#X, o_shippriority#X] Arguments: false -(20) ColumnarExchange +(22) VeloxAppendBatches +Input [4]: [hash_partition_key#X, o_orderkey#X, o_orderdate#X, o_shippriority#X] +Arguments: X + +(23) ColumnarExchange Input [4]: [hash_partition_key#X, o_orderkey#X, o_orderdate#X, o_shippriority#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderdate#X, o_shippriority#X], [plan_id=X], [id=#X] -(21) ShuffleQueryStage +(24) ShuffleQueryStage Output [3]: [o_orderkey#X, o_orderdate#X, o_shippriority#X] Arguments: X -(22) InputAdapter +(25) InputAdapter Input [3]: [o_orderkey#X, o_orderdate#X, o_shippriority#X] -(23) InputIteratorTransformer +(26) InputIteratorTransformer Input [3]: [o_orderkey#X, o_orderdate#X, o_shippriority#X] -(24) Scan parquet +(27) Scan parquet Output [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThan(l_shipdate,1995-03-15), IsNotNull(l_orderkey)] ReadSchema: struct -(25) FilterExecTransformer +(28) FilterExecTransformer Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: ((isnotnull(l_shipdate#X) AND (l_shipdate#X > 1995-03-15)) AND isnotnull(l_orderkey#X)) -(26) ProjectExecTransformer +(29) ProjectExecTransformer Output [4]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(27) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X] Arguments: false -(28) ColumnarExchange +(31) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(32) ColumnarExchange Input [4]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(29) ShuffleQueryStage +(33) ShuffleQueryStage Output [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] Arguments: X -(30) InputAdapter +(34) InputAdapter Input [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] -(31) InputIteratorTransformer +(35) InputIteratorTransformer Input [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] -(32) ShuffledHashJoinExecTransformer +(36) ShuffledHashJoinExecTransformer Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(33) ProjectExecTransformer +(37) ProjectExecTransformer Output [6]: [o_orderdate#X, o_shippriority#X, l_orderkey#X, l_extendedprice#X, l_discount#X, CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) AS _pre_X#X] Input [6]: [o_orderkey#X, o_orderdate#X, o_shippriority#X, l_orderkey#X, l_extendedprice#X, l_discount#X] -(34) RegularHashAggregateExecTransformer +(38) RegularHashAggregateExecTransformer Input [6]: [o_orderdate#X, o_shippriority#X, l_orderkey#X, l_extendedprice#X, l_discount#X, _pre_X#X] Keys [3]: [l_orderkey#X, o_orderdate#X, o_shippriority#X] Functions [1]: [partial_sum(_pre_X#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] -(35) RegularHashAggregateExecTransformer +(39) RegularHashAggregateExecTransformer Input [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] Keys [3]: [l_orderkey#X, o_orderdate#X, o_shippriority#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] Results [4]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] -(36) ProjectExecTransformer +(40) ProjectExecTransformer Output [4]: [l_orderkey#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X AS revenue#X, o_orderdate#X, o_shippriority#X] Input [4]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] -(37) WholeStageCodegenTransformer (X) +(41) WholeStageCodegenTransformer (X) Input [4]: [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] Arguments: false -(38) TakeOrderedAndProjectExecTransformer +(42) TakeOrderedAndProjectExecTransformer Input [4]: [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] Arguments: X, [revenue#X DESC NULLS LAST, o_orderdate#X ASC NULLS FIRST], [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X], 0 -(39) VeloxColumnarToRowExec +(43) VeloxColumnarToRowExec Input [4]: [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] -(40) Scan parquet +(44) Scan parquet Output [2]: [c_custkey#X, c_mktsegment#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_mktsegment), EqualTo(c_mktsegment,BUILDING), IsNotNull(c_custkey)] ReadSchema: struct -(41) Filter +(45) Filter Input [2]: [c_custkey#X, c_mktsegment#X] Condition : ((isnotnull(c_mktsegment#X) AND (c_mktsegment#X = BUILDING)) AND isnotnull(c_custkey#X)) -(42) Project +(46) Project Output [1]: [c_custkey#X] Input [2]: [c_custkey#X, c_mktsegment#X] -(43) Exchange +(47) Exchange Input [1]: [c_custkey#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(44) Scan parquet +(48) Sort +Input [1]: [c_custkey#X] +Arguments: [c_custkey#X ASC NULLS FIRST], false, 0 + +(49) Scan parquet Output [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), LessThan(o_orderdate,1995-03-15), IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(45) Filter +(50) Filter Input [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Condition : (((isnotnull(o_orderdate#X) AND (o_orderdate#X < 1995-03-15)) AND isnotnull(o_custkey#X)) AND isnotnull(o_orderkey#X)) -(46) Exchange +(51) Exchange Input [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(47) ShuffledHashJoin +(52) Sort +Input [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] +Arguments: [o_custkey#X ASC NULLS FIRST], false, 0 + +(53) SortMergeJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(48) Project +(54) Project Output [3]: [o_orderkey#X, o_orderdate#X, o_shippriority#X] Input [5]: [c_custkey#X, o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] -(49) Exchange +(55) Exchange Input [3]: [o_orderkey#X, o_orderdate#X, o_shippriority#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(50) Scan parquet +(56) Sort +Input [3]: [o_orderkey#X, o_orderdate#X, o_shippriority#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(57) Scan parquet Output [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThan(l_shipdate,1995-03-15), IsNotNull(l_orderkey)] ReadSchema: struct -(51) Filter +(58) Filter Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : ((isnotnull(l_shipdate#X) AND (l_shipdate#X > 1995-03-15)) AND isnotnull(l_orderkey#X)) -(52) Project +(59) Project Output [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(53) Exchange +(60) Exchange Input [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(54) ShuffledHashJoin +(61) Sort +Input [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(62) SortMergeJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(55) Project +(63) Project Output [5]: [o_orderdate#X, o_shippriority#X, l_orderkey#X, l_extendedprice#X, l_discount#X] Input [6]: [o_orderkey#X, o_orderdate#X, o_shippriority#X, l_orderkey#X, l_extendedprice#X, l_discount#X] -(56) HashAggregate +(64) HashAggregate Input [5]: [o_orderdate#X, o_shippriority#X, l_orderkey#X, l_extendedprice#X, l_discount#X] Keys [3]: [l_orderkey#X, o_orderdate#X, o_shippriority#X] Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] -(57) HashAggregate +(65) HashAggregate Input [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] Keys [3]: [l_orderkey#X, o_orderdate#X, o_shippriority#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] Results [4]: [l_orderkey#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X AS revenue#X, o_orderdate#X, o_shippriority#X] -(58) TakeOrderedAndProject +(66) TakeOrderedAndProject Input [4]: [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] Arguments: X, [revenue#X DESC NULLS LAST, o_orderdate#X ASC NULLS FIRST], [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] -(59) AdaptiveSparkPlan +(67) AdaptiveSparkPlan Output [4]: [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/4.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/4.txt index 22e9480fffc8e..85d303df874f2 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/4.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/4.txt @@ -1,51 +1,53 @@ == Physical Plan == -AdaptiveSparkPlan (50) +AdaptiveSparkPlan (56) +- == Final Plan == - VeloxColumnarToRowExec (34) - +- ^ SortExecTransformer (32) - +- ^ InputIteratorTransformer (31) - +- ^ InputAdapter (30) - +- ^ ShuffleQueryStage (29), Statistics(X) - +- ColumnarExchange (28) - +- ^ RegularHashAggregateExecTransformer (26) - +- ^ InputIteratorTransformer (25) - +- ^ InputAdapter (24) - +- ^ ShuffleQueryStage (23), Statistics(X) - +- ColumnarExchange (22) - +- ^ ProjectExecTransformer (20) - +- ^ FlushableHashAggregateExecTransformer (19) - +- ^ ProjectExecTransformer (18) - +- ^ ShuffledHashJoinExecTransformer LeftSemi (17) - :- ^ InputIteratorTransformer (8) - : +- ^ InputAdapter (7) - : +- ^ ShuffleQueryStage (6), Statistics(X) - : +- ColumnarExchange (5) + VeloxColumnarToRowExec (38) + +- ^ SortExecTransformer (36) + +- ^ InputIteratorTransformer (35) + +- ShuffleQueryStage (33), Statistics(X) + +- ColumnarExchange (32) + +- VeloxAppendBatches (31) + +- ^ RegularHashAggregateExecTransformer (29) + +- ^ InputIteratorTransformer (28) + +- ShuffleQueryStage (26), Statistics(X) + +- ColumnarExchange (25) + +- VeloxAppendBatches (24) + +- ^ ProjectExecTransformer (22) + +- ^ FlushableHashAggregateExecTransformer (21) + +- ^ ProjectExecTransformer (20) + +- ^ ShuffledHashJoinExecTransformer LeftSemi BuildRight (19) + :- ^ InputIteratorTransformer (9) + : +- ShuffleQueryStage (7), Statistics(X) + : +- ColumnarExchange (6) + : +- VeloxAppendBatches (5) : +- ^ ProjectExecTransformer (3) : +- ^ FilterExecTransformer (2) : +- ^ Scan parquet (1) - +- ^ InputIteratorTransformer (16) - +- ^ InputAdapter (15) - +- ^ ShuffleQueryStage (14), Statistics(X) - +- ColumnarExchange (13) - +- ^ ProjectExecTransformer (11) - +- ^ FilterExecTransformer (10) - +- ^ Scan parquet (9) + +- ^ InputIteratorTransformer (18) + +- ShuffleQueryStage (16), Statistics(X) + +- ColumnarExchange (15) + +- VeloxAppendBatches (14) + +- ^ ProjectExecTransformer (12) + +- ^ FilterExecTransformer (11) + +- ^ Scan parquet (10) +- == Initial Plan == - Sort (49) - +- Exchange (48) - +- HashAggregate (47) - +- Exchange (46) - +- HashAggregate (45) - +- Project (44) - +- ShuffledHashJoin LeftSemi BuildRight (43) - :- Exchange (38) - : +- Project (37) - : +- Filter (36) - : +- Scan parquet (35) - +- Exchange (42) - +- Project (41) - +- Filter (40) - +- Scan parquet (39) + Sort (55) + +- Exchange (54) + +- HashAggregate (53) + +- Exchange (52) + +- HashAggregate (51) + +- Project (50) + +- SortMergeJoin LeftSemi (49) + :- Sort (43) + : +- Exchange (42) + : +- Project (41) + : +- Filter (40) + : +- Scan parquet (39) + +- Sort (48) + +- Exchange (47) + +- Project (46) + +- Filter (45) + +- Scan parquet (44) (1) Scan parquet @@ -67,200 +69,224 @@ Input [3]: [o_orderkey#X, o_orderdate#X, o_orderpriority#X] Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderpriority#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderpriority#X] +Arguments: X + +(6) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderpriority#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderpriority#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [2]: [o_orderkey#X, o_orderpriority#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [2]: [o_orderkey#X, o_orderpriority#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [2]: [o_orderkey#X, o_orderpriority#X] -(9) Scan parquet +(10) Scan parquet Output [3]: [l_orderkey#X, l_commitdate#X, l_receiptdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_commitdate), IsNotNull(l_receiptdate)] ReadSchema: struct -(10) FilterExecTransformer +(11) FilterExecTransformer Input [3]: [l_orderkey#X, l_commitdate#X, l_receiptdate#X] Arguments: ((isnotnull(l_commitdate#X) AND isnotnull(l_receiptdate#X)) AND (l_commitdate#X < l_receiptdate#X)) -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [2]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X] Input [3]: [l_orderkey#X, l_commitdate#X, l_receiptdate#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, l_orderkey#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [2]: [hash_partition_key#X, l_orderkey#X] +Arguments: X + +(15) ColumnarExchange Input [2]: [hash_partition_key#X, l_orderkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [1]: [l_orderkey#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [1]: [l_orderkey#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [1]: [l_orderkey#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [1]: [o_orderpriority#X] Input [2]: [o_orderkey#X, o_orderpriority#X] -(19) FlushableHashAggregateExecTransformer +(21) FlushableHashAggregateExecTransformer Input [1]: [o_orderpriority#X] Keys [1]: [o_orderpriority#X] Functions [1]: [partial_count(1)] Aggregate Attributes [1]: [count#X] Results [2]: [o_orderpriority#X, count#X] -(20) ProjectExecTransformer +(22) ProjectExecTransformer Output [3]: [hash(o_orderpriority#X, 42) AS hash_partition_key#X, o_orderpriority#X, count#X] Input [2]: [o_orderpriority#X, count#X] -(21) WholeStageCodegenTransformer (X) +(23) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, o_orderpriority#X, count#X] Arguments: false -(22) ColumnarExchange +(24) VeloxAppendBatches +Input [3]: [hash_partition_key#X, o_orderpriority#X, count#X] +Arguments: X + +(25) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderpriority#X, count#X] Arguments: hashpartitioning(o_orderpriority#X, 1), ENSURE_REQUIREMENTS, [o_orderpriority#X, count#X], [plan_id=X], [id=#X] -(23) ShuffleQueryStage +(26) ShuffleQueryStage Output [2]: [o_orderpriority#X, count#X] Arguments: X -(24) InputAdapter +(27) InputAdapter Input [2]: [o_orderpriority#X, count#X] -(25) InputIteratorTransformer +(28) InputIteratorTransformer Input [2]: [o_orderpriority#X, count#X] -(26) RegularHashAggregateExecTransformer +(29) RegularHashAggregateExecTransformer Input [2]: [o_orderpriority#X, count#X] Keys [1]: [o_orderpriority#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [o_orderpriority#X, count(1)#X AS order_count#X] -(27) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [2]: [o_orderpriority#X, order_count#X] Arguments: false -(28) ColumnarExchange +(31) VeloxAppendBatches +Input [2]: [o_orderpriority#X, order_count#X] +Arguments: X + +(32) ColumnarExchange Input [2]: [o_orderpriority#X, order_count#X] Arguments: rangepartitioning(o_orderpriority#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(29) ShuffleQueryStage +(33) ShuffleQueryStage Output [2]: [o_orderpriority#X, order_count#X] Arguments: X -(30) InputAdapter +(34) InputAdapter Input [2]: [o_orderpriority#X, order_count#X] -(31) InputIteratorTransformer +(35) InputIteratorTransformer Input [2]: [o_orderpriority#X, order_count#X] -(32) SortExecTransformer +(36) SortExecTransformer Input [2]: [o_orderpriority#X, order_count#X] Arguments: [o_orderpriority#X ASC NULLS FIRST], true, 0 -(33) WholeStageCodegenTransformer (X) +(37) WholeStageCodegenTransformer (X) Input [2]: [o_orderpriority#X, order_count#X] Arguments: false -(34) VeloxColumnarToRowExec +(38) VeloxColumnarToRowExec Input [2]: [o_orderpriority#X, order_count#X] -(35) Scan parquet +(39) Scan parquet Output [3]: [o_orderkey#X, o_orderdate#X, o_orderpriority#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1993-07-01), LessThan(o_orderdate,1993-10-01)] ReadSchema: struct -(36) Filter +(40) Filter Input [3]: [o_orderkey#X, o_orderdate#X, o_orderpriority#X] Condition : ((isnotnull(o_orderdate#X) AND (o_orderdate#X >= 1993-07-01)) AND (o_orderdate#X < 1993-10-01)) -(37) Project +(41) Project Output [2]: [o_orderkey#X, o_orderpriority#X] Input [3]: [o_orderkey#X, o_orderdate#X, o_orderpriority#X] -(38) Exchange +(42) Exchange Input [2]: [o_orderkey#X, o_orderpriority#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(39) Scan parquet +(43) Sort +Input [2]: [o_orderkey#X, o_orderpriority#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(44) Scan parquet Output [3]: [l_orderkey#X, l_commitdate#X, l_receiptdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_commitdate), IsNotNull(l_receiptdate)] ReadSchema: struct -(40) Filter +(45) Filter Input [3]: [l_orderkey#X, l_commitdate#X, l_receiptdate#X] Condition : ((isnotnull(l_commitdate#X) AND isnotnull(l_receiptdate#X)) AND (l_commitdate#X < l_receiptdate#X)) -(41) Project +(46) Project Output [1]: [l_orderkey#X] Input [3]: [l_orderkey#X, l_commitdate#X, l_receiptdate#X] -(42) Exchange +(47) Exchange Input [1]: [l_orderkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(43) ShuffledHashJoin +(48) Sort +Input [1]: [l_orderkey#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(49) SortMergeJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(44) Project +(50) Project Output [1]: [o_orderpriority#X] Input [2]: [o_orderkey#X, o_orderpriority#X] -(45) HashAggregate +(51) HashAggregate Input [1]: [o_orderpriority#X] Keys [1]: [o_orderpriority#X] Functions [1]: [partial_count(1)] Aggregate Attributes [1]: [count#X] Results [2]: [o_orderpriority#X, count#X] -(46) Exchange +(52) Exchange Input [2]: [o_orderpriority#X, count#X] Arguments: hashpartitioning(o_orderpriority#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(47) HashAggregate +(53) HashAggregate Input [2]: [o_orderpriority#X, count#X] Keys [1]: [o_orderpriority#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [o_orderpriority#X, count(1)#X AS order_count#X] -(48) Exchange +(54) Exchange Input [2]: [o_orderpriority#X, order_count#X] Arguments: rangepartitioning(o_orderpriority#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(49) Sort +(55) Sort Input [2]: [o_orderpriority#X, order_count#X] Arguments: [o_orderpriority#X ASC NULLS FIRST], true, 0 -(50) AdaptiveSparkPlan +(56) AdaptiveSparkPlan Output [2]: [o_orderpriority#X, order_count#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/5.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/5.txt index 59273229a1ae2..8978f9563c68f 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/5.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/5.txt @@ -1,127 +1,137 @@ == Physical Plan == -AdaptiveSparkPlan (134) +AdaptiveSparkPlan (156) +- == Final Plan == - VeloxColumnarToRowExec (94) - +- ^ SortExecTransformer (92) - +- ^ InputIteratorTransformer (91) - +- ^ InputAdapter (90) - +- ^ ShuffleQueryStage (89), Statistics(X) - +- ColumnarExchange (88) - +- ^ RegularHashAggregateExecTransformer (86) - +- ^ InputIteratorTransformer (85) - +- ^ InputAdapter (84) - +- ^ ShuffleQueryStage (83), Statistics(X) - +- ColumnarExchange (82) - +- ^ ProjectExecTransformer (80) - +- ^ FlushableHashAggregateExecTransformer (79) - +- ^ ProjectExecTransformer (78) - +- ^ ShuffledHashJoinExecTransformer Inner (77) - :- ^ InputIteratorTransformer (68) - : +- ^ InputAdapter (67) - : +- ^ ShuffleQueryStage (66), Statistics(X) - : +- ColumnarExchange (65) - : +- ^ ProjectExecTransformer (63) - : +- ^ ShuffledHashJoinExecTransformer Inner (62) - : :- ^ InputIteratorTransformer (53) - : : +- ^ InputAdapter (52) - : : +- ^ ShuffleQueryStage (51), Statistics(X) - : : +- ColumnarExchange (50) - : : +- ^ ProjectExecTransformer (48) - : : +- ^ ShuffledHashJoinExecTransformer Inner (47) - : : :- ^ InputIteratorTransformer (38) - : : : +- ^ InputAdapter (37) - : : : +- ^ ShuffleQueryStage (36), Statistics(X) - : : : +- ColumnarExchange (35) - : : : +- ^ ProjectExecTransformer (33) - : : : +- ^ ShuffledHashJoinExecTransformer Inner (32) - : : : :- ^ InputIteratorTransformer (23) - : : : : +- ^ InputAdapter (22) - : : : : +- ^ ShuffleQueryStage (21), Statistics(X) - : : : : +- ColumnarExchange (20) - : : : : +- ^ ProjectExecTransformer (18) - : : : : +- ^ ShuffledHashJoinExecTransformer Inner (17) - : : : : :- ^ InputIteratorTransformer (8) - : : : : : +- ^ InputAdapter (7) - : : : : : +- ^ ShuffleQueryStage (6), Statistics(X) - : : : : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (106) + +- ^ SortExecTransformer (104) + +- ^ InputIteratorTransformer (103) + +- ShuffleQueryStage (101), Statistics(X) + +- ColumnarExchange (100) + +- VeloxAppendBatches (99) + +- ^ RegularHashAggregateExecTransformer (97) + +- ^ InputIteratorTransformer (96) + +- ShuffleQueryStage (94), Statistics(X) + +- ColumnarExchange (93) + +- VeloxAppendBatches (92) + +- ^ ProjectExecTransformer (90) + +- ^ FlushableHashAggregateExecTransformer (89) + +- ^ ProjectExecTransformer (88) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (87) + :- ^ InputIteratorTransformer (77) + : +- ShuffleQueryStage (75), Statistics(X) + : +- ColumnarExchange (74) + : +- VeloxAppendBatches (73) + : +- ^ ProjectExecTransformer (71) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (70) + : :- ^ InputIteratorTransformer (60) + : : +- ShuffleQueryStage (58), Statistics(X) + : : +- ColumnarExchange (57) + : : +- VeloxAppendBatches (56) + : : +- ^ ProjectExecTransformer (54) + : : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (53) + : : :- ^ InputIteratorTransformer (43) + : : : +- ShuffleQueryStage (41), Statistics(X) + : : : +- ColumnarExchange (40) + : : : +- VeloxAppendBatches (39) + : : : +- ^ ProjectExecTransformer (37) + : : : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (36) + : : : :- ^ InputIteratorTransformer (26) + : : : : +- ShuffleQueryStage (24), Statistics(X) + : : : : +- ColumnarExchange (23) + : : : : +- VeloxAppendBatches (22) + : : : : +- ^ ProjectExecTransformer (20) + : : : : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (19) + : : : : :- ^ InputIteratorTransformer (9) + : : : : : +- ShuffleQueryStage (7), Statistics(X) + : : : : : +- ColumnarExchange (6) + : : : : : +- VeloxAppendBatches (5) : : : : : +- ^ ProjectExecTransformer (3) : : : : : +- ^ FilterExecTransformer (2) : : : : : +- ^ Scan parquet (1) - : : : : +- ^ InputIteratorTransformer (16) - : : : : +- ^ InputAdapter (15) - : : : : +- ^ ShuffleQueryStage (14), Statistics(X) - : : : : +- ColumnarExchange (13) - : : : : +- ^ ProjectExecTransformer (11) - : : : : +- ^ FilterExecTransformer (10) - : : : : +- ^ Scan parquet (9) - : : : +- ^ InputIteratorTransformer (31) - : : : +- ^ InputAdapter (30) - : : : +- ^ ShuffleQueryStage (29), Statistics(X) - : : : +- ColumnarExchange (28) - : : : +- ^ ProjectExecTransformer (26) - : : : +- ^ FilterExecTransformer (25) - : : : +- ^ Scan parquet (24) - : : +- ^ InputIteratorTransformer (46) - : : +- ^ InputAdapter (45) - : : +- ^ ShuffleQueryStage (44), Statistics(X) - : : +- ColumnarExchange (43) - : : +- ^ ProjectExecTransformer (41) - : : +- ^ FilterExecTransformer (40) - : : +- ^ Scan parquet (39) - : +- ^ InputIteratorTransformer (61) - : +- ^ InputAdapter (60) - : +- ^ ShuffleQueryStage (59), Statistics(X) - : +- ColumnarExchange (58) - : +- ^ ProjectExecTransformer (56) - : +- ^ FilterExecTransformer (55) - : +- ^ Scan parquet (54) - +- ^ InputIteratorTransformer (76) - +- ^ InputAdapter (75) - +- ^ ShuffleQueryStage (74), Statistics(X) - +- ColumnarExchange (73) - +- ^ ProjectExecTransformer (71) - +- ^ FilterExecTransformer (70) - +- ^ Scan parquet (69) + : : : : +- ^ InputIteratorTransformer (18) + : : : : +- ShuffleQueryStage (16), Statistics(X) + : : : : +- ColumnarExchange (15) + : : : : +- VeloxAppendBatches (14) + : : : : +- ^ ProjectExecTransformer (12) + : : : : +- ^ FilterExecTransformer (11) + : : : : +- ^ Scan parquet (10) + : : : +- ^ InputIteratorTransformer (35) + : : : +- ShuffleQueryStage (33), Statistics(X) + : : : +- ColumnarExchange (32) + : : : +- VeloxAppendBatches (31) + : : : +- ^ ProjectExecTransformer (29) + : : : +- ^ FilterExecTransformer (28) + : : : +- ^ Scan parquet (27) + : : +- ^ InputIteratorTransformer (52) + : : +- ShuffleQueryStage (50), Statistics(X) + : : +- ColumnarExchange (49) + : : +- VeloxAppendBatches (48) + : : +- ^ ProjectExecTransformer (46) + : : +- ^ FilterExecTransformer (45) + : : +- ^ Scan parquet (44) + : +- ^ InputIteratorTransformer (69) + : +- ShuffleQueryStage (67), Statistics(X) + : +- ColumnarExchange (66) + : +- VeloxAppendBatches (65) + : +- ^ ProjectExecTransformer (63) + : +- ^ FilterExecTransformer (62) + : +- ^ Scan parquet (61) + +- ^ InputIteratorTransformer (86) + +- ShuffleQueryStage (84), Statistics(X) + +- ColumnarExchange (83) + +- VeloxAppendBatches (82) + +- ^ ProjectExecTransformer (80) + +- ^ FilterExecTransformer (79) + +- ^ Scan parquet (78) +- == Initial Plan == - Sort (133) - +- Exchange (132) - +- HashAggregate (131) - +- Exchange (130) - +- HashAggregate (129) - +- Project (128) - +- ShuffledHashJoin Inner BuildRight (127) - :- Exchange (122) - : +- Project (121) - : +- ShuffledHashJoin Inner BuildRight (120) - : :- Exchange (116) - : : +- Project (115) - : : +- ShuffledHashJoin Inner BuildRight (114) - : : :- Exchange (110) - : : : +- Project (109) - : : : +- ShuffledHashJoin Inner BuildRight (108) - : : : :- Exchange (104) - : : : : +- Project (103) - : : : : +- ShuffledHashJoin Inner BuildLeft (102) - : : : : :- Exchange (97) - : : : : : +- Filter (96) - : : : : : +- Scan parquet (95) - : : : : +- Exchange (101) - : : : : +- Project (100) - : : : : +- Filter (99) - : : : : +- Scan parquet (98) - : : : +- Exchange (107) - : : : +- Filter (106) - : : : +- Scan parquet (105) - : : +- Exchange (113) - : : +- Filter (112) - : : +- Scan parquet (111) - : +- Exchange (119) - : +- Filter (118) - : +- Scan parquet (117) - +- Exchange (126) - +- Project (125) - +- Filter (124) - +- Scan parquet (123) + Sort (155) + +- Exchange (154) + +- HashAggregate (153) + +- Exchange (152) + +- HashAggregate (151) + +- Project (150) + +- SortMergeJoin Inner (149) + :- Sort (143) + : +- Exchange (142) + : +- Project (141) + : +- SortMergeJoin Inner (140) + : :- Sort (135) + : : +- Exchange (134) + : : +- Project (133) + : : +- SortMergeJoin Inner (132) + : : :- Sort (127) + : : : +- Exchange (126) + : : : +- Project (125) + : : : +- SortMergeJoin Inner (124) + : : : :- Sort (119) + : : : : +- Exchange (118) + : : : : +- Project (117) + : : : : +- SortMergeJoin Inner (116) + : : : : :- Sort (110) + : : : : : +- Exchange (109) + : : : : : +- Filter (108) + : : : : : +- Scan parquet (107) + : : : : +- Sort (115) + : : : : +- Exchange (114) + : : : : +- Project (113) + : : : : +- Filter (112) + : : : : +- Scan parquet (111) + : : : +- Sort (123) + : : : +- Exchange (122) + : : : +- Filter (121) + : : : +- Scan parquet (120) + : : +- Sort (131) + : : +- Exchange (130) + : : +- Filter (129) + : : +- Scan parquet (128) + : +- Sort (139) + : +- Exchange (138) + : +- Filter (137) + : +- Scan parquet (136) + +- Sort (148) + +- Exchange (147) + +- Project (146) + +- Filter (145) + +- Scan parquet (144) (1) Scan parquet @@ -143,552 +153,640 @@ Input [2]: [c_custkey#X, c_nationkey#X] Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] +Arguments: X + +(6) ColumnarExchange Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_nationkey#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [2]: [c_custkey#X, c_nationkey#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [2]: [c_custkey#X, c_nationkey#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [2]: [c_custkey#X, c_nationkey#X] -(9) Scan parquet +(10) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1994-01-01), LessThan(o_orderdate,1995-01-01), IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(10) FilterExecTransformer +(11) FilterExecTransformer Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Arguments: ((((isnotnull(o_orderdate#X) AND (o_orderdate#X >= 1994-01-01)) AND (o_orderdate#X < 1995-01-01)) AND isnotnull(o_custkey#X)) AND isnotnull(o_orderkey#X)) -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [3]: [hash(o_custkey#X, 42) AS hash_partition_key#X, o_orderkey#X, o_custkey#X] Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] +Arguments: X + +(15) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [2]: [o_orderkey#X, o_custkey#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [2]: [o_orderkey#X, o_custkey#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [2]: [o_orderkey#X, o_custkey#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [3]: [hash(o_orderkey#X, 42) AS hash_partition_key#X, c_nationkey#X, o_orderkey#X] Input [4]: [c_custkey#X, c_nationkey#X, o_orderkey#X, o_custkey#X] -(19) WholeStageCodegenTransformer (X) +(21) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, c_nationkey#X, o_orderkey#X] Arguments: false -(20) ColumnarExchange +(22) VeloxAppendBatches +Input [3]: [hash_partition_key#X, c_nationkey#X, o_orderkey#X] +Arguments: X + +(23) ColumnarExchange Input [3]: [hash_partition_key#X, c_nationkey#X, o_orderkey#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [c_nationkey#X, o_orderkey#X], [plan_id=X], [id=#X] -(21) ShuffleQueryStage +(24) ShuffleQueryStage Output [2]: [c_nationkey#X, o_orderkey#X] Arguments: X -(22) InputAdapter +(25) InputAdapter Input [2]: [c_nationkey#X, o_orderkey#X] -(23) InputIteratorTransformer +(26) InputIteratorTransformer Input [2]: [c_nationkey#X, o_orderkey#X] -(24) Scan parquet +(27) Scan parquet Output [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_orderkey), IsNotNull(l_suppkey)] ReadSchema: struct -(25) FilterExecTransformer +(28) FilterExecTransformer Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: (isnotnull(l_orderkey#X) AND isnotnull(l_suppkey#X)) -(26) ProjectExecTransformer +(29) ProjectExecTransformer Output [5]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(27) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: false -(28) ColumnarExchange +(31) VeloxAppendBatches +Input [5]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(32) ColumnarExchange Input [5]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(29) ShuffleQueryStage +(33) ShuffleQueryStage Output [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: X -(30) InputAdapter +(34) InputAdapter Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(31) InputIteratorTransformer +(35) InputIteratorTransformer Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(32) ShuffledHashJoinExecTransformer +(36) ShuffledHashJoinExecTransformer Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(33) ProjectExecTransformer +(37) ProjectExecTransformer Output [5]: [hash(l_suppkey#X, c_nationkey#X, 42) AS hash_partition_key#X, c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Input [6]: [c_nationkey#X, o_orderkey#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(34) WholeStageCodegenTransformer (X) +(38) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: false -(35) ColumnarExchange +(39) VeloxAppendBatches +Input [5]: [hash_partition_key#X, c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(40) ColumnarExchange Input [5]: [hash_partition_key#X, c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_suppkey#X, c_nationkey#X, 1), ENSURE_REQUIREMENTS, [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(36) ShuffleQueryStage +(41) ShuffleQueryStage Output [4]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: X -(37) InputAdapter +(42) InputAdapter Input [4]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(38) InputIteratorTransformer +(43) InputIteratorTransformer Input [4]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(39) Scan parquet +(44) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(40) FilterExecTransformer +(45) FilterExecTransformer Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(41) ProjectExecTransformer +(46) ProjectExecTransformer Output [3]: [hash(s_suppkey#X, s_nationkey#X, 42) AS hash_partition_key#X, s_suppkey#X, s_nationkey#X] Input [2]: [s_suppkey#X, s_nationkey#X] -(42) WholeStageCodegenTransformer (X) +(47) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] Arguments: false -(43) ColumnarExchange +(48) VeloxAppendBatches +Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] +Arguments: X + +(49) ColumnarExchange Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [id=#X] -(44) ShuffleQueryStage +(50) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] Arguments: X -(45) InputAdapter +(51) InputAdapter Input [2]: [s_suppkey#X, s_nationkey#X] -(46) InputIteratorTransformer +(52) InputIteratorTransformer Input [2]: [s_suppkey#X, s_nationkey#X] -(47) ShuffledHashJoinExecTransformer +(53) ShuffledHashJoinExecTransformer Left keys [2]: [l_suppkey#X, c_nationkey#X] Right keys [2]: [s_suppkey#X, s_nationkey#X] Join condition: None -(48) ProjectExecTransformer +(54) ProjectExecTransformer Output [4]: [hash(s_nationkey#X, 42) AS hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Input [6]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, s_suppkey#X, s_nationkey#X] -(49) WholeStageCodegenTransformer (X) +(55) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: false -(50) ColumnarExchange +(56) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X] +Arguments: X + +(57) ColumnarExchange Input [4]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X], [plan_id=X], [id=#X] -(51) ShuffleQueryStage +(58) ShuffleQueryStage Output [3]: [l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: X -(52) InputAdapter +(59) InputAdapter Input [3]: [l_extendedprice#X, l_discount#X, s_nationkey#X] -(53) InputIteratorTransformer +(60) InputIteratorTransformer Input [3]: [l_extendedprice#X, l_discount#X, s_nationkey#X] -(54) Scan parquet +(61) Scan parquet Output [3]: [n_nationkey#X, n_name#X, n_regionkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), IsNotNull(n_regionkey)] ReadSchema: struct -(55) FilterExecTransformer +(62) FilterExecTransformer Input [3]: [n_nationkey#X, n_name#X, n_regionkey#X] Arguments: (isnotnull(n_nationkey#X) AND isnotnull(n_regionkey#X)) -(56) ProjectExecTransformer +(63) ProjectExecTransformer Output [4]: [hash(n_nationkey#X, 42) AS hash_partition_key#X, n_nationkey#X, n_name#X, n_regionkey#X] Input [3]: [n_nationkey#X, n_name#X, n_regionkey#X] -(57) WholeStageCodegenTransformer (X) +(64) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, n_nationkey#X, n_name#X, n_regionkey#X] Arguments: false -(58) ColumnarExchange +(65) VeloxAppendBatches +Input [4]: [hash_partition_key#X, n_nationkey#X, n_name#X, n_regionkey#X] +Arguments: X + +(66) ColumnarExchange Input [4]: [hash_partition_key#X, n_nationkey#X, n_name#X, n_regionkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X, n_regionkey#X], [plan_id=X], [id=#X] -(59) ShuffleQueryStage +(67) ShuffleQueryStage Output [3]: [n_nationkey#X, n_name#X, n_regionkey#X] Arguments: X -(60) InputAdapter +(68) InputAdapter Input [3]: [n_nationkey#X, n_name#X, n_regionkey#X] -(61) InputIteratorTransformer +(69) InputIteratorTransformer Input [3]: [n_nationkey#X, n_name#X, n_regionkey#X] -(62) ShuffledHashJoinExecTransformer +(70) ShuffledHashJoinExecTransformer Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(63) ProjectExecTransformer +(71) ProjectExecTransformer Output [5]: [hash(n_regionkey#X, 42) AS hash_partition_key#X, l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] Input [6]: [l_extendedprice#X, l_discount#X, s_nationkey#X, n_nationkey#X, n_name#X, n_regionkey#X] -(64) WholeStageCodegenTransformer (X) +(72) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] Arguments: false -(65) ColumnarExchange +(73) VeloxAppendBatches +Input [5]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] +Arguments: X + +(74) ColumnarExchange Input [5]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] Arguments: hashpartitioning(n_regionkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X], [plan_id=X], [id=#X] -(66) ShuffleQueryStage +(75) ShuffleQueryStage Output [4]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] Arguments: X -(67) InputAdapter +(76) InputAdapter Input [4]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] -(68) InputIteratorTransformer +(77) InputIteratorTransformer Input [4]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] -(69) Scan parquet +(78) Scan parquet Output [2]: [r_regionkey#X, r_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(r_name), EqualTo(r_name,ASIA), IsNotNull(r_regionkey)] ReadSchema: struct -(70) FilterExecTransformer +(79) FilterExecTransformer Input [2]: [r_regionkey#X, r_name#X] Arguments: ((isnotnull(r_name#X) AND (r_name#X = ASIA)) AND isnotnull(r_regionkey#X)) -(71) ProjectExecTransformer +(80) ProjectExecTransformer Output [2]: [hash(r_regionkey#X, 42) AS hash_partition_key#X, r_regionkey#X] Input [2]: [r_regionkey#X, r_name#X] -(72) WholeStageCodegenTransformer (X) +(81) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, r_regionkey#X] Arguments: false -(73) ColumnarExchange +(82) VeloxAppendBatches +Input [2]: [hash_partition_key#X, r_regionkey#X] +Arguments: X + +(83) ColumnarExchange Input [2]: [hash_partition_key#X, r_regionkey#X] Arguments: hashpartitioning(r_regionkey#X, 1), ENSURE_REQUIREMENTS, [r_regionkey#X], [plan_id=X], [id=#X] -(74) ShuffleQueryStage +(84) ShuffleQueryStage Output [1]: [r_regionkey#X] Arguments: X -(75) InputAdapter +(85) InputAdapter Input [1]: [r_regionkey#X] -(76) InputIteratorTransformer +(86) InputIteratorTransformer Input [1]: [r_regionkey#X] -(77) ShuffledHashJoinExecTransformer +(87) ShuffledHashJoinExecTransformer Left keys [1]: [n_regionkey#X] Right keys [1]: [r_regionkey#X] Join condition: None -(78) ProjectExecTransformer +(88) ProjectExecTransformer Output [4]: [l_extendedprice#X, l_discount#X, n_name#X, CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) AS _pre_X#X] Input [5]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X, r_regionkey#X] -(79) FlushableHashAggregateExecTransformer +(89) FlushableHashAggregateExecTransformer Input [4]: [l_extendedprice#X, l_discount#X, n_name#X, _pre_X#X] Keys [1]: [n_name#X] Functions [1]: [partial_sum(_pre_X#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [n_name#X, sum#X, isEmpty#X] -(80) ProjectExecTransformer +(90) ProjectExecTransformer Output [4]: [hash(n_name#X, 42) AS hash_partition_key#X, n_name#X, sum#X, isEmpty#X] Input [3]: [n_name#X, sum#X, isEmpty#X] -(81) WholeStageCodegenTransformer (X) +(91) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, n_name#X, sum#X, isEmpty#X] Arguments: false -(82) ColumnarExchange +(92) VeloxAppendBatches +Input [4]: [hash_partition_key#X, n_name#X, sum#X, isEmpty#X] +Arguments: X + +(93) ColumnarExchange Input [4]: [hash_partition_key#X, n_name#X, sum#X, isEmpty#X] Arguments: hashpartitioning(n_name#X, 1), ENSURE_REQUIREMENTS, [n_name#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(83) ShuffleQueryStage +(94) ShuffleQueryStage Output [3]: [n_name#X, sum#X, isEmpty#X] Arguments: X -(84) InputAdapter +(95) InputAdapter Input [3]: [n_name#X, sum#X, isEmpty#X] -(85) InputIteratorTransformer +(96) InputIteratorTransformer Input [3]: [n_name#X, sum#X, isEmpty#X] -(86) RegularHashAggregateExecTransformer +(97) RegularHashAggregateExecTransformer Input [3]: [n_name#X, sum#X, isEmpty#X] Keys [1]: [n_name#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] Results [2]: [n_name#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X AS revenue#X] -(87) WholeStageCodegenTransformer (X) +(98) WholeStageCodegenTransformer (X) Input [2]: [n_name#X, revenue#X] Arguments: false -(88) ColumnarExchange +(99) VeloxAppendBatches +Input [2]: [n_name#X, revenue#X] +Arguments: X + +(100) ColumnarExchange Input [2]: [n_name#X, revenue#X] Arguments: rangepartitioning(revenue#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(89) ShuffleQueryStage +(101) ShuffleQueryStage Output [2]: [n_name#X, revenue#X] Arguments: X -(90) InputAdapter +(102) InputAdapter Input [2]: [n_name#X, revenue#X] -(91) InputIteratorTransformer +(103) InputIteratorTransformer Input [2]: [n_name#X, revenue#X] -(92) SortExecTransformer +(104) SortExecTransformer Input [2]: [n_name#X, revenue#X] Arguments: [revenue#X DESC NULLS LAST], true, 0 -(93) WholeStageCodegenTransformer (X) +(105) WholeStageCodegenTransformer (X) Input [2]: [n_name#X, revenue#X] Arguments: false -(94) VeloxColumnarToRowExec +(106) VeloxColumnarToRowExec Input [2]: [n_name#X, revenue#X] -(95) Scan parquet +(107) Scan parquet Output [2]: [c_custkey#X, c_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey), IsNotNull(c_nationkey)] ReadSchema: struct -(96) Filter +(108) Filter Input [2]: [c_custkey#X, c_nationkey#X] Condition : (isnotnull(c_custkey#X) AND isnotnull(c_nationkey#X)) -(97) Exchange +(109) Exchange Input [2]: [c_custkey#X, c_nationkey#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(98) Scan parquet +(110) Sort +Input [2]: [c_custkey#X, c_nationkey#X] +Arguments: [c_custkey#X ASC NULLS FIRST], false, 0 + +(111) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1994-01-01), LessThan(o_orderdate,1995-01-01), IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(99) Filter +(112) Filter Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Condition : ((((isnotnull(o_orderdate#X) AND (o_orderdate#X >= 1994-01-01)) AND (o_orderdate#X < 1995-01-01)) AND isnotnull(o_custkey#X)) AND isnotnull(o_orderkey#X)) -(100) Project +(113) Project Output [2]: [o_orderkey#X, o_custkey#X] Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] -(101) Exchange +(114) Exchange Input [2]: [o_orderkey#X, o_custkey#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(102) ShuffledHashJoin +(115) Sort +Input [2]: [o_orderkey#X, o_custkey#X] +Arguments: [o_custkey#X ASC NULLS FIRST], false, 0 + +(116) SortMergeJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join condition: None -(103) Project +(117) Project Output [2]: [c_nationkey#X, o_orderkey#X] Input [4]: [c_custkey#X, c_nationkey#X, o_orderkey#X, o_custkey#X] -(104) Exchange +(118) Exchange Input [2]: [c_nationkey#X, o_orderkey#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(105) Scan parquet +(119) Sort +Input [2]: [c_nationkey#X, o_orderkey#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(120) Scan parquet Output [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_orderkey), IsNotNull(l_suppkey)] ReadSchema: struct -(106) Filter +(121) Filter Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Condition : (isnotnull(l_orderkey#X) AND isnotnull(l_suppkey#X)) -(107) Exchange +(122) Exchange Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(108) ShuffledHashJoin +(123) Sort +Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(124) SortMergeJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join condition: None -(109) Project +(125) Project Output [4]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Input [6]: [c_nationkey#X, o_orderkey#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(110) Exchange +(126) Exchange Input [4]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_suppkey#X, c_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(111) Scan parquet +(127) Sort +Input [4]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] +Arguments: [l_suppkey#X ASC NULLS FIRST, c_nationkey#X ASC NULLS FIRST], false, 0 + +(128) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(112) Filter +(129) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(113) Exchange +(130) Exchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, s_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(114) ShuffledHashJoin +(131) Sort +Input [2]: [s_suppkey#X, s_nationkey#X] +Arguments: [s_suppkey#X ASC NULLS FIRST, s_nationkey#X ASC NULLS FIRST], false, 0 + +(132) SortMergeJoin Left keys [2]: [l_suppkey#X, c_nationkey#X] Right keys [2]: [s_suppkey#X, s_nationkey#X] Join condition: None -(115) Project +(133) Project Output [3]: [l_extendedprice#X, l_discount#X, s_nationkey#X] Input [6]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, s_suppkey#X, s_nationkey#X] -(116) Exchange +(134) Exchange Input [3]: [l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(117) Scan parquet +(135) Sort +Input [3]: [l_extendedprice#X, l_discount#X, s_nationkey#X] +Arguments: [s_nationkey#X ASC NULLS FIRST], false, 0 + +(136) Scan parquet Output [3]: [n_nationkey#X, n_name#X, n_regionkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), IsNotNull(n_regionkey)] ReadSchema: struct -(118) Filter +(137) Filter Input [3]: [n_nationkey#X, n_name#X, n_regionkey#X] Condition : (isnotnull(n_nationkey#X) AND isnotnull(n_regionkey#X)) -(119) Exchange +(138) Exchange Input [3]: [n_nationkey#X, n_name#X, n_regionkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(120) ShuffledHashJoin +(139) Sort +Input [3]: [n_nationkey#X, n_name#X, n_regionkey#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(140) SortMergeJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(121) Project +(141) Project Output [4]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] Input [6]: [l_extendedprice#X, l_discount#X, s_nationkey#X, n_nationkey#X, n_name#X, n_regionkey#X] -(122) Exchange +(142) Exchange Input [4]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] Arguments: hashpartitioning(n_regionkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(123) Scan parquet +(143) Sort +Input [4]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] +Arguments: [n_regionkey#X ASC NULLS FIRST], false, 0 + +(144) Scan parquet Output [2]: [r_regionkey#X, r_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(r_name), EqualTo(r_name,ASIA), IsNotNull(r_regionkey)] ReadSchema: struct -(124) Filter +(145) Filter Input [2]: [r_regionkey#X, r_name#X] Condition : ((isnotnull(r_name#X) AND (r_name#X = ASIA)) AND isnotnull(r_regionkey#X)) -(125) Project +(146) Project Output [1]: [r_regionkey#X] Input [2]: [r_regionkey#X, r_name#X] -(126) Exchange +(147) Exchange Input [1]: [r_regionkey#X] Arguments: hashpartitioning(r_regionkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(127) ShuffledHashJoin +(148) Sort +Input [1]: [r_regionkey#X] +Arguments: [r_regionkey#X ASC NULLS FIRST], false, 0 + +(149) SortMergeJoin Left keys [1]: [n_regionkey#X] Right keys [1]: [r_regionkey#X] Join condition: None -(128) Project +(150) Project Output [3]: [l_extendedprice#X, l_discount#X, n_name#X] Input [5]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X, r_regionkey#X] -(129) HashAggregate +(151) HashAggregate Input [3]: [l_extendedprice#X, l_discount#X, n_name#X] Keys [1]: [n_name#X] Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [n_name#X, sum#X, isEmpty#X] -(130) Exchange +(152) Exchange Input [3]: [n_name#X, sum#X, isEmpty#X] Arguments: hashpartitioning(n_name#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(131) HashAggregate +(153) HashAggregate Input [3]: [n_name#X, sum#X, isEmpty#X] Keys [1]: [n_name#X] Functions [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X] Results [2]: [n_name#X, sum(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)))#X AS revenue#X] -(132) Exchange +(154) Exchange Input [2]: [n_name#X, revenue#X] Arguments: rangepartitioning(revenue#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(133) Sort +(155) Sort Input [2]: [n_name#X, revenue#X] Arguments: [revenue#X DESC NULLS LAST], true, 0 -(134) AdaptiveSparkPlan +(156) AdaptiveSparkPlan Output [2]: [n_name#X, revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/6.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/6.txt index 15f7640401849..3432579a0de06 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/6.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/6.txt @@ -1,23 +1,23 @@ == Physical Plan == -AdaptiveSparkPlan (19) +AdaptiveSparkPlan (20) +- == Final Plan == - VeloxColumnarToRowExec (12) - +- ^ RegularHashAggregateExecTransformer (10) - +- ^ InputIteratorTransformer (9) - +- ^ InputAdapter (8) - +- ^ ShuffleQueryStage (7), Statistics(X) - +- ColumnarExchange (6) + VeloxColumnarToRowExec (13) + +- ^ RegularHashAggregateExecTransformer (11) + +- ^ InputIteratorTransformer (10) + +- ShuffleQueryStage (8), Statistics(X) + +- ColumnarExchange (7) + +- VeloxAppendBatches (6) +- ^ FlushableHashAggregateExecTransformer (4) +- ^ ProjectExecTransformer (3) +- ^ FilterExecTransformer (2) +- ^ Scan parquet (1) +- == Initial Plan == - HashAggregate (18) - +- Exchange (17) - +- HashAggregate (16) - +- Project (15) - +- Filter (14) - +- Scan parquet (13) + HashAggregate (19) + +- Exchange (18) + +- HashAggregate (17) + +- Project (16) + +- Filter (15) + +- Scan parquet (14) (1) Scan parquet @@ -46,67 +46,71 @@ Results [2]: [sum#X, isEmpty#X] Input [2]: [sum#X, isEmpty#X] Arguments: false -(6) ColumnarExchange +(6) VeloxAppendBatches +Input [2]: [sum#X, isEmpty#X] +Arguments: X + +(7) ColumnarExchange Input [2]: [sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(7) ShuffleQueryStage +(8) ShuffleQueryStage Output [2]: [sum#X, isEmpty#X] Arguments: X -(8) InputAdapter +(9) InputAdapter Input [2]: [sum#X, isEmpty#X] -(9) InputIteratorTransformer +(10) InputIteratorTransformer Input [2]: [sum#X, isEmpty#X] -(10) RegularHashAggregateExecTransformer +(11) RegularHashAggregateExecTransformer Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(CheckOverflow((promote_precision(l_extendedprice#X) * promote_precision(l_discount#X)), DecimalType(25,4)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(l_extendedprice#X) * promote_precision(l_discount#X)), DecimalType(25,4)))#X] Results [1]: [sum(CheckOverflow((promote_precision(l_extendedprice#X) * promote_precision(l_discount#X)), DecimalType(25,4)))#X AS revenue#X] -(11) WholeStageCodegenTransformer (X) +(12) WholeStageCodegenTransformer (X) Input [1]: [revenue#X] Arguments: false -(12) VeloxColumnarToRowExec +(13) VeloxColumnarToRowExec Input [1]: [revenue#X] -(13) Scan parquet +(14) Scan parquet Output [4]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), IsNotNull(l_discount), IsNotNull(l_quantity), GreaterThanOrEqual(l_shipdate,1994-01-01), LessThan(l_shipdate,1995-01-01), GreaterThanOrEqual(l_discount,0.05), LessThanOrEqual(l_discount,0.07), LessThan(l_quantity,24.00)] ReadSchema: struct -(14) Filter +(15) Filter Input [4]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : (((((((isnotnull(l_shipdate#X) AND isnotnull(l_discount#X)) AND isnotnull(l_quantity#X)) AND (l_shipdate#X >= 1994-01-01)) AND (l_shipdate#X < 1995-01-01)) AND (l_discount#X >= 0.05)) AND (l_discount#X <= 0.07)) AND (l_quantity#X < 24.00)) -(15) Project +(16) Project Output [2]: [l_extendedprice#X, l_discount#X] Input [4]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(16) HashAggregate +(17) HashAggregate Input [2]: [l_extendedprice#X, l_discount#X] Keys: [] Functions [1]: [partial_sum(CheckOverflow((promote_precision(l_extendedprice#X) * promote_precision(l_discount#X)), DecimalType(25,4)))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(17) Exchange +(18) Exchange Input [2]: [sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X] -(18) HashAggregate +(19) HashAggregate Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(CheckOverflow((promote_precision(l_extendedprice#X) * promote_precision(l_discount#X)), DecimalType(25,4)))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(l_extendedprice#X) * promote_precision(l_discount#X)), DecimalType(25,4)))#X] Results [1]: [sum(CheckOverflow((promote_precision(l_extendedprice#X) * promote_precision(l_discount#X)), DecimalType(25,4)))#X AS revenue#X] -(19) AdaptiveSparkPlan +(20) AdaptiveSparkPlan Output [1]: [revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/7.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/7.txt index 2aa77120c6937..244f650f3a72c 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/7.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/7.txt @@ -1,122 +1,131 @@ == Physical Plan == -AdaptiveSparkPlan (128) +AdaptiveSparkPlan (149) +- == Final Plan == - VeloxColumnarToRowExec (90) - +- ^ SortExecTransformer (88) - +- ^ InputIteratorTransformer (87) - +- ^ InputAdapter (86) - +- ^ ShuffleQueryStage (85), Statistics(X) - +- ColumnarExchange (84) - +- ^ RegularHashAggregateExecTransformer (82) - +- ^ InputIteratorTransformer (81) - +- ^ InputAdapter (80) - +- ^ ShuffleQueryStage (79), Statistics(X) - +- ColumnarExchange (78) - +- ^ ProjectExecTransformer (76) - +- ^ FlushableHashAggregateExecTransformer (75) - +- ^ ProjectExecTransformer (74) - +- ^ ShuffledHashJoinExecTransformer Inner (73) - :- ^ InputIteratorTransformer (68) - : +- ^ InputAdapter (67) - : +- ^ ShuffleQueryStage (66), Statistics(X) - : +- ColumnarExchange (65) - : +- ^ ProjectExecTransformer (63) - : +- ^ ShuffledHashJoinExecTransformer Inner (62) - : :- ^ InputIteratorTransformer (53) - : : +- ^ InputAdapter (52) - : : +- ^ ShuffleQueryStage (51), Statistics(X) - : : +- ColumnarExchange (50) - : : +- ^ ProjectExecTransformer (48) - : : +- ^ ShuffledHashJoinExecTransformer Inner (47) - : : :- ^ InputIteratorTransformer (38) - : : : +- ^ InputAdapter (37) - : : : +- ^ ShuffleQueryStage (36), Statistics(X) - : : : +- ColumnarExchange (35) - : : : +- ^ ProjectExecTransformer (33) - : : : +- ^ ShuffledHashJoinExecTransformer Inner (32) - : : : :- ^ InputIteratorTransformer (23) - : : : : +- ^ InputAdapter (22) - : : : : +- ^ ShuffleQueryStage (21), Statistics(X) - : : : : +- ColumnarExchange (20) - : : : : +- ^ ProjectExecTransformer (18) - : : : : +- ^ ShuffledHashJoinExecTransformer Inner (17) - : : : : :- ^ InputIteratorTransformer (8) - : : : : : +- ^ InputAdapter (7) - : : : : : +- ^ ShuffleQueryStage (6), Statistics(X) - : : : : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (101) + +- ^ SortExecTransformer (99) + +- ^ InputIteratorTransformer (98) + +- ShuffleQueryStage (96), Statistics(X) + +- ColumnarExchange (95) + +- VeloxAppendBatches (94) + +- ^ RegularHashAggregateExecTransformer (92) + +- ^ InputIteratorTransformer (91) + +- ShuffleQueryStage (89), Statistics(X) + +- ColumnarExchange (88) + +- VeloxAppendBatches (87) + +- ^ ProjectExecTransformer (85) + +- ^ FlushableHashAggregateExecTransformer (84) + +- ^ ProjectExecTransformer (83) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (82) + :- ^ InputIteratorTransformer (77) + : +- ShuffleQueryStage (75), Statistics(X) + : +- ColumnarExchange (74) + : +- VeloxAppendBatches (73) + : +- ^ ProjectExecTransformer (71) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (70) + : :- ^ InputIteratorTransformer (60) + : : +- ShuffleQueryStage (58), Statistics(X) + : : +- ColumnarExchange (57) + : : +- VeloxAppendBatches (56) + : : +- ^ ProjectExecTransformer (54) + : : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (53) + : : :- ^ InputIteratorTransformer (43) + : : : +- ShuffleQueryStage (41), Statistics(X) + : : : +- ColumnarExchange (40) + : : : +- VeloxAppendBatches (39) + : : : +- ^ ProjectExecTransformer (37) + : : : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (36) + : : : :- ^ InputIteratorTransformer (26) + : : : : +- ShuffleQueryStage (24), Statistics(X) + : : : : +- ColumnarExchange (23) + : : : : +- VeloxAppendBatches (22) + : : : : +- ^ ProjectExecTransformer (20) + : : : : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (19) + : : : : :- ^ InputIteratorTransformer (9) + : : : : : +- ShuffleQueryStage (7), Statistics(X) + : : : : : +- ColumnarExchange (6) + : : : : : +- VeloxAppendBatches (5) : : : : : +- ^ ProjectExecTransformer (3) : : : : : +- ^ FilterExecTransformer (2) : : : : : +- ^ Scan parquet (1) - : : : : +- ^ InputIteratorTransformer (16) - : : : : +- ^ InputAdapter (15) - : : : : +- ^ ShuffleQueryStage (14), Statistics(X) - : : : : +- ColumnarExchange (13) - : : : : +- ^ ProjectExecTransformer (11) - : : : : +- ^ FilterExecTransformer (10) - : : : : +- ^ Scan parquet (9) - : : : +- ^ InputIteratorTransformer (31) - : : : +- ^ InputAdapter (30) - : : : +- ^ ShuffleQueryStage (29), Statistics(X) - : : : +- ColumnarExchange (28) - : : : +- ^ ProjectExecTransformer (26) - : : : +- ^ FilterExecTransformer (25) - : : : +- ^ Scan parquet (24) - : : +- ^ InputIteratorTransformer (46) - : : +- ^ InputAdapter (45) - : : +- ^ ShuffleQueryStage (44), Statistics(X) - : : +- ColumnarExchange (43) - : : +- ^ ProjectExecTransformer (41) - : : +- ^ FilterExecTransformer (40) - : : +- ^ Scan parquet (39) - : +- ^ InputIteratorTransformer (61) - : +- ^ InputAdapter (60) - : +- ^ ShuffleQueryStage (59), Statistics(X) - : +- ColumnarExchange (58) - : +- ^ ProjectExecTransformer (56) - : +- ^ FilterExecTransformer (55) - : +- ^ Scan parquet (54) - +- ^ InputIteratorTransformer (72) - +- ^ InputAdapter (71) - +- ^ ShuffleQueryStage (70), Statistics(X) - +- ReusedExchange (69) + : : : : +- ^ InputIteratorTransformer (18) + : : : : +- ShuffleQueryStage (16), Statistics(X) + : : : : +- ColumnarExchange (15) + : : : : +- VeloxAppendBatches (14) + : : : : +- ^ ProjectExecTransformer (12) + : : : : +- ^ FilterExecTransformer (11) + : : : : +- ^ Scan parquet (10) + : : : +- ^ InputIteratorTransformer (35) + : : : +- ShuffleQueryStage (33), Statistics(X) + : : : +- ColumnarExchange (32) + : : : +- VeloxAppendBatches (31) + : : : +- ^ ProjectExecTransformer (29) + : : : +- ^ FilterExecTransformer (28) + : : : +- ^ Scan parquet (27) + : : +- ^ InputIteratorTransformer (52) + : : +- ShuffleQueryStage (50), Statistics(X) + : : +- ColumnarExchange (49) + : : +- VeloxAppendBatches (48) + : : +- ^ ProjectExecTransformer (46) + : : +- ^ FilterExecTransformer (45) + : : +- ^ Scan parquet (44) + : +- ^ InputIteratorTransformer (69) + : +- ShuffleQueryStage (67), Statistics(X) + : +- ColumnarExchange (66) + : +- VeloxAppendBatches (65) + : +- ^ ProjectExecTransformer (63) + : +- ^ FilterExecTransformer (62) + : +- ^ Scan parquet (61) + +- ^ InputIteratorTransformer (81) + +- ShuffleQueryStage (79), Statistics(X) + +- ReusedExchange (78) +- == Initial Plan == - Sort (127) - +- Exchange (126) - +- HashAggregate (125) - +- Exchange (124) - +- HashAggregate (123) - +- Project (122) - +- ShuffledHashJoin Inner BuildRight (121) - :- Exchange (117) - : +- Project (116) - : +- ShuffledHashJoin Inner BuildRight (115) - : :- Exchange (111) - : : +- Project (110) - : : +- ShuffledHashJoin Inner BuildRight (109) - : : :- Exchange (105) - : : : +- Project (104) - : : : +- ShuffledHashJoin Inner BuildRight (103) - : : : :- Exchange (99) - : : : : +- Project (98) - : : : : +- ShuffledHashJoin Inner BuildLeft (97) - : : : : :- Exchange (93) - : : : : : +- Filter (92) - : : : : : +- Scan parquet (91) - : : : : +- Exchange (96) - : : : : +- Filter (95) - : : : : +- Scan parquet (94) - : : : +- Exchange (102) - : : : +- Filter (101) - : : : +- Scan parquet (100) - : : +- Exchange (108) - : : +- Filter (107) - : : +- Scan parquet (106) - : +- Exchange (114) - : +- Filter (113) - : +- Scan parquet (112) - +- Exchange (120) - +- Filter (119) - +- Scan parquet (118) + Sort (148) + +- Exchange (147) + +- HashAggregate (146) + +- Exchange (145) + +- HashAggregate (144) + +- Project (143) + +- SortMergeJoin Inner (142) + :- Sort (137) + : +- Exchange (136) + : +- Project (135) + : +- SortMergeJoin Inner (134) + : :- Sort (129) + : : +- Exchange (128) + : : +- Project (127) + : : +- SortMergeJoin Inner (126) + : : :- Sort (121) + : : : +- Exchange (120) + : : : +- Project (119) + : : : +- SortMergeJoin Inner (118) + : : : :- Sort (113) + : : : : +- Exchange (112) + : : : : +- Project (111) + : : : : +- SortMergeJoin Inner (110) + : : : : :- Sort (105) + : : : : : +- Exchange (104) + : : : : : +- Filter (103) + : : : : : +- Scan parquet (102) + : : : : +- Sort (109) + : : : : +- Exchange (108) + : : : : +- Filter (107) + : : : : +- Scan parquet (106) + : : : +- Sort (117) + : : : +- Exchange (116) + : : : +- Filter (115) + : : : +- Scan parquet (114) + : : +- Sort (125) + : : +- Exchange (124) + : : +- Filter (123) + : : +- Scan parquet (122) + : +- Sort (133) + : +- Exchange (132) + : +- Filter (131) + : +- Scan parquet (130) + +- Sort (141) + +- Exchange (140) + +- Filter (139) + +- Scan parquet (138) (1) Scan parquet @@ -138,524 +147,608 @@ Input [2]: [s_suppkey#X, s_nationkey#X] Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] +Arguments: X + +(6) ColumnarExchange Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [2]: [s_suppkey#X, s_nationkey#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [2]: [s_suppkey#X, s_nationkey#X] -(9) Scan parquet +(10) Scan parquet Output [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1995-01-01), LessThanOrEqual(l_shipdate,1996-12-31), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(10) FilterExecTransformer +(11) FilterExecTransformer Input [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: ((((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1995-01-01)) AND (l_shipdate#X <= 1996-12-31)) AND isnotnull(l_suppkey#X)) AND isnotnull(l_orderkey#X)) -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [6]: [hash(l_suppkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Input [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [6]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] +Arguments: X + +(15) ColumnarExchange Input [6]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [s_suppkey#X] Right keys [1]: [l_suppkey#X] Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [6]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Input [7]: [s_suppkey#X, s_nationkey#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(19) WholeStageCodegenTransformer (X) +(21) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: false -(20) ColumnarExchange +(22) VeloxAppendBatches +Input [6]: [hash_partition_key#X, s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] +Arguments: X + +(23) ColumnarExchange Input [6]: [hash_partition_key#X, s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X], [plan_id=X], [id=#X] -(21) ShuffleQueryStage +(24) ShuffleQueryStage Output [5]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: X -(22) InputAdapter +(25) InputAdapter Input [5]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(23) InputIteratorTransformer +(26) InputIteratorTransformer Input [5]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(24) Scan parquet +(27) Scan parquet Output [2]: [o_orderkey#X, o_custkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderkey), IsNotNull(o_custkey)] ReadSchema: struct -(25) FilterExecTransformer +(28) FilterExecTransformer Input [2]: [o_orderkey#X, o_custkey#X] Arguments: (isnotnull(o_orderkey#X) AND isnotnull(o_custkey#X)) -(26) ProjectExecTransformer +(29) ProjectExecTransformer Output [3]: [hash(o_orderkey#X, 42) AS hash_partition_key#X, o_orderkey#X, o_custkey#X] Input [2]: [o_orderkey#X, o_custkey#X] -(27) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] Arguments: false -(28) ColumnarExchange +(31) VeloxAppendBatches +Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] +Arguments: X + +(32) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [id=#X] -(29) ShuffleQueryStage +(33) ShuffleQueryStage Output [2]: [o_orderkey#X, o_custkey#X] Arguments: X -(30) InputAdapter +(34) InputAdapter Input [2]: [o_orderkey#X, o_custkey#X] -(31) InputIteratorTransformer +(35) InputIteratorTransformer Input [2]: [o_orderkey#X, o_custkey#X] -(32) ShuffledHashJoinExecTransformer +(36) ShuffledHashJoinExecTransformer Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join condition: None -(33) ProjectExecTransformer +(37) ProjectExecTransformer Output [6]: [hash(o_custkey#X, 42) AS hash_partition_key#X, s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] Input [7]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_orderkey#X, o_custkey#X] -(34) WholeStageCodegenTransformer (X) +(38) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] Arguments: false -(35) ColumnarExchange +(39) VeloxAppendBatches +Input [6]: [hash_partition_key#X, s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] +Arguments: X + +(40) ColumnarExchange Input [6]: [hash_partition_key#X, s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X], [plan_id=X], [id=#X] -(36) ShuffleQueryStage +(41) ShuffleQueryStage Output [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] Arguments: X -(37) InputAdapter +(42) InputAdapter Input [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] -(38) InputIteratorTransformer +(43) InputIteratorTransformer Input [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] -(39) Scan parquet +(44) Scan parquet Output [2]: [c_custkey#X, c_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey), IsNotNull(c_nationkey)] ReadSchema: struct -(40) FilterExecTransformer +(45) FilterExecTransformer Input [2]: [c_custkey#X, c_nationkey#X] Arguments: (isnotnull(c_custkey#X) AND isnotnull(c_nationkey#X)) -(41) ProjectExecTransformer +(46) ProjectExecTransformer Output [3]: [hash(c_custkey#X, 42) AS hash_partition_key#X, c_custkey#X, c_nationkey#X] Input [2]: [c_custkey#X, c_nationkey#X] -(42) WholeStageCodegenTransformer (X) +(47) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] Arguments: false -(43) ColumnarExchange +(48) VeloxAppendBatches +Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] +Arguments: X + +(49) ColumnarExchange Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_nationkey#X], [plan_id=X], [id=#X] -(44) ShuffleQueryStage +(50) ShuffleQueryStage Output [2]: [c_custkey#X, c_nationkey#X] Arguments: X -(45) InputAdapter +(51) InputAdapter Input [2]: [c_custkey#X, c_nationkey#X] -(46) InputIteratorTransformer +(52) InputIteratorTransformer Input [2]: [c_custkey#X, c_nationkey#X] -(47) ShuffledHashJoinExecTransformer +(53) ShuffledHashJoinExecTransformer Left keys [1]: [o_custkey#X] Right keys [1]: [c_custkey#X] Join condition: None -(48) ProjectExecTransformer +(54) ProjectExecTransformer Output [6]: [hash(s_nationkey#X, 42) AS hash_partition_key#X, s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] Input [7]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X, c_custkey#X, c_nationkey#X] -(49) WholeStageCodegenTransformer (X) +(55) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] Arguments: false -(50) ColumnarExchange +(56) VeloxAppendBatches +Input [6]: [hash_partition_key#X, s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] +Arguments: X + +(57) ColumnarExchange Input [6]: [hash_partition_key#X, s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X], [plan_id=X], [id=#X] -(51) ShuffleQueryStage +(58) ShuffleQueryStage Output [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] Arguments: X -(52) InputAdapter +(59) InputAdapter Input [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] -(53) InputIteratorTransformer +(60) InputIteratorTransformer Input [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] -(54) Scan parquet +(61) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), Or(EqualTo(n_name,FRANCE),EqualTo(n_name,GERMANY))] ReadSchema: struct -(55) FilterExecTransformer +(62) FilterExecTransformer Input [2]: [n_nationkey#X, n_name#X] Arguments: (isnotnull(n_nationkey#X) AND ((n_name#X = FRANCE) OR (n_name#X = GERMANY))) -(56) ProjectExecTransformer +(63) ProjectExecTransformer Output [3]: [hash(n_nationkey#X, 42) AS hash_partition_key#X, n_nationkey#X, n_name#X] Input [2]: [n_nationkey#X, n_name#X] -(57) WholeStageCodegenTransformer (X) +(64) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] Arguments: false -(58) ColumnarExchange +(65) VeloxAppendBatches +Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] +Arguments: X + +(66) ColumnarExchange Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [id=#X] -(59) ShuffleQueryStage +(67) ShuffleQueryStage Output [2]: [n_nationkey#X, n_name#X] Arguments: X -(60) InputAdapter +(68) InputAdapter Input [2]: [n_nationkey#X, n_name#X] -(61) InputIteratorTransformer +(69) InputIteratorTransformer Input [2]: [n_nationkey#X, n_name#X] -(62) ShuffledHashJoinExecTransformer +(70) ShuffledHashJoinExecTransformer Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(63) ProjectExecTransformer +(71) ProjectExecTransformer Output [6]: [hash(c_nationkey#X, 42) AS hash_partition_key#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] Input [7]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_nationkey#X, n_name#X] -(64) WholeStageCodegenTransformer (X) +(72) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] Arguments: false -(65) ColumnarExchange +(73) VeloxAppendBatches +Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] +Arguments: X + +(74) ColumnarExchange Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X], [plan_id=X], [id=#X] -(66) ShuffleQueryStage +(75) ShuffleQueryStage Output [5]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] Arguments: X -(67) InputAdapter +(76) InputAdapter Input [5]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] -(68) InputIteratorTransformer +(77) InputIteratorTransformer Input [5]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] -(69) ReusedExchange [Reuses operator id: 58] +(78) ReusedExchange [Reuses operator id: 66] Output [2]: [n_nationkey#X, n_name#X] -(70) ShuffleQueryStage +(79) ShuffleQueryStage Output [2]: [n_nationkey#X, n_name#X] Arguments: X -(71) InputAdapter +(80) InputAdapter Input [2]: [n_nationkey#X, n_name#X] -(72) InputIteratorTransformer +(81) InputIteratorTransformer Input [2]: [n_nationkey#X, n_name#X] -(73) ShuffledHashJoinExecTransformer +(82) ShuffledHashJoinExecTransformer Left keys [1]: [c_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: (((n_name#X = FRANCE) AND (n_name#X = GERMANY)) OR ((n_name#X = GERMANY) AND (n_name#X = FRANCE))) -(74) ProjectExecTransformer +(83) ProjectExecTransformer Output [4]: [n_name#X AS supp_nation#X, n_name#X AS cust_nation#X, year(l_shipdate#X) AS l_year#X, CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) AS volume#X] Input [7]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X, n_nationkey#X, n_name#X] -(75) FlushableHashAggregateExecTransformer +(84) FlushableHashAggregateExecTransformer Input [4]: [supp_nation#X, cust_nation#X, l_year#X, volume#X] Keys [3]: [supp_nation#X, cust_nation#X, l_year#X] Functions [1]: [partial_sum(volume#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] -(76) ProjectExecTransformer +(85) ProjectExecTransformer Output [6]: [hash(supp_nation#X, cust_nation#X, l_year#X, 42) AS hash_partition_key#X, supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] -(77) WholeStageCodegenTransformer (X) +(86) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Arguments: false -(78) ColumnarExchange +(87) VeloxAppendBatches +Input [6]: [hash_partition_key#X, supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] +Arguments: X + +(88) ColumnarExchange Input [6]: [hash_partition_key#X, supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Arguments: hashpartitioning(supp_nation#X, cust_nation#X, l_year#X, 1), ENSURE_REQUIREMENTS, [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(79) ShuffleQueryStage +(89) ShuffleQueryStage Output [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Arguments: X -(80) InputAdapter +(90) InputAdapter Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] -(81) InputIteratorTransformer +(91) InputIteratorTransformer Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] -(82) RegularHashAggregateExecTransformer +(92) RegularHashAggregateExecTransformer Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Keys [3]: [supp_nation#X, cust_nation#X, l_year#X] Functions [1]: [sum(volume#X)] Aggregate Attributes [1]: [sum(volume#X)#X] Results [4]: [supp_nation#X, cust_nation#X, l_year#X, sum(volume#X)#X AS revenue#X] -(83) WholeStageCodegenTransformer (X) +(93) WholeStageCodegenTransformer (X) Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: false -(84) ColumnarExchange +(94) VeloxAppendBatches +Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] +Arguments: X + +(95) ColumnarExchange Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: rangepartitioning(supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(85) ShuffleQueryStage +(96) ShuffleQueryStage Output [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: X -(86) InputAdapter +(97) InputAdapter Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] -(87) InputIteratorTransformer +(98) InputIteratorTransformer Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] -(88) SortExecTransformer +(99) SortExecTransformer Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: [supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST], true, 0 -(89) WholeStageCodegenTransformer (X) +(100) WholeStageCodegenTransformer (X) Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: false -(90) VeloxColumnarToRowExec +(101) VeloxColumnarToRowExec Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] -(91) Scan parquet +(102) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(92) Filter +(103) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(93) Exchange +(104) Exchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(94) Scan parquet +(105) Sort +Input [2]: [s_suppkey#X, s_nationkey#X] +Arguments: [s_suppkey#X ASC NULLS FIRST], false, 0 + +(106) Scan parquet Output [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1995-01-01), LessThanOrEqual(l_shipdate,1996-12-31), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(95) Filter +(107) Filter Input [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : ((((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1995-01-01)) AND (l_shipdate#X <= 1996-12-31)) AND isnotnull(l_suppkey#X)) AND isnotnull(l_orderkey#X)) -(96) Exchange +(108) Exchange Input [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(97) ShuffledHashJoin +(109) Sort +Input [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] +Arguments: [l_suppkey#X ASC NULLS FIRST], false, 0 + +(110) SortMergeJoin Left keys [1]: [s_suppkey#X] Right keys [1]: [l_suppkey#X] Join condition: None -(98) Project +(111) Project Output [5]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Input [7]: [s_suppkey#X, s_nationkey#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(99) Exchange +(112) Exchange Input [5]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(100) Scan parquet +(113) Sort +Input [5]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(114) Scan parquet Output [2]: [o_orderkey#X, o_custkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderkey), IsNotNull(o_custkey)] ReadSchema: struct -(101) Filter +(115) Filter Input [2]: [o_orderkey#X, o_custkey#X] Condition : (isnotnull(o_orderkey#X) AND isnotnull(o_custkey#X)) -(102) Exchange +(116) Exchange Input [2]: [o_orderkey#X, o_custkey#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(103) ShuffledHashJoin +(117) Sort +Input [2]: [o_orderkey#X, o_custkey#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(118) SortMergeJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join condition: None -(104) Project +(119) Project Output [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] Input [7]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_orderkey#X, o_custkey#X] -(105) Exchange +(120) Exchange Input [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(106) Scan parquet +(121) Sort +Input [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] +Arguments: [o_custkey#X ASC NULLS FIRST], false, 0 + +(122) Scan parquet Output [2]: [c_custkey#X, c_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey), IsNotNull(c_nationkey)] ReadSchema: struct -(107) Filter +(123) Filter Input [2]: [c_custkey#X, c_nationkey#X] Condition : (isnotnull(c_custkey#X) AND isnotnull(c_nationkey#X)) -(108) Exchange +(124) Exchange Input [2]: [c_custkey#X, c_nationkey#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(109) ShuffledHashJoin +(125) Sort +Input [2]: [c_custkey#X, c_nationkey#X] +Arguments: [c_custkey#X ASC NULLS FIRST], false, 0 + +(126) SortMergeJoin Left keys [1]: [o_custkey#X] Right keys [1]: [c_custkey#X] Join condition: None -(110) Project +(127) Project Output [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] Input [7]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X, c_custkey#X, c_nationkey#X] -(111) Exchange +(128) Exchange Input [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(112) Scan parquet +(129) Sort +Input [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] +Arguments: [s_nationkey#X ASC NULLS FIRST], false, 0 + +(130) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), Or(EqualTo(n_name,FRANCE),EqualTo(n_name,GERMANY))] ReadSchema: struct -(113) Filter +(131) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : (isnotnull(n_nationkey#X) AND ((n_name#X = FRANCE) OR (n_name#X = GERMANY))) -(114) Exchange +(132) Exchange Input [2]: [n_nationkey#X, n_name#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(115) ShuffledHashJoin +(133) Sort +Input [2]: [n_nationkey#X, n_name#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(134) SortMergeJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(116) Project +(135) Project Output [5]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] Input [7]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_nationkey#X, n_name#X] -(117) Exchange +(136) Exchange Input [5]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(118) Scan parquet +(137) Sort +Input [5]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] +Arguments: [c_nationkey#X ASC NULLS FIRST], false, 0 + +(138) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), Or(EqualTo(n_name,GERMANY),EqualTo(n_name,FRANCE))] ReadSchema: struct -(119) Filter +(139) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : (isnotnull(n_nationkey#X) AND ((n_name#X = GERMANY) OR (n_name#X = FRANCE))) -(120) Exchange +(140) Exchange Input [2]: [n_nationkey#X, n_name#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(121) ShuffledHashJoin +(141) Sort +Input [2]: [n_nationkey#X, n_name#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(142) SortMergeJoin Left keys [1]: [c_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: (((n_name#X = FRANCE) AND (n_name#X = GERMANY)) OR ((n_name#X = GERMANY) AND (n_name#X = FRANCE))) -(122) Project +(143) Project Output [4]: [n_name#X AS supp_nation#X, n_name#X AS cust_nation#X, year(l_shipdate#X) AS l_year#X, CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) AS volume#X] Input [7]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X, n_nationkey#X, n_name#X] -(123) HashAggregate +(144) HashAggregate Input [4]: [supp_nation#X, cust_nation#X, l_year#X, volume#X] Keys [3]: [supp_nation#X, cust_nation#X, l_year#X] Functions [1]: [partial_sum(volume#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] -(124) Exchange +(145) Exchange Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Arguments: hashpartitioning(supp_nation#X, cust_nation#X, l_year#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(125) HashAggregate +(146) HashAggregate Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Keys [3]: [supp_nation#X, cust_nation#X, l_year#X] Functions [1]: [sum(volume#X)] Aggregate Attributes [1]: [sum(volume#X)#X] Results [4]: [supp_nation#X, cust_nation#X, l_year#X, sum(volume#X)#X AS revenue#X] -(126) Exchange +(147) Exchange Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: rangepartitioning(supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(127) Sort +(148) Sort Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: [supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST], true, 0 -(128) AdaptiveSparkPlan +(149) AdaptiveSparkPlan Output [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/8.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/8.txt index ab8ef221e5037..282790ba65070 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/8.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/8.txt @@ -1,166 +1,180 @@ == Physical Plan == -AdaptiveSparkPlan (177) +AdaptiveSparkPlan (207) +- == Final Plan == - VeloxColumnarToRowExec (125) - +- ^ SortExecTransformer (123) - +- ^ InputIteratorTransformer (122) - +- ^ InputAdapter (121) - +- ^ ShuffleQueryStage (120), Statistics(X) - +- ColumnarExchange (119) - +- ^ ProjectExecTransformer (117) - +- ^ RegularHashAggregateExecTransformer (116) - +- ^ InputIteratorTransformer (115) - +- ^ InputAdapter (114) - +- ^ ShuffleQueryStage (113), Statistics(X) - +- ColumnarExchange (112) - +- ^ ProjectExecTransformer (110) - +- ^ FlushableHashAggregateExecTransformer (109) - +- ^ ProjectExecTransformer (108) - +- ^ ShuffledHashJoinExecTransformer Inner (107) - :- ^ InputIteratorTransformer (98) - : +- ^ InputAdapter (97) - : +- ^ ShuffleQueryStage (96), Statistics(X) - : +- ColumnarExchange (95) - : +- ^ ProjectExecTransformer (93) - : +- ^ ShuffledHashJoinExecTransformer Inner (92) - : :- ^ InputIteratorTransformer (83) - : : +- ^ InputAdapter (82) - : : +- ^ ShuffleQueryStage (81), Statistics(X) - : : +- ColumnarExchange (80) - : : +- ^ ProjectExecTransformer (78) - : : +- ^ ShuffledHashJoinExecTransformer Inner (77) - : : :- ^ InputIteratorTransformer (68) - : : : +- ^ InputAdapter (67) - : : : +- ^ ShuffleQueryStage (66), Statistics(X) - : : : +- ColumnarExchange (65) - : : : +- ^ ProjectExecTransformer (63) - : : : +- ^ ShuffledHashJoinExecTransformer Inner (62) - : : : :- ^ InputIteratorTransformer (53) - : : : : +- ^ InputAdapter (52) - : : : : +- ^ ShuffleQueryStage (51), Statistics(X) - : : : : +- ColumnarExchange (50) - : : : : +- ^ ProjectExecTransformer (48) - : : : : +- ^ ShuffledHashJoinExecTransformer Inner (47) - : : : : :- ^ InputIteratorTransformer (38) - : : : : : +- ^ InputAdapter (37) - : : : : : +- ^ ShuffleQueryStage (36), Statistics(X) - : : : : : +- ColumnarExchange (35) - : : : : : +- ^ ProjectExecTransformer (33) - : : : : : +- ^ ShuffledHashJoinExecTransformer Inner (32) - : : : : : :- ^ InputIteratorTransformer (23) - : : : : : : +- ^ InputAdapter (22) - : : : : : : +- ^ ShuffleQueryStage (21), Statistics(X) - : : : : : : +- ColumnarExchange (20) - : : : : : : +- ^ ProjectExecTransformer (18) - : : : : : : +- ^ ShuffledHashJoinExecTransformer Inner (17) - : : : : : : :- ^ InputIteratorTransformer (8) - : : : : : : : +- ^ InputAdapter (7) - : : : : : : : +- ^ ShuffleQueryStage (6), Statistics(X) - : : : : : : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (141) + +- ^ SortExecTransformer (139) + +- ^ InputIteratorTransformer (138) + +- ShuffleQueryStage (136), Statistics(X) + +- ColumnarExchange (135) + +- VeloxAppendBatches (134) + +- ^ ProjectExecTransformer (132) + +- ^ RegularHashAggregateExecTransformer (131) + +- ^ InputIteratorTransformer (130) + +- ShuffleQueryStage (128), Statistics(X) + +- ColumnarExchange (127) + +- VeloxAppendBatches (126) + +- ^ ProjectExecTransformer (124) + +- ^ FlushableHashAggregateExecTransformer (123) + +- ^ ProjectExecTransformer (122) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (121) + :- ^ InputIteratorTransformer (111) + : +- ShuffleQueryStage (109), Statistics(X) + : +- ColumnarExchange (108) + : +- VeloxAppendBatches (107) + : +- ^ ProjectExecTransformer (105) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (104) + : :- ^ InputIteratorTransformer (94) + : : +- ShuffleQueryStage (92), Statistics(X) + : : +- ColumnarExchange (91) + : : +- VeloxAppendBatches (90) + : : +- ^ ProjectExecTransformer (88) + : : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (87) + : : :- ^ InputIteratorTransformer (77) + : : : +- ShuffleQueryStage (75), Statistics(X) + : : : +- ColumnarExchange (74) + : : : +- VeloxAppendBatches (73) + : : : +- ^ ProjectExecTransformer (71) + : : : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (70) + : : : :- ^ InputIteratorTransformer (60) + : : : : +- ShuffleQueryStage (58), Statistics(X) + : : : : +- ColumnarExchange (57) + : : : : +- VeloxAppendBatches (56) + : : : : +- ^ ProjectExecTransformer (54) + : : : : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (53) + : : : : :- ^ InputIteratorTransformer (43) + : : : : : +- ShuffleQueryStage (41), Statistics(X) + : : : : : +- ColumnarExchange (40) + : : : : : +- VeloxAppendBatches (39) + : : : : : +- ^ ProjectExecTransformer (37) + : : : : : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (36) + : : : : : :- ^ InputIteratorTransformer (26) + : : : : : : +- ShuffleQueryStage (24), Statistics(X) + : : : : : : +- ColumnarExchange (23) + : : : : : : +- VeloxAppendBatches (22) + : : : : : : +- ^ ProjectExecTransformer (20) + : : : : : : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (19) + : : : : : : :- ^ InputIteratorTransformer (9) + : : : : : : : +- ShuffleQueryStage (7), Statistics(X) + : : : : : : : +- ColumnarExchange (6) + : : : : : : : +- VeloxAppendBatches (5) : : : : : : : +- ^ ProjectExecTransformer (3) : : : : : : : +- ^ FilterExecTransformer (2) : : : : : : : +- ^ Scan parquet (1) - : : : : : : +- ^ InputIteratorTransformer (16) - : : : : : : +- ^ InputAdapter (15) - : : : : : : +- ^ ShuffleQueryStage (14), Statistics(X) - : : : : : : +- ColumnarExchange (13) - : : : : : : +- ^ ProjectExecTransformer (11) - : : : : : : +- ^ FilterExecTransformer (10) - : : : : : : +- ^ Scan parquet (9) - : : : : : +- ^ InputIteratorTransformer (31) - : : : : : +- ^ InputAdapter (30) - : : : : : +- ^ ShuffleQueryStage (29), Statistics(X) - : : : : : +- ColumnarExchange (28) - : : : : : +- ^ ProjectExecTransformer (26) - : : : : : +- ^ FilterExecTransformer (25) - : : : : : +- ^ Scan parquet (24) - : : : : +- ^ InputIteratorTransformer (46) - : : : : +- ^ InputAdapter (45) - : : : : +- ^ ShuffleQueryStage (44), Statistics(X) - : : : : +- ColumnarExchange (43) - : : : : +- ^ ProjectExecTransformer (41) - : : : : +- ^ FilterExecTransformer (40) - : : : : +- ^ Scan parquet (39) - : : : +- ^ InputIteratorTransformer (61) - : : : +- ^ InputAdapter (60) - : : : +- ^ ShuffleQueryStage (59), Statistics(X) - : : : +- ColumnarExchange (58) - : : : +- ^ ProjectExecTransformer (56) - : : : +- ^ FilterExecTransformer (55) - : : : +- ^ Scan parquet (54) - : : +- ^ InputIteratorTransformer (76) - : : +- ^ InputAdapter (75) - : : +- ^ ShuffleQueryStage (74), Statistics(X) - : : +- ColumnarExchange (73) - : : +- ^ ProjectExecTransformer (71) - : : +- ^ FilterExecTransformer (70) - : : +- ^ Scan parquet (69) - : +- ^ InputIteratorTransformer (91) - : +- ^ InputAdapter (90) - : +- ^ ShuffleQueryStage (89), Statistics(X) - : +- ColumnarExchange (88) - : +- ^ ProjectExecTransformer (86) - : +- ^ FilterExecTransformer (85) - : +- ^ Scan parquet (84) - +- ^ InputIteratorTransformer (106) - +- ^ InputAdapter (105) - +- ^ ShuffleQueryStage (104), Statistics(X) - +- ColumnarExchange (103) - +- ^ ProjectExecTransformer (101) - +- ^ FilterExecTransformer (100) - +- ^ Scan parquet (99) + : : : : : : +- ^ InputIteratorTransformer (18) + : : : : : : +- ShuffleQueryStage (16), Statistics(X) + : : : : : : +- ColumnarExchange (15) + : : : : : : +- VeloxAppendBatches (14) + : : : : : : +- ^ ProjectExecTransformer (12) + : : : : : : +- ^ FilterExecTransformer (11) + : : : : : : +- ^ Scan parquet (10) + : : : : : +- ^ InputIteratorTransformer (35) + : : : : : +- ShuffleQueryStage (33), Statistics(X) + : : : : : +- ColumnarExchange (32) + : : : : : +- VeloxAppendBatches (31) + : : : : : +- ^ ProjectExecTransformer (29) + : : : : : +- ^ FilterExecTransformer (28) + : : : : : +- ^ Scan parquet (27) + : : : : +- ^ InputIteratorTransformer (52) + : : : : +- ShuffleQueryStage (50), Statistics(X) + : : : : +- ColumnarExchange (49) + : : : : +- VeloxAppendBatches (48) + : : : : +- ^ ProjectExecTransformer (46) + : : : : +- ^ FilterExecTransformer (45) + : : : : +- ^ Scan parquet (44) + : : : +- ^ InputIteratorTransformer (69) + : : : +- ShuffleQueryStage (67), Statistics(X) + : : : +- ColumnarExchange (66) + : : : +- VeloxAppendBatches (65) + : : : +- ^ ProjectExecTransformer (63) + : : : +- ^ FilterExecTransformer (62) + : : : +- ^ Scan parquet (61) + : : +- ^ InputIteratorTransformer (86) + : : +- ShuffleQueryStage (84), Statistics(X) + : : +- ColumnarExchange (83) + : : +- VeloxAppendBatches (82) + : : +- ^ ProjectExecTransformer (80) + : : +- ^ FilterExecTransformer (79) + : : +- ^ Scan parquet (78) + : +- ^ InputIteratorTransformer (103) + : +- ShuffleQueryStage (101), Statistics(X) + : +- ColumnarExchange (100) + : +- VeloxAppendBatches (99) + : +- ^ ProjectExecTransformer (97) + : +- ^ FilterExecTransformer (96) + : +- ^ Scan parquet (95) + +- ^ InputIteratorTransformer (120) + +- ShuffleQueryStage (118), Statistics(X) + +- ColumnarExchange (117) + +- VeloxAppendBatches (116) + +- ^ ProjectExecTransformer (114) + +- ^ FilterExecTransformer (113) + +- ^ Scan parquet (112) +- == Initial Plan == - Sort (176) - +- Exchange (175) - +- HashAggregate (174) - +- Exchange (173) - +- HashAggregate (172) - +- Project (171) - +- ShuffledHashJoin Inner BuildRight (170) - :- Exchange (165) - : +- Project (164) - : +- ShuffledHashJoin Inner BuildRight (163) - : :- Exchange (159) - : : +- Project (158) - : : +- ShuffledHashJoin Inner BuildRight (157) - : : :- Exchange (153) - : : : +- Project (152) - : : : +- ShuffledHashJoin Inner BuildRight (151) - : : : :- Exchange (147) - : : : : +- Project (146) - : : : : +- ShuffledHashJoin Inner BuildRight (145) - : : : : :- Exchange (141) - : : : : : +- Project (140) - : : : : : +- ShuffledHashJoin Inner BuildRight (139) - : : : : : :- Exchange (135) - : : : : : : +- Project (134) - : : : : : : +- ShuffledHashJoin Inner BuildLeft (133) - : : : : : : :- Exchange (129) - : : : : : : : +- Project (128) - : : : : : : : +- Filter (127) - : : : : : : : +- Scan parquet (126) - : : : : : : +- Exchange (132) - : : : : : : +- Filter (131) - : : : : : : +- Scan parquet (130) - : : : : : +- Exchange (138) - : : : : : +- Filter (137) - : : : : : +- Scan parquet (136) - : : : : +- Exchange (144) - : : : : +- Filter (143) - : : : : +- Scan parquet (142) - : : : +- Exchange (150) - : : : +- Filter (149) - : : : +- Scan parquet (148) - : : +- Exchange (156) - : : +- Filter (155) - : : +- Scan parquet (154) - : +- Exchange (162) - : +- Filter (161) - : +- Scan parquet (160) - +- Exchange (169) - +- Project (168) - +- Filter (167) - +- Scan parquet (166) + Sort (206) + +- Exchange (205) + +- HashAggregate (204) + +- Exchange (203) + +- HashAggregate (202) + +- Project (201) + +- SortMergeJoin Inner (200) + :- Sort (194) + : +- Exchange (193) + : +- Project (192) + : +- SortMergeJoin Inner (191) + : :- Sort (186) + : : +- Exchange (185) + : : +- Project (184) + : : +- SortMergeJoin Inner (183) + : : :- Sort (178) + : : : +- Exchange (177) + : : : +- Project (176) + : : : +- SortMergeJoin Inner (175) + : : : :- Sort (170) + : : : : +- Exchange (169) + : : : : +- Project (168) + : : : : +- SortMergeJoin Inner (167) + : : : : :- Sort (162) + : : : : : +- Exchange (161) + : : : : : +- Project (160) + : : : : : +- SortMergeJoin Inner (159) + : : : : : :- Sort (154) + : : : : : : +- Exchange (153) + : : : : : : +- Project (152) + : : : : : : +- SortMergeJoin Inner (151) + : : : : : : :- Sort (146) + : : : : : : : +- Exchange (145) + : : : : : : : +- Project (144) + : : : : : : : +- Filter (143) + : : : : : : : +- Scan parquet (142) + : : : : : : +- Sort (150) + : : : : : : +- Exchange (149) + : : : : : : +- Filter (148) + : : : : : : +- Scan parquet (147) + : : : : : +- Sort (158) + : : : : : +- Exchange (157) + : : : : : +- Filter (156) + : : : : : +- Scan parquet (155) + : : : : +- Sort (166) + : : : : +- Exchange (165) + : : : : +- Filter (164) + : : : : +- Scan parquet (163) + : : : +- Sort (174) + : : : +- Exchange (173) + : : : +- Filter (172) + : : : +- Scan parquet (171) + : : +- Sort (182) + : : +- Exchange (181) + : : +- Filter (180) + : : +- Scan parquet (179) + : +- Sort (190) + : +- Exchange (189) + : +- Filter (188) + : +- Scan parquet (187) + +- Sort (199) + +- Exchange (198) + +- Project (197) + +- Filter (196) + +- Scan parquet (195) (1) Scan parquet @@ -182,732 +196,852 @@ Input [2]: [p_partkey#X, p_type#X] Input [2]: [hash_partition_key#X, p_partkey#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [2]: [hash_partition_key#X, p_partkey#X] +Arguments: X + +(6) ColumnarExchange Input [2]: [hash_partition_key#X, p_partkey#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [1]: [p_partkey#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [1]: [p_partkey#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [1]: [p_partkey#X] -(9) Scan parquet +(10) Scan parquet Output [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(10) FilterExecTransformer +(11) FilterExecTransformer Input [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: ((isnotnull(l_partkey#X) AND isnotnull(l_suppkey#X)) AND isnotnull(l_orderkey#X)) -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [6]: [hash(l_partkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Input [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [6]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(15) ColumnarExchange Input [6]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [p_partkey#X] Right keys [1]: [l_partkey#X] Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [5]: [hash(l_suppkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Input [6]: [p_partkey#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(19) WholeStageCodegenTransformer (X) +(21) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: false -(20) ColumnarExchange +(22) VeloxAppendBatches +Input [5]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(23) ColumnarExchange Input [5]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(21) ShuffleQueryStage +(24) ShuffleQueryStage Output [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: X -(22) InputAdapter +(25) InputAdapter Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(23) InputIteratorTransformer +(26) InputIteratorTransformer Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(24) Scan parquet +(27) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(25) FilterExecTransformer +(28) FilterExecTransformer Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(26) ProjectExecTransformer +(29) ProjectExecTransformer Output [3]: [hash(s_suppkey#X, 42) AS hash_partition_key#X, s_suppkey#X, s_nationkey#X] Input [2]: [s_suppkey#X, s_nationkey#X] -(27) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] Arguments: false -(28) ColumnarExchange +(31) VeloxAppendBatches +Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] +Arguments: X + +(32) ColumnarExchange Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [id=#X] -(29) ShuffleQueryStage +(33) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] Arguments: X -(30) InputAdapter +(34) InputAdapter Input [2]: [s_suppkey#X, s_nationkey#X] -(31) InputIteratorTransformer +(35) InputIteratorTransformer Input [2]: [s_suppkey#X, s_nationkey#X] -(32) ShuffledHashJoinExecTransformer +(36) ShuffledHashJoinExecTransformer Left keys [1]: [l_suppkey#X] Right keys [1]: [s_suppkey#X] Join condition: None -(33) ProjectExecTransformer +(37) ProjectExecTransformer Output [5]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Input [6]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, s_suppkey#X, s_nationkey#X] -(34) WholeStageCodegenTransformer (X) +(38) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: false -(35) ColumnarExchange +(39) VeloxAppendBatches +Input [5]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] +Arguments: X + +(40) ColumnarExchange Input [5]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X], [plan_id=X], [id=#X] -(36) ShuffleQueryStage +(41) ShuffleQueryStage Output [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: X -(37) InputAdapter +(42) InputAdapter Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] -(38) InputIteratorTransformer +(43) InputIteratorTransformer Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] -(39) Scan parquet +(44) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1995-01-01), LessThanOrEqual(o_orderdate,1996-12-31), IsNotNull(o_orderkey), IsNotNull(o_custkey)] ReadSchema: struct -(40) FilterExecTransformer +(45) FilterExecTransformer Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Arguments: ((((isnotnull(o_orderdate#X) AND (o_orderdate#X >= 1995-01-01)) AND (o_orderdate#X <= 1996-12-31)) AND isnotnull(o_orderkey#X)) AND isnotnull(o_custkey#X)) -(41) ProjectExecTransformer +(46) ProjectExecTransformer Output [4]: [hash(o_orderkey#X, 42) AS hash_partition_key#X, o_orderkey#X, o_custkey#X, o_orderdate#X] Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] -(42) WholeStageCodegenTransformer (X) +(47) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_orderdate#X] Arguments: false -(43) ColumnarExchange +(48) VeloxAppendBatches +Input [4]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_orderdate#X] +Arguments: X + +(49) ColumnarExchange Input [4]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_orderdate#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_orderdate#X], [plan_id=X], [id=#X] -(44) ShuffleQueryStage +(50) ShuffleQueryStage Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Arguments: X -(45) InputAdapter +(51) InputAdapter Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] -(46) InputIteratorTransformer +(52) InputIteratorTransformer Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] -(47) ShuffledHashJoinExecTransformer +(53) ShuffledHashJoinExecTransformer Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join condition: None -(48) ProjectExecTransformer +(54) ProjectExecTransformer Output [6]: [hash(o_custkey#X, 42) AS hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] Input [7]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderkey#X, o_custkey#X, o_orderdate#X] -(49) WholeStageCodegenTransformer (X) +(55) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] Arguments: false -(50) ColumnarExchange +(56) VeloxAppendBatches +Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] +Arguments: X + +(57) ColumnarExchange Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X], [plan_id=X], [id=#X] -(51) ShuffleQueryStage +(58) ShuffleQueryStage Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] Arguments: X -(52) InputAdapter +(59) InputAdapter Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] -(53) InputIteratorTransformer +(60) InputIteratorTransformer Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] -(54) Scan parquet +(61) Scan parquet Output [2]: [c_custkey#X, c_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey), IsNotNull(c_nationkey)] ReadSchema: struct -(55) FilterExecTransformer +(62) FilterExecTransformer Input [2]: [c_custkey#X, c_nationkey#X] Arguments: (isnotnull(c_custkey#X) AND isnotnull(c_nationkey#X)) -(56) ProjectExecTransformer +(63) ProjectExecTransformer Output [3]: [hash(c_custkey#X, 42) AS hash_partition_key#X, c_custkey#X, c_nationkey#X] Input [2]: [c_custkey#X, c_nationkey#X] -(57) WholeStageCodegenTransformer (X) +(64) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] Arguments: false -(58) ColumnarExchange +(65) VeloxAppendBatches +Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] +Arguments: X + +(66) ColumnarExchange Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_nationkey#X], [plan_id=X], [id=#X] -(59) ShuffleQueryStage +(67) ShuffleQueryStage Output [2]: [c_custkey#X, c_nationkey#X] Arguments: X -(60) InputAdapter +(68) InputAdapter Input [2]: [c_custkey#X, c_nationkey#X] -(61) InputIteratorTransformer +(69) InputIteratorTransformer Input [2]: [c_custkey#X, c_nationkey#X] -(62) ShuffledHashJoinExecTransformer +(70) ShuffledHashJoinExecTransformer Left keys [1]: [o_custkey#X] Right keys [1]: [c_custkey#X] Join condition: None -(63) ProjectExecTransformer +(71) ProjectExecTransformer Output [6]: [hash(c_nationkey#X, 42) AS hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] Input [7]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X, c_custkey#X, c_nationkey#X] -(64) WholeStageCodegenTransformer (X) +(72) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] Arguments: false -(65) ColumnarExchange +(73) VeloxAppendBatches +Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] +Arguments: X + +(74) ColumnarExchange Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X], [plan_id=X], [id=#X] -(66) ShuffleQueryStage +(75) ShuffleQueryStage Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] Arguments: X -(67) InputAdapter +(76) InputAdapter Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] -(68) InputIteratorTransformer +(77) InputIteratorTransformer Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] -(69) Scan parquet +(78) Scan parquet Output [2]: [n_nationkey#X, n_regionkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), IsNotNull(n_regionkey)] ReadSchema: struct -(70) FilterExecTransformer +(79) FilterExecTransformer Input [2]: [n_nationkey#X, n_regionkey#X] Arguments: (isnotnull(n_nationkey#X) AND isnotnull(n_regionkey#X)) -(71) ProjectExecTransformer +(80) ProjectExecTransformer Output [3]: [hash(n_nationkey#X, 42) AS hash_partition_key#X, n_nationkey#X, n_regionkey#X] Input [2]: [n_nationkey#X, n_regionkey#X] -(72) WholeStageCodegenTransformer (X) +(81) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, n_nationkey#X, n_regionkey#X] Arguments: false -(73) ColumnarExchange +(82) VeloxAppendBatches +Input [3]: [hash_partition_key#X, n_nationkey#X, n_regionkey#X] +Arguments: X + +(83) ColumnarExchange Input [3]: [hash_partition_key#X, n_nationkey#X, n_regionkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_regionkey#X], [plan_id=X], [id=#X] -(74) ShuffleQueryStage +(84) ShuffleQueryStage Output [2]: [n_nationkey#X, n_regionkey#X] Arguments: X -(75) InputAdapter +(85) InputAdapter Input [2]: [n_nationkey#X, n_regionkey#X] -(76) InputIteratorTransformer +(86) InputIteratorTransformer Input [2]: [n_nationkey#X, n_regionkey#X] -(77) ShuffledHashJoinExecTransformer +(87) ShuffledHashJoinExecTransformer Left keys [1]: [c_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(78) ProjectExecTransformer +(88) ProjectExecTransformer Output [6]: [hash(s_nationkey#X, 42) AS hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] Input [7]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X, n_nationkey#X, n_regionkey#X] -(79) WholeStageCodegenTransformer (X) +(89) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] Arguments: false -(80) ColumnarExchange +(90) VeloxAppendBatches +Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] +Arguments: X + +(91) ColumnarExchange Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X], [plan_id=X], [id=#X] -(81) ShuffleQueryStage +(92) ShuffleQueryStage Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] Arguments: X -(82) InputAdapter +(93) InputAdapter Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] -(83) InputIteratorTransformer +(94) InputIteratorTransformer Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] -(84) Scan parquet +(95) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey)] ReadSchema: struct -(85) FilterExecTransformer +(96) FilterExecTransformer Input [2]: [n_nationkey#X, n_name#X] Arguments: isnotnull(n_nationkey#X) -(86) ProjectExecTransformer +(97) ProjectExecTransformer Output [3]: [hash(n_nationkey#X, 42) AS hash_partition_key#X, n_nationkey#X, n_name#X] Input [2]: [n_nationkey#X, n_name#X] -(87) WholeStageCodegenTransformer (X) +(98) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] Arguments: false -(88) ColumnarExchange +(99) VeloxAppendBatches +Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] +Arguments: X + +(100) ColumnarExchange Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [id=#X] -(89) ShuffleQueryStage +(101) ShuffleQueryStage Output [2]: [n_nationkey#X, n_name#X] Arguments: X -(90) InputAdapter +(102) InputAdapter Input [2]: [n_nationkey#X, n_name#X] -(91) InputIteratorTransformer +(103) InputIteratorTransformer Input [2]: [n_nationkey#X, n_name#X] -(92) ShuffledHashJoinExecTransformer +(104) ShuffledHashJoinExecTransformer Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(93) ProjectExecTransformer +(105) ProjectExecTransformer Output [6]: [hash(n_regionkey#X, 42) AS hash_partition_key#X, l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] Input [7]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X, n_nationkey#X, n_name#X] -(94) WholeStageCodegenTransformer (X) +(106) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] Arguments: false -(95) ColumnarExchange +(107) VeloxAppendBatches +Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] +Arguments: X + +(108) ColumnarExchange Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] Arguments: hashpartitioning(n_regionkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X], [plan_id=X], [id=#X] -(96) ShuffleQueryStage +(109) ShuffleQueryStage Output [5]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] Arguments: X -(97) InputAdapter +(110) InputAdapter Input [5]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] -(98) InputIteratorTransformer +(111) InputIteratorTransformer Input [5]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] -(99) Scan parquet +(112) Scan parquet Output [2]: [r_regionkey#X, r_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(r_name), EqualTo(r_name,AMERICA), IsNotNull(r_regionkey)] ReadSchema: struct -(100) FilterExecTransformer +(113) FilterExecTransformer Input [2]: [r_regionkey#X, r_name#X] Arguments: ((isnotnull(r_name#X) AND (r_name#X = AMERICA)) AND isnotnull(r_regionkey#X)) -(101) ProjectExecTransformer +(114) ProjectExecTransformer Output [2]: [hash(r_regionkey#X, 42) AS hash_partition_key#X, r_regionkey#X] Input [2]: [r_regionkey#X, r_name#X] -(102) WholeStageCodegenTransformer (X) +(115) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, r_regionkey#X] Arguments: false -(103) ColumnarExchange +(116) VeloxAppendBatches +Input [2]: [hash_partition_key#X, r_regionkey#X] +Arguments: X + +(117) ColumnarExchange Input [2]: [hash_partition_key#X, r_regionkey#X] Arguments: hashpartitioning(r_regionkey#X, 1), ENSURE_REQUIREMENTS, [r_regionkey#X], [plan_id=X], [id=#X] -(104) ShuffleQueryStage +(118) ShuffleQueryStage Output [1]: [r_regionkey#X] Arguments: X -(105) InputAdapter +(119) InputAdapter Input [1]: [r_regionkey#X] -(106) InputIteratorTransformer +(120) InputIteratorTransformer Input [1]: [r_regionkey#X] -(107) ShuffledHashJoinExecTransformer +(121) ShuffledHashJoinExecTransformer Left keys [1]: [n_regionkey#X] Right keys [1]: [r_regionkey#X] Join condition: None -(108) ProjectExecTransformer +(122) ProjectExecTransformer Output [4]: [year(o_orderdate#X) AS o_year#X, CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) AS volume#X, n_name#X AS nation#X, CASE WHEN (n_name#X = BRAZIL) THEN CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) ELSE 0.0000 END AS _pre_X#X] Input [6]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X, r_regionkey#X] -(109) FlushableHashAggregateExecTransformer +(123) FlushableHashAggregateExecTransformer Input [4]: [o_year#X, volume#X, nation#X, _pre_X#X] Keys [1]: [o_year#X] Functions [2]: [partial_sum(_pre_X#X), partial_sum(volume#X)] Aggregate Attributes [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Results [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] -(110) ProjectExecTransformer +(124) ProjectExecTransformer Output [6]: [hash(o_year#X, 42) AS hash_partition_key#X, o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] -(111) WholeStageCodegenTransformer (X) +(125) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: false -(112) ColumnarExchange +(126) VeloxAppendBatches +Input [6]: [hash_partition_key#X, o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] +Arguments: X + +(127) ColumnarExchange Input [6]: [hash_partition_key#X, o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: hashpartitioning(o_year#X, 1), ENSURE_REQUIREMENTS, [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(113) ShuffleQueryStage +(128) ShuffleQueryStage Output [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: X -(114) InputAdapter +(129) InputAdapter Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] -(115) InputIteratorTransformer +(130) InputIteratorTransformer Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] -(116) RegularHashAggregateExecTransformer +(131) RegularHashAggregateExecTransformer Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Keys [1]: [o_year#X] Functions [2]: [sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END), sum(volume#X)] Aggregate Attributes [2]: [sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X, sum(volume#X)#X] Results [3]: [o_year#X, sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X, sum(volume#X)#X] -(117) ProjectExecTransformer +(132) ProjectExecTransformer Output [2]: [o_year#X, CheckOverflow((promote_precision(sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X) / promote_precision(sum(volume#X)#X)), DecimalType(38,6)) AS mkt_share#X] Input [3]: [o_year#X, sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X, sum(volume#X)#X] -(118) WholeStageCodegenTransformer (X) +(133) WholeStageCodegenTransformer (X) Input [2]: [o_year#X, mkt_share#X] Arguments: false -(119) ColumnarExchange +(134) VeloxAppendBatches +Input [2]: [o_year#X, mkt_share#X] +Arguments: X + +(135) ColumnarExchange Input [2]: [o_year#X, mkt_share#X] Arguments: rangepartitioning(o_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(120) ShuffleQueryStage +(136) ShuffleQueryStage Output [2]: [o_year#X, mkt_share#X] Arguments: X -(121) InputAdapter +(137) InputAdapter Input [2]: [o_year#X, mkt_share#X] -(122) InputIteratorTransformer +(138) InputIteratorTransformer Input [2]: [o_year#X, mkt_share#X] -(123) SortExecTransformer +(139) SortExecTransformer Input [2]: [o_year#X, mkt_share#X] Arguments: [o_year#X ASC NULLS FIRST], true, 0 -(124) WholeStageCodegenTransformer (X) +(140) WholeStageCodegenTransformer (X) Input [2]: [o_year#X, mkt_share#X] Arguments: false -(125) VeloxColumnarToRowExec +(141) VeloxColumnarToRowExec Input [2]: [o_year#X, mkt_share#X] -(126) Scan parquet +(142) Scan parquet Output [2]: [p_partkey#X, p_type#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_type), EqualTo(p_type,ECONOMY ANODIZED STEEL), IsNotNull(p_partkey)] ReadSchema: struct -(127) Filter +(143) Filter Input [2]: [p_partkey#X, p_type#X] Condition : ((isnotnull(p_type#X) AND (p_type#X = ECONOMY ANODIZED STEEL)) AND isnotnull(p_partkey#X)) -(128) Project +(144) Project Output [1]: [p_partkey#X] Input [2]: [p_partkey#X, p_type#X] -(129) Exchange +(145) Exchange Input [1]: [p_partkey#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(130) Scan parquet +(146) Sort +Input [1]: [p_partkey#X] +Arguments: [p_partkey#X ASC NULLS FIRST], false, 0 + +(147) Scan parquet Output [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(131) Filter +(148) Filter Input [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Condition : ((isnotnull(l_partkey#X) AND isnotnull(l_suppkey#X)) AND isnotnull(l_orderkey#X)) -(132) Exchange +(149) Exchange Input [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(133) ShuffledHashJoin +(150) Sort +Input [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] +Arguments: [l_partkey#X ASC NULLS FIRST], false, 0 + +(151) SortMergeJoin Left keys [1]: [p_partkey#X] Right keys [1]: [l_partkey#X] Join condition: None -(134) Project +(152) Project Output [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Input [6]: [p_partkey#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(135) Exchange +(153) Exchange Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(136) Scan parquet +(154) Sort +Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] +Arguments: [l_suppkey#X ASC NULLS FIRST], false, 0 + +(155) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(137) Filter +(156) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(138) Exchange +(157) Exchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(139) ShuffledHashJoin +(158) Sort +Input [2]: [s_suppkey#X, s_nationkey#X] +Arguments: [s_suppkey#X ASC NULLS FIRST], false, 0 + +(159) SortMergeJoin Left keys [1]: [l_suppkey#X] Right keys [1]: [s_suppkey#X] Join condition: None -(140) Project +(160) Project Output [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Input [6]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, s_suppkey#X, s_nationkey#X] -(141) Exchange +(161) Exchange Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(142) Scan parquet +(162) Sort +Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(163) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1995-01-01), LessThanOrEqual(o_orderdate,1996-12-31), IsNotNull(o_orderkey), IsNotNull(o_custkey)] ReadSchema: struct -(143) Filter +(164) Filter Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Condition : ((((isnotnull(o_orderdate#X) AND (o_orderdate#X >= 1995-01-01)) AND (o_orderdate#X <= 1996-12-31)) AND isnotnull(o_orderkey#X)) AND isnotnull(o_custkey#X)) -(144) Exchange +(165) Exchange Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(145) ShuffledHashJoin +(166) Sort +Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(167) SortMergeJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join condition: None -(146) Project +(168) Project Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] Input [7]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderkey#X, o_custkey#X, o_orderdate#X] -(147) Exchange +(169) Exchange Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(148) Scan parquet +(170) Sort +Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] +Arguments: [o_custkey#X ASC NULLS FIRST], false, 0 + +(171) Scan parquet Output [2]: [c_custkey#X, c_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey), IsNotNull(c_nationkey)] ReadSchema: struct -(149) Filter +(172) Filter Input [2]: [c_custkey#X, c_nationkey#X] Condition : (isnotnull(c_custkey#X) AND isnotnull(c_nationkey#X)) -(150) Exchange +(173) Exchange Input [2]: [c_custkey#X, c_nationkey#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(151) ShuffledHashJoin +(174) Sort +Input [2]: [c_custkey#X, c_nationkey#X] +Arguments: [c_custkey#X ASC NULLS FIRST], false, 0 + +(175) SortMergeJoin Left keys [1]: [o_custkey#X] Right keys [1]: [c_custkey#X] Join condition: None -(152) Project +(176) Project Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] Input [7]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X, c_custkey#X, c_nationkey#X] -(153) Exchange +(177) Exchange Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(154) Scan parquet +(178) Sort +Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] +Arguments: [c_nationkey#X ASC NULLS FIRST], false, 0 + +(179) Scan parquet Output [2]: [n_nationkey#X, n_regionkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), IsNotNull(n_regionkey)] ReadSchema: struct -(155) Filter +(180) Filter Input [2]: [n_nationkey#X, n_regionkey#X] Condition : (isnotnull(n_nationkey#X) AND isnotnull(n_regionkey#X)) -(156) Exchange +(181) Exchange Input [2]: [n_nationkey#X, n_regionkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(157) ShuffledHashJoin +(182) Sort +Input [2]: [n_nationkey#X, n_regionkey#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(183) SortMergeJoin Left keys [1]: [c_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(158) Project +(184) Project Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] Input [7]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X, n_nationkey#X, n_regionkey#X] -(159) Exchange +(185) Exchange Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(160) Scan parquet +(186) Sort +Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] +Arguments: [s_nationkey#X ASC NULLS FIRST], false, 0 + +(187) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey)] ReadSchema: struct -(161) Filter +(188) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : isnotnull(n_nationkey#X) -(162) Exchange +(189) Exchange Input [2]: [n_nationkey#X, n_name#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(163) ShuffledHashJoin +(190) Sort +Input [2]: [n_nationkey#X, n_name#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(191) SortMergeJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(164) Project +(192) Project Output [5]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] Input [7]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X, n_nationkey#X, n_name#X] -(165) Exchange +(193) Exchange Input [5]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] Arguments: hashpartitioning(n_regionkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(166) Scan parquet +(194) Sort +Input [5]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] +Arguments: [n_regionkey#X ASC NULLS FIRST], false, 0 + +(195) Scan parquet Output [2]: [r_regionkey#X, r_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(r_name), EqualTo(r_name,AMERICA), IsNotNull(r_regionkey)] ReadSchema: struct -(167) Filter +(196) Filter Input [2]: [r_regionkey#X, r_name#X] Condition : ((isnotnull(r_name#X) AND (r_name#X = AMERICA)) AND isnotnull(r_regionkey#X)) -(168) Project +(197) Project Output [1]: [r_regionkey#X] Input [2]: [r_regionkey#X, r_name#X] -(169) Exchange +(198) Exchange Input [1]: [r_regionkey#X] Arguments: hashpartitioning(r_regionkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(170) ShuffledHashJoin +(199) Sort +Input [1]: [r_regionkey#X] +Arguments: [r_regionkey#X ASC NULLS FIRST], false, 0 + +(200) SortMergeJoin Left keys [1]: [n_regionkey#X] Right keys [1]: [r_regionkey#X] Join condition: None -(171) Project +(201) Project Output [3]: [year(o_orderdate#X) AS o_year#X, CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) AS volume#X, n_name#X AS nation#X] Input [6]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X, r_regionkey#X] -(172) HashAggregate +(202) HashAggregate Input [3]: [o_year#X, volume#X, nation#X] Keys [1]: [o_year#X] Functions [2]: [partial_sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END), partial_sum(volume#X)] Aggregate Attributes [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Results [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] -(173) Exchange +(203) Exchange Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: hashpartitioning(o_year#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(174) HashAggregate +(204) HashAggregate Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Keys [1]: [o_year#X] Functions [2]: [sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END), sum(volume#X)] Aggregate Attributes [2]: [sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X, sum(volume#X)#X] Results [2]: [o_year#X, CheckOverflow((promote_precision(sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X) / promote_precision(sum(volume#X)#X)), DecimalType(38,6)) AS mkt_share#X] -(175) Exchange +(205) Exchange Input [2]: [o_year#X, mkt_share#X] Arguments: rangepartitioning(o_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(176) Sort +(206) Sort Input [2]: [o_year#X, mkt_share#X] Arguments: [o_year#X ASC NULLS FIRST], true, 0 -(177) AdaptiveSparkPlan +(207) AdaptiveSparkPlan Output [2]: [o_year#X, mkt_share#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/9.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/9.txt index 08e6c30a8da34..15fbf97a77f31 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/9.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/9.txt @@ -1,126 +1,136 @@ == Physical Plan == -AdaptiveSparkPlan (133) +AdaptiveSparkPlan (155) +- == Final Plan == - VeloxColumnarToRowExec (94) - +- ^ SortExecTransformer (92) - +- ^ InputIteratorTransformer (91) - +- ^ InputAdapter (90) - +- ^ ShuffleQueryStage (89), Statistics(X) - +- ColumnarExchange (88) - +- ^ RegularHashAggregateExecTransformer (86) - +- ^ InputIteratorTransformer (85) - +- ^ InputAdapter (84) - +- ^ ShuffleQueryStage (83), Statistics(X) - +- ColumnarExchange (82) - +- ^ ProjectExecTransformer (80) - +- ^ FlushableHashAggregateExecTransformer (79) - +- ^ ProjectExecTransformer (78) - +- ^ ShuffledHashJoinExecTransformer Inner (77) - :- ^ InputIteratorTransformer (68) - : +- ^ InputAdapter (67) - : +- ^ ShuffleQueryStage (66), Statistics(X) - : +- ColumnarExchange (65) - : +- ^ ProjectExecTransformer (63) - : +- ^ ShuffledHashJoinExecTransformer Inner (62) - : :- ^ InputIteratorTransformer (53) - : : +- ^ InputAdapter (52) - : : +- ^ ShuffleQueryStage (51), Statistics(X) - : : +- ColumnarExchange (50) - : : +- ^ ProjectExecTransformer (48) - : : +- ^ ShuffledHashJoinExecTransformer Inner (47) - : : :- ^ InputIteratorTransformer (38) - : : : +- ^ InputAdapter (37) - : : : +- ^ ShuffleQueryStage (36), Statistics(X) - : : : +- ColumnarExchange (35) - : : : +- ^ ProjectExecTransformer (33) - : : : +- ^ ShuffledHashJoinExecTransformer Inner (32) - : : : :- ^ InputIteratorTransformer (23) - : : : : +- ^ InputAdapter (22) - : : : : +- ^ ShuffleQueryStage (21), Statistics(X) - : : : : +- ColumnarExchange (20) - : : : : +- ^ ProjectExecTransformer (18) - : : : : +- ^ ShuffledHashJoinExecTransformer Inner (17) - : : : : :- ^ InputIteratorTransformer (8) - : : : : : +- ^ InputAdapter (7) - : : : : : +- ^ ShuffleQueryStage (6), Statistics(X) - : : : : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (106) + +- ^ SortExecTransformer (104) + +- ^ InputIteratorTransformer (103) + +- ShuffleQueryStage (101), Statistics(X) + +- ColumnarExchange (100) + +- VeloxAppendBatches (99) + +- ^ RegularHashAggregateExecTransformer (97) + +- ^ InputIteratorTransformer (96) + +- ShuffleQueryStage (94), Statistics(X) + +- ColumnarExchange (93) + +- VeloxAppendBatches (92) + +- ^ ProjectExecTransformer (90) + +- ^ FlushableHashAggregateExecTransformer (89) + +- ^ ProjectExecTransformer (88) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (87) + :- ^ InputIteratorTransformer (77) + : +- ShuffleQueryStage (75), Statistics(X) + : +- ColumnarExchange (74) + : +- VeloxAppendBatches (73) + : +- ^ ProjectExecTransformer (71) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (70) + : :- ^ InputIteratorTransformer (60) + : : +- ShuffleQueryStage (58), Statistics(X) + : : +- ColumnarExchange (57) + : : +- VeloxAppendBatches (56) + : : +- ^ ProjectExecTransformer (54) + : : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (53) + : : :- ^ InputIteratorTransformer (43) + : : : +- ShuffleQueryStage (41), Statistics(X) + : : : +- ColumnarExchange (40) + : : : +- VeloxAppendBatches (39) + : : : +- ^ ProjectExecTransformer (37) + : : : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (36) + : : : :- ^ InputIteratorTransformer (26) + : : : : +- ShuffleQueryStage (24), Statistics(X) + : : : : +- ColumnarExchange (23) + : : : : +- VeloxAppendBatches (22) + : : : : +- ^ ProjectExecTransformer (20) + : : : : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (19) + : : : : :- ^ InputIteratorTransformer (9) + : : : : : +- ShuffleQueryStage (7), Statistics(X) + : : : : : +- ColumnarExchange (6) + : : : : : +- VeloxAppendBatches (5) : : : : : +- ^ ProjectExecTransformer (3) : : : : : +- ^ FilterExecTransformer (2) : : : : : +- ^ Scan parquet (1) - : : : : +- ^ InputIteratorTransformer (16) - : : : : +- ^ InputAdapter (15) - : : : : +- ^ ShuffleQueryStage (14), Statistics(X) - : : : : +- ColumnarExchange (13) - : : : : +- ^ ProjectExecTransformer (11) - : : : : +- ^ FilterExecTransformer (10) - : : : : +- ^ Scan parquet (9) - : : : +- ^ InputIteratorTransformer (31) - : : : +- ^ InputAdapter (30) - : : : +- ^ ShuffleQueryStage (29), Statistics(X) - : : : +- ColumnarExchange (28) - : : : +- ^ ProjectExecTransformer (26) - : : : +- ^ FilterExecTransformer (25) - : : : +- ^ Scan parquet (24) - : : +- ^ InputIteratorTransformer (46) - : : +- ^ InputAdapter (45) - : : +- ^ ShuffleQueryStage (44), Statistics(X) - : : +- ColumnarExchange (43) - : : +- ^ ProjectExecTransformer (41) - : : +- ^ FilterExecTransformer (40) - : : +- ^ Scan parquet (39) - : +- ^ InputIteratorTransformer (61) - : +- ^ InputAdapter (60) - : +- ^ ShuffleQueryStage (59), Statistics(X) - : +- ColumnarExchange (58) - : +- ^ ProjectExecTransformer (56) - : +- ^ FilterExecTransformer (55) - : +- ^ Scan parquet (54) - +- ^ InputIteratorTransformer (76) - +- ^ InputAdapter (75) - +- ^ ShuffleQueryStage (74), Statistics(X) - +- ColumnarExchange (73) - +- ^ ProjectExecTransformer (71) - +- ^ FilterExecTransformer (70) - +- ^ Scan parquet (69) + : : : : +- ^ InputIteratorTransformer (18) + : : : : +- ShuffleQueryStage (16), Statistics(X) + : : : : +- ColumnarExchange (15) + : : : : +- VeloxAppendBatches (14) + : : : : +- ^ ProjectExecTransformer (12) + : : : : +- ^ FilterExecTransformer (11) + : : : : +- ^ Scan parquet (10) + : : : +- ^ InputIteratorTransformer (35) + : : : +- ShuffleQueryStage (33), Statistics(X) + : : : +- ColumnarExchange (32) + : : : +- VeloxAppendBatches (31) + : : : +- ^ ProjectExecTransformer (29) + : : : +- ^ FilterExecTransformer (28) + : : : +- ^ Scan parquet (27) + : : +- ^ InputIteratorTransformer (52) + : : +- ShuffleQueryStage (50), Statistics(X) + : : +- ColumnarExchange (49) + : : +- VeloxAppendBatches (48) + : : +- ^ ProjectExecTransformer (46) + : : +- ^ FilterExecTransformer (45) + : : +- ^ Scan parquet (44) + : +- ^ InputIteratorTransformer (69) + : +- ShuffleQueryStage (67), Statistics(X) + : +- ColumnarExchange (66) + : +- VeloxAppendBatches (65) + : +- ^ ProjectExecTransformer (63) + : +- ^ FilterExecTransformer (62) + : +- ^ Scan parquet (61) + +- ^ InputIteratorTransformer (86) + +- ShuffleQueryStage (84), Statistics(X) + +- ColumnarExchange (83) + +- VeloxAppendBatches (82) + +- ^ ProjectExecTransformer (80) + +- ^ FilterExecTransformer (79) + +- ^ Scan parquet (78) +- == Initial Plan == - Sort (132) - +- Exchange (131) - +- HashAggregate (130) - +- Exchange (129) - +- HashAggregate (128) - +- Project (127) - +- ShuffledHashJoin Inner BuildRight (126) - :- Exchange (122) - : +- Project (121) - : +- ShuffledHashJoin Inner BuildRight (120) - : :- Exchange (116) - : : +- Project (115) - : : +- ShuffledHashJoin Inner BuildRight (114) - : : :- Exchange (110) - : : : +- Project (109) - : : : +- ShuffledHashJoin Inner BuildRight (108) - : : : :- Exchange (104) - : : : : +- Project (103) - : : : : +- ShuffledHashJoin Inner BuildLeft (102) - : : : : :- Exchange (98) - : : : : : +- Project (97) - : : : : : +- Filter (96) - : : : : : +- Scan parquet (95) - : : : : +- Exchange (101) - : : : : +- Filter (100) - : : : : +- Scan parquet (99) - : : : +- Exchange (107) - : : : +- Filter (106) - : : : +- Scan parquet (105) - : : +- Exchange (113) - : : +- Filter (112) - : : +- Scan parquet (111) - : +- Exchange (119) - : +- Filter (118) - : +- Scan parquet (117) - +- Exchange (125) - +- Filter (124) - +- Scan parquet (123) + Sort (154) + +- Exchange (153) + +- HashAggregate (152) + +- Exchange (151) + +- HashAggregate (150) + +- Project (149) + +- SortMergeJoin Inner (148) + :- Sort (143) + : +- Exchange (142) + : +- Project (141) + : +- SortMergeJoin Inner (140) + : :- Sort (135) + : : +- Exchange (134) + : : +- Project (133) + : : +- SortMergeJoin Inner (132) + : : :- Sort (127) + : : : +- Exchange (126) + : : : +- Project (125) + : : : +- SortMergeJoin Inner (124) + : : : :- Sort (119) + : : : : +- Exchange (118) + : : : : +- Project (117) + : : : : +- SortMergeJoin Inner (116) + : : : : :- Sort (111) + : : : : : +- Exchange (110) + : : : : : +- Project (109) + : : : : : +- Filter (108) + : : : : : +- Scan parquet (107) + : : : : +- Sort (115) + : : : : +- Exchange (114) + : : : : +- Filter (113) + : : : : +- Scan parquet (112) + : : : +- Sort (123) + : : : +- Exchange (122) + : : : +- Filter (121) + : : : +- Scan parquet (120) + : : +- Sort (131) + : : +- Exchange (130) + : : +- Filter (129) + : : +- Scan parquet (128) + : +- Sort (139) + : +- Exchange (138) + : +- Filter (137) + : +- Scan parquet (136) + +- Sort (147) + +- Exchange (146) + +- Filter (145) + +- Scan parquet (144) (1) Scan parquet @@ -142,548 +152,636 @@ Input [2]: [p_partkey#X, p_name#X] Input [2]: [hash_partition_key#X, p_partkey#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [2]: [hash_partition_key#X, p_partkey#X] +Arguments: X + +(6) ColumnarExchange Input [2]: [hash_partition_key#X, p_partkey#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [1]: [p_partkey#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [1]: [p_partkey#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [1]: [p_partkey#X] -(9) Scan parquet +(10) Scan parquet Output [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(10) FilterExecTransformer +(11) FilterExecTransformer Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: ((isnotnull(l_partkey#X) AND isnotnull(l_suppkey#X)) AND isnotnull(l_orderkey#X)) -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [7]: [hash(l_partkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [7]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [7]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(15) ColumnarExchange Input [7]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [p_partkey#X] Right keys [1]: [l_partkey#X] Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [7]: [hash(l_suppkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Input [7]: [p_partkey#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -(19) WholeStageCodegenTransformer (X) +(21) WholeStageCodegenTransformer (X) Input [7]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: false -(20) ColumnarExchange +(22) VeloxAppendBatches +Input [7]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(23) ColumnarExchange Input [7]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(21) ShuffleQueryStage +(24) ShuffleQueryStage Output [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: X -(22) InputAdapter +(25) InputAdapter Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -(23) InputIteratorTransformer +(26) InputIteratorTransformer Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -(24) Scan parquet +(27) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(25) FilterExecTransformer +(28) FilterExecTransformer Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(26) ProjectExecTransformer +(29) ProjectExecTransformer Output [3]: [hash(s_suppkey#X, 42) AS hash_partition_key#X, s_suppkey#X, s_nationkey#X] Input [2]: [s_suppkey#X, s_nationkey#X] -(27) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] Arguments: false -(28) ColumnarExchange +(31) VeloxAppendBatches +Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] +Arguments: X + +(32) ColumnarExchange Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [id=#X] -(29) ShuffleQueryStage +(33) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] Arguments: X -(30) InputAdapter +(34) InputAdapter Input [2]: [s_suppkey#X, s_nationkey#X] -(31) InputIteratorTransformer +(35) InputIteratorTransformer Input [2]: [s_suppkey#X, s_nationkey#X] -(32) ShuffledHashJoinExecTransformer +(36) ShuffledHashJoinExecTransformer Left keys [1]: [l_suppkey#X] Right keys [1]: [s_suppkey#X] Join condition: None -(33) ProjectExecTransformer +(37) ProjectExecTransformer Output [8]: [hash(l_suppkey#X, l_partkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Input [8]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_suppkey#X, s_nationkey#X] -(34) WholeStageCodegenTransformer (X) +(38) WholeStageCodegenTransformer (X) Input [8]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: false -(35) ColumnarExchange +(39) VeloxAppendBatches +Input [8]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] +Arguments: X + +(40) ColumnarExchange Input [8]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: hashpartitioning(l_suppkey#X, l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X], [plan_id=X], [id=#X] -(36) ShuffleQueryStage +(41) ShuffleQueryStage Output [7]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: X -(37) InputAdapter +(42) InputAdapter Input [7]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] -(38) InputIteratorTransformer +(43) InputIteratorTransformer Input [7]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] -(39) Scan parquet +(44) Scan parquet Output [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_suppkey), IsNotNull(ps_partkey)] ReadSchema: struct -(40) FilterExecTransformer +(45) FilterExecTransformer Input [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Arguments: (isnotnull(ps_suppkey#X) AND isnotnull(ps_partkey#X)) -(41) ProjectExecTransformer +(46) ProjectExecTransformer Output [4]: [hash(ps_suppkey#X, ps_partkey#X, 42) AS hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Input [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] -(42) WholeStageCodegenTransformer (X) +(47) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Arguments: false -(43) ColumnarExchange +(48) VeloxAppendBatches +Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_supplycost#X] +Arguments: X + +(49) ColumnarExchange Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Arguments: hashpartitioning(ps_suppkey#X, ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_supplycost#X], [plan_id=X], [id=#X] -(44) ShuffleQueryStage +(50) ShuffleQueryStage Output [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Arguments: X -(45) InputAdapter +(51) InputAdapter Input [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] -(46) InputIteratorTransformer +(52) InputIteratorTransformer Input [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] -(47) ShuffledHashJoinExecTransformer +(53) ShuffledHashJoinExecTransformer Left keys [2]: [l_suppkey#X, l_partkey#X] Right keys [2]: [ps_suppkey#X, ps_partkey#X] Join condition: None -(48) ProjectExecTransformer +(54) ProjectExecTransformer Output [7]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] Input [10]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_partkey#X, ps_suppkey#X, ps_supplycost#X] -(49) WholeStageCodegenTransformer (X) +(55) WholeStageCodegenTransformer (X) Input [7]: [hash_partition_key#X, l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] Arguments: false -(50) ColumnarExchange +(56) VeloxAppendBatches +Input [7]: [hash_partition_key#X, l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] +Arguments: X + +(57) ColumnarExchange Input [7]: [hash_partition_key#X, l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X], [plan_id=X], [id=#X] -(51) ShuffleQueryStage +(58) ShuffleQueryStage Output [6]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] Arguments: X -(52) InputAdapter +(59) InputAdapter Input [6]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] -(53) InputIteratorTransformer +(60) InputIteratorTransformer Input [6]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] -(54) Scan parquet +(61) Scan parquet Output [2]: [o_orderkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderkey)] ReadSchema: struct -(55) FilterExecTransformer +(62) FilterExecTransformer Input [2]: [o_orderkey#X, o_orderdate#X] Arguments: isnotnull(o_orderkey#X) -(56) ProjectExecTransformer +(63) ProjectExecTransformer Output [3]: [hash(o_orderkey#X, 42) AS hash_partition_key#X, o_orderkey#X, o_orderdate#X] Input [2]: [o_orderkey#X, o_orderdate#X] -(57) WholeStageCodegenTransformer (X) +(64) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderdate#X] Arguments: false -(58) ColumnarExchange +(65) VeloxAppendBatches +Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderdate#X] +Arguments: X + +(66) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderdate#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderdate#X], [plan_id=X], [id=#X] -(59) ShuffleQueryStage +(67) ShuffleQueryStage Output [2]: [o_orderkey#X, o_orderdate#X] Arguments: X -(60) InputAdapter +(68) InputAdapter Input [2]: [o_orderkey#X, o_orderdate#X] -(61) InputIteratorTransformer +(69) InputIteratorTransformer Input [2]: [o_orderkey#X, o_orderdate#X] -(62) ShuffledHashJoinExecTransformer +(70) ShuffledHashJoinExecTransformer Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join condition: None -(63) ProjectExecTransformer +(71) ProjectExecTransformer Output [7]: [hash(s_nationkey#X, 42) AS hash_partition_key#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] Input [8]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderkey#X, o_orderdate#X] -(64) WholeStageCodegenTransformer (X) +(72) WholeStageCodegenTransformer (X) Input [7]: [hash_partition_key#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] Arguments: false -(65) ColumnarExchange +(73) VeloxAppendBatches +Input [7]: [hash_partition_key#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] +Arguments: X + +(74) ColumnarExchange Input [7]: [hash_partition_key#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X], [plan_id=X], [id=#X] -(66) ShuffleQueryStage +(75) ShuffleQueryStage Output [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] Arguments: X -(67) InputAdapter +(76) InputAdapter Input [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] -(68) InputIteratorTransformer +(77) InputIteratorTransformer Input [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] -(69) Scan parquet +(78) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey)] ReadSchema: struct -(70) FilterExecTransformer +(79) FilterExecTransformer Input [2]: [n_nationkey#X, n_name#X] Arguments: isnotnull(n_nationkey#X) -(71) ProjectExecTransformer +(80) ProjectExecTransformer Output [3]: [hash(n_nationkey#X, 42) AS hash_partition_key#X, n_nationkey#X, n_name#X] Input [2]: [n_nationkey#X, n_name#X] -(72) WholeStageCodegenTransformer (X) +(81) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] Arguments: false -(73) ColumnarExchange +(82) VeloxAppendBatches +Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] +Arguments: X + +(83) ColumnarExchange Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [id=#X] -(74) ShuffleQueryStage +(84) ShuffleQueryStage Output [2]: [n_nationkey#X, n_name#X] Arguments: X -(75) InputAdapter +(85) InputAdapter Input [2]: [n_nationkey#X, n_name#X] -(76) InputIteratorTransformer +(86) InputIteratorTransformer Input [2]: [n_nationkey#X, n_name#X] -(77) ShuffledHashJoinExecTransformer +(87) ShuffledHashJoinExecTransformer Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(78) ProjectExecTransformer +(88) ProjectExecTransformer Output [3]: [n_name#X AS nation#X, year(o_orderdate#X) AS o_year#X, CheckOverflow((promote_precision(cast(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) as decimal(27,4))) - promote_precision(cast(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(l_quantity#X)), DecimalType(25,4)) as decimal(27,4)))), DecimalType(27,4)) AS amount#X] Input [8]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X, n_nationkey#X, n_name#X] -(79) FlushableHashAggregateExecTransformer +(89) FlushableHashAggregateExecTransformer Input [3]: [nation#X, o_year#X, amount#X] Keys [2]: [nation#X, o_year#X] Functions [1]: [partial_sum(amount#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [4]: [nation#X, o_year#X, sum#X, isEmpty#X] -(80) ProjectExecTransformer +(90) ProjectExecTransformer Output [5]: [hash(nation#X, o_year#X, 42) AS hash_partition_key#X, nation#X, o_year#X, sum#X, isEmpty#X] Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] -(81) WholeStageCodegenTransformer (X) +(91) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, nation#X, o_year#X, sum#X, isEmpty#X] Arguments: false -(82) ColumnarExchange +(92) VeloxAppendBatches +Input [5]: [hash_partition_key#X, nation#X, o_year#X, sum#X, isEmpty#X] +Arguments: X + +(93) ColumnarExchange Input [5]: [hash_partition_key#X, nation#X, o_year#X, sum#X, isEmpty#X] Arguments: hashpartitioning(nation#X, o_year#X, 1), ENSURE_REQUIREMENTS, [nation#X, o_year#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(83) ShuffleQueryStage +(94) ShuffleQueryStage Output [4]: [nation#X, o_year#X, sum#X, isEmpty#X] Arguments: X -(84) InputAdapter +(95) InputAdapter Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] -(85) InputIteratorTransformer +(96) InputIteratorTransformer Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] -(86) RegularHashAggregateExecTransformer +(97) RegularHashAggregateExecTransformer Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] Keys [2]: [nation#X, o_year#X] Functions [1]: [sum(amount#X)] Aggregate Attributes [1]: [sum(amount#X)#X] Results [3]: [nation#X, o_year#X, sum(amount#X)#X AS sum_profit#X] -(87) WholeStageCodegenTransformer (X) +(98) WholeStageCodegenTransformer (X) Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: false -(88) ColumnarExchange +(99) VeloxAppendBatches +Input [3]: [nation#X, o_year#X, sum_profit#X] +Arguments: X + +(100) ColumnarExchange Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: rangepartitioning(nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(89) ShuffleQueryStage +(101) ShuffleQueryStage Output [3]: [nation#X, o_year#X, sum_profit#X] Arguments: X -(90) InputAdapter +(102) InputAdapter Input [3]: [nation#X, o_year#X, sum_profit#X] -(91) InputIteratorTransformer +(103) InputIteratorTransformer Input [3]: [nation#X, o_year#X, sum_profit#X] -(92) SortExecTransformer +(104) SortExecTransformer Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: [nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST], true, 0 -(93) WholeStageCodegenTransformer (X) +(105) WholeStageCodegenTransformer (X) Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: false -(94) VeloxColumnarToRowExec +(106) VeloxColumnarToRowExec Input [3]: [nation#X, o_year#X, sum_profit#X] -(95) Scan parquet +(107) Scan parquet Output [2]: [p_partkey#X, p_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_name), StringContains(p_name,green), IsNotNull(p_partkey)] ReadSchema: struct -(96) Filter +(108) Filter Input [2]: [p_partkey#X, p_name#X] Condition : ((isnotnull(p_name#X) AND Contains(p_name#X, green)) AND isnotnull(p_partkey#X)) -(97) Project +(109) Project Output [1]: [p_partkey#X] Input [2]: [p_partkey#X, p_name#X] -(98) Exchange +(110) Exchange Input [1]: [p_partkey#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(99) Scan parquet +(111) Sort +Input [1]: [p_partkey#X] +Arguments: [p_partkey#X ASC NULLS FIRST], false, 0 + +(112) Scan parquet Output [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(100) Filter +(113) Filter Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Condition : ((isnotnull(l_partkey#X) AND isnotnull(l_suppkey#X)) AND isnotnull(l_orderkey#X)) -(101) Exchange +(114) Exchange Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(102) ShuffledHashJoin +(115) Sort +Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] +Arguments: [l_partkey#X ASC NULLS FIRST], false, 0 + +(116) SortMergeJoin Left keys [1]: [p_partkey#X] Right keys [1]: [l_partkey#X] Join condition: None -(103) Project +(117) Project Output [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Input [7]: [p_partkey#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -(104) Exchange +(118) Exchange Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(105) Scan parquet +(119) Sort +Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] +Arguments: [l_suppkey#X ASC NULLS FIRST], false, 0 + +(120) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(106) Filter +(121) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(107) Exchange +(122) Exchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(108) ShuffledHashJoin +(123) Sort +Input [2]: [s_suppkey#X, s_nationkey#X] +Arguments: [s_suppkey#X ASC NULLS FIRST], false, 0 + +(124) SortMergeJoin Left keys [1]: [l_suppkey#X] Right keys [1]: [s_suppkey#X] Join condition: None -(109) Project +(125) Project Output [7]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Input [8]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_suppkey#X, s_nationkey#X] -(110) Exchange +(126) Exchange Input [7]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: hashpartitioning(l_suppkey#X, l_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(111) Scan parquet +(127) Sort +Input [7]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] +Arguments: [l_suppkey#X ASC NULLS FIRST, l_partkey#X ASC NULLS FIRST], false, 0 + +(128) Scan parquet Output [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_suppkey), IsNotNull(ps_partkey)] ReadSchema: struct -(112) Filter +(129) Filter Input [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Condition : (isnotnull(ps_suppkey#X) AND isnotnull(ps_partkey#X)) -(113) Exchange +(130) Exchange Input [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Arguments: hashpartitioning(ps_suppkey#X, ps_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(114) ShuffledHashJoin +(131) Sort +Input [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] +Arguments: [ps_suppkey#X ASC NULLS FIRST, ps_partkey#X ASC NULLS FIRST], false, 0 + +(132) SortMergeJoin Left keys [2]: [l_suppkey#X, l_partkey#X] Right keys [2]: [ps_suppkey#X, ps_partkey#X] Join condition: None -(115) Project +(133) Project Output [6]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] Input [10]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_partkey#X, ps_suppkey#X, ps_supplycost#X] -(116) Exchange +(134) Exchange Input [6]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(117) Scan parquet +(135) Sort +Input [6]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(136) Scan parquet Output [2]: [o_orderkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderkey)] ReadSchema: struct -(118) Filter +(137) Filter Input [2]: [o_orderkey#X, o_orderdate#X] Condition : isnotnull(o_orderkey#X) -(119) Exchange +(138) Exchange Input [2]: [o_orderkey#X, o_orderdate#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(120) ShuffledHashJoin +(139) Sort +Input [2]: [o_orderkey#X, o_orderdate#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(140) SortMergeJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join condition: None -(121) Project +(141) Project Output [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] Input [8]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderkey#X, o_orderdate#X] -(122) Exchange +(142) Exchange Input [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(123) Scan parquet +(143) Sort +Input [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] +Arguments: [s_nationkey#X ASC NULLS FIRST], false, 0 + +(144) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey)] ReadSchema: struct -(124) Filter +(145) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : isnotnull(n_nationkey#X) -(125) Exchange +(146) Exchange Input [2]: [n_nationkey#X, n_name#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(126) ShuffledHashJoin +(147) Sort +Input [2]: [n_nationkey#X, n_name#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(148) SortMergeJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join condition: None -(127) Project +(149) Project Output [3]: [n_name#X AS nation#X, year(o_orderdate#X) AS o_year#X, CheckOverflow((promote_precision(cast(CheckOverflow((promote_precision(cast(l_extendedprice#X as decimal(13,2))) * promote_precision(CheckOverflow((1.00 - promote_precision(cast(l_discount#X as decimal(13,2)))), DecimalType(13,2)))), DecimalType(26,4)) as decimal(27,4))) - promote_precision(cast(CheckOverflow((promote_precision(ps_supplycost#X) * promote_precision(l_quantity#X)), DecimalType(25,4)) as decimal(27,4)))), DecimalType(27,4)) AS amount#X] Input [8]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X, n_nationkey#X, n_name#X] -(128) HashAggregate +(150) HashAggregate Input [3]: [nation#X, o_year#X, amount#X] Keys [2]: [nation#X, o_year#X] Functions [1]: [partial_sum(amount#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [4]: [nation#X, o_year#X, sum#X, isEmpty#X] -(129) Exchange +(151) Exchange Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] Arguments: hashpartitioning(nation#X, o_year#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(130) HashAggregate +(152) HashAggregate Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] Keys [2]: [nation#X, o_year#X] Functions [1]: [sum(amount#X)] Aggregate Attributes [1]: [sum(amount#X)#X] Results [3]: [nation#X, o_year#X, sum(amount#X)#X AS sum_profit#X] -(131) Exchange +(153) Exchange Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: rangepartitioning(nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(132) Sort +(154) Sort Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: [nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST], true, 0 -(133) AdaptiveSparkPlan +(155) AdaptiveSparkPlan Output [3]: [nation#X, o_year#X, sum_profit#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/1.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/1.txt index 634f26c86f24a..1e53cd90e1b35 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/1.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/1.txt @@ -1,31 +1,31 @@ == Physical Plan == -AdaptiveSparkPlan (28) +AdaptiveSparkPlan (30) +- == Final Plan == - VeloxColumnarToRowExec (19) - +- ^ SortExecTransformer (17) - +- ^ InputIteratorTransformer (16) - +- ^ InputAdapter (15) - +- ^ ShuffleQueryStage (14), Statistics(X) - +- ColumnarExchange (13) - +- ^ RegularHashAggregateExecTransformer (11) - +- ^ InputIteratorTransformer (10) - +- ^ InputAdapter (9) - +- ^ ShuffleQueryStage (8), Statistics(X) - +- ColumnarExchange (7) + VeloxColumnarToRowExec (21) + +- ^ SortExecTransformer (19) + +- ^ InputIteratorTransformer (18) + +- ShuffleQueryStage (16), Statistics(X) + +- ColumnarExchange (15) + +- VeloxAppendBatches (14) + +- ^ RegularHashAggregateExecTransformer (12) + +- ^ InputIteratorTransformer (11) + +- ShuffleQueryStage (9), Statistics(X) + +- ColumnarExchange (8) + +- VeloxAppendBatches (7) +- ^ ProjectExecTransformer (5) +- ^ FlushableHashAggregateExecTransformer (4) +- ^ ProjectExecTransformer (3) +- ^ FilterExecTransformer (2) +- ^ Scan parquet (1) +- == Initial Plan == - Sort (27) - +- Exchange (26) - +- HashAggregate (25) - +- Exchange (24) - +- HashAggregate (23) - +- Project (22) - +- Filter (21) - +- Scan parquet (20) + Sort (29) + +- Exchange (28) + +- HashAggregate (27) + +- Exchange (26) + +- HashAggregate (25) + +- Project (24) + +- Filter (23) + +- Scan parquet (22) (1) Scan parquet @@ -58,97 +58,105 @@ Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, Input [18]: [hash_partition_key#X, l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Arguments: false -(7) ColumnarExchange +(7) VeloxAppendBatches +Input [18]: [hash_partition_key#X, l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] +Arguments: X + +(8) ColumnarExchange Input [18]: [hash_partition_key#X, l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Arguments: hashpartitioning(l_returnflag#X, l_linestatus#X, 1), ENSURE_REQUIREMENTS, [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X], [plan_id=X], [id=#X] -(8) ShuffleQueryStage +(9) ShuffleQueryStage Output [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Arguments: X -(9) InputAdapter +(10) InputAdapter Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] -(10) InputIteratorTransformer +(11) InputIteratorTransformer Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] -(11) RegularHashAggregateExecTransformer +(12) RegularHashAggregateExecTransformer Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Keys [2]: [l_returnflag#X, l_linestatus#X] Functions [8]: [sum(l_quantity#X), sum(l_extendedprice#X), sum((l_extendedprice#X * (1 - l_discount#X))), sum(((l_extendedprice#X * (1 - l_discount#X)) * (1 + l_tax#X))), avg(l_quantity#X), avg(l_extendedprice#X), avg(l_discount#X), count(1)] Aggregate Attributes [8]: [sum(l_quantity#X)#X, sum(l_extendedprice#X)#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X, sum(((l_extendedprice#X * (1 - l_discount#X)) * (1 + l_tax#X)))#X, avg(l_quantity#X)#X, avg(l_extendedprice#X)#X, avg(l_discount#X)#X, count(1)#X] Results [10]: [l_returnflag#X, l_linestatus#X, sum(l_quantity#X)#X AS sum_qty#X, sum(l_extendedprice#X)#X AS sum_base_price#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X AS sum_disc_price#X, sum(((l_extendedprice#X * (1 - l_discount#X)) * (1 + l_tax#X)))#X AS sum_charge#X, avg(l_quantity#X)#X AS avg_qty#X, avg(l_extendedprice#X)#X AS avg_price#X, avg(l_discount#X)#X AS avg_disc#X, count(1)#X AS count_order#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] +Arguments: X + +(15) ColumnarExchange Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: rangepartitioning(l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] -(17) SortExecTransformer +(19) SortExecTransformer Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: [l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST], true, 0 -(18) WholeStageCodegenTransformer (X) +(20) WholeStageCodegenTransformer (X) Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: false -(19) VeloxColumnarToRowExec +(21) VeloxColumnarToRowExec Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] -(20) Scan parquet +(22) Scan parquet Output [7]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_tax#X, l_returnflag#X, l_linestatus#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), LessThanOrEqual(l_shipdate,1998-09-02)] ReadSchema: struct -(21) Filter +(23) Filter Input [7]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_tax#X, l_returnflag#X, l_linestatus#X, l_shipdate#X] Condition : (isnotnull(l_shipdate#X) AND (l_shipdate#X <= 1998-09-02)) -(22) Project +(24) Project Output [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_tax#X, l_returnflag#X, l_linestatus#X] Input [7]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_tax#X, l_returnflag#X, l_linestatus#X, l_shipdate#X] -(23) HashAggregate +(25) HashAggregate Input [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_tax#X, l_returnflag#X, l_linestatus#X] Keys [2]: [l_returnflag#X, l_linestatus#X] Functions [8]: [partial_sum(l_quantity#X), partial_sum(l_extendedprice#X), partial_sum((l_extendedprice#X * (1 - l_discount#X))), partial_sum(((l_extendedprice#X * (1 - l_discount#X)) * (1 + l_tax#X))), partial_avg(l_quantity#X), partial_avg(l_extendedprice#X), partial_avg(l_discount#X), partial_count(1)] Aggregate Attributes [15]: [sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Results [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] -(24) Exchange +(26) Exchange Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Arguments: hashpartitioning(l_returnflag#X, l_linestatus#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(25) HashAggregate +(27) HashAggregate Input [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] Keys [2]: [l_returnflag#X, l_linestatus#X] Functions [8]: [sum(l_quantity#X), sum(l_extendedprice#X), sum((l_extendedprice#X * (1 - l_discount#X))), sum(((l_extendedprice#X * (1 - l_discount#X)) * (1 + l_tax#X))), avg(l_quantity#X), avg(l_extendedprice#X), avg(l_discount#X), count(1)] Aggregate Attributes [8]: [sum(l_quantity#X)#X, sum(l_extendedprice#X)#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X, sum(((l_extendedprice#X * (1 - l_discount#X)) * (1 + l_tax#X)))#X, avg(l_quantity#X)#X, avg(l_extendedprice#X)#X, avg(l_discount#X)#X, count(1)#X] Results [10]: [l_returnflag#X, l_linestatus#X, sum(l_quantity#X)#X AS sum_qty#X, sum(l_extendedprice#X)#X AS sum_base_price#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X AS sum_disc_price#X, sum(((l_extendedprice#X * (1 - l_discount#X)) * (1 + l_tax#X)))#X AS sum_charge#X, avg(l_quantity#X)#X AS avg_qty#X, avg(l_extendedprice#X)#X AS avg_price#X, avg(l_discount#X)#X AS avg_disc#X, count(1)#X AS count_order#X] -(26) Exchange +(28) Exchange Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: rangepartitioning(l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(27) Sort +(29) Sort Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: [l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST], true, 0 -(28) AdaptiveSparkPlan +(30) AdaptiveSparkPlan Output [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/10.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/10.txt index 873710be9a024..d7376c740f934 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/10.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/10.txt @@ -1,85 +1,91 @@ == Physical Plan == -AdaptiveSparkPlan (87) +AdaptiveSparkPlan (100) +- == Final Plan == - VeloxColumnarToRowExec (60) - +- TakeOrderedAndProjectExecTransformer (59) - +- ^ ProjectExecTransformer (57) - +- ^ RegularHashAggregateExecTransformer (56) - +- ^ InputIteratorTransformer (55) - +- ^ InputAdapter (54) - +- ^ ShuffleQueryStage (53), Statistics(X) - +- ColumnarExchange (52) - +- ^ ProjectExecTransformer (50) - +- ^ FlushableHashAggregateExecTransformer (49) - +- ^ ProjectExecTransformer (48) - +- ^ ShuffledHashJoinExecTransformer Inner (47) - :- ^ InputIteratorTransformer (38) - : +- ^ InputAdapter (37) - : +- ^ ShuffleQueryStage (36), Statistics(X) - : +- ColumnarExchange (35) - : +- ^ ProjectExecTransformer (33) - : +- ^ ShuffledHashJoinExecTransformer Inner (32) - : :- ^ InputIteratorTransformer (23) - : : +- ^ InputAdapter (22) - : : +- ^ ShuffleQueryStage (21), Statistics(X) - : : +- ColumnarExchange (20) - : : +- ^ ProjectExecTransformer (18) - : : +- ^ ShuffledHashJoinExecTransformer Inner (17) - : : :- ^ InputIteratorTransformer (8) - : : : +- ^ InputAdapter (7) - : : : +- ^ ShuffleQueryStage (6), Statistics(X) - : : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (67) + +- TakeOrderedAndProjectExecTransformer (66) + +- ^ ProjectExecTransformer (64) + +- ^ RegularHashAggregateExecTransformer (63) + +- ^ InputIteratorTransformer (62) + +- ShuffleQueryStage (60), Statistics(X) + +- ColumnarExchange (59) + +- VeloxAppendBatches (58) + +- ^ ProjectExecTransformer (56) + +- ^ FlushableHashAggregateExecTransformer (55) + +- ^ ProjectExecTransformer (54) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (53) + :- ^ InputIteratorTransformer (43) + : +- ShuffleQueryStage (41), Statistics(X) + : +- ColumnarExchange (40) + : +- VeloxAppendBatches (39) + : +- ^ ProjectExecTransformer (37) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (36) + : :- ^ InputIteratorTransformer (26) + : : +- ShuffleQueryStage (24), Statistics(X) + : : +- ColumnarExchange (23) + : : +- VeloxAppendBatches (22) + : : +- ^ ProjectExecTransformer (20) + : : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (19) + : : :- ^ InputIteratorTransformer (9) + : : : +- ShuffleQueryStage (7), Statistics(X) + : : : +- ColumnarExchange (6) + : : : +- VeloxAppendBatches (5) : : : +- ^ ProjectExecTransformer (3) : : : +- ^ FilterExecTransformer (2) : : : +- ^ Scan parquet (1) - : : +- ^ InputIteratorTransformer (16) - : : +- ^ InputAdapter (15) - : : +- ^ ShuffleQueryStage (14), Statistics(X) - : : +- ColumnarExchange (13) - : : +- ^ ProjectExecTransformer (11) - : : +- ^ FilterExecTransformer (10) - : : +- ^ Scan parquet (9) - : +- ^ InputIteratorTransformer (31) - : +- ^ InputAdapter (30) - : +- ^ ShuffleQueryStage (29), Statistics(X) - : +- ColumnarExchange (28) - : +- ^ ProjectExecTransformer (26) - : +- ^ FilterExecTransformer (25) - : +- ^ Scan parquet (24) - +- ^ InputIteratorTransformer (46) - +- ^ InputAdapter (45) - +- ^ ShuffleQueryStage (44), Statistics(X) - +- ColumnarExchange (43) - +- ^ ProjectExecTransformer (41) - +- ^ FilterExecTransformer (40) - +- ^ Scan parquet (39) + : : +- ^ InputIteratorTransformer (18) + : : +- ShuffleQueryStage (16), Statistics(X) + : : +- ColumnarExchange (15) + : : +- VeloxAppendBatches (14) + : : +- ^ ProjectExecTransformer (12) + : : +- ^ FilterExecTransformer (11) + : : +- ^ Scan parquet (10) + : +- ^ InputIteratorTransformer (35) + : +- ShuffleQueryStage (33), Statistics(X) + : +- ColumnarExchange (32) + : +- VeloxAppendBatches (31) + : +- ^ ProjectExecTransformer (29) + : +- ^ FilterExecTransformer (28) + : +- ^ Scan parquet (27) + +- ^ InputIteratorTransformer (52) + +- ShuffleQueryStage (50), Statistics(X) + +- ColumnarExchange (49) + +- VeloxAppendBatches (48) + +- ^ ProjectExecTransformer (46) + +- ^ FilterExecTransformer (45) + +- ^ Scan parquet (44) +- == Initial Plan == - TakeOrderedAndProject (86) - +- HashAggregate (85) - +- Exchange (84) - +- HashAggregate (83) - +- Project (82) - +- ShuffledHashJoin Inner BuildRight (81) - :- Exchange (77) - : +- Project (76) - : +- ShuffledHashJoin Inner BuildRight (75) - : :- Exchange (70) - : : +- Project (69) - : : +- ShuffledHashJoin Inner BuildRight (68) - : : :- Exchange (63) - : : : +- Filter (62) - : : : +- Scan parquet (61) - : : +- Exchange (67) - : : +- Project (66) - : : +- Filter (65) - : : +- Scan parquet (64) - : +- Exchange (74) - : +- Project (73) - : +- Filter (72) - : +- Scan parquet (71) - +- Exchange (80) - +- Filter (79) - +- Scan parquet (78) + TakeOrderedAndProject (99) + +- HashAggregate (98) + +- Exchange (97) + +- HashAggregate (96) + +- Project (95) + +- SortMergeJoin Inner (94) + :- Sort (89) + : +- Exchange (88) + : +- Project (87) + : +- SortMergeJoin Inner (86) + : :- Sort (80) + : : +- Exchange (79) + : : +- Project (78) + : : +- SortMergeJoin Inner (77) + : : :- Sort (71) + : : : +- Exchange (70) + : : : +- Filter (69) + : : : +- Scan parquet (68) + : : +- Sort (76) + : : +- Exchange (75) + : : +- Project (74) + : : +- Filter (73) + : : +- Scan parquet (72) + : +- Sort (85) + : +- Exchange (84) + : +- Project (83) + : +- Filter (82) + : +- Scan parquet (81) + +- Sort (93) + +- Exchange (92) + +- Filter (91) + +- Scan parquet (90) (1) Scan parquet @@ -101,364 +107,416 @@ Input [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acct Input [8]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [8]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] +Arguments: X + +(6) ColumnarExchange Input [8]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] -(9) Scan parquet +(10) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1993-10-01), LessThan(o_orderdate,1994-01-01), IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(10) FilterExecTransformer +(11) FilterExecTransformer Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Arguments: ((((isnotnull(o_orderdate#X) AND (o_orderdate#X >= 1993-10-01)) AND (o_orderdate#X < 1994-01-01)) AND isnotnull(o_custkey#X)) AND isnotnull(o_orderkey#X)) -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [3]: [hash(o_custkey#X, 42) AS hash_partition_key#X, o_orderkey#X, o_custkey#X] Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] +Arguments: X + +(15) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [2]: [o_orderkey#X, o_custkey#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [2]: [o_orderkey#X, o_custkey#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [2]: [o_orderkey#X, o_custkey#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join type: Inner Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [9]: [hash(o_orderkey#X, 42) AS hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] Input [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X, o_custkey#X] -(19) WholeStageCodegenTransformer (X) +(21) WholeStageCodegenTransformer (X) Input [9]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] Arguments: false -(20) ColumnarExchange +(22) VeloxAppendBatches +Input [9]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] +Arguments: X + +(23) ColumnarExchange Input [9]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X], [plan_id=X], [id=#X] -(21) ShuffleQueryStage +(24) ShuffleQueryStage Output [8]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] Arguments: X -(22) InputAdapter +(25) InputAdapter Input [8]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] -(23) InputIteratorTransformer +(26) InputIteratorTransformer Input [8]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] -(24) Scan parquet +(27) Scan parquet Output [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_returnflag#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_returnflag), EqualTo(l_returnflag,R), IsNotNull(l_orderkey)] ReadSchema: struct -(25) FilterExecTransformer +(28) FilterExecTransformer Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_returnflag#X] Arguments: ((isnotnull(l_returnflag#X) AND (l_returnflag#X = R)) AND isnotnull(l_orderkey#X)) -(26) ProjectExecTransformer +(29) ProjectExecTransformer Output [4]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_returnflag#X] -(27) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X] Arguments: false -(28) ColumnarExchange +(31) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(32) ColumnarExchange Input [4]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(29) ShuffleQueryStage +(33) ShuffleQueryStage Output [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] Arguments: X -(30) InputAdapter +(34) InputAdapter Input [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] -(31) InputIteratorTransformer +(35) InputIteratorTransformer Input [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] -(32) ShuffledHashJoinExecTransformer +(36) ShuffledHashJoinExecTransformer Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: Inner Join condition: None -(33) ProjectExecTransformer +(37) ProjectExecTransformer Output [10]: [hash(c_nationkey#X, 42) AS hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] Input [11]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X] -(34) WholeStageCodegenTransformer (X) +(38) WholeStageCodegenTransformer (X) Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] Arguments: false -(35) ColumnarExchange +(39) VeloxAppendBatches +Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(40) ColumnarExchange Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(36) ShuffleQueryStage +(41) ShuffleQueryStage Output [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] Arguments: X -(37) InputAdapter +(42) InputAdapter Input [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] -(38) InputIteratorTransformer +(43) InputIteratorTransformer Input [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] -(39) Scan parquet +(44) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey)] ReadSchema: struct -(40) FilterExecTransformer +(45) FilterExecTransformer Input [2]: [n_nationkey#X, n_name#X] Arguments: isnotnull(n_nationkey#X) -(41) ProjectExecTransformer +(46) ProjectExecTransformer Output [3]: [hash(n_nationkey#X, 42) AS hash_partition_key#X, n_nationkey#X, n_name#X] Input [2]: [n_nationkey#X, n_name#X] -(42) WholeStageCodegenTransformer (X) +(47) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] Arguments: false -(43) ColumnarExchange +(48) VeloxAppendBatches +Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] +Arguments: X + +(49) ColumnarExchange Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [id=#X] -(44) ShuffleQueryStage +(50) ShuffleQueryStage Output [2]: [n_nationkey#X, n_name#X] Arguments: X -(45) InputAdapter +(51) InputAdapter Input [2]: [n_nationkey#X, n_name#X] -(46) InputIteratorTransformer +(52) InputIteratorTransformer Input [2]: [n_nationkey#X, n_name#X] -(47) ShuffledHashJoinExecTransformer +(53) ShuffledHashJoinExecTransformer Left keys [1]: [c_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(48) ProjectExecTransformer +(54) ProjectExecTransformer Output [10]: [c_custkey#X, c_name#X, c_address#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X, n_name#X, (l_extendedprice#X * (1 - l_discount#X)) AS _pre_X#X] Input [11]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X, n_nationkey#X, n_name#X] -(49) FlushableHashAggregateExecTransformer +(55) FlushableHashAggregateExecTransformer Input [10]: [c_custkey#X, c_name#X, c_address#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X, n_name#X, _pre_X#X] Keys [7]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X] Functions [1]: [partial_sum(_pre_X#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] -(50) ProjectExecTransformer +(56) ProjectExecTransformer Output [10]: [hash(c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, 42) AS hash_partition_key#X, c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] -(51) WholeStageCodegenTransformer (X) +(57) WholeStageCodegenTransformer (X) Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Arguments: false -(52) ColumnarExchange +(58) VeloxAppendBatches +Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] +Arguments: X + +(59) ColumnarExchange Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Arguments: hashpartitioning(c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(53) ShuffleQueryStage +(60) ShuffleQueryStage Output [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Arguments: X -(54) InputAdapter +(61) InputAdapter Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] -(55) InputIteratorTransformer +(62) InputIteratorTransformer Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] -(56) RegularHashAggregateExecTransformer +(63) RegularHashAggregateExecTransformer Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Keys [7]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X] Functions [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X] Results [8]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X] -(57) ProjectExecTransformer +(64) ProjectExecTransformer Output [8]: [c_custkey#X, c_name#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X AS revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] Input [8]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X] -(58) WholeStageCodegenTransformer (X) +(65) WholeStageCodegenTransformer (X) Input [8]: [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] Arguments: false -(59) TakeOrderedAndProjectExecTransformer +(66) TakeOrderedAndProjectExecTransformer Input [8]: [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] Arguments: X, [revenue#X DESC NULLS LAST], [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X], 0 -(60) VeloxColumnarToRowExec +(67) VeloxColumnarToRowExec Input [8]: [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] -(61) Scan parquet +(68) Scan parquet Output [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey), IsNotNull(c_nationkey)] ReadSchema: struct -(62) Filter +(69) Filter Input [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] Condition : (isnotnull(c_custkey#X) AND isnotnull(c_nationkey#X)) -(63) Exchange +(70) Exchange Input [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(64) Scan parquet +(71) Sort +Input [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] +Arguments: [c_custkey#X ASC NULLS FIRST], false, 0 + +(72) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1993-10-01), LessThan(o_orderdate,1994-01-01), IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(65) Filter +(73) Filter Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Condition : ((((isnotnull(o_orderdate#X) AND (o_orderdate#X >= 1993-10-01)) AND (o_orderdate#X < 1994-01-01)) AND isnotnull(o_custkey#X)) AND isnotnull(o_orderkey#X)) -(66) Project +(74) Project Output [2]: [o_orderkey#X, o_custkey#X] Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] -(67) Exchange +(75) Exchange Input [2]: [o_orderkey#X, o_custkey#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(68) ShuffledHashJoin +(76) Sort +Input [2]: [o_orderkey#X, o_custkey#X] +Arguments: [o_custkey#X ASC NULLS FIRST], false, 0 + +(77) SortMergeJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join type: Inner Join condition: None -(69) Project +(78) Project Output [8]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] Input [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X, o_custkey#X] -(70) Exchange +(79) Exchange Input [8]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(71) Scan parquet +(80) Sort +Input [8]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(81) Scan parquet Output [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_returnflag#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_returnflag), EqualTo(l_returnflag,R), IsNotNull(l_orderkey)] ReadSchema: struct -(72) Filter +(82) Filter Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_returnflag#X] Condition : ((isnotnull(l_returnflag#X) AND (l_returnflag#X = R)) AND isnotnull(l_orderkey#X)) -(73) Project +(83) Project Output [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_returnflag#X] -(74) Exchange +(84) Exchange Input [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(75) ShuffledHashJoin +(85) Sort +Input [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(86) SortMergeJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: Inner Join condition: None -(76) Project +(87) Project Output [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] Input [11]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X] -(77) Exchange +(88) Exchange Input [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(78) Scan parquet +(89) Sort +Input [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] +Arguments: [c_nationkey#X ASC NULLS FIRST], false, 0 + +(90) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey)] ReadSchema: struct -(79) Filter +(91) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : isnotnull(n_nationkey#X) -(80) Exchange +(92) Exchange Input [2]: [n_nationkey#X, n_name#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(81) ShuffledHashJoin +(93) Sort +Input [2]: [n_nationkey#X, n_name#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(94) SortMergeJoin Left keys [1]: [c_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(82) Project +(95) Project Output [9]: [c_custkey#X, c_name#X, c_address#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X, n_name#X] Input [11]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X, n_nationkey#X, n_name#X] -(83) HashAggregate +(96) HashAggregate Input [9]: [c_custkey#X, c_name#X, c_address#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X, n_name#X] Keys [7]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X] Functions [1]: [partial_sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] -(84) Exchange +(97) Exchange Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Arguments: hashpartitioning(c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(85) HashAggregate +(98) HashAggregate Input [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] Keys [7]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X] Functions [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X] Results [8]: [c_custkey#X, c_name#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X AS revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] -(86) TakeOrderedAndProject +(99) TakeOrderedAndProject Input [8]: [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] Arguments: X, [revenue#X DESC NULLS LAST], [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] -(87) AdaptiveSparkPlan +(100) AdaptiveSparkPlan Output [8]: [c_custkey#X, c_name#X, revenue#X, c_acctbal#X, n_name#X, c_address#X, c_phone#X, c_comment#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/11.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/11.txt index bfe059a11c117..c9371ffbf2c3a 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/11.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/11.txt @@ -1,71 +1,75 @@ == Physical Plan == -AdaptiveSparkPlan (72) +AdaptiveSparkPlan (82) +- == Final Plan == - VeloxColumnarToRowExec (50) - +- ^ SortExecTransformer (48) - +- ^ InputIteratorTransformer (47) - +- ^ InputAdapter (46) - +- ^ ShuffleQueryStage (45), Statistics(X) - +- ColumnarExchange (44) - +- ^ FilterExecTransformer (42) - +- ^ RegularHashAggregateExecTransformer (41) - +- ^ InputIteratorTransformer (40) - +- ^ InputAdapter (39) - +- ^ ShuffleQueryStage (38), Statistics(X) - +- ColumnarExchange (37) - +- ^ ProjectExecTransformer (35) - +- ^ FlushableHashAggregateExecTransformer (34) - +- ^ ProjectExecTransformer (33) - +- ^ ShuffledHashJoinExecTransformer Inner (32) - :- ^ InputIteratorTransformer (23) - : +- ^ InputAdapter (22) - : +- ^ ShuffleQueryStage (21), Statistics(X) - : +- ColumnarExchange (20) - : +- ^ ProjectExecTransformer (18) - : +- ^ ShuffledHashJoinExecTransformer Inner (17) - : :- ^ InputIteratorTransformer (8) - : : +- ^ InputAdapter (7) - : : +- ^ ShuffleQueryStage (6), Statistics(X) - : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (56) + +- ^ SortExecTransformer (54) + +- ^ InputIteratorTransformer (53) + +- ShuffleQueryStage (51), Statistics(X) + +- ColumnarExchange (50) + +- VeloxAppendBatches (49) + +- ^ FilterExecTransformer (47) + +- ^ RegularHashAggregateExecTransformer (46) + +- ^ InputIteratorTransformer (45) + +- ShuffleQueryStage (43), Statistics(X) + +- ColumnarExchange (42) + +- VeloxAppendBatches (41) + +- ^ ProjectExecTransformer (39) + +- ^ FlushableHashAggregateExecTransformer (38) + +- ^ ProjectExecTransformer (37) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (36) + :- ^ InputIteratorTransformer (26) + : +- ShuffleQueryStage (24), Statistics(X) + : +- ColumnarExchange (23) + : +- VeloxAppendBatches (22) + : +- ^ ProjectExecTransformer (20) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (19) + : :- ^ InputIteratorTransformer (9) + : : +- ShuffleQueryStage (7), Statistics(X) + : : +- ColumnarExchange (6) + : : +- VeloxAppendBatches (5) : : +- ^ ProjectExecTransformer (3) : : +- ^ FilterExecTransformer (2) : : +- ^ Scan parquet (1) - : +- ^ InputIteratorTransformer (16) - : +- ^ InputAdapter (15) - : +- ^ ShuffleQueryStage (14), Statistics(X) - : +- ColumnarExchange (13) - : +- ^ ProjectExecTransformer (11) - : +- ^ FilterExecTransformer (10) - : +- ^ Scan parquet (9) - +- ^ InputIteratorTransformer (31) - +- ^ InputAdapter (30) - +- ^ ShuffleQueryStage (29), Statistics(X) - +- ColumnarExchange (28) - +- ^ ProjectExecTransformer (26) - +- ^ FilterExecTransformer (25) - +- ^ Scan parquet (24) + : +- ^ InputIteratorTransformer (18) + : +- ShuffleQueryStage (16), Statistics(X) + : +- ColumnarExchange (15) + : +- VeloxAppendBatches (14) + : +- ^ ProjectExecTransformer (12) + : +- ^ FilterExecTransformer (11) + : +- ^ Scan parquet (10) + +- ^ InputIteratorTransformer (35) + +- ShuffleQueryStage (33), Statistics(X) + +- ColumnarExchange (32) + +- VeloxAppendBatches (31) + +- ^ ProjectExecTransformer (29) + +- ^ FilterExecTransformer (28) + +- ^ Scan parquet (27) +- == Initial Plan == - Sort (71) - +- Exchange (70) - +- Filter (69) - +- HashAggregate (68) - +- Exchange (67) - +- HashAggregate (66) - +- Project (65) - +- ShuffledHashJoin Inner BuildRight (64) - :- Exchange (59) - : +- Project (58) - : +- ShuffledHashJoin Inner BuildRight (57) - : :- Exchange (53) - : : +- Filter (52) - : : +- Scan parquet (51) - : +- Exchange (56) - : +- Filter (55) - : +- Scan parquet (54) - +- Exchange (63) - +- Project (62) - +- Filter (61) - +- Scan parquet (60) + Sort (81) + +- Exchange (80) + +- Filter (79) + +- HashAggregate (78) + +- Exchange (77) + +- HashAggregate (76) + +- Project (75) + +- SortMergeJoin Inner (74) + :- Sort (68) + : +- Exchange (67) + : +- Project (66) + : +- SortMergeJoin Inner (65) + : :- Sort (60) + : : +- Exchange (59) + : : +- Filter (58) + : : +- Scan parquet (57) + : +- Sort (64) + : +- Exchange (63) + : +- Filter (62) + : +- Scan parquet (61) + +- Sort (73) + +- Exchange (72) + +- Project (71) + +- Filter (70) + +- Scan parquet (69) (1) Scan parquet @@ -87,561 +91,627 @@ Input [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Input [5]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [5]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] +Arguments: X + +(6) ColumnarExchange Input [5]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] -(9) Scan parquet +(10) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(10) FilterExecTransformer +(11) FilterExecTransformer Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [3]: [hash(s_suppkey#X, 42) AS hash_partition_key#X, s_suppkey#X, s_nationkey#X] Input [2]: [s_suppkey#X, s_nationkey#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] +Arguments: X + +(15) ColumnarExchange Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [2]: [s_suppkey#X, s_nationkey#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [2]: [s_suppkey#X, s_nationkey#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [ps_suppkey#X] Right keys [1]: [s_suppkey#X] Join type: Inner Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [5]: [hash(s_nationkey#X, 42) AS hash_partition_key#X, ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] Input [6]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X, s_suppkey#X, s_nationkey#X] -(19) WholeStageCodegenTransformer (X) +(21) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] Arguments: false -(20) ColumnarExchange +(22) VeloxAppendBatches +Input [5]: [hash_partition_key#X, ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] +Arguments: X + +(23) ColumnarExchange Input [5]: [hash_partition_key#X, ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X], [plan_id=X], [id=#X] -(21) ShuffleQueryStage +(24) ShuffleQueryStage Output [4]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] Arguments: X -(22) InputAdapter +(25) InputAdapter Input [4]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] -(23) InputIteratorTransformer +(26) InputIteratorTransformer Input [4]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] -(24) Scan parquet +(27) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,GERMANY), IsNotNull(n_nationkey)] ReadSchema: struct -(25) FilterExecTransformer +(28) FilterExecTransformer Input [2]: [n_nationkey#X, n_name#X] Arguments: ((isnotnull(n_name#X) AND (n_name#X = GERMANY)) AND isnotnull(n_nationkey#X)) -(26) ProjectExecTransformer +(29) ProjectExecTransformer Output [2]: [hash(n_nationkey#X, 42) AS hash_partition_key#X, n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(27) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, n_nationkey#X] Arguments: false -(28) ColumnarExchange +(31) VeloxAppendBatches +Input [2]: [hash_partition_key#X, n_nationkey#X] +Arguments: X + +(32) ColumnarExchange Input [2]: [hash_partition_key#X, n_nationkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X], [plan_id=X], [id=#X] -(29) ShuffleQueryStage +(33) ShuffleQueryStage Output [1]: [n_nationkey#X] Arguments: X -(30) InputAdapter +(34) InputAdapter Input [1]: [n_nationkey#X] -(31) InputIteratorTransformer +(35) InputIteratorTransformer Input [1]: [n_nationkey#X] -(32) ShuffledHashJoinExecTransformer +(36) ShuffledHashJoinExecTransformer Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(33) ProjectExecTransformer +(37) ProjectExecTransformer Output [4]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, (ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))) AS _pre_X#X] Input [5]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X, n_nationkey#X] -(34) FlushableHashAggregateExecTransformer +(38) FlushableHashAggregateExecTransformer Input [4]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, _pre_X#X] Keys [1]: [ps_partkey#X] Functions [1]: [partial_sum(_pre_X#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [ps_partkey#X, sum#X, isEmpty#X] -(35) ProjectExecTransformer +(39) ProjectExecTransformer Output [4]: [hash(ps_partkey#X, 42) AS hash_partition_key#X, ps_partkey#X, sum#X, isEmpty#X] Input [3]: [ps_partkey#X, sum#X, isEmpty#X] -(36) WholeStageCodegenTransformer (X) +(40) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, ps_partkey#X, sum#X, isEmpty#X] Arguments: false -(37) ColumnarExchange +(41) VeloxAppendBatches +Input [4]: [hash_partition_key#X, ps_partkey#X, sum#X, isEmpty#X] +Arguments: X + +(42) ColumnarExchange Input [4]: [hash_partition_key#X, ps_partkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(38) ShuffleQueryStage +(43) ShuffleQueryStage Output [3]: [ps_partkey#X, sum#X, isEmpty#X] Arguments: X -(39) InputAdapter +(44) InputAdapter Input [3]: [ps_partkey#X, sum#X, isEmpty#X] -(40) InputIteratorTransformer +(45) InputIteratorTransformer Input [3]: [ps_partkey#X, sum#X, isEmpty#X] -(41) RegularHashAggregateExecTransformer +(46) RegularHashAggregateExecTransformer Input [3]: [ps_partkey#X, sum#X, isEmpty#X] Keys [1]: [ps_partkey#X] Functions [1]: [sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))] Aggregate Attributes [1]: [sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))#X] Results [2]: [ps_partkey#X, sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))#X AS value#X] -(42) FilterExecTransformer +(47) FilterExecTransformer Input [2]: [ps_partkey#X, value#X] Arguments: (isnotnull(value#X) AND (cast(value#X as decimal(38,6)) > Subquery subquery#X, [id=#X])) -(43) WholeStageCodegenTransformer (X) +(48) WholeStageCodegenTransformer (X) Input [2]: [ps_partkey#X, value#X] Arguments: false -(44) ColumnarExchange +(49) VeloxAppendBatches +Input [2]: [ps_partkey#X, value#X] +Arguments: X + +(50) ColumnarExchange Input [2]: [ps_partkey#X, value#X] Arguments: rangepartitioning(value#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(45) ShuffleQueryStage +(51) ShuffleQueryStage Output [2]: [ps_partkey#X, value#X] Arguments: X -(46) InputAdapter +(52) InputAdapter Input [2]: [ps_partkey#X, value#X] -(47) InputIteratorTransformer +(53) InputIteratorTransformer Input [2]: [ps_partkey#X, value#X] -(48) SortExecTransformer +(54) SortExecTransformer Input [2]: [ps_partkey#X, value#X] Arguments: [value#X DESC NULLS LAST], true, 0 -(49) WholeStageCodegenTransformer (X) +(55) WholeStageCodegenTransformer (X) Input [2]: [ps_partkey#X, value#X] Arguments: false -(50) VeloxColumnarToRowExec +(56) VeloxColumnarToRowExec Input [2]: [ps_partkey#X, value#X] -(51) Scan parquet +(57) Scan parquet Output [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_suppkey)] ReadSchema: struct -(52) Filter +(58) Filter Input [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Condition : isnotnull(ps_suppkey#X) -(53) Exchange +(59) Exchange Input [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(54) Scan parquet +(60) Sort +Input [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] +Arguments: [ps_suppkey#X ASC NULLS FIRST], false, 0 + +(61) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(55) Filter +(62) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(56) Exchange +(63) Exchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(57) ShuffledHashJoin +(64) Sort +Input [2]: [s_suppkey#X, s_nationkey#X] +Arguments: [s_suppkey#X ASC NULLS FIRST], false, 0 + +(65) SortMergeJoin Left keys [1]: [ps_suppkey#X] Right keys [1]: [s_suppkey#X] Join type: Inner Join condition: None -(58) Project +(66) Project Output [4]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] Input [6]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X, s_suppkey#X, s_nationkey#X] -(59) Exchange +(67) Exchange Input [4]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(60) Scan parquet +(68) Sort +Input [4]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] +Arguments: [s_nationkey#X ASC NULLS FIRST], false, 0 + +(69) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,GERMANY), IsNotNull(n_nationkey)] ReadSchema: struct -(61) Filter +(70) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : ((isnotnull(n_name#X) AND (n_name#X = GERMANY)) AND isnotnull(n_nationkey#X)) -(62) Project +(71) Project Output [1]: [n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(63) Exchange +(72) Exchange Input [1]: [n_nationkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(64) ShuffledHashJoin +(73) Sort +Input [1]: [n_nationkey#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(74) SortMergeJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(65) Project +(75) Project Output [3]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X] Input [5]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X, n_nationkey#X] -(66) HashAggregate +(76) HashAggregate Input [3]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X] Keys [1]: [ps_partkey#X] Functions [1]: [partial_sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [ps_partkey#X, sum#X, isEmpty#X] -(67) Exchange +(77) Exchange Input [3]: [ps_partkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(68) HashAggregate +(78) HashAggregate Input [3]: [ps_partkey#X, sum#X, isEmpty#X] Keys [1]: [ps_partkey#X] Functions [1]: [sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))] Aggregate Attributes [1]: [sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))#X] Results [2]: [ps_partkey#X, sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))#X AS value#X] -(69) Filter +(79) Filter Input [2]: [ps_partkey#X, value#X] Condition : (isnotnull(value#X) AND (cast(value#X as decimal(38,6)) > Subquery subquery#X, [id=#X])) -(70) Exchange +(80) Exchange Input [2]: [ps_partkey#X, value#X] Arguments: rangepartitioning(value#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(71) Sort +(81) Sort Input [2]: [ps_partkey#X, value#X] Arguments: [value#X DESC NULLS LAST], true, 0 -(72) AdaptiveSparkPlan +(82) AdaptiveSparkPlan Output [2]: [ps_partkey#X, value#X] Arguments: isFinalPlan=true ===== Subqueries ===== -Subquery:1 Hosting operator id = 42 Hosting Expression = Subquery subquery#X, [id=#X] -AdaptiveSparkPlan (120) +Subquery:1 Hosting operator id = 47 Hosting Expression = Subquery subquery#X, [id=#X] +AdaptiveSparkPlan (136) +- == Final Plan == - VeloxColumnarToRowExec (102) - +- ^ ProjectExecTransformer (100) - +- ^ RegularHashAggregateExecTransformer (99) - +- ^ RegularHashAggregateExecTransformer (98) - +- ^ ProjectExecTransformer (97) - +- ^ ShuffledHashJoinExecTransformer Inner (96) - :- ^ InputIteratorTransformer (91) - : +- ^ InputAdapter (90) - : +- ^ ShuffleQueryStage (89), Statistics(X) - : +- ColumnarExchange (88) - : +- ^ ProjectExecTransformer (86) - : +- ^ ShuffledHashJoinExecTransformer Inner (85) - : :- ^ InputIteratorTransformer (80) - : : +- ^ InputAdapter (79) - : : +- ^ ShuffleQueryStage (78), Statistics(X) - : : +- ColumnarExchange (77) - : : +- ^ ProjectExecTransformer (75) - : : +- ^ FilterExecTransformer (74) - : : +- ^ Scan parquet (73) - : +- ^ InputIteratorTransformer (84) - : +- ^ InputAdapter (83) - : +- ^ ShuffleQueryStage (82), Statistics(X) - : +- ReusedExchange (81) - +- ^ InputIteratorTransformer (95) - +- ^ InputAdapter (94) - +- ^ ShuffleQueryStage (93), Statistics(X) - +- ReusedExchange (92) + VeloxColumnarToRowExec (114) + +- ^ ProjectExecTransformer (112) + +- ^ RegularHashAggregateExecTransformer (111) + +- ^ RegularHashAggregateExecTransformer (110) + +- ^ ProjectExecTransformer (109) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (108) + :- ^ InputIteratorTransformer (103) + : +- ShuffleQueryStage (101), Statistics(X) + : +- ColumnarExchange (100) + : +- VeloxAppendBatches (99) + : +- ^ ProjectExecTransformer (97) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (96) + : :- ^ InputIteratorTransformer (91) + : : +- ShuffleQueryStage (89), Statistics(X) + : : +- ColumnarExchange (88) + : : +- VeloxAppendBatches (87) + : : +- ^ ProjectExecTransformer (85) + : : +- ^ FilterExecTransformer (84) + : : +- ^ Scan parquet (83) + : +- ^ InputIteratorTransformer (95) + : +- ShuffleQueryStage (93), Statistics(X) + : +- ReusedExchange (92) + +- ^ InputIteratorTransformer (107) + +- ShuffleQueryStage (105), Statistics(X) + +- ReusedExchange (104) +- == Initial Plan == - HashAggregate (119) - +- HashAggregate (118) - +- Project (117) - +- ShuffledHashJoin Inner BuildRight (116) - :- Exchange (111) - : +- Project (110) - : +- ShuffledHashJoin Inner BuildRight (109) - : :- Exchange (105) - : : +- Filter (104) - : : +- Scan parquet (103) - : +- Exchange (108) - : +- Filter (107) - : +- Scan parquet (106) - +- Exchange (115) - +- Project (114) - +- Filter (113) - +- Scan parquet (112) - - -(73) Scan parquet + HashAggregate (135) + +- HashAggregate (134) + +- Project (133) + +- SortMergeJoin Inner (132) + :- Sort (126) + : +- Exchange (125) + : +- Project (124) + : +- SortMergeJoin Inner (123) + : :- Sort (118) + : : +- Exchange (117) + : : +- Filter (116) + : : +- Scan parquet (115) + : +- Sort (122) + : +- Exchange (121) + : +- Filter (120) + : +- Scan parquet (119) + +- Sort (131) + +- Exchange (130) + +- Project (129) + +- Filter (128) + +- Scan parquet (127) + + +(83) Scan parquet Output [3]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_suppkey)] ReadSchema: struct -(74) FilterExecTransformer +(84) FilterExecTransformer Input [3]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Arguments: isnotnull(ps_suppkey#X) -(75) ProjectExecTransformer +(85) ProjectExecTransformer Output [4]: [hash(ps_suppkey#X, 42) AS hash_partition_key#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Input [3]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] -(76) WholeStageCodegenTransformer (X) +(86) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Arguments: false -(77) ColumnarExchange +(87) VeloxAppendBatches +Input [4]: [hash_partition_key#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] +Arguments: X + +(88) ColumnarExchange Input [4]: [hash_partition_key#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_suppkey#X, ps_availqty#X, ps_supplycost#X], [plan_id=X], [id=#X] -(78) ShuffleQueryStage +(89) ShuffleQueryStage Output [3]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Arguments: X -(79) InputAdapter +(90) InputAdapter Input [3]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] -(80) InputIteratorTransformer +(91) InputIteratorTransformer Input [3]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] -(81) ReusedExchange [Reuses operator id: 13] +(92) ReusedExchange [Reuses operator id: 15] Output [2]: [s_suppkey#X, s_nationkey#X] -(82) ShuffleQueryStage +(93) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] Arguments: X -(83) InputAdapter +(94) InputAdapter Input [2]: [s_suppkey#X, s_nationkey#X] -(84) InputIteratorTransformer +(95) InputIteratorTransformer Input [2]: [s_suppkey#X, s_nationkey#X] -(85) ShuffledHashJoinExecTransformer +(96) ShuffledHashJoinExecTransformer Left keys [1]: [ps_suppkey#X] Right keys [1]: [s_suppkey#X] Join type: Inner Join condition: None -(86) ProjectExecTransformer +(97) ProjectExecTransformer Output [4]: [hash(s_nationkey#X, 42) AS hash_partition_key#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] Input [5]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X, s_suppkey#X, s_nationkey#X] -(87) WholeStageCodegenTransformer (X) +(98) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] Arguments: false -(88) ColumnarExchange +(99) VeloxAppendBatches +Input [4]: [hash_partition_key#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] +Arguments: X + +(100) ColumnarExchange Input [4]: [hash_partition_key#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [ps_availqty#X, ps_supplycost#X, s_nationkey#X], [plan_id=X], [id=#X] -(89) ShuffleQueryStage +(101) ShuffleQueryStage Output [3]: [ps_availqty#X, ps_supplycost#X, s_nationkey#X] Arguments: X -(90) InputAdapter +(102) InputAdapter Input [3]: [ps_availqty#X, ps_supplycost#X, s_nationkey#X] -(91) InputIteratorTransformer +(103) InputIteratorTransformer Input [3]: [ps_availqty#X, ps_supplycost#X, s_nationkey#X] -(92) ReusedExchange [Reuses operator id: 28] +(104) ReusedExchange [Reuses operator id: 32] Output [1]: [n_nationkey#X] -(93) ShuffleQueryStage +(105) ShuffleQueryStage Output [1]: [n_nationkey#X] Arguments: X -(94) InputAdapter +(106) InputAdapter Input [1]: [n_nationkey#X] -(95) InputIteratorTransformer +(107) InputIteratorTransformer Input [1]: [n_nationkey#X] -(96) ShuffledHashJoinExecTransformer +(108) ShuffledHashJoinExecTransformer Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(97) ProjectExecTransformer +(109) ProjectExecTransformer Output [3]: [ps_availqty#X, ps_supplycost#X, (ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))) AS _pre_X#X] Input [4]: [ps_availqty#X, ps_supplycost#X, s_nationkey#X, n_nationkey#X] -(98) RegularHashAggregateExecTransformer +(110) RegularHashAggregateExecTransformer Input [3]: [ps_availqty#X, ps_supplycost#X, _pre_X#X] Keys: [] Functions [1]: [partial_sum(_pre_X#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(99) RegularHashAggregateExecTransformer +(111) RegularHashAggregateExecTransformer Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))] Aggregate Attributes [1]: [sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))#X] Results [1]: [sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))#X] -(100) ProjectExecTransformer +(112) ProjectExecTransformer Output [1]: [(sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))#X * 0.0001000000) AS (sum((ps_supplycost * ps_availqty)) * 0.0001000000)#X] Input [1]: [sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))#X] -(101) WholeStageCodegenTransformer (X) +(113) WholeStageCodegenTransformer (X) Input [1]: [(sum((ps_supplycost * ps_availqty)) * 0.0001000000)#X] Arguments: false -(102) VeloxColumnarToRowExec +(114) VeloxColumnarToRowExec Input [1]: [(sum((ps_supplycost * ps_availqty)) * 0.0001000000)#X] -(103) Scan parquet +(115) Scan parquet Output [3]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_suppkey)] ReadSchema: struct -(104) Filter +(116) Filter Input [3]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Condition : isnotnull(ps_suppkey#X) -(105) Exchange +(117) Exchange Input [3]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(106) Scan parquet +(118) Sort +Input [3]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] +Arguments: [ps_suppkey#X ASC NULLS FIRST], false, 0 + +(119) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(107) Filter +(120) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(108) Exchange +(121) Exchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(109) ShuffledHashJoin +(122) Sort +Input [2]: [s_suppkey#X, s_nationkey#X] +Arguments: [s_suppkey#X ASC NULLS FIRST], false, 0 + +(123) SortMergeJoin Left keys [1]: [ps_suppkey#X] Right keys [1]: [s_suppkey#X] Join type: Inner Join condition: None -(110) Project +(124) Project Output [3]: [ps_availqty#X, ps_supplycost#X, s_nationkey#X] Input [5]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X, s_suppkey#X, s_nationkey#X] -(111) Exchange +(125) Exchange Input [3]: [ps_availqty#X, ps_supplycost#X, s_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(112) Scan parquet +(126) Sort +Input [3]: [ps_availqty#X, ps_supplycost#X, s_nationkey#X] +Arguments: [s_nationkey#X ASC NULLS FIRST], false, 0 + +(127) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,GERMANY), IsNotNull(n_nationkey)] ReadSchema: struct -(113) Filter +(128) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : ((isnotnull(n_name#X) AND (n_name#X = GERMANY)) AND isnotnull(n_nationkey#X)) -(114) Project +(129) Project Output [1]: [n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(115) Exchange +(130) Exchange Input [1]: [n_nationkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(116) ShuffledHashJoin +(131) Sort +Input [1]: [n_nationkey#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(132) SortMergeJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(117) Project +(133) Project Output [2]: [ps_availqty#X, ps_supplycost#X] Input [4]: [ps_availqty#X, ps_supplycost#X, s_nationkey#X, n_nationkey#X] -(118) HashAggregate +(134) HashAggregate Input [2]: [ps_availqty#X, ps_supplycost#X] Keys: [] Functions [1]: [partial_sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(119) HashAggregate +(135) HashAggregate Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))] Aggregate Attributes [1]: [sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))#X] Results [1]: [(sum((ps_supplycost#X * cast(ps_availqty#X as decimal(10,0))))#X * 0.0001000000) AS (sum((ps_supplycost * ps_availqty)) * 0.0001000000)#X] -(120) AdaptiveSparkPlan +(136) AdaptiveSparkPlan Output [1]: [(sum((ps_supplycost * ps_availqty)) * 0.0001000000)#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/12.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/12.txt index 85c06695e1474..ce033f5468d1b 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/12.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/12.txt @@ -1,50 +1,52 @@ == Physical Plan == -AdaptiveSparkPlan (49) +AdaptiveSparkPlan (55) +- == Final Plan == - VeloxColumnarToRowExec (34) - +- ^ SortExecTransformer (32) - +- ^ InputIteratorTransformer (31) - +- ^ InputAdapter (30) - +- ^ ShuffleQueryStage (29), Statistics(X) - +- ColumnarExchange (28) - +- ^ RegularHashAggregateExecTransformer (26) - +- ^ InputIteratorTransformer (25) - +- ^ InputAdapter (24) - +- ^ ShuffleQueryStage (23), Statistics(X) - +- ColumnarExchange (22) - +- ^ ProjectExecTransformer (20) - +- ^ FlushableHashAggregateExecTransformer (19) - +- ^ ProjectExecTransformer (18) - +- ^ ShuffledHashJoinExecTransformer Inner (17) - :- ^ InputIteratorTransformer (8) - : +- ^ InputAdapter (7) - : +- ^ ShuffleQueryStage (6), Statistics(X) - : +- ColumnarExchange (5) + VeloxColumnarToRowExec (38) + +- ^ SortExecTransformer (36) + +- ^ InputIteratorTransformer (35) + +- ShuffleQueryStage (33), Statistics(X) + +- ColumnarExchange (32) + +- VeloxAppendBatches (31) + +- ^ RegularHashAggregateExecTransformer (29) + +- ^ InputIteratorTransformer (28) + +- ShuffleQueryStage (26), Statistics(X) + +- ColumnarExchange (25) + +- VeloxAppendBatches (24) + +- ^ ProjectExecTransformer (22) + +- ^ FlushableHashAggregateExecTransformer (21) + +- ^ ProjectExecTransformer (20) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (19) + :- ^ InputIteratorTransformer (9) + : +- ShuffleQueryStage (7), Statistics(X) + : +- ColumnarExchange (6) + : +- VeloxAppendBatches (5) : +- ^ ProjectExecTransformer (3) : +- ^ FilterExecTransformer (2) : +- ^ Scan parquet (1) - +- ^ InputIteratorTransformer (16) - +- ^ InputAdapter (15) - +- ^ ShuffleQueryStage (14), Statistics(X) - +- ColumnarExchange (13) - +- ^ ProjectExecTransformer (11) - +- ^ FilterExecTransformer (10) - +- ^ Scan parquet (9) + +- ^ InputIteratorTransformer (18) + +- ShuffleQueryStage (16), Statistics(X) + +- ColumnarExchange (15) + +- VeloxAppendBatches (14) + +- ^ ProjectExecTransformer (12) + +- ^ FilterExecTransformer (11) + +- ^ Scan parquet (10) +- == Initial Plan == - Sort (48) - +- Exchange (47) - +- HashAggregate (46) - +- Exchange (45) - +- HashAggregate (44) - +- Project (43) - +- ShuffledHashJoin Inner BuildLeft (42) - :- Exchange (37) - : +- Filter (36) - : +- Scan parquet (35) - +- Exchange (41) - +- Project (40) - +- Filter (39) - +- Scan parquet (38) + Sort (54) + +- Exchange (53) + +- HashAggregate (52) + +- Exchange (51) + +- HashAggregate (50) + +- Project (49) + +- SortMergeJoin Inner (48) + :- Sort (42) + : +- Exchange (41) + : +- Filter (40) + : +- Scan parquet (39) + +- Sort (47) + +- Exchange (46) + +- Project (45) + +- Filter (44) + +- Scan parquet (43) (1) Scan parquet @@ -66,198 +68,222 @@ Input [2]: [o_orderkey#X, o_orderpriority#X] Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderpriority#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderpriority#X] +Arguments: X + +(6) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderpriority#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderpriority#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [2]: [o_orderkey#X, o_orderpriority#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [2]: [o_orderkey#X, o_orderpriority#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [2]: [o_orderkey#X, o_orderpriority#X] -(9) Scan parquet +(10) Scan parquet Output [5]: [l_orderkey#X, l_shipdate#X, l_commitdate#X, l_receiptdate#X, l_shipmode#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_commitdate), IsNotNull(l_receiptdate), IsNotNull(l_shipdate), In(l_shipmode, [MAIL,SHIP]), GreaterThanOrEqual(l_receiptdate,1994-01-01), LessThan(l_receiptdate,1995-01-01), IsNotNull(l_orderkey)] ReadSchema: struct -(10) FilterExecTransformer +(11) FilterExecTransformer Input [5]: [l_orderkey#X, l_shipdate#X, l_commitdate#X, l_receiptdate#X, l_shipmode#X] Arguments: ((((((((isnotnull(l_commitdate#X) AND isnotnull(l_receiptdate#X)) AND isnotnull(l_shipdate#X)) AND l_shipmode#X IN (MAIL,SHIP)) AND (l_commitdate#X < l_receiptdate#X)) AND (l_shipdate#X < l_commitdate#X)) AND (l_receiptdate#X >= 1994-01-01)) AND (l_receiptdate#X < 1995-01-01)) AND isnotnull(l_orderkey#X)) -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [3]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_shipmode#X] Input [5]: [l_orderkey#X, l_shipdate#X, l_commitdate#X, l_receiptdate#X, l_shipmode#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, l_orderkey#X, l_shipmode#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [3]: [hash_partition_key#X, l_orderkey#X, l_shipmode#X] +Arguments: X + +(15) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_shipmode#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_shipmode#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [2]: [l_orderkey#X, l_shipmode#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [2]: [l_orderkey#X, l_shipmode#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [2]: [l_orderkey#X, l_shipmode#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: Inner Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [4]: [o_orderpriority#X, l_shipmode#X, CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END AS _pre_X#X, CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END AS _pre_X#X] Input [4]: [o_orderkey#X, o_orderpriority#X, l_orderkey#X, l_shipmode#X] -(19) FlushableHashAggregateExecTransformer +(21) FlushableHashAggregateExecTransformer Input [4]: [o_orderpriority#X, l_shipmode#X, _pre_X#X, _pre_X#X] Keys [1]: [l_shipmode#X] Functions [2]: [partial_sum(_pre_X#X), partial_sum(_pre_X#X)] Aggregate Attributes [2]: [sum#X, sum#X] Results [3]: [l_shipmode#X, sum#X, sum#X] -(20) ProjectExecTransformer +(22) ProjectExecTransformer Output [4]: [hash(l_shipmode#X, 42) AS hash_partition_key#X, l_shipmode#X, sum#X, sum#X] Input [3]: [l_shipmode#X, sum#X, sum#X] -(21) WholeStageCodegenTransformer (X) +(23) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, l_shipmode#X, sum#X, sum#X] Arguments: false -(22) ColumnarExchange +(24) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_shipmode#X, sum#X, sum#X] +Arguments: X + +(25) ColumnarExchange Input [4]: [hash_partition_key#X, l_shipmode#X, sum#X, sum#X] Arguments: hashpartitioning(l_shipmode#X, 1), ENSURE_REQUIREMENTS, [l_shipmode#X, sum#X, sum#X], [plan_id=X], [id=#X] -(23) ShuffleQueryStage +(26) ShuffleQueryStage Output [3]: [l_shipmode#X, sum#X, sum#X] Arguments: X -(24) InputAdapter +(27) InputAdapter Input [3]: [l_shipmode#X, sum#X, sum#X] -(25) InputIteratorTransformer +(28) InputIteratorTransformer Input [3]: [l_shipmode#X, sum#X, sum#X] -(26) RegularHashAggregateExecTransformer +(29) RegularHashAggregateExecTransformer Input [3]: [l_shipmode#X, sum#X, sum#X] Keys [1]: [l_shipmode#X] Functions [2]: [sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END), sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)] Aggregate Attributes [2]: [sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X, sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X] Results [3]: [l_shipmode#X, sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X AS high_line_count#X, sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X AS low_line_count#X] -(27) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: false -(28) ColumnarExchange +(31) VeloxAppendBatches +Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] +Arguments: X + +(32) ColumnarExchange Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: rangepartitioning(l_shipmode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(29) ShuffleQueryStage +(33) ShuffleQueryStage Output [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: X -(30) InputAdapter +(34) InputAdapter Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] -(31) InputIteratorTransformer +(35) InputIteratorTransformer Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] -(32) SortExecTransformer +(36) SortExecTransformer Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: [l_shipmode#X ASC NULLS FIRST], true, 0 -(33) WholeStageCodegenTransformer (X) +(37) WholeStageCodegenTransformer (X) Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: false -(34) VeloxColumnarToRowExec +(38) VeloxColumnarToRowExec Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] -(35) Scan parquet +(39) Scan parquet Output [2]: [o_orderkey#X, o_orderpriority#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderkey)] ReadSchema: struct -(36) Filter +(40) Filter Input [2]: [o_orderkey#X, o_orderpriority#X] Condition : isnotnull(o_orderkey#X) -(37) Exchange +(41) Exchange Input [2]: [o_orderkey#X, o_orderpriority#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(38) Scan parquet +(42) Sort +Input [2]: [o_orderkey#X, o_orderpriority#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(43) Scan parquet Output [5]: [l_orderkey#X, l_shipdate#X, l_commitdate#X, l_receiptdate#X, l_shipmode#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_commitdate), IsNotNull(l_receiptdate), IsNotNull(l_shipdate), In(l_shipmode, [MAIL,SHIP]), GreaterThanOrEqual(l_receiptdate,1994-01-01), LessThan(l_receiptdate,1995-01-01), IsNotNull(l_orderkey)] ReadSchema: struct -(39) Filter +(44) Filter Input [5]: [l_orderkey#X, l_shipdate#X, l_commitdate#X, l_receiptdate#X, l_shipmode#X] Condition : ((((((((isnotnull(l_commitdate#X) AND isnotnull(l_receiptdate#X)) AND isnotnull(l_shipdate#X)) AND l_shipmode#X IN (MAIL,SHIP)) AND (l_commitdate#X < l_receiptdate#X)) AND (l_shipdate#X < l_commitdate#X)) AND (l_receiptdate#X >= 1994-01-01)) AND (l_receiptdate#X < 1995-01-01)) AND isnotnull(l_orderkey#X)) -(40) Project +(45) Project Output [2]: [l_orderkey#X, l_shipmode#X] Input [5]: [l_orderkey#X, l_shipdate#X, l_commitdate#X, l_receiptdate#X, l_shipmode#X] -(41) Exchange +(46) Exchange Input [2]: [l_orderkey#X, l_shipmode#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(42) ShuffledHashJoin +(47) Sort +Input [2]: [l_orderkey#X, l_shipmode#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(48) SortMergeJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: Inner Join condition: None -(43) Project +(49) Project Output [2]: [o_orderpriority#X, l_shipmode#X] Input [4]: [o_orderkey#X, o_orderpriority#X, l_orderkey#X, l_shipmode#X] -(44) HashAggregate +(50) HashAggregate Input [2]: [o_orderpriority#X, l_shipmode#X] Keys [1]: [l_shipmode#X] Functions [2]: [partial_sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END), partial_sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)] Aggregate Attributes [2]: [sum#X, sum#X] Results [3]: [l_shipmode#X, sum#X, sum#X] -(45) Exchange +(51) Exchange Input [3]: [l_shipmode#X, sum#X, sum#X] Arguments: hashpartitioning(l_shipmode#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(46) HashAggregate +(52) HashAggregate Input [3]: [l_shipmode#X, sum#X, sum#X] Keys [1]: [l_shipmode#X] Functions [2]: [sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END), sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)] Aggregate Attributes [2]: [sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X, sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X] Results [3]: [l_shipmode#X, sum(CASE WHEN ((o_orderpriority#X = 1-URGENT) OR (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X AS high_line_count#X, sum(CASE WHEN (NOT (o_orderpriority#X = 1-URGENT) AND NOT (o_orderpriority#X = 2-HIGH)) THEN 1 ELSE 0 END)#X AS low_line_count#X] -(47) Exchange +(53) Exchange Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: rangepartitioning(l_shipmode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(48) Sort +(54) Sort Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: [l_shipmode#X ASC NULLS FIRST], true, 0 -(49) AdaptiveSparkPlan +(55) AdaptiveSparkPlan Output [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/13.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/13.txt index c3526c707e4be..c71d03b93e127 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/13.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/13.txt @@ -1,53 +1,55 @@ == Physical Plan == -AdaptiveSparkPlan (52) +AdaptiveSparkPlan (58) +- == Final Plan == - VeloxColumnarToRowExec (36) - +- ^ SortExecTransformer (34) - +- ^ InputIteratorTransformer (33) - +- ^ InputAdapter (32) - +- ^ ShuffleQueryStage (31), Statistics(X) - +- ColumnarExchange (30) - +- ^ RegularHashAggregateExecTransformer (28) - +- ^ InputIteratorTransformer (27) - +- ^ InputAdapter (26) - +- ^ ShuffleQueryStage (25), Statistics(X) - +- ColumnarExchange (24) - +- ^ ProjectExecTransformer (22) - +- ^ FlushableHashAggregateExecTransformer (21) - +- ^ ProjectExecTransformer (20) - +- ^ RegularHashAggregateExecTransformer (19) - +- ^ RegularHashAggregateExecTransformer (18) - +- ^ ProjectExecTransformer (17) - +- ^ ShuffledHashJoinExecTransformer LeftOuter (16) - :- ^ InputIteratorTransformer (7) - : +- ^ InputAdapter (6) - : +- ^ ShuffleQueryStage (5), Statistics(X) - : +- ColumnarExchange (4) + VeloxColumnarToRowExec (40) + +- ^ SortExecTransformer (38) + +- ^ InputIteratorTransformer (37) + +- ShuffleQueryStage (35), Statistics(X) + +- ColumnarExchange (34) + +- VeloxAppendBatches (33) + +- ^ RegularHashAggregateExecTransformer (31) + +- ^ InputIteratorTransformer (30) + +- ShuffleQueryStage (28), Statistics(X) + +- ColumnarExchange (27) + +- VeloxAppendBatches (26) + +- ^ ProjectExecTransformer (24) + +- ^ FlushableHashAggregateExecTransformer (23) + +- ^ ProjectExecTransformer (22) + +- ^ RegularHashAggregateExecTransformer (21) + +- ^ RegularHashAggregateExecTransformer (20) + +- ^ ProjectExecTransformer (19) + +- ^ ShuffledHashJoinExecTransformer LeftOuter BuildLeft (18) + :- ^ InputIteratorTransformer (8) + : +- ShuffleQueryStage (6), Statistics(X) + : +- ColumnarExchange (5) + : +- VeloxAppendBatches (4) : +- ^ ProjectExecTransformer (2) : +- ^ Scan parquet (1) - +- ^ InputIteratorTransformer (15) - +- ^ InputAdapter (14) - +- ^ ShuffleQueryStage (13), Statistics(X) - +- ColumnarExchange (12) - +- ^ ProjectExecTransformer (10) - +- ^ FilterExecTransformer (9) - +- ^ Scan parquet (8) + +- ^ InputIteratorTransformer (17) + +- ShuffleQueryStage (15), Statistics(X) + +- ColumnarExchange (14) + +- VeloxAppendBatches (13) + +- ^ ProjectExecTransformer (11) + +- ^ FilterExecTransformer (10) + +- ^ Scan parquet (9) +- == Initial Plan == - Sort (51) - +- Exchange (50) - +- HashAggregate (49) - +- Exchange (48) - +- HashAggregate (47) - +- HashAggregate (46) - +- HashAggregate (45) - +- Project (44) - +- ShuffledHashJoin LeftOuter BuildRight (43) - :- Exchange (38) - : +- Scan parquet (37) - +- Exchange (42) - +- Project (41) - +- Filter (40) - +- Scan parquet (39) + Sort (57) + +- Exchange (56) + +- HashAggregate (55) + +- Exchange (54) + +- HashAggregate (53) + +- HashAggregate (52) + +- HashAggregate (51) + +- Project (50) + +- SortMergeJoin LeftOuter (49) + :- Sort (43) + : +- Exchange (42) + : +- Scan parquet (41) + +- Sort (48) + +- Exchange (47) + +- Project (46) + +- Filter (45) + +- Scan parquet (44) (1) Scan parquet @@ -64,225 +66,249 @@ Input [1]: [c_custkey#X] Input [2]: [hash_partition_key#X, c_custkey#X] Arguments: false -(4) ColumnarExchange +(4) VeloxAppendBatches +Input [2]: [hash_partition_key#X, c_custkey#X] +Arguments: X + +(5) ColumnarExchange Input [2]: [hash_partition_key#X, c_custkey#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X], [plan_id=X], [id=#X] -(5) ShuffleQueryStage +(6) ShuffleQueryStage Output [1]: [c_custkey#X] Arguments: X -(6) InputAdapter +(7) InputAdapter Input [1]: [c_custkey#X] -(7) InputIteratorTransformer +(8) InputIteratorTransformer Input [1]: [c_custkey#X] -(8) Scan parquet +(9) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_comment#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_comment), IsNotNull(o_custkey)] ReadSchema: struct -(9) FilterExecTransformer +(10) FilterExecTransformer Input [3]: [o_orderkey#X, o_custkey#X, o_comment#X] Arguments: ((isnotnull(o_comment#X) AND NOT o_comment#X LIKE %special%requests%) AND isnotnull(o_custkey#X)) -(10) ProjectExecTransformer +(11) ProjectExecTransformer Output [3]: [hash(o_custkey#X, 42) AS hash_partition_key#X, o_orderkey#X, o_custkey#X] Input [3]: [o_orderkey#X, o_custkey#X, o_comment#X] -(11) WholeStageCodegenTransformer (X) +(12) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] Arguments: false -(12) ColumnarExchange +(13) VeloxAppendBatches +Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] +Arguments: X + +(14) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [id=#X] -(13) ShuffleQueryStage +(15) ShuffleQueryStage Output [2]: [o_orderkey#X, o_custkey#X] Arguments: X -(14) InputAdapter +(16) InputAdapter Input [2]: [o_orderkey#X, o_custkey#X] -(15) InputIteratorTransformer +(17) InputIteratorTransformer Input [2]: [o_orderkey#X, o_custkey#X] -(16) ShuffledHashJoinExecTransformer +(18) ShuffledHashJoinExecTransformer Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join type: LeftOuter Join condition: None -(17) ProjectExecTransformer +(19) ProjectExecTransformer Output [2]: [c_custkey#X, o_orderkey#X] Input [3]: [c_custkey#X, o_orderkey#X, o_custkey#X] -(18) RegularHashAggregateExecTransformer +(20) RegularHashAggregateExecTransformer Input [2]: [c_custkey#X, o_orderkey#X] Keys [1]: [c_custkey#X] Functions [1]: [partial_count(o_orderkey#X)] Aggregate Attributes [1]: [count#X] Results [2]: [c_custkey#X, count#X] -(19) RegularHashAggregateExecTransformer +(21) RegularHashAggregateExecTransformer Input [2]: [c_custkey#X, count#X] Keys [1]: [c_custkey#X] Functions [1]: [count(o_orderkey#X)] Aggregate Attributes [1]: [count(o_orderkey#X)#X] Results [2]: [c_custkey#X, count(o_orderkey#X)#X] -(20) ProjectExecTransformer +(22) ProjectExecTransformer Output [1]: [count(o_orderkey#X)#X AS c_count#X] Input [2]: [c_custkey#X, count(o_orderkey#X)#X] -(21) FlushableHashAggregateExecTransformer +(23) FlushableHashAggregateExecTransformer Input [1]: [c_count#X] Keys [1]: [c_count#X] Functions [1]: [partial_count(1)] Aggregate Attributes [1]: [count#X] Results [2]: [c_count#X, count#X] -(22) ProjectExecTransformer +(24) ProjectExecTransformer Output [3]: [hash(c_count#X, 42) AS hash_partition_key#X, c_count#X, count#X] Input [2]: [c_count#X, count#X] -(23) WholeStageCodegenTransformer (X) +(25) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, c_count#X, count#X] Arguments: false -(24) ColumnarExchange +(26) VeloxAppendBatches +Input [3]: [hash_partition_key#X, c_count#X, count#X] +Arguments: X + +(27) ColumnarExchange Input [3]: [hash_partition_key#X, c_count#X, count#X] Arguments: hashpartitioning(c_count#X, 1), ENSURE_REQUIREMENTS, [c_count#X, count#X], [plan_id=X], [id=#X] -(25) ShuffleQueryStage +(28) ShuffleQueryStage Output [2]: [c_count#X, count#X] Arguments: X -(26) InputAdapter +(29) InputAdapter Input [2]: [c_count#X, count#X] -(27) InputIteratorTransformer +(30) InputIteratorTransformer Input [2]: [c_count#X, count#X] -(28) RegularHashAggregateExecTransformer +(31) RegularHashAggregateExecTransformer Input [2]: [c_count#X, count#X] Keys [1]: [c_count#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [c_count#X, count(1)#X AS custdist#X] -(29) WholeStageCodegenTransformer (X) +(32) WholeStageCodegenTransformer (X) Input [2]: [c_count#X, custdist#X] Arguments: false -(30) ColumnarExchange +(33) VeloxAppendBatches +Input [2]: [c_count#X, custdist#X] +Arguments: X + +(34) ColumnarExchange Input [2]: [c_count#X, custdist#X] Arguments: rangepartitioning(custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(31) ShuffleQueryStage +(35) ShuffleQueryStage Output [2]: [c_count#X, custdist#X] Arguments: X -(32) InputAdapter +(36) InputAdapter Input [2]: [c_count#X, custdist#X] -(33) InputIteratorTransformer +(37) InputIteratorTransformer Input [2]: [c_count#X, custdist#X] -(34) SortExecTransformer +(38) SortExecTransformer Input [2]: [c_count#X, custdist#X] Arguments: [custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST], true, 0 -(35) WholeStageCodegenTransformer (X) +(39) WholeStageCodegenTransformer (X) Input [2]: [c_count#X, custdist#X] Arguments: false -(36) VeloxColumnarToRowExec +(40) VeloxColumnarToRowExec Input [2]: [c_count#X, custdist#X] -(37) Scan parquet +(41) Scan parquet Output [1]: [c_custkey#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(38) Exchange +(42) Exchange Input [1]: [c_custkey#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(39) Scan parquet +(43) Sort +Input [1]: [c_custkey#X] +Arguments: [c_custkey#X ASC NULLS FIRST], false, 0 + +(44) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_comment#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_comment), IsNotNull(o_custkey)] ReadSchema: struct -(40) Filter +(45) Filter Input [3]: [o_orderkey#X, o_custkey#X, o_comment#X] Condition : ((isnotnull(o_comment#X) AND NOT o_comment#X LIKE %special%requests%) AND isnotnull(o_custkey#X)) -(41) Project +(46) Project Output [2]: [o_orderkey#X, o_custkey#X] Input [3]: [o_orderkey#X, o_custkey#X, o_comment#X] -(42) Exchange +(47) Exchange Input [2]: [o_orderkey#X, o_custkey#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(43) ShuffledHashJoin +(48) Sort +Input [2]: [o_orderkey#X, o_custkey#X] +Arguments: [o_custkey#X ASC NULLS FIRST], false, 0 + +(49) SortMergeJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join type: LeftOuter Join condition: None -(44) Project +(50) Project Output [2]: [c_custkey#X, o_orderkey#X] Input [3]: [c_custkey#X, o_orderkey#X, o_custkey#X] -(45) HashAggregate +(51) HashAggregate Input [2]: [c_custkey#X, o_orderkey#X] Keys [1]: [c_custkey#X] Functions [1]: [partial_count(o_orderkey#X)] Aggregate Attributes [1]: [count#X] Results [2]: [c_custkey#X, count#X] -(46) HashAggregate +(52) HashAggregate Input [2]: [c_custkey#X, count#X] Keys [1]: [c_custkey#X] Functions [1]: [count(o_orderkey#X)] Aggregate Attributes [1]: [count(o_orderkey#X)#X] Results [1]: [count(o_orderkey#X)#X AS c_count#X] -(47) HashAggregate +(53) HashAggregate Input [1]: [c_count#X] Keys [1]: [c_count#X] Functions [1]: [partial_count(1)] Aggregate Attributes [1]: [count#X] Results [2]: [c_count#X, count#X] -(48) Exchange +(54) Exchange Input [2]: [c_count#X, count#X] Arguments: hashpartitioning(c_count#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(49) HashAggregate +(55) HashAggregate Input [2]: [c_count#X, count#X] Keys [1]: [c_count#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [c_count#X, count(1)#X AS custdist#X] -(50) Exchange +(56) Exchange Input [2]: [c_count#X, custdist#X] Arguments: rangepartitioning(custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(51) Sort +(57) Sort Input [2]: [c_count#X, custdist#X] Arguments: [custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST], true, 0 -(52) AdaptiveSparkPlan +(58) AdaptiveSparkPlan Output [2]: [c_count#X, custdist#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/14.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/14.txt index fd8974a5b6cbd..492d3f8b9d073 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/14.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/14.txt @@ -1,38 +1,40 @@ == Physical Plan == -AdaptiveSparkPlan (35) +AdaptiveSparkPlan (39) +- == Final Plan == - VeloxColumnarToRowExec (23) - +- ^ ProjectExecTransformer (21) - +- ^ RegularHashAggregateExecTransformer (20) - +- ^ RegularHashAggregateExecTransformer (19) - +- ^ ProjectExecTransformer (18) - +- ^ ShuffledHashJoinExecTransformer Inner (17) - :- ^ InputIteratorTransformer (8) - : +- ^ InputAdapter (7) - : +- ^ ShuffleQueryStage (6), Statistics(X) - : +- ColumnarExchange (5) + VeloxColumnarToRowExec (25) + +- ^ ProjectExecTransformer (23) + +- ^ RegularHashAggregateExecTransformer (22) + +- ^ RegularHashAggregateExecTransformer (21) + +- ^ ProjectExecTransformer (20) + +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (19) + :- ^ InputIteratorTransformer (9) + : +- ShuffleQueryStage (7), Statistics(X) + : +- ColumnarExchange (6) + : +- VeloxAppendBatches (5) : +- ^ ProjectExecTransformer (3) : +- ^ FilterExecTransformer (2) : +- ^ Scan parquet (1) - +- ^ InputIteratorTransformer (16) - +- ^ InputAdapter (15) - +- ^ ShuffleQueryStage (14), Statistics(X) - +- ColumnarExchange (13) - +- ^ ProjectExecTransformer (11) - +- ^ FilterExecTransformer (10) - +- ^ Scan parquet (9) + +- ^ InputIteratorTransformer (18) + +- ShuffleQueryStage (16), Statistics(X) + +- ColumnarExchange (15) + +- VeloxAppendBatches (14) + +- ^ ProjectExecTransformer (12) + +- ^ FilterExecTransformer (11) + +- ^ Scan parquet (10) +- == Initial Plan == - HashAggregate (34) - +- HashAggregate (33) - +- Project (32) - +- ShuffledHashJoin Inner BuildRight (31) - :- Exchange (27) - : +- Project (26) - : +- Filter (25) - : +- Scan parquet (24) - +- Exchange (30) - +- Filter (29) - +- Scan parquet (28) + HashAggregate (38) + +- HashAggregate (37) + +- Project (36) + +- SortMergeJoin Inner (35) + :- Sort (30) + : +- Exchange (29) + : +- Project (28) + : +- Filter (27) + : +- Scan parquet (26) + +- Sort (34) + +- Exchange (33) + +- Filter (32) + +- Scan parquet (31) (1) Scan parquet @@ -54,146 +56,162 @@ Input [4]: [l_partkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Input [4]: [hash_partition_key#X, l_partkey#X, l_extendedprice#X, l_discount#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_partkey#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(6) ColumnarExchange Input [4]: [hash_partition_key#X, l_partkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [3]: [l_partkey#X, l_extendedprice#X, l_discount#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [3]: [l_partkey#X, l_extendedprice#X, l_discount#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [3]: [l_partkey#X, l_extendedprice#X, l_discount#X] -(9) Scan parquet +(10) Scan parquet Output [2]: [p_partkey#X, p_type#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_partkey)] ReadSchema: struct -(10) FilterExecTransformer +(11) FilterExecTransformer Input [2]: [p_partkey#X, p_type#X] Arguments: isnotnull(p_partkey#X) -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [3]: [hash(p_partkey#X, 42) AS hash_partition_key#X, p_partkey#X, p_type#X] Input [2]: [p_partkey#X, p_type#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, p_partkey#X, p_type#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [3]: [hash_partition_key#X, p_partkey#X, p_type#X] +Arguments: X + +(15) ColumnarExchange Input [3]: [hash_partition_key#X, p_partkey#X, p_type#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X, p_type#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [2]: [p_partkey#X, p_type#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [2]: [p_partkey#X, p_type#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [2]: [p_partkey#X, p_type#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join type: Inner Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [5]: [l_extendedprice#X, l_discount#X, p_type#X, CASE WHEN StartsWith(p_type#X, PROMO) THEN (l_extendedprice#X * (1 - l_discount#X)) ELSE 0.0000 END AS _pre_X#X, (l_extendedprice#X * (1 - l_discount#X)) AS _pre_X#X] Input [5]: [l_partkey#X, l_extendedprice#X, l_discount#X, p_partkey#X, p_type#X] -(19) RegularHashAggregateExecTransformer +(21) RegularHashAggregateExecTransformer Input [5]: [l_extendedprice#X, l_discount#X, p_type#X, _pre_X#X, _pre_X#X] Keys: [] Functions [2]: [partial_sum(_pre_X#X), partial_sum(_pre_X#X)] Aggregate Attributes [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Results [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] -(20) RegularHashAggregateExecTransformer +(22) RegularHashAggregateExecTransformer Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Keys: [] Functions [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN (l_extendedprice#X * (1 - l_discount#X)) ELSE 0.0000 END), sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN (l_extendedprice#X * (1 - l_discount#X)) ELSE 0.0000 END)#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X] Results [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN (l_extendedprice#X * (1 - l_discount#X)) ELSE 0.0000 END)#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X] -(21) ProjectExecTransformer +(23) ProjectExecTransformer Output [1]: [((100.00 * sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN (l_extendedprice#X * (1 - l_discount#X)) ELSE 0.0000 END)#X) / sum((l_extendedprice#X * (1 - l_discount#X)))#X) AS promo_revenue#X] Input [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN (l_extendedprice#X * (1 - l_discount#X)) ELSE 0.0000 END)#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X] -(22) WholeStageCodegenTransformer (X) +(24) WholeStageCodegenTransformer (X) Input [1]: [promo_revenue#X] Arguments: false -(23) VeloxColumnarToRowExec +(25) VeloxColumnarToRowExec Input [1]: [promo_revenue#X] -(24) Scan parquet +(26) Scan parquet Output [4]: [l_partkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1995-09-01), LessThan(l_shipdate,1995-10-01), IsNotNull(l_partkey)] ReadSchema: struct -(25) Filter +(27) Filter Input [4]: [l_partkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : (((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1995-09-01)) AND (l_shipdate#X < 1995-10-01)) AND isnotnull(l_partkey#X)) -(26) Project +(28) Project Output [3]: [l_partkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_partkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(27) Exchange +(29) Exchange Input [3]: [l_partkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(28) Scan parquet +(30) Sort +Input [3]: [l_partkey#X, l_extendedprice#X, l_discount#X] +Arguments: [l_partkey#X ASC NULLS FIRST], false, 0 + +(31) Scan parquet Output [2]: [p_partkey#X, p_type#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_partkey)] ReadSchema: struct -(29) Filter +(32) Filter Input [2]: [p_partkey#X, p_type#X] Condition : isnotnull(p_partkey#X) -(30) Exchange +(33) Exchange Input [2]: [p_partkey#X, p_type#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(31) ShuffledHashJoin +(34) Sort +Input [2]: [p_partkey#X, p_type#X] +Arguments: [p_partkey#X ASC NULLS FIRST], false, 0 + +(35) SortMergeJoin Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join type: Inner Join condition: None -(32) Project +(36) Project Output [3]: [l_extendedprice#X, l_discount#X, p_type#X] Input [5]: [l_partkey#X, l_extendedprice#X, l_discount#X, p_partkey#X, p_type#X] -(33) HashAggregate +(37) HashAggregate Input [3]: [l_extendedprice#X, l_discount#X, p_type#X] Keys: [] Functions [2]: [partial_sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN (l_extendedprice#X * (1 - l_discount#X)) ELSE 0.0000 END), partial_sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Results [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] -(34) HashAggregate +(38) HashAggregate Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Keys: [] Functions [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN (l_extendedprice#X * (1 - l_discount#X)) ELSE 0.0000 END), sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [2]: [sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN (l_extendedprice#X * (1 - l_discount#X)) ELSE 0.0000 END)#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X] Results [1]: [((100.00 * sum(CASE WHEN StartsWith(p_type#X, PROMO) THEN (l_extendedprice#X * (1 - l_discount#X)) ELSE 0.0000 END)#X) / sum((l_extendedprice#X * (1 - l_discount#X)))#X) AS promo_revenue#X] -(35) AdaptiveSparkPlan +(39) AdaptiveSparkPlan Output [1]: [promo_revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/15.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/15.txt index 8fd855a59c424..129e4ad927e99 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/15.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/15.txt @@ -1,45 +1,48 @@ == Physical Plan == -AdaptiveSparkPlan (42) +AdaptiveSparkPlan (47) +- == Final Plan == - VeloxColumnarToRowExec (27) - +- AQEShuffleRead (26) - +- ShuffleQueryStage (25), Statistics(X) - +- ColumnarExchange (24) - +- ^ ProjectExecTransformer (22) - +- ^ ShuffledHashJoinExecTransformer Inner (21) - :- ^ InputIteratorTransformer (8) - : +- ^ InputAdapter (7) - : +- ^ ShuffleQueryStage (6), Statistics(X) - : +- ColumnarExchange (5) - : +- ^ ProjectExecTransformer (3) - : +- ^ FilterExecTransformer (2) - : +- ^ Scan parquet (1) - +- ^ FilterExecTransformer (20) - +- ^ RegularHashAggregateExecTransformer (19) - +- ^ InputIteratorTransformer (18) - +- ^ InputAdapter (17) - +- ^ ShuffleQueryStage (16), Statistics(X) - +- ColumnarExchange (15) - +- ^ ProjectExecTransformer (13) - +- ^ FlushableHashAggregateExecTransformer (12) - +- ^ ProjectExecTransformer (11) - +- ^ FilterExecTransformer (10) - +- ^ Scan parquet (9) + VeloxColumnarToRowExec (30) + +- AQEShuffleRead (29) + +- ShuffleQueryStage (28), Statistics(X) + +- ColumnarExchange (27) + +- VeloxAppendBatches (26) + +- ^ ProjectExecTransformer (24) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (23) + :- ^ InputIteratorTransformer (9) + : +- ShuffleQueryStage (7), Statistics(X) + : +- ColumnarExchange (6) + : +- VeloxAppendBatches (5) + : +- ^ ProjectExecTransformer (3) + : +- ^ FilterExecTransformer (2) + : +- ^ Scan parquet (1) + +- ^ FilterExecTransformer (22) + +- ^ RegularHashAggregateExecTransformer (21) + +- ^ InputIteratorTransformer (20) + +- ShuffleQueryStage (18), Statistics(X) + +- ColumnarExchange (17) + +- VeloxAppendBatches (16) + +- ^ ProjectExecTransformer (14) + +- ^ FlushableHashAggregateExecTransformer (13) + +- ^ ProjectExecTransformer (12) + +- ^ FilterExecTransformer (11) + +- ^ Scan parquet (10) +- == Initial Plan == - Sort (41) - +- Exchange (40) - +- Project (39) - +- ShuffledHashJoin Inner BuildLeft (38) - :- Exchange (30) - : +- Filter (29) - : +- Scan parquet (28) - +- Filter (37) - +- HashAggregate (36) - +- Exchange (35) - +- HashAggregate (34) - +- Project (33) - +- Filter (32) - +- Scan parquet (31) + Sort (46) + +- Exchange (45) + +- Project (44) + +- SortMergeJoin Inner (43) + :- Sort (34) + : +- Exchange (33) + : +- Filter (32) + : +- Scan parquet (31) + +- Sort (42) + +- Filter (41) + +- HashAggregate (40) + +- Exchange (39) + +- HashAggregate (38) + +- Project (37) + +- Filter (36) + +- Scan parquet (35) (1) Scan parquet @@ -61,331 +64,355 @@ Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_phone#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_phone#X] +Arguments: X + +(6) ColumnarExchange Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_phone#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_address#X, s_phone#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] -(9) Scan parquet +(10) Scan parquet Output [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1996-01-01), LessThan(l_shipdate,1996-04-01), IsNotNull(l_suppkey)] ReadSchema: struct -(10) FilterExecTransformer +(11) FilterExecTransformer Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: (((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1996-01-01)) AND (l_shipdate#X < 1996-04-01)) AND isnotnull(l_suppkey#X)) -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, (l_extendedprice#X * (1 - l_discount#X)) AS _pre_X#X] Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(12) FlushableHashAggregateExecTransformer +(13) FlushableHashAggregateExecTransformer Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, _pre_X#X] Keys [1]: [l_suppkey#X] Functions [1]: [partial_sum(_pre_X#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_suppkey#X, sum#X, isEmpty#X] -(13) ProjectExecTransformer +(14) ProjectExecTransformer Output [4]: [hash(l_suppkey#X, 42) AS hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] Input [3]: [l_suppkey#X, sum#X, isEmpty#X] -(14) WholeStageCodegenTransformer (X) +(15) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: false -(15) ColumnarExchange +(16) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] +Arguments: X + +(17) ColumnarExchange Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(16) ShuffleQueryStage +(18) ShuffleQueryStage Output [3]: [l_suppkey#X, sum#X, isEmpty#X] Arguments: X -(17) InputAdapter +(19) InputAdapter Input [3]: [l_suppkey#X, sum#X, isEmpty#X] -(18) InputIteratorTransformer +(20) InputIteratorTransformer Input [3]: [l_suppkey#X, sum#X, isEmpty#X] -(19) RegularHashAggregateExecTransformer +(21) RegularHashAggregateExecTransformer Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Keys [1]: [l_suppkey#X] Functions [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X] Results [2]: [l_suppkey#X AS supplier_no#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X AS total_revenue#X] -(20) FilterExecTransformer +(22) FilterExecTransformer Input [2]: [supplier_no#X, total_revenue#X] Arguments: (isnotnull(total_revenue#X) AND (total_revenue#X = Subquery subquery#X, [id=#X])) -(21) ShuffledHashJoinExecTransformer +(23) ShuffledHashJoinExecTransformer Left keys [1]: [s_suppkey#X] Right keys [1]: [supplier_no#X] Join type: Inner Join condition: None -(22) ProjectExecTransformer +(24) ProjectExecTransformer Output [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Input [6]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, supplier_no#X, total_revenue#X] -(23) WholeStageCodegenTransformer (X) +(25) WholeStageCodegenTransformer (X) Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: false -(24) ColumnarExchange +(26) VeloxAppendBatches +Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] +Arguments: X + +(27) ColumnarExchange Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: rangepartitioning(s_suppkey#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(25) ShuffleQueryStage +(28) ShuffleQueryStage Output [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: X -(26) AQEShuffleRead +(29) AQEShuffleRead Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: local -(27) VeloxColumnarToRowExec +(30) VeloxColumnarToRowExec Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] -(28) Scan parquet +(31) Scan parquet Output [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey)] ReadSchema: struct -(29) Filter +(32) Filter Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] Condition : isnotnull(s_suppkey#X) -(30) Exchange +(33) Exchange Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(31) Scan parquet +(34) Sort +Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] +Arguments: [s_suppkey#X ASC NULLS FIRST], false, 0 + +(35) Scan parquet Output [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1996-01-01), LessThan(l_shipdate,1996-04-01), IsNotNull(l_suppkey)] ReadSchema: struct -(32) Filter +(36) Filter Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : (((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1996-01-01)) AND (l_shipdate#X < 1996-04-01)) AND isnotnull(l_suppkey#X)) -(33) Project +(37) Project Output [3]: [l_suppkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(34) HashAggregate +(38) HashAggregate Input [3]: [l_suppkey#X, l_extendedprice#X, l_discount#X] Keys [1]: [l_suppkey#X] Functions [1]: [partial_sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_suppkey#X, sum#X, isEmpty#X] -(35) Exchange +(39) Exchange Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(36) HashAggregate +(40) HashAggregate Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Keys [1]: [l_suppkey#X] Functions [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X] Results [2]: [l_suppkey#X AS supplier_no#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X AS total_revenue#X] -(37) Filter +(41) Filter Input [2]: [supplier_no#X, total_revenue#X] Condition : (isnotnull(total_revenue#X) AND (total_revenue#X = Subquery subquery#X, [id=#X])) -(38) ShuffledHashJoin +(42) Sort +Input [2]: [supplier_no#X, total_revenue#X] +Arguments: [supplier_no#X ASC NULLS FIRST], false, 0 + +(43) SortMergeJoin Left keys [1]: [s_suppkey#X] Right keys [1]: [supplier_no#X] Join type: Inner Join condition: None -(39) Project +(44) Project Output [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Input [6]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, supplier_no#X, total_revenue#X] -(40) Exchange +(45) Exchange Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: rangepartitioning(s_suppkey#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(41) Sort +(46) Sort Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: [s_suppkey#X ASC NULLS FIRST], true, 0 -(42) AdaptiveSparkPlan +(47) AdaptiveSparkPlan Output [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] Arguments: isFinalPlan=true ===== Subqueries ===== -Subquery:1 Hosting operator id = 20 Hosting Expression = Subquery subquery#X, [id=#X] -AdaptiveSparkPlan (67) +Subquery:1 Hosting operator id = 22 Hosting Expression = Subquery subquery#X, [id=#X] +AdaptiveSparkPlan (73) +- == Final Plan == - VeloxColumnarToRowExec (58) - +- ^ RegularHashAggregateExecTransformer (56) - +- ^ RegularHashAggregateExecTransformer (55) - +- ^ ProjectExecTransformer (54) - +- ^ RegularHashAggregateExecTransformer (53) - +- ^ InputIteratorTransformer (52) - +- ^ InputAdapter (51) - +- ^ ShuffleQueryStage (50), Statistics(X) - +- ColumnarExchange (49) - +- ^ ProjectExecTransformer (47) - +- ^ FlushableHashAggregateExecTransformer (46) - +- ^ ProjectExecTransformer (45) - +- ^ FilterExecTransformer (44) - +- ^ Scan parquet (43) + VeloxColumnarToRowExec (64) + +- ^ RegularHashAggregateExecTransformer (62) + +- ^ RegularHashAggregateExecTransformer (61) + +- ^ ProjectExecTransformer (60) + +- ^ RegularHashAggregateExecTransformer (59) + +- ^ InputIteratorTransformer (58) + +- ShuffleQueryStage (56), Statistics(X) + +- ColumnarExchange (55) + +- VeloxAppendBatches (54) + +- ^ ProjectExecTransformer (52) + +- ^ FlushableHashAggregateExecTransformer (51) + +- ^ ProjectExecTransformer (50) + +- ^ FilterExecTransformer (49) + +- ^ Scan parquet (48) +- == Initial Plan == - HashAggregate (66) - +- HashAggregate (65) - +- HashAggregate (64) - +- Exchange (63) - +- HashAggregate (62) - +- Project (61) - +- Filter (60) - +- Scan parquet (59) + HashAggregate (72) + +- HashAggregate (71) + +- HashAggregate (70) + +- Exchange (69) + +- HashAggregate (68) + +- Project (67) + +- Filter (66) + +- Scan parquet (65) -(43) Scan parquet +(48) Scan parquet Output [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1996-01-01), LessThan(l_shipdate,1996-04-01)] ReadSchema: struct -(44) FilterExecTransformer +(49) FilterExecTransformer Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: ((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1996-01-01)) AND (l_shipdate#X < 1996-04-01)) -(45) ProjectExecTransformer +(50) ProjectExecTransformer Output [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, (l_extendedprice#X * (1 - l_discount#X)) AS _pre_X#X] Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(46) FlushableHashAggregateExecTransformer +(51) FlushableHashAggregateExecTransformer Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, _pre_X#X] Keys [1]: [l_suppkey#X] Functions [1]: [partial_sum(_pre_X#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_suppkey#X, sum#X, isEmpty#X] -(47) ProjectExecTransformer +(52) ProjectExecTransformer Output [4]: [hash(l_suppkey#X, 42) AS hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] Input [3]: [l_suppkey#X, sum#X, isEmpty#X] -(48) WholeStageCodegenTransformer (X) +(53) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: false -(49) ColumnarExchange +(54) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] +Arguments: X + +(55) ColumnarExchange Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(50) ShuffleQueryStage +(56) ShuffleQueryStage Output [3]: [l_suppkey#X, sum#X, isEmpty#X] Arguments: X -(51) InputAdapter +(57) InputAdapter Input [3]: [l_suppkey#X, sum#X, isEmpty#X] -(52) InputIteratorTransformer +(58) InputIteratorTransformer Input [3]: [l_suppkey#X, sum#X, isEmpty#X] -(53) RegularHashAggregateExecTransformer +(59) RegularHashAggregateExecTransformer Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Keys [1]: [l_suppkey#X] Functions [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X] Results [2]: [l_suppkey#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X] -(54) ProjectExecTransformer +(60) ProjectExecTransformer Output [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X AS total_revenue#X] Input [2]: [l_suppkey#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X] -(55) RegularHashAggregateExecTransformer +(61) RegularHashAggregateExecTransformer Input [1]: [total_revenue#X] Keys: [] Functions [1]: [partial_max(total_revenue#X)] Aggregate Attributes [1]: [max#X] Results [1]: [max#X] -(56) RegularHashAggregateExecTransformer +(62) RegularHashAggregateExecTransformer Input [1]: [max#X] Keys: [] Functions [1]: [max(total_revenue#X)] Aggregate Attributes [1]: [max(total_revenue#X)#X] Results [1]: [max(total_revenue#X)#X AS max(total_revenue)#X] -(57) WholeStageCodegenTransformer (X) +(63) WholeStageCodegenTransformer (X) Input [1]: [max(total_revenue)#X] Arguments: false -(58) VeloxColumnarToRowExec +(64) VeloxColumnarToRowExec Input [1]: [max(total_revenue)#X] -(59) Scan parquet +(65) Scan parquet Output [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1996-01-01), LessThan(l_shipdate,1996-04-01)] ReadSchema: struct -(60) Filter +(66) Filter Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : ((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1996-01-01)) AND (l_shipdate#X < 1996-04-01)) -(61) Project +(67) Project Output [3]: [l_suppkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(62) HashAggregate +(68) HashAggregate Input [3]: [l_suppkey#X, l_extendedprice#X, l_discount#X] Keys [1]: [l_suppkey#X] Functions [1]: [partial_sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_suppkey#X, sum#X, isEmpty#X] -(63) Exchange +(69) Exchange Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(64) HashAggregate +(70) HashAggregate Input [3]: [l_suppkey#X, sum#X, isEmpty#X] Keys [1]: [l_suppkey#X] Functions [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X] Results [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X AS total_revenue#X] -(65) HashAggregate +(71) HashAggregate Input [1]: [total_revenue#X] Keys: [] Functions [1]: [partial_max(total_revenue#X)] Aggregate Attributes [1]: [max#X] Results [1]: [max#X] -(66) HashAggregate +(72) HashAggregate Input [1]: [max#X] Keys: [] Functions [1]: [max(total_revenue#X)] Aggregate Attributes [1]: [max(total_revenue#X)#X] Results [1]: [max(total_revenue#X)#X AS max(total_revenue)#X] -(67) AdaptiveSparkPlan +(73) AdaptiveSparkPlan Output [1]: [max(total_revenue)#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/16.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/16.txt index 498c1e83e15b5..45b6041f8b4b0 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/16.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/16.txt @@ -1,64 +1,66 @@ == Physical Plan == -AdaptiveSparkPlan (64) +AdaptiveSparkPlan (71) +- == Final Plan == - VeloxColumnarToRowExec (42) - +- ^ SortExecTransformer (40) - +- ^ InputIteratorTransformer (39) - +- ^ InputAdapter (38) - +- ^ ShuffleQueryStage (37), Statistics(X) - +- ColumnarExchange (36) - +- ^ RegularHashAggregateExecTransformer (34) - +- ^ InputIteratorTransformer (33) - +- ^ InputAdapter (32) - +- ^ ShuffleQueryStage (31), Statistics(X) - +- ColumnarExchange (30) - +- ^ ProjectExecTransformer (28) - +- ^ FlushableHashAggregateExecTransformer (27) - +- ^ RegularHashAggregateExecTransformer (26) - +- ^ InputIteratorTransformer (25) - +- ^ InputAdapter (24) - +- ^ ShuffleQueryStage (23), Statistics(X) - +- ColumnarExchange (22) - +- ^ ProjectExecTransformer (20) - +- ^ FlushableHashAggregateExecTransformer (19) - +- ^ ProjectExecTransformer (18) - +- ^ ShuffledHashJoinExecTransformer Inner (17) - :- ^ InputIteratorTransformer (8) - : +- ^ InputAdapter (7) - : +- ^ ShuffleQueryStage (6), Statistics(X) - : +- ColumnarExchange (5) + VeloxColumnarToRowExec (47) + +- ^ SortExecTransformer (45) + +- ^ InputIteratorTransformer (44) + +- ShuffleQueryStage (42), Statistics(X) + +- ColumnarExchange (41) + +- VeloxAppendBatches (40) + +- ^ RegularHashAggregateExecTransformer (38) + +- ^ InputIteratorTransformer (37) + +- ShuffleQueryStage (35), Statistics(X) + +- ColumnarExchange (34) + +- VeloxAppendBatches (33) + +- ^ ProjectExecTransformer (31) + +- ^ FlushableHashAggregateExecTransformer (30) + +- ^ RegularHashAggregateExecTransformer (29) + +- ^ InputIteratorTransformer (28) + +- ShuffleQueryStage (26), Statistics(X) + +- ColumnarExchange (25) + +- VeloxAppendBatches (24) + +- ^ ProjectExecTransformer (22) + +- ^ FlushableHashAggregateExecTransformer (21) + +- ^ ProjectExecTransformer (20) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (19) + :- ^ InputIteratorTransformer (9) + : +- ShuffleQueryStage (7), Statistics(X) + : +- ColumnarExchange (6) + : +- VeloxAppendBatches (5) : +- ^ ProjectExecTransformer (3) : +- ^ FilterExecTransformer (2) : +- ^ Scan parquet (1) - +- ^ InputIteratorTransformer (16) - +- ^ InputAdapter (15) - +- ^ ShuffleQueryStage (14), Statistics(X) - +- ColumnarExchange (13) - +- ^ ProjectExecTransformer (11) - +- ^ FilterExecTransformer (10) - +- ^ Scan parquet (9) + +- ^ InputIteratorTransformer (18) + +- ShuffleQueryStage (16), Statistics(X) + +- ColumnarExchange (15) + +- VeloxAppendBatches (14) + +- ^ ProjectExecTransformer (12) + +- ^ FilterExecTransformer (11) + +- ^ Scan parquet (10) +- == Initial Plan == - Sort (63) - +- Exchange (62) - +- HashAggregate (61) - +- Exchange (60) - +- HashAggregate (59) - +- HashAggregate (58) - +- Exchange (57) - +- HashAggregate (56) - +- Project (55) - +- ShuffledHashJoin Inner BuildRight (54) - :- Exchange (50) - : +- BroadcastHashJoin LeftAnti BuildRight (49) - : :- Filter (44) - : : +- Scan parquet (43) - : +- BroadcastExchange (48) - : +- Project (47) - : +- Filter (46) - : +- Scan parquet (45) - +- Exchange (53) - +- Filter (52) - +- Scan parquet (51) + Sort (70) + +- Exchange (69) + +- HashAggregate (68) + +- Exchange (67) + +- HashAggregate (66) + +- HashAggregate (65) + +- Exchange (64) + +- HashAggregate (63) + +- Project (62) + +- SortMergeJoin Inner (61) + :- Sort (56) + : +- Exchange (55) + : +- BroadcastHashJoin LeftAnti BuildRight (54) + : :- Filter (49) + : : +- Scan parquet (48) + : +- BroadcastExchange (53) + : +- Project (52) + : +- Filter (51) + : +- Scan parquet (50) + +- Sort (60) + +- Exchange (59) + +- Filter (58) + +- Scan parquet (57) (1) Scan parquet @@ -80,273 +82,301 @@ Input [2]: [ps_partkey#X, ps_suppkey#X] Input [3]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [3]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X] +Arguments: X + +(6) ColumnarExchange Input [3]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X] Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [2]: [ps_partkey#X, ps_suppkey#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [2]: [ps_partkey#X, ps_suppkey#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [2]: [ps_partkey#X, ps_suppkey#X] -(9) Scan parquet +(10) Scan parquet Output [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_brand), IsNotNull(p_type), Not(EqualTo(p_brand,Brand#X)), Not(StringStartsWith(p_type,MEDIUM POLISHED)), In(p_size, [14,19,23,3,36,45,49,9]), IsNotNull(p_partkey)] ReadSchema: struct -(10) FilterExecTransformer +(11) FilterExecTransformer Input [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] Arguments: (((((isnotnull(p_brand#X) AND isnotnull(p_type#X)) AND NOT (p_brand#X = Brand#X)) AND NOT StartsWith(p_type#X, MEDIUM POLISHED)) AND p_size#X IN (49,14,23,45,19,3,36,9)) AND isnotnull(p_partkey#X)) -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [5]: [hash(p_partkey#X, 42) AS hash_partition_key#X, p_partkey#X, p_brand#X, p_type#X, p_size#X] Input [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, p_partkey#X, p_brand#X, p_type#X, p_size#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [5]: [hash_partition_key#X, p_partkey#X, p_brand#X, p_type#X, p_size#X] +Arguments: X + +(15) ColumnarExchange Input [5]: [hash_partition_key#X, p_partkey#X, p_brand#X, p_type#X, p_size#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X, p_brand#X, p_type#X, p_size#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [ps_partkey#X] Right keys [1]: [p_partkey#X] Join type: Inner Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [4]: [ps_suppkey#X, p_brand#X, p_type#X, p_size#X] Input [6]: [ps_partkey#X, ps_suppkey#X, p_partkey#X, p_brand#X, p_type#X, p_size#X] -(19) FlushableHashAggregateExecTransformer +(21) FlushableHashAggregateExecTransformer Input [4]: [ps_suppkey#X, p_brand#X, p_type#X, p_size#X] Keys [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Functions: [] Aggregate Attributes: [] Results [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(20) ProjectExecTransformer +(22) ProjectExecTransformer Output [5]: [hash(p_brand#X, p_type#X, p_size#X, ps_suppkey#X, 42) AS hash_partition_key#X, p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(21) WholeStageCodegenTransformer (X) +(23) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Arguments: false -(22) ColumnarExchange +(24) VeloxAppendBatches +Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, ps_suppkey#X] +Arguments: X + +(25) ColumnarExchange Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, ps_suppkey#X], [plan_id=X], [id=#X] -(23) ShuffleQueryStage +(26) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Arguments: X -(24) InputAdapter +(27) InputAdapter Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(25) InputIteratorTransformer +(28) InputIteratorTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(26) RegularHashAggregateExecTransformer +(29) RegularHashAggregateExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Keys [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Functions: [] Aggregate Attributes: [] Results [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(27) FlushableHashAggregateExecTransformer +(30) FlushableHashAggregateExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] Functions [1]: [partial_count(distinct ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count#X] -(28) ProjectExecTransformer +(31) ProjectExecTransformer Output [5]: [hash(p_brand#X, p_type#X, p_size#X, 42) AS hash_partition_key#X, p_brand#X, p_type#X, p_size#X, count#X] Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] -(29) WholeStageCodegenTransformer (X) +(32) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, count#X] Arguments: false -(30) ColumnarExchange +(33) VeloxAppendBatches +Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, count#X] +Arguments: X + +(34) ColumnarExchange Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, count#X] Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, count#X], [plan_id=X], [id=#X] -(31) ShuffleQueryStage +(35) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, count#X] Arguments: X -(32) InputAdapter +(36) InputAdapter Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] -(33) InputIteratorTransformer +(37) InputIteratorTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] -(34) RegularHashAggregateExecTransformer +(38) RegularHashAggregateExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] Functions [1]: [count(distinct ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count(ps_suppkey#X)#X AS supplier_cnt#X] -(35) WholeStageCodegenTransformer (X) +(39) WholeStageCodegenTransformer (X) Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: false -(36) ColumnarExchange +(40) VeloxAppendBatches +Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] +Arguments: X + +(41) ColumnarExchange Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: rangepartitioning(supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(37) ShuffleQueryStage +(42) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: X -(38) InputAdapter +(43) InputAdapter Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] -(39) InputIteratorTransformer +(44) InputIteratorTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] -(40) SortExecTransformer +(45) SortExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: [supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST], true, 0 -(41) WholeStageCodegenTransformer (X) +(46) WholeStageCodegenTransformer (X) Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: false -(42) VeloxColumnarToRowExec +(47) VeloxColumnarToRowExec Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] -(43) Scan parquet +(48) Scan parquet Output [2]: [ps_partkey#X, ps_suppkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_partkey)] ReadSchema: struct -(44) Filter +(49) Filter Input [2]: [ps_partkey#X, ps_suppkey#X] Condition : isnotnull(ps_partkey#X) -(45) Scan parquet +(50) Scan parquet Output [2]: [s_suppkey#X, s_comment#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_comment)] ReadSchema: struct -(46) Filter +(51) Filter Input [2]: [s_suppkey#X, s_comment#X] Condition : (isnotnull(s_comment#X) AND s_comment#X LIKE %Customer%Complaints%) -(47) Project +(52) Project Output [1]: [s_suppkey#X] Input [2]: [s_suppkey#X, s_comment#X] -(48) BroadcastExchange +(53) BroadcastExchange Input [1]: [s_suppkey#X] Arguments: HashedRelationBroadcastMode(List(input[0, bigint, true]),true), [plan_id=X] -(49) BroadcastHashJoin +(54) BroadcastHashJoin Left keys [1]: [ps_suppkey#X] Right keys [1]: [s_suppkey#X] Join type: LeftAnti Join condition: None -(50) Exchange +(55) Exchange Input [2]: [ps_partkey#X, ps_suppkey#X] Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(51) Scan parquet +(56) Sort +Input [2]: [ps_partkey#X, ps_suppkey#X] +Arguments: [ps_partkey#X ASC NULLS FIRST], false, 0 + +(57) Scan parquet Output [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_brand), IsNotNull(p_type), Not(EqualTo(p_brand,Brand#X)), Not(StringStartsWith(p_type,MEDIUM POLISHED)), In(p_size, [14,19,23,3,36,45,49,9]), IsNotNull(p_partkey)] ReadSchema: struct -(52) Filter +(58) Filter Input [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] Condition : (((((isnotnull(p_brand#X) AND isnotnull(p_type#X)) AND NOT (p_brand#X = Brand#X)) AND NOT StartsWith(p_type#X, MEDIUM POLISHED)) AND p_size#X IN (49,14,23,45,19,3,36,9)) AND isnotnull(p_partkey#X)) -(53) Exchange +(59) Exchange Input [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(54) ShuffledHashJoin +(60) Sort +Input [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] +Arguments: [p_partkey#X ASC NULLS FIRST], false, 0 + +(61) SortMergeJoin Left keys [1]: [ps_partkey#X] Right keys [1]: [p_partkey#X] Join type: Inner Join condition: None -(55) Project +(62) Project Output [4]: [ps_suppkey#X, p_brand#X, p_type#X, p_size#X] Input [6]: [ps_partkey#X, ps_suppkey#X, p_partkey#X, p_brand#X, p_type#X, p_size#X] -(56) HashAggregate +(63) HashAggregate Input [4]: [ps_suppkey#X, p_brand#X, p_type#X, p_size#X] Keys [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Functions: [] Aggregate Attributes: [] Results [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(57) Exchange +(64) Exchange Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(58) HashAggregate +(65) HashAggregate Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Keys [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Functions: [] Aggregate Attributes: [] Results [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -(59) HashAggregate +(66) HashAggregate Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] Functions [1]: [partial_count(distinct ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count#X] -(60) Exchange +(67) Exchange Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(61) HashAggregate +(68) HashAggregate Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] Functions [1]: [count(distinct ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count(ps_suppkey#X)#X AS supplier_cnt#X] -(62) Exchange +(69) Exchange Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: rangepartitioning(supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(63) Sort +(70) Sort Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: [supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST], true, 0 -(64) AdaptiveSparkPlan +(71) AdaptiveSparkPlan Output [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/17.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/17.txt index f15f42b69cd8b..b46b3e3f27242 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/17.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/17.txt @@ -1,59 +1,62 @@ == Physical Plan == -AdaptiveSparkPlan (57) +AdaptiveSparkPlan (63) +- == Final Plan == - VeloxColumnarToRowExec (37) - +- ^ ProjectExecTransformer (35) - +- ^ RegularHashAggregateExecTransformer (34) - +- ^ RegularHashAggregateExecTransformer (33) - +- ^ ProjectExecTransformer (32) - +- ^ ShuffledHashJoinExecTransformer Inner (31) - :- ^ ProjectExecTransformer (18) - : +- ^ ShuffledHashJoinExecTransformer Inner (17) - : :- ^ InputIteratorTransformer (8) - : : +- ^ InputAdapter (7) - : : +- ^ ShuffleQueryStage (6), Statistics(X) - : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (40) + +- ^ ProjectExecTransformer (38) + +- ^ RegularHashAggregateExecTransformer (37) + +- ^ RegularHashAggregateExecTransformer (36) + +- ^ ProjectExecTransformer (35) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (34) + :- ^ ProjectExecTransformer (20) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (19) + : :- ^ InputIteratorTransformer (9) + : : +- ShuffleQueryStage (7), Statistics(X) + : : +- ColumnarExchange (6) + : : +- VeloxAppendBatches (5) : : +- ^ ProjectExecTransformer (3) : : +- ^ FilterExecTransformer (2) : : +- ^ Scan parquet (1) - : +- ^ InputIteratorTransformer (16) - : +- ^ InputAdapter (15) - : +- ^ ShuffleQueryStage (14), Statistics(X) - : +- ColumnarExchange (13) - : +- ^ ProjectExecTransformer (11) - : +- ^ FilterExecTransformer (10) - : +- ^ Scan parquet (9) - +- ^ FilterExecTransformer (30) - +- ^ ProjectExecTransformer (29) - +- ^ RegularHashAggregateExecTransformer (28) - +- ^ InputIteratorTransformer (27) - +- ^ InputAdapter (26) - +- ^ ShuffleQueryStage (25), Statistics(X) - +- ColumnarExchange (24) - +- ^ ProjectExecTransformer (22) - +- ^ FlushableHashAggregateExecTransformer (21) - +- ^ FilterExecTransformer (20) - +- ^ Scan parquet (19) + : +- ^ InputIteratorTransformer (18) + : +- ShuffleQueryStage (16), Statistics(X) + : +- ColumnarExchange (15) + : +- VeloxAppendBatches (14) + : +- ^ ProjectExecTransformer (12) + : +- ^ FilterExecTransformer (11) + : +- ^ Scan parquet (10) + +- ^ FilterExecTransformer (33) + +- ^ ProjectExecTransformer (32) + +- ^ RegularHashAggregateExecTransformer (31) + +- ^ InputIteratorTransformer (30) + +- ShuffleQueryStage (28), Statistics(X) + +- ColumnarExchange (27) + +- VeloxAppendBatches (26) + +- ^ ProjectExecTransformer (24) + +- ^ FlushableHashAggregateExecTransformer (23) + +- ^ FilterExecTransformer (22) + +- ^ Scan parquet (21) +- == Initial Plan == - HashAggregate (56) - +- HashAggregate (55) - +- Project (54) - +- ShuffledHashJoin Inner BuildRight (53) - :- Project (46) - : +- ShuffledHashJoin Inner BuildRight (45) - : :- Exchange (40) - : : +- Filter (39) - : : +- Scan parquet (38) - : +- Exchange (44) - : +- Project (43) - : +- Filter (42) - : +- Scan parquet (41) - +- Filter (52) - +- HashAggregate (51) - +- Exchange (50) - +- HashAggregate (49) - +- Filter (48) - +- Scan parquet (47) + HashAggregate (62) + +- HashAggregate (61) + +- Project (60) + +- SortMergeJoin Inner (59) + :- Project (51) + : +- SortMergeJoin Inner (50) + : :- Sort (44) + : : +- Exchange (43) + : : +- Filter (42) + : : +- Scan parquet (41) + : +- Sort (49) + : +- Exchange (48) + : +- Project (47) + : +- Filter (46) + : +- Scan parquet (45) + +- Sort (58) + +- Filter (57) + +- HashAggregate (56) + +- Exchange (55) + +- HashAggregate (54) + +- Filter (53) + +- Scan parquet (52) (1) Scan parquet @@ -75,254 +78,278 @@ Input [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] Input [4]: [hash_partition_key#X, l_partkey#X, l_quantity#X, l_extendedprice#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_partkey#X, l_quantity#X, l_extendedprice#X] +Arguments: X + +(6) ColumnarExchange Input [4]: [hash_partition_key#X, l_partkey#X, l_quantity#X, l_extendedprice#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_quantity#X, l_extendedprice#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] -(9) Scan parquet +(10) Scan parquet Output [3]: [p_partkey#X, p_brand#X, p_container#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_brand), IsNotNull(p_container), EqualTo(p_brand,Brand#X), EqualTo(p_container,MED BOX), IsNotNull(p_partkey)] ReadSchema: struct -(10) FilterExecTransformer +(11) FilterExecTransformer Input [3]: [p_partkey#X, p_brand#X, p_container#X] Arguments: ((((isnotnull(p_brand#X) AND isnotnull(p_container#X)) AND (p_brand#X = Brand#X)) AND (p_container#X = MED BOX)) AND isnotnull(p_partkey#X)) -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [2]: [hash(p_partkey#X, 42) AS hash_partition_key#X, p_partkey#X] Input [3]: [p_partkey#X, p_brand#X, p_container#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, p_partkey#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [2]: [hash_partition_key#X, p_partkey#X] +Arguments: X + +(15) ColumnarExchange Input [2]: [hash_partition_key#X, p_partkey#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [1]: [p_partkey#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [1]: [p_partkey#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [1]: [p_partkey#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join type: Inner Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [3]: [l_quantity#X, l_extendedprice#X, p_partkey#X] Input [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, p_partkey#X] -(19) Scan parquet +(21) Scan parquet Output [2]: [l_partkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey)] ReadSchema: struct -(20) FilterExecTransformer +(22) FilterExecTransformer Input [2]: [l_partkey#X, l_quantity#X] Arguments: isnotnull(l_partkey#X) -(21) FlushableHashAggregateExecTransformer +(23) FlushableHashAggregateExecTransformer Input [2]: [l_partkey#X, l_quantity#X] Keys [1]: [l_partkey#X] Functions [1]: [partial_avg(l_quantity#X)] Aggregate Attributes [2]: [sum#X, count#X] Results [3]: [l_partkey#X, sum#X, count#X] -(22) ProjectExecTransformer +(24) ProjectExecTransformer Output [4]: [hash(l_partkey#X, 42) AS hash_partition_key#X, l_partkey#X, sum#X, count#X] Input [3]: [l_partkey#X, sum#X, count#X] -(23) WholeStageCodegenTransformer (X) +(25) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, l_partkey#X, sum#X, count#X] Arguments: false -(24) ColumnarExchange +(26) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_partkey#X, sum#X, count#X] +Arguments: X + +(27) ColumnarExchange Input [4]: [hash_partition_key#X, l_partkey#X, sum#X, count#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, sum#X, count#X], [plan_id=X], [id=#X] -(25) ShuffleQueryStage +(28) ShuffleQueryStage Output [3]: [l_partkey#X, sum#X, count#X] Arguments: X -(26) InputAdapter +(29) InputAdapter Input [3]: [l_partkey#X, sum#X, count#X] -(27) InputIteratorTransformer +(30) InputIteratorTransformer Input [3]: [l_partkey#X, sum#X, count#X] -(28) RegularHashAggregateExecTransformer +(31) RegularHashAggregateExecTransformer Input [3]: [l_partkey#X, sum#X, count#X] Keys [1]: [l_partkey#X] Functions [1]: [avg(l_quantity#X)] Aggregate Attributes [1]: [avg(l_quantity#X)#X] Results [2]: [l_partkey#X, avg(l_quantity#X)#X] -(29) ProjectExecTransformer +(32) ProjectExecTransformer Output [2]: [(0.2 * avg(l_quantity#X)#X) AS (0.2 * avg(l_quantity))#X, l_partkey#X] Input [2]: [l_partkey#X, avg(l_quantity#X)#X] -(30) FilterExecTransformer +(33) FilterExecTransformer Input [2]: [(0.2 * avg(l_quantity))#X, l_partkey#X] Arguments: isnotnull((0.2 * avg(l_quantity))#X) -(31) ShuffledHashJoinExecTransformer +(34) ShuffledHashJoinExecTransformer Left keys [1]: [p_partkey#X] Right keys [1]: [l_partkey#X] Join type: Inner Join condition: (cast(l_quantity#X as decimal(18,7)) < (0.2 * avg(l_quantity))#X) -(32) ProjectExecTransformer +(35) ProjectExecTransformer Output [1]: [l_extendedprice#X] Input [5]: [l_quantity#X, l_extendedprice#X, p_partkey#X, (0.2 * avg(l_quantity))#X, l_partkey#X] -(33) RegularHashAggregateExecTransformer +(36) RegularHashAggregateExecTransformer Input [1]: [l_extendedprice#X] Keys: [] Functions [1]: [partial_sum(l_extendedprice#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(34) RegularHashAggregateExecTransformer +(37) RegularHashAggregateExecTransformer Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(l_extendedprice#X)] Aggregate Attributes [1]: [sum(l_extendedprice#X)#X] Results [1]: [sum(l_extendedprice#X)#X] -(35) ProjectExecTransformer +(38) ProjectExecTransformer Output [1]: [(sum(l_extendedprice#X)#X / 7.0) AS avg_yearly#X] Input [1]: [sum(l_extendedprice#X)#X] -(36) WholeStageCodegenTransformer (X) +(39) WholeStageCodegenTransformer (X) Input [1]: [avg_yearly#X] Arguments: false -(37) VeloxColumnarToRowExec +(40) VeloxColumnarToRowExec Input [1]: [avg_yearly#X] -(38) Scan parquet +(41) Scan parquet Output [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey), IsNotNull(l_quantity)] ReadSchema: struct -(39) Filter +(42) Filter Input [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] Condition : (isnotnull(l_partkey#X) AND isnotnull(l_quantity#X)) -(40) Exchange +(43) Exchange Input [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(41) Scan parquet +(44) Sort +Input [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] +Arguments: [l_partkey#X ASC NULLS FIRST], false, 0 + +(45) Scan parquet Output [3]: [p_partkey#X, p_brand#X, p_container#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_brand), IsNotNull(p_container), EqualTo(p_brand,Brand#X), EqualTo(p_container,MED BOX), IsNotNull(p_partkey)] ReadSchema: struct -(42) Filter +(46) Filter Input [3]: [p_partkey#X, p_brand#X, p_container#X] Condition : ((((isnotnull(p_brand#X) AND isnotnull(p_container#X)) AND (p_brand#X = Brand#X)) AND (p_container#X = MED BOX)) AND isnotnull(p_partkey#X)) -(43) Project +(47) Project Output [1]: [p_partkey#X] Input [3]: [p_partkey#X, p_brand#X, p_container#X] -(44) Exchange +(48) Exchange Input [1]: [p_partkey#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(45) ShuffledHashJoin +(49) Sort +Input [1]: [p_partkey#X] +Arguments: [p_partkey#X ASC NULLS FIRST], false, 0 + +(50) SortMergeJoin Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join type: Inner Join condition: None -(46) Project +(51) Project Output [3]: [l_quantity#X, l_extendedprice#X, p_partkey#X] Input [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, p_partkey#X] -(47) Scan parquet +(52) Scan parquet Output [2]: [l_partkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey)] ReadSchema: struct -(48) Filter +(53) Filter Input [2]: [l_partkey#X, l_quantity#X] Condition : isnotnull(l_partkey#X) -(49) HashAggregate +(54) HashAggregate Input [2]: [l_partkey#X, l_quantity#X] Keys [1]: [l_partkey#X] Functions [1]: [partial_avg(l_quantity#X)] Aggregate Attributes [2]: [sum#X, count#X] Results [3]: [l_partkey#X, sum#X, count#X] -(50) Exchange +(55) Exchange Input [3]: [l_partkey#X, sum#X, count#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(51) HashAggregate +(56) HashAggregate Input [3]: [l_partkey#X, sum#X, count#X] Keys [1]: [l_partkey#X] Functions [1]: [avg(l_quantity#X)] Aggregate Attributes [1]: [avg(l_quantity#X)#X] Results [2]: [(0.2 * avg(l_quantity#X)#X) AS (0.2 * avg(l_quantity))#X, l_partkey#X] -(52) Filter +(57) Filter Input [2]: [(0.2 * avg(l_quantity))#X, l_partkey#X] Condition : isnotnull((0.2 * avg(l_quantity))#X) -(53) ShuffledHashJoin +(58) Sort +Input [2]: [(0.2 * avg(l_quantity))#X, l_partkey#X] +Arguments: [l_partkey#X ASC NULLS FIRST], false, 0 + +(59) SortMergeJoin Left keys [1]: [p_partkey#X] Right keys [1]: [l_partkey#X] Join type: Inner Join condition: (cast(l_quantity#X as decimal(18,7)) < (0.2 * avg(l_quantity))#X) -(54) Project +(60) Project Output [1]: [l_extendedprice#X] Input [5]: [l_quantity#X, l_extendedprice#X, p_partkey#X, (0.2 * avg(l_quantity))#X, l_partkey#X] -(55) HashAggregate +(61) HashAggregate Input [1]: [l_extendedprice#X] Keys: [] Functions [1]: [partial_sum(l_extendedprice#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(56) HashAggregate +(62) HashAggregate Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum(l_extendedprice#X)] Aggregate Attributes [1]: [sum(l_extendedprice#X)#X] Results [1]: [(sum(l_extendedprice#X)#X / 7.0) AS avg_yearly#X] -(57) AdaptiveSparkPlan +(63) AdaptiveSparkPlan Output [1]: [avg_yearly#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/18.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/18.txt index ad97ed4013f04..febb48962446b 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/18.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/18.txt @@ -1,96 +1,102 @@ == Physical Plan == -AdaptiveSparkPlan (97) +AdaptiveSparkPlan (110) +- == Final Plan == - VeloxColumnarToRowExec (64) - +- TakeOrderedAndProjectExecTransformer (63) - +- ^ RegularHashAggregateExecTransformer (61) - +- ^ RegularHashAggregateExecTransformer (60) - +- ^ ProjectExecTransformer (59) - +- ^ ShuffledHashJoinExecTransformer Inner (58) - :- ^ InputIteratorTransformer (41) - : +- ^ InputAdapter (40) - : +- ^ ShuffleQueryStage (39), Statistics(X) - : +- ColumnarExchange (38) - : +- ^ ProjectExecTransformer (36) - : +- ^ ShuffledHashJoinExecTransformer Inner (35) - : :- ^ InputIteratorTransformer (8) - : : +- ^ InputAdapter (7) - : : +- ^ ShuffleQueryStage (6), Statistics(X) - : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (70) + +- TakeOrderedAndProjectExecTransformer (69) + +- ^ RegularHashAggregateExecTransformer (67) + +- ^ RegularHashAggregateExecTransformer (66) + +- ^ ProjectExecTransformer (65) + +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (64) + :- ^ InputIteratorTransformer (46) + : +- ShuffleQueryStage (44), Statistics(X) + : +- ColumnarExchange (43) + : +- VeloxAppendBatches (42) + : +- ^ ProjectExecTransformer (40) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (39) + : :- ^ InputIteratorTransformer (9) + : : +- ShuffleQueryStage (7), Statistics(X) + : : +- ColumnarExchange (6) + : : +- VeloxAppendBatches (5) : : +- ^ ProjectExecTransformer (3) : : +- ^ FilterExecTransformer (2) : : +- ^ Scan parquet (1) - : +- ^ InputIteratorTransformer (34) - : +- ^ InputAdapter (33) - : +- ^ ShuffleQueryStage (32), Statistics(X) - : +- ColumnarExchange (31) - : +- ^ ProjectExecTransformer (29) - : +- ^ ShuffledHashJoinExecTransformer LeftSemi (28) - : :- ^ InputIteratorTransformer (16) - : : +- ^ InputAdapter (15) - : : +- ^ ShuffleQueryStage (14), Statistics(X) - : : +- ColumnarExchange (13) - : : +- ^ ProjectExecTransformer (11) - : : +- ^ FilterExecTransformer (10) - : : +- ^ Scan parquet (9) - : +- ^ ProjectExecTransformer (27) - : +- ^ FilterExecTransformer (26) - : +- ^ RegularHashAggregateExecTransformer (25) - : +- ^ InputIteratorTransformer (24) - : +- ^ InputAdapter (23) - : +- ^ ShuffleQueryStage (22), Statistics(X) - : +- ColumnarExchange (21) - : +- ^ ProjectExecTransformer (19) - : +- ^ FlushableHashAggregateExecTransformer (18) - : +- ^ Scan parquet (17) - +- ^ ShuffledHashJoinExecTransformer LeftSemi (57) - :- ^ InputIteratorTransformer (49) - : +- ^ InputAdapter (48) - : +- ^ ShuffleQueryStage (47), Statistics(X) - : +- ColumnarExchange (46) - : +- ^ ProjectExecTransformer (44) - : +- ^ FilterExecTransformer (43) - : +- ^ Scan parquet (42) - +- ^ ProjectExecTransformer (56) - +- ^ FilterExecTransformer (55) - +- ^ RegularHashAggregateExecTransformer (54) - +- ^ InputIteratorTransformer (53) - +- ^ InputAdapter (52) - +- ^ ShuffleQueryStage (51), Statistics(X) - +- ReusedExchange (50) + : +- ^ InputIteratorTransformer (38) + : +- ShuffleQueryStage (36), Statistics(X) + : +- ColumnarExchange (35) + : +- VeloxAppendBatches (34) + : +- ^ ProjectExecTransformer (32) + : +- ^ ShuffledHashJoinExecTransformer LeftSemi BuildRight (31) + : :- ^ InputIteratorTransformer (18) + : : +- ShuffleQueryStage (16), Statistics(X) + : : +- ColumnarExchange (15) + : : +- VeloxAppendBatches (14) + : : +- ^ ProjectExecTransformer (12) + : : +- ^ FilterExecTransformer (11) + : : +- ^ Scan parquet (10) + : +- ^ ProjectExecTransformer (30) + : +- ^ FilterExecTransformer (29) + : +- ^ RegularHashAggregateExecTransformer (28) + : +- ^ InputIteratorTransformer (27) + : +- ShuffleQueryStage (25), Statistics(X) + : +- ColumnarExchange (24) + : +- VeloxAppendBatches (23) + : +- ^ ProjectExecTransformer (21) + : +- ^ FlushableHashAggregateExecTransformer (20) + : +- ^ Scan parquet (19) + +- ^ ShuffledHashJoinExecTransformer LeftSemi BuildRight (63) + :- ^ InputIteratorTransformer (55) + : +- ShuffleQueryStage (53), Statistics(X) + : +- ColumnarExchange (52) + : +- VeloxAppendBatches (51) + : +- ^ ProjectExecTransformer (49) + : +- ^ FilterExecTransformer (48) + : +- ^ Scan parquet (47) + +- ^ ProjectExecTransformer (62) + +- ^ FilterExecTransformer (61) + +- ^ RegularHashAggregateExecTransformer (60) + +- ^ InputIteratorTransformer (59) + +- ShuffleQueryStage (57), Statistics(X) + +- ReusedExchange (56) +- == Initial Plan == - TakeOrderedAndProject (96) - +- HashAggregate (95) - +- HashAggregate (94) - +- Project (93) - +- ShuffledHashJoin Inner BuildRight (92) - :- Exchange (81) - : +- Project (80) - : +- ShuffledHashJoin Inner BuildLeft (79) - : :- Exchange (67) - : : +- Filter (66) - : : +- Scan parquet (65) - : +- Exchange (78) - : +- ShuffledHashJoin LeftSemi BuildRight (77) - : :- Exchange (70) - : : +- Filter (69) - : : +- Scan parquet (68) - : +- Project (76) - : +- Filter (75) - : +- HashAggregate (74) - : +- Exchange (73) - : +- HashAggregate (72) - : +- Scan parquet (71) - +- ShuffledHashJoin LeftSemi BuildRight (91) - :- Exchange (84) - : +- Filter (83) - : +- Scan parquet (82) - +- Project (90) - +- Filter (89) - +- HashAggregate (88) - +- Exchange (87) - +- HashAggregate (86) - +- Scan parquet (85) + TakeOrderedAndProject (109) + +- HashAggregate (108) + +- HashAggregate (107) + +- Project (106) + +- SortMergeJoin Inner (105) + :- Sort (92) + : +- Exchange (91) + : +- Project (90) + : +- SortMergeJoin Inner (89) + : :- Sort (74) + : : +- Exchange (73) + : : +- Filter (72) + : : +- Scan parquet (71) + : +- Sort (88) + : +- Exchange (87) + : +- SortMergeJoin LeftSemi (86) + : :- Sort (78) + : : +- Exchange (77) + : : +- Filter (76) + : : +- Scan parquet (75) + : +- Sort (85) + : +- Project (84) + : +- Filter (83) + : +- HashAggregate (82) + : +- Exchange (81) + : +- HashAggregate (80) + : +- Scan parquet (79) + +- SortMergeJoin LeftSemi (104) + :- Sort (96) + : +- Exchange (95) + : +- Filter (94) + : +- Scan parquet (93) + +- Sort (103) + +- Project (102) + +- Filter (101) + +- HashAggregate (100) + +- Exchange (99) + +- HashAggregate (98) + +- Scan parquet (97) (1) Scan parquet @@ -112,428 +118,480 @@ Input [2]: [c_custkey#X, c_name#X] Input [3]: [hash_partition_key#X, c_custkey#X, c_name#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [3]: [hash_partition_key#X, c_custkey#X, c_name#X] +Arguments: X + +(6) ColumnarExchange Input [3]: [hash_partition_key#X, c_custkey#X, c_name#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [2]: [c_custkey#X, c_name#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [2]: [c_custkey#X, c_name#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [2]: [c_custkey#X, c_name#X] -(9) Scan parquet +(10) Scan parquet Output [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(10) FilterExecTransformer +(11) FilterExecTransformer Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Arguments: (isnotnull(o_custkey#X) AND isnotnull(o_orderkey#X)) -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [5]: [hash(o_orderkey#X, 42) AS hash_partition_key#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] +Arguments: X + +(15) ColumnarExchange Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -(17) Scan parquet +(19) Scan parquet Output [2]: [l_orderkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(18) FlushableHashAggregateExecTransformer +(20) FlushableHashAggregateExecTransformer Input [2]: [l_orderkey#X, l_quantity#X] Keys [1]: [l_orderkey#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_orderkey#X, sum#X, isEmpty#X] -(19) ProjectExecTransformer +(21) ProjectExecTransformer Output [4]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, sum#X, isEmpty#X] Input [3]: [l_orderkey#X, sum#X, isEmpty#X] -(20) WholeStageCodegenTransformer (X) +(22) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, l_orderkey#X, sum#X, isEmpty#X] Arguments: false -(21) ColumnarExchange +(23) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_orderkey#X, sum#X, isEmpty#X] +Arguments: X + +(24) ColumnarExchange Input [4]: [hash_partition_key#X, l_orderkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(22) ShuffleQueryStage +(25) ShuffleQueryStage Output [3]: [l_orderkey#X, sum#X, isEmpty#X] Arguments: X -(23) InputAdapter +(26) InputAdapter Input [3]: [l_orderkey#X, sum#X, isEmpty#X] -(24) InputIteratorTransformer +(27) InputIteratorTransformer Input [3]: [l_orderkey#X, sum#X, isEmpty#X] -(25) RegularHashAggregateExecTransformer +(28) RegularHashAggregateExecTransformer Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Keys [1]: [l_orderkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [2]: [l_orderkey#X, sum(l_quantity#X)#X AS sum(l_quantity#X)#X] -(26) FilterExecTransformer +(29) FilterExecTransformer Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] Arguments: (isnotnull(sum(l_quantity#X)#X) AND (sum(l_quantity#X)#X > 300.00)) -(27) ProjectExecTransformer +(30) ProjectExecTransformer Output [1]: [l_orderkey#X] Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] -(28) ShuffledHashJoinExecTransformer +(31) ShuffledHashJoinExecTransformer Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: LeftSemi Join condition: None -(29) ProjectExecTransformer +(32) ProjectExecTransformer Output [5]: [hash(o_custkey#X, 42) AS hash_partition_key#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -(30) WholeStageCodegenTransformer (X) +(33) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Arguments: false -(31) ColumnarExchange +(34) VeloxAppendBatches +Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] +Arguments: X + +(35) ColumnarExchange Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X], [plan_id=X], [id=#X] -(32) ShuffleQueryStage +(36) ShuffleQueryStage Output [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Arguments: X -(33) InputAdapter +(37) InputAdapter Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -(34) InputIteratorTransformer +(38) InputIteratorTransformer Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -(35) ShuffledHashJoinExecTransformer +(39) ShuffledHashJoinExecTransformer Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join type: Inner Join condition: None -(36) ProjectExecTransformer +(40) ProjectExecTransformer Output [6]: [hash(o_orderkey#X, 42) AS hash_partition_key#X, c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] Input [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -(37) WholeStageCodegenTransformer (X) +(41) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] Arguments: false -(38) ColumnarExchange +(42) VeloxAppendBatches +Input [6]: [hash_partition_key#X, c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] +Arguments: X + +(43) ColumnarExchange Input [6]: [hash_partition_key#X, c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X], [plan_id=X], [id=#X] -(39) ShuffleQueryStage +(44) ShuffleQueryStage Output [5]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] Arguments: X -(40) InputAdapter +(45) InputAdapter Input [5]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] -(41) InputIteratorTransformer +(46) InputIteratorTransformer Input [5]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] -(42) Scan parquet +(47) Scan parquet Output [2]: [l_orderkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_orderkey)] ReadSchema: struct -(43) FilterExecTransformer +(48) FilterExecTransformer Input [2]: [l_orderkey#X, l_quantity#X] Arguments: isnotnull(l_orderkey#X) -(44) ProjectExecTransformer +(49) ProjectExecTransformer Output [3]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_quantity#X] Input [2]: [l_orderkey#X, l_quantity#X] -(45) WholeStageCodegenTransformer (X) +(50) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, l_orderkey#X, l_quantity#X] Arguments: false -(46) ColumnarExchange +(51) VeloxAppendBatches +Input [3]: [hash_partition_key#X, l_orderkey#X, l_quantity#X] +Arguments: X + +(52) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_quantity#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_quantity#X], [plan_id=X], [id=#X] -(47) ShuffleQueryStage +(53) ShuffleQueryStage Output [2]: [l_orderkey#X, l_quantity#X] Arguments: X -(48) InputAdapter +(54) InputAdapter Input [2]: [l_orderkey#X, l_quantity#X] -(49) InputIteratorTransformer +(55) InputIteratorTransformer Input [2]: [l_orderkey#X, l_quantity#X] -(50) ReusedExchange [Reuses operator id: 21] +(56) ReusedExchange [Reuses operator id: 24] Output [3]: [l_orderkey#X, sum#X, isEmpty#X] -(51) ShuffleQueryStage +(57) ShuffleQueryStage Output [3]: [l_orderkey#X, sum#X, isEmpty#X] Arguments: X -(52) InputAdapter +(58) InputAdapter Input [3]: [l_orderkey#X, sum#X, isEmpty#X] -(53) InputIteratorTransformer +(59) InputIteratorTransformer Input [3]: [l_orderkey#X, sum#X, isEmpty#X] -(54) RegularHashAggregateExecTransformer +(60) RegularHashAggregateExecTransformer Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Keys [1]: [l_orderkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [2]: [l_orderkey#X, sum(l_quantity#X)#X AS sum(l_quantity#X)#X] -(55) FilterExecTransformer +(61) FilterExecTransformer Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] Arguments: (isnotnull(sum(l_quantity#X)#X) AND (sum(l_quantity#X)#X > 300.00)) -(56) ProjectExecTransformer +(62) ProjectExecTransformer Output [1]: [l_orderkey#X] Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] -(57) ShuffledHashJoinExecTransformer +(63) ShuffledHashJoinExecTransformer Left keys [1]: [l_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: LeftSemi Join condition: None -(58) ShuffledHashJoinExecTransformer +(64) ShuffledHashJoinExecTransformer Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: Inner Join condition: None -(59) ProjectExecTransformer +(65) ProjectExecTransformer Output [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_quantity#X] Input [7]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_orderkey#X, l_quantity#X] -(60) RegularHashAggregateExecTransformer +(66) RegularHashAggregateExecTransformer Input [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_quantity#X] Keys [5]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] -(61) RegularHashAggregateExecTransformer +(67) RegularHashAggregateExecTransformer Input [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] Keys [5]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity#X)#X AS sum(l_quantity)#X] -(62) WholeStageCodegenTransformer (X) +(68) WholeStageCodegenTransformer (X) Input [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] Arguments: false -(63) TakeOrderedAndProjectExecTransformer +(69) TakeOrderedAndProjectExecTransformer Input [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] Arguments: X, [o_totalprice#X DESC NULLS LAST, o_orderdate#X ASC NULLS FIRST], [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X], 0 -(64) VeloxColumnarToRowExec +(70) VeloxColumnarToRowExec Input [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] -(65) Scan parquet +(71) Scan parquet Output [2]: [c_custkey#X, c_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey)] ReadSchema: struct -(66) Filter +(72) Filter Input [2]: [c_custkey#X, c_name#X] Condition : isnotnull(c_custkey#X) -(67) Exchange +(73) Exchange Input [2]: [c_custkey#X, c_name#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(68) Scan parquet +(74) Sort +Input [2]: [c_custkey#X, c_name#X] +Arguments: [c_custkey#X ASC NULLS FIRST], false, 0 + +(75) Scan parquet Output [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(69) Filter +(76) Filter Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Condition : (isnotnull(o_custkey#X) AND isnotnull(o_orderkey#X)) -(70) Exchange +(77) Exchange Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(71) Scan parquet +(78) Sort +Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(79) Scan parquet Output [2]: [l_orderkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(72) HashAggregate +(80) HashAggregate Input [2]: [l_orderkey#X, l_quantity#X] Keys [1]: [l_orderkey#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_orderkey#X, sum#X, isEmpty#X] -(73) Exchange +(81) Exchange Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(74) HashAggregate +(82) HashAggregate Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Keys [1]: [l_orderkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [2]: [l_orderkey#X, sum(l_quantity#X)#X AS sum(l_quantity#X)#X] -(75) Filter +(83) Filter Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] Condition : (isnotnull(sum(l_quantity#X)#X) AND (sum(l_quantity#X)#X > 300.00)) -(76) Project +(84) Project Output [1]: [l_orderkey#X] Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] -(77) ShuffledHashJoin +(85) Sort +Input [1]: [l_orderkey#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(86) SortMergeJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: LeftSemi Join condition: None -(78) Exchange +(87) Exchange Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(79) ShuffledHashJoin +(88) Sort +Input [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] +Arguments: [o_custkey#X ASC NULLS FIRST], false, 0 + +(89) SortMergeJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join type: Inner Join condition: None -(80) Project +(90) Project Output [5]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] Input [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -(81) Exchange +(91) Exchange Input [5]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(82) Scan parquet +(92) Sort +Input [5]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(93) Scan parquet Output [2]: [l_orderkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_orderkey)] ReadSchema: struct -(83) Filter +(94) Filter Input [2]: [l_orderkey#X, l_quantity#X] Condition : isnotnull(l_orderkey#X) -(84) Exchange +(95) Exchange Input [2]: [l_orderkey#X, l_quantity#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(85) Scan parquet +(96) Sort +Input [2]: [l_orderkey#X, l_quantity#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(97) Scan parquet Output [2]: [l_orderkey#X, l_quantity#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(86) HashAggregate +(98) HashAggregate Input [2]: [l_orderkey#X, l_quantity#X] Keys [1]: [l_orderkey#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [l_orderkey#X, sum#X, isEmpty#X] -(87) Exchange +(99) Exchange Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(88) HashAggregate +(100) HashAggregate Input [3]: [l_orderkey#X, sum#X, isEmpty#X] Keys [1]: [l_orderkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [2]: [l_orderkey#X, sum(l_quantity#X)#X AS sum(l_quantity#X)#X] -(89) Filter +(101) Filter Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] Condition : (isnotnull(sum(l_quantity#X)#X) AND (sum(l_quantity#X)#X > 300.00)) -(90) Project +(102) Project Output [1]: [l_orderkey#X] Input [2]: [l_orderkey#X, sum(l_quantity#X)#X] -(91) ShuffledHashJoin +(103) Sort +Input [1]: [l_orderkey#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(104) SortMergeJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: LeftSemi Join condition: None -(92) ShuffledHashJoin +(105) SortMergeJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: Inner Join condition: None -(93) Project +(106) Project Output [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_quantity#X] Input [7]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_orderkey#X, l_quantity#X] -(94) HashAggregate +(107) HashAggregate Input [6]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X, l_quantity#X] Keys [5]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] -(95) HashAggregate +(108) HashAggregate Input [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] Keys [5]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity#X)#X AS sum(l_quantity)#X] -(96) TakeOrderedAndProject +(109) TakeOrderedAndProject Input [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] Arguments: X, [o_totalprice#X DESC NULLS LAST, o_orderdate#X ASC NULLS FIRST], [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] -(97) AdaptiveSparkPlan +(110) AdaptiveSparkPlan Output [6]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum(l_quantity)#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/19.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/19.txt index 4b5c20d3a1b19..fa78645313e48 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/19.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/19.txt @@ -1,37 +1,39 @@ == Physical Plan == -AdaptiveSparkPlan (34) +AdaptiveSparkPlan (38) +- == Final Plan == - VeloxColumnarToRowExec (22) - +- ^ RegularHashAggregateExecTransformer (20) - +- ^ RegularHashAggregateExecTransformer (19) - +- ^ ProjectExecTransformer (18) - +- ^ ShuffledHashJoinExecTransformer Inner (17) - :- ^ InputIteratorTransformer (8) - : +- ^ InputAdapter (7) - : +- ^ ShuffleQueryStage (6), Statistics(X) - : +- ColumnarExchange (5) + VeloxColumnarToRowExec (24) + +- ^ RegularHashAggregateExecTransformer (22) + +- ^ RegularHashAggregateExecTransformer (21) + +- ^ ProjectExecTransformer (20) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (19) + :- ^ InputIteratorTransformer (9) + : +- ShuffleQueryStage (7), Statistics(X) + : +- ColumnarExchange (6) + : +- VeloxAppendBatches (5) : +- ^ ProjectExecTransformer (3) : +- ^ FilterExecTransformer (2) : +- ^ Scan parquet (1) - +- ^ InputIteratorTransformer (16) - +- ^ InputAdapter (15) - +- ^ ShuffleQueryStage (14), Statistics(X) - +- ColumnarExchange (13) - +- ^ ProjectExecTransformer (11) - +- ^ FilterExecTransformer (10) - +- ^ Scan parquet (9) + +- ^ InputIteratorTransformer (18) + +- ShuffleQueryStage (16), Statistics(X) + +- ColumnarExchange (15) + +- VeloxAppendBatches (14) + +- ^ ProjectExecTransformer (12) + +- ^ FilterExecTransformer (11) + +- ^ Scan parquet (10) +- == Initial Plan == - HashAggregate (33) - +- HashAggregate (32) - +- Project (31) - +- ShuffledHashJoin Inner BuildRight (30) - :- Exchange (26) - : +- Project (25) - : +- Filter (24) - : +- Scan parquet (23) - +- Exchange (29) - +- Filter (28) - +- Scan parquet (27) + HashAggregate (37) + +- HashAggregate (36) + +- Project (35) + +- SortMergeJoin Inner (34) + :- Sort (29) + : +- Exchange (28) + : +- Project (27) + : +- Filter (26) + : +- Scan parquet (25) + +- Sort (33) + +- Exchange (32) + +- Filter (31) + +- Scan parquet (30) (1) Scan parquet @@ -53,142 +55,158 @@ Input [6]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, l_shipin Input [5]: [hash_partition_key#X, l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [5]: [hash_partition_key#X, l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(6) ColumnarExchange Input [5]: [hash_partition_key#X, l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -(9) Scan parquet +(10) Scan parquet Output [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_size), GreaterThanOrEqual(p_size,1), IsNotNull(p_partkey), Or(Or(And(And(EqualTo(p_brand,Brand#X),In(p_container, [SM BOX,SM CASE,SM PACK,SM PKG])),LessThanOrEqual(p_size,5)),And(And(EqualTo(p_brand,Brand#X),In(p_container, [MED BAG,MED BOX,MED PACK,MED PKG])),LessThanOrEqual(p_size,10))),And(And(EqualTo(p_brand,Brand#X),In(p_container, [LG BOX,LG CASE,LG PACK,LG PKG])),LessThanOrEqual(p_size,15)))] ReadSchema: struct -(10) FilterExecTransformer +(11) FilterExecTransformer Input [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] Arguments: (((isnotnull(p_size#X) AND (p_size#X >= 1)) AND isnotnull(p_partkey#X)) AND (((((p_brand#X = Brand#X) AND p_container#X IN (SM CASE,SM BOX,SM PACK,SM PKG)) AND (p_size#X <= 5)) OR (((p_brand#X = Brand#X) AND p_container#X IN (MED BAG,MED BOX,MED PKG,MED PACK)) AND (p_size#X <= 10))) OR (((p_brand#X = Brand#X) AND p_container#X IN (LG CASE,LG BOX,LG PACK,LG PKG)) AND (p_size#X <= 15)))) -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [5]: [hash(p_partkey#X, 42) AS hash_partition_key#X, p_partkey#X, p_brand#X, p_size#X, p_container#X] Input [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, p_partkey#X, p_brand#X, p_size#X, p_container#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [5]: [hash_partition_key#X, p_partkey#X, p_brand#X, p_size#X, p_container#X] +Arguments: X + +(15) ColumnarExchange Input [5]: [hash_partition_key#X, p_partkey#X, p_brand#X, p_size#X, p_container#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X, p_brand#X, p_size#X, p_container#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join type: Inner Join condition: (((((((p_brand#X = Brand#X) AND p_container#X IN (SM CASE,SM BOX,SM PACK,SM PKG)) AND (l_quantity#X >= 1.00)) AND (l_quantity#X <= 11.00)) AND (p_size#X <= 5)) OR (((((p_brand#X = Brand#X) AND p_container#X IN (MED BAG,MED BOX,MED PKG,MED PACK)) AND (l_quantity#X >= 10.00)) AND (l_quantity#X <= 20.00)) AND (p_size#X <= 10))) OR (((((p_brand#X = Brand#X) AND p_container#X IN (LG CASE,LG BOX,LG PACK,LG PKG)) AND (l_quantity#X >= 20.00)) AND (l_quantity#X <= 30.00)) AND (p_size#X <= 15))) -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [3]: [l_extendedprice#X, l_discount#X, (l_extendedprice#X * (1 - l_discount#X)) AS _pre_X#X] Input [8]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, p_partkey#X, p_brand#X, p_size#X, p_container#X] -(19) RegularHashAggregateExecTransformer +(21) RegularHashAggregateExecTransformer Input [3]: [l_extendedprice#X, l_discount#X, _pre_X#X] Keys: [] Functions [1]: [partial_sum(_pre_X#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(20) RegularHashAggregateExecTransformer +(22) RegularHashAggregateExecTransformer Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X] Results [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X AS revenue#X] -(21) WholeStageCodegenTransformer (X) +(23) WholeStageCodegenTransformer (X) Input [1]: [revenue#X] Arguments: false -(22) VeloxColumnarToRowExec +(24) VeloxColumnarToRowExec Input [1]: [revenue#X] -(23) Scan parquet +(25) Scan parquet Output [6]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, l_shipinstruct#X, l_shipmode#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipinstruct), In(l_shipmode, [AIR,AIR REG]), EqualTo(l_shipinstruct,DELIVER IN PERSON), IsNotNull(l_partkey), Or(Or(And(GreaterThanOrEqual(l_quantity,1.00),LessThanOrEqual(l_quantity,11.00)),And(GreaterThanOrEqual(l_quantity,10.00),LessThanOrEqual(l_quantity,20.00))),And(GreaterThanOrEqual(l_quantity,20.00),LessThanOrEqual(l_quantity,30.00)))] ReadSchema: struct -(24) Filter +(26) Filter Input [6]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, l_shipinstruct#X, l_shipmode#X] Condition : ((((isnotnull(l_shipinstruct#X) AND l_shipmode#X IN (AIR,AIR REG)) AND (l_shipinstruct#X = DELIVER IN PERSON)) AND isnotnull(l_partkey#X)) AND ((((l_quantity#X >= 1.00) AND (l_quantity#X <= 11.00)) OR ((l_quantity#X >= 10.00) AND (l_quantity#X <= 20.00))) OR ((l_quantity#X >= 20.00) AND (l_quantity#X <= 30.00)))) -(25) Project +(27) Project Output [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Input [6]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, l_shipinstruct#X, l_shipmode#X] -(26) Exchange +(28) Exchange Input [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(27) Scan parquet +(29) Sort +Input [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] +Arguments: [l_partkey#X ASC NULLS FIRST], false, 0 + +(30) Scan parquet Output [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_size), GreaterThanOrEqual(p_size,1), IsNotNull(p_partkey), Or(Or(And(And(EqualTo(p_brand,Brand#X),In(p_container, [SM BOX,SM CASE,SM PACK,SM PKG])),LessThanOrEqual(p_size,5)),And(And(EqualTo(p_brand,Brand#X),In(p_container, [MED BAG,MED BOX,MED PACK,MED PKG])),LessThanOrEqual(p_size,10))),And(And(EqualTo(p_brand,Brand#X),In(p_container, [LG BOX,LG CASE,LG PACK,LG PKG])),LessThanOrEqual(p_size,15)))] ReadSchema: struct -(28) Filter +(31) Filter Input [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] Condition : (((isnotnull(p_size#X) AND (p_size#X >= 1)) AND isnotnull(p_partkey#X)) AND (((((p_brand#X = Brand#X) AND p_container#X IN (SM CASE,SM BOX,SM PACK,SM PKG)) AND (p_size#X <= 5)) OR (((p_brand#X = Brand#X) AND p_container#X IN (MED BAG,MED BOX,MED PKG,MED PACK)) AND (p_size#X <= 10))) OR (((p_brand#X = Brand#X) AND p_container#X IN (LG CASE,LG BOX,LG PACK,LG PKG)) AND (p_size#X <= 15)))) -(29) Exchange +(32) Exchange Input [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(30) ShuffledHashJoin +(33) Sort +Input [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] +Arguments: [p_partkey#X ASC NULLS FIRST], false, 0 + +(34) SortMergeJoin Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join type: Inner Join condition: (((((((p_brand#X = Brand#X) AND p_container#X IN (SM CASE,SM BOX,SM PACK,SM PKG)) AND (l_quantity#X >= 1.00)) AND (l_quantity#X <= 11.00)) AND (p_size#X <= 5)) OR (((((p_brand#X = Brand#X) AND p_container#X IN (MED BAG,MED BOX,MED PKG,MED PACK)) AND (l_quantity#X >= 10.00)) AND (l_quantity#X <= 20.00)) AND (p_size#X <= 10))) OR (((((p_brand#X = Brand#X) AND p_container#X IN (LG CASE,LG BOX,LG PACK,LG PKG)) AND (l_quantity#X >= 20.00)) AND (l_quantity#X <= 30.00)) AND (p_size#X <= 15))) -(31) Project +(35) Project Output [2]: [l_extendedprice#X, l_discount#X] Input [8]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, p_partkey#X, p_brand#X, p_size#X, p_container#X] -(32) HashAggregate +(36) HashAggregate Input [2]: [l_extendedprice#X, l_discount#X] Keys: [] Functions [1]: [partial_sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(33) HashAggregate +(37) HashAggregate Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X] Results [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X AS revenue#X] -(34) AdaptiveSparkPlan +(38) AdaptiveSparkPlan Output [1]: [revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/20.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/20.txt index 17f48f8ffa72a..bb9987fc32c1b 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/20.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/20.txt @@ -1,119 +1,129 @@ == Physical Plan == -AdaptiveSparkPlan (123) +AdaptiveSparkPlan (143) +- == Final Plan == - VeloxColumnarToRowExec (83) - +- AQEShuffleRead (82) - +- ShuffleQueryStage (81), Statistics(X) - +- ColumnarExchange (80) - +- ^ ProjectExecTransformer (78) - +- ^ ShuffledHashJoinExecTransformer Inner (77) - :- ^ InputIteratorTransformer (68) - : +- ^ InputAdapter (67) - : +- ^ ShuffleQueryStage (66), Statistics(X) - : +- ColumnarExchange (65) - : +- ^ ProjectExecTransformer (63) - : +- ^ ShuffledHashJoinExecTransformer LeftSemi (62) - : :- ^ InputIteratorTransformer (8) - : : +- ^ InputAdapter (7) - : : +- ^ ShuffleQueryStage (6), Statistics(X) - : : +- ColumnarExchange (5) - : : +- ^ ProjectExecTransformer (3) - : : +- ^ FilterExecTransformer (2) - : : +- ^ Scan parquet (1) - : +- ^ InputIteratorTransformer (61) - : +- ^ InputAdapter (60) - : +- ^ ShuffleQueryStage (59), Statistics(X) - : +- ColumnarExchange (58) - : +- ^ ProjectExecTransformer (56) - : +- ^ ShuffledHashJoinExecTransformer Inner (55) - : :- ^ InputIteratorTransformer (31) - : : +- ^ InputAdapter (30) - : : +- ^ ShuffleQueryStage (29), Statistics(X) - : : +- ColumnarExchange (28) - : : +- ^ ProjectExecTransformer (26) - : : +- ^ ShuffledHashJoinExecTransformer LeftSemi (25) - : : :- ^ InputIteratorTransformer (16) - : : : +- ^ InputAdapter (15) - : : : +- ^ ShuffleQueryStage (14), Statistics(X) - : : : +- ColumnarExchange (13) - : : : +- ^ ProjectExecTransformer (11) - : : : +- ^ FilterExecTransformer (10) - : : : +- ^ Scan parquet (9) - : : +- ^ InputIteratorTransformer (24) - : : +- ^ InputAdapter (23) - : : +- ^ ShuffleQueryStage (22), Statistics(X) - : : +- ColumnarExchange (21) - : : +- ^ ProjectExecTransformer (19) - : : +- ^ FilterExecTransformer (18) - : : +- ^ Scan parquet (17) - : +- ^ InputIteratorTransformer (54) - : +- ^ InputAdapter (53) - : +- ^ ShuffleQueryStage (52), Statistics(X) - : +- ColumnarExchange (51) - : +- ^ ProjectExecTransformer (49) - : +- ^ FilterExecTransformer (48) - : +- ^ ProjectExecTransformer (47) - : +- ^ RegularHashAggregateExecTransformer (46) - : +- ^ RegularHashAggregateExecTransformer (45) - : +- ^ ShuffledHashJoinExecTransformer LeftSemi (44) - : :- ^ InputIteratorTransformer (39) - : : +- ^ InputAdapter (38) - : : +- ^ ShuffleQueryStage (37), Statistics(X) - : : +- ColumnarExchange (36) - : : +- ^ ProjectExecTransformer (34) - : : +- ^ FilterExecTransformer (33) - : : +- ^ Scan parquet (32) - : +- ^ InputIteratorTransformer (43) - : +- ^ InputAdapter (42) - : +- ^ ShuffleQueryStage (41), Statistics(X) - : +- ReusedExchange (40) - +- ^ InputIteratorTransformer (76) - +- ^ InputAdapter (75) - +- ^ ShuffleQueryStage (74), Statistics(X) - +- ColumnarExchange (73) - +- ^ ProjectExecTransformer (71) - +- ^ FilterExecTransformer (70) - +- ^ Scan parquet (69) + VeloxColumnarToRowExec (93) + +- AQEShuffleRead (92) + +- ShuffleQueryStage (91), Statistics(X) + +- ColumnarExchange (90) + +- VeloxAppendBatches (89) + +- ^ ProjectExecTransformer (87) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (86) + :- ^ InputIteratorTransformer (76) + : +- ShuffleQueryStage (74), Statistics(X) + : +- ColumnarExchange (73) + : +- VeloxAppendBatches (72) + : +- ^ ProjectExecTransformer (70) + : +- ^ ShuffledHashJoinExecTransformer LeftSemi BuildRight (69) + : :- ^ InputIteratorTransformer (9) + : : +- ShuffleQueryStage (7), Statistics(X) + : : +- ColumnarExchange (6) + : : +- VeloxAppendBatches (5) + : : +- ^ ProjectExecTransformer (3) + : : +- ^ FilterExecTransformer (2) + : : +- ^ Scan parquet (1) + : +- ^ InputIteratorTransformer (68) + : +- ShuffleQueryStage (66), Statistics(X) + : +- ColumnarExchange (65) + : +- VeloxAppendBatches (64) + : +- ^ ProjectExecTransformer (62) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (61) + : :- ^ InputIteratorTransformer (35) + : : +- ShuffleQueryStage (33), Statistics(X) + : : +- ColumnarExchange (32) + : : +- VeloxAppendBatches (31) + : : +- ^ ProjectExecTransformer (29) + : : +- ^ ShuffledHashJoinExecTransformer LeftSemi BuildRight (28) + : : :- ^ InputIteratorTransformer (18) + : : : +- ShuffleQueryStage (16), Statistics(X) + : : : +- ColumnarExchange (15) + : : : +- VeloxAppendBatches (14) + : : : +- ^ ProjectExecTransformer (12) + : : : +- ^ FilterExecTransformer (11) + : : : +- ^ Scan parquet (10) + : : +- ^ InputIteratorTransformer (27) + : : +- ShuffleQueryStage (25), Statistics(X) + : : +- ColumnarExchange (24) + : : +- VeloxAppendBatches (23) + : : +- ^ ProjectExecTransformer (21) + : : +- ^ FilterExecTransformer (20) + : : +- ^ Scan parquet (19) + : +- ^ InputIteratorTransformer (60) + : +- ShuffleQueryStage (58), Statistics(X) + : +- ColumnarExchange (57) + : +- VeloxAppendBatches (56) + : +- ^ ProjectExecTransformer (54) + : +- ^ FilterExecTransformer (53) + : +- ^ ProjectExecTransformer (52) + : +- ^ RegularHashAggregateExecTransformer (51) + : +- ^ RegularHashAggregateExecTransformer (50) + : +- ^ ShuffledHashJoinExecTransformer LeftSemi BuildRight (49) + : :- ^ InputIteratorTransformer (44) + : : +- ShuffleQueryStage (42), Statistics(X) + : : +- ColumnarExchange (41) + : : +- VeloxAppendBatches (40) + : : +- ^ ProjectExecTransformer (38) + : : +- ^ FilterExecTransformer (37) + : : +- ^ Scan parquet (36) + : +- ^ InputIteratorTransformer (48) + : +- ShuffleQueryStage (46), Statistics(X) + : +- ReusedExchange (45) + +- ^ InputIteratorTransformer (85) + +- ShuffleQueryStage (83), Statistics(X) + +- ColumnarExchange (82) + +- VeloxAppendBatches (81) + +- ^ ProjectExecTransformer (79) + +- ^ FilterExecTransformer (78) + +- ^ Scan parquet (77) +- == Initial Plan == - Sort (122) - +- Exchange (121) - +- Project (120) - +- ShuffledHashJoin Inner BuildRight (119) - :- Exchange (114) - : +- Project (113) - : +- ShuffledHashJoin LeftSemi BuildRight (112) - : :- Exchange (86) - : : +- Filter (85) - : : +- Scan parquet (84) - : +- Exchange (111) - : +- Project (110) - : +- ShuffledHashJoin Inner BuildLeft (109) - : :- Exchange (95) - : : +- ShuffledHashJoin LeftSemi BuildRight (94) - : : :- Exchange (89) - : : : +- Filter (88) - : : : +- Scan parquet (87) - : : +- Exchange (93) - : : +- Project (92) - : : +- Filter (91) - : : +- Scan parquet (90) - : +- Exchange (108) - : +- Filter (107) - : +- HashAggregate (106) - : +- HashAggregate (105) - : +- ShuffledHashJoin LeftSemi BuildRight (104) - : :- Exchange (99) - : : +- Project (98) - : : +- Filter (97) - : : +- Scan parquet (96) - : +- Exchange (103) - : +- Project (102) - : +- Filter (101) - : +- Scan parquet (100) - +- Exchange (118) - +- Project (117) - +- Filter (116) - +- Scan parquet (115) + Sort (142) + +- Exchange (141) + +- Project (140) + +- SortMergeJoin Inner (139) + :- Sort (133) + : +- Exchange (132) + : +- Project (131) + : +- SortMergeJoin LeftSemi (130) + : :- Sort (97) + : : +- Exchange (96) + : : +- Filter (95) + : : +- Scan parquet (94) + : +- Sort (129) + : +- Exchange (128) + : +- Project (127) + : +- SortMergeJoin Inner (126) + : :- Sort (109) + : : +- Exchange (108) + : : +- SortMergeJoin LeftSemi (107) + : : :- Sort (101) + : : : +- Exchange (100) + : : : +- Filter (99) + : : : +- Scan parquet (98) + : : +- Sort (106) + : : +- Exchange (105) + : : +- Project (104) + : : +- Filter (103) + : : +- Scan parquet (102) + : +- Sort (125) + : +- Exchange (124) + : +- Filter (123) + : +- HashAggregate (122) + : +- HashAggregate (121) + : +- SortMergeJoin LeftSemi (120) + : :- Sort (114) + : : +- Exchange (113) + : : +- Project (112) + : : +- Filter (111) + : : +- Scan parquet (110) + : +- Sort (119) + : +- Exchange (118) + : +- Project (117) + : +- Filter (116) + : +- Scan parquet (115) + +- Sort (138) + +- Exchange (137) + +- Project (136) + +- Filter (135) + +- Scan parquet (134) (1) Scan parquet @@ -135,518 +145,598 @@ Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] +Arguments: X + +(6) ColumnarExchange Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] -(9) Scan parquet +(10) Scan parquet Output [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_availqty), IsNotNull(ps_partkey), IsNotNull(ps_suppkey)] ReadSchema: struct -(10) FilterExecTransformer +(11) FilterExecTransformer Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: ((isnotnull(ps_availqty#X) AND isnotnull(ps_partkey#X)) AND isnotnull(ps_suppkey#X)) -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [4]: [hash(ps_partkey#X, 42) AS hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X] Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X] +Arguments: X + +(15) ColumnarExchange Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_availqty#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] -(17) Scan parquet +(19) Scan parquet Output [2]: [p_partkey#X, p_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_name), StringStartsWith(p_name,forest)] ReadSchema: struct -(18) FilterExecTransformer +(20) FilterExecTransformer Input [2]: [p_partkey#X, p_name#X] Arguments: (isnotnull(p_name#X) AND StartsWith(p_name#X, forest)) -(19) ProjectExecTransformer +(21) ProjectExecTransformer Output [2]: [hash(p_partkey#X, 42) AS hash_partition_key#X, p_partkey#X] Input [2]: [p_partkey#X, p_name#X] -(20) WholeStageCodegenTransformer (X) +(22) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, p_partkey#X] Arguments: false -(21) ColumnarExchange +(23) VeloxAppendBatches +Input [2]: [hash_partition_key#X, p_partkey#X] +Arguments: X + +(24) ColumnarExchange Input [2]: [hash_partition_key#X, p_partkey#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [id=#X] -(22) ShuffleQueryStage +(25) ShuffleQueryStage Output [1]: [p_partkey#X] Arguments: X -(23) InputAdapter +(26) InputAdapter Input [1]: [p_partkey#X] -(24) InputIteratorTransformer +(27) InputIteratorTransformer Input [1]: [p_partkey#X] -(25) ShuffledHashJoinExecTransformer +(28) ShuffledHashJoinExecTransformer Left keys [1]: [ps_partkey#X] Right keys [1]: [p_partkey#X] Join type: LeftSemi Join condition: None -(26) ProjectExecTransformer +(29) ProjectExecTransformer Output [4]: [hash(ps_partkey#X, ps_suppkey#X, 42) AS hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X] Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] -(27) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: false -(28) ColumnarExchange +(31) VeloxAppendBatches +Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X] +Arguments: X + +(32) ColumnarExchange Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: hashpartitioning(ps_partkey#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_availqty#X], [plan_id=X], [id=#X] -(29) ShuffleQueryStage +(33) ShuffleQueryStage Output [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: X -(30) InputAdapter +(34) InputAdapter Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] -(31) InputIteratorTransformer +(35) InputIteratorTransformer Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] -(32) Scan parquet +(36) Scan parquet Output [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1994-01-01), LessThan(l_shipdate,1995-01-01), IsNotNull(l_partkey), IsNotNull(l_suppkey)] ReadSchema: struct -(33) FilterExecTransformer +(37) FilterExecTransformer Input [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] Arguments: ((((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1994-01-01)) AND (l_shipdate#X < 1995-01-01)) AND isnotnull(l_partkey#X)) AND isnotnull(l_suppkey#X)) -(34) ProjectExecTransformer +(38) ProjectExecTransformer Output [4]: [hash(l_partkey#X, 42) AS hash_partition_key#X, l_partkey#X, l_suppkey#X, l_quantity#X] Input [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] -(35) WholeStageCodegenTransformer (X) +(39) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, l_partkey#X, l_suppkey#X, l_quantity#X] Arguments: false -(36) ColumnarExchange +(40) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_partkey#X, l_suppkey#X, l_quantity#X] +Arguments: X + +(41) ColumnarExchange Input [4]: [hash_partition_key#X, l_partkey#X, l_suppkey#X, l_quantity#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_suppkey#X, l_quantity#X], [plan_id=X], [id=#X] -(37) ShuffleQueryStage +(42) ShuffleQueryStage Output [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] Arguments: X -(38) InputAdapter +(43) InputAdapter Input [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] -(39) InputIteratorTransformer +(44) InputIteratorTransformer Input [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] -(40) ReusedExchange [Reuses operator id: 21] +(45) ReusedExchange [Reuses operator id: 24] Output [1]: [p_partkey#X] -(41) ShuffleQueryStage +(46) ShuffleQueryStage Output [1]: [p_partkey#X] Arguments: X -(42) InputAdapter +(47) InputAdapter Input [1]: [p_partkey#X] -(43) InputIteratorTransformer +(48) InputIteratorTransformer Input [1]: [p_partkey#X] -(44) ShuffledHashJoinExecTransformer +(49) ShuffledHashJoinExecTransformer Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join type: LeftSemi Join condition: None -(45) RegularHashAggregateExecTransformer +(50) RegularHashAggregateExecTransformer Input [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] Keys [2]: [l_partkey#X, l_suppkey#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] -(46) RegularHashAggregateExecTransformer +(51) RegularHashAggregateExecTransformer Input [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] Keys [2]: [l_partkey#X, l_suppkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [3]: [l_partkey#X, l_suppkey#X, sum(l_quantity#X)#X] -(47) ProjectExecTransformer +(52) ProjectExecTransformer Output [3]: [(0.5 * sum(l_quantity#X)#X) AS (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Input [3]: [l_partkey#X, l_suppkey#X, sum(l_quantity#X)#X] -(48) FilterExecTransformer +(53) FilterExecTransformer Input [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Arguments: isnotnull((0.5 * sum(l_quantity))#X) -(49) ProjectExecTransformer +(54) ProjectExecTransformer Output [4]: [hash(l_partkey#X, l_suppkey#X, 42) AS hash_partition_key#X, (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Input [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] -(50) WholeStageCodegenTransformer (X) +(55) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Arguments: false -(51) ColumnarExchange +(56) VeloxAppendBatches +Input [4]: [hash_partition_key#X, (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] +Arguments: X + +(57) ColumnarExchange Input [4]: [hash_partition_key#X, (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Arguments: hashpartitioning(l_partkey#X, l_suppkey#X, 1), ENSURE_REQUIREMENTS, [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X], [plan_id=X], [id=#X] -(52) ShuffleQueryStage +(58) ShuffleQueryStage Output [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Arguments: X -(53) InputAdapter +(59) InputAdapter Input [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] -(54) InputIteratorTransformer +(60) InputIteratorTransformer Input [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] -(55) ShuffledHashJoinExecTransformer +(61) ShuffledHashJoinExecTransformer Left keys [2]: [ps_partkey#X, ps_suppkey#X] Right keys [2]: [l_partkey#X, l_suppkey#X] Join type: Inner Join condition: (cast(ps_availqty#X as decimal(24,3)) > (0.5 * sum(l_quantity))#X) -(56) ProjectExecTransformer +(62) ProjectExecTransformer Output [2]: [hash(ps_suppkey#X, 42) AS hash_partition_key#X, ps_suppkey#X] Input [6]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] -(57) WholeStageCodegenTransformer (X) +(63) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, ps_suppkey#X] Arguments: false -(58) ColumnarExchange +(64) VeloxAppendBatches +Input [2]: [hash_partition_key#X, ps_suppkey#X] +Arguments: X + +(65) ColumnarExchange Input [2]: [hash_partition_key#X, ps_suppkey#X] Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_suppkey#X], [plan_id=X], [id=#X] -(59) ShuffleQueryStage +(66) ShuffleQueryStage Output [1]: [ps_suppkey#X] Arguments: X -(60) InputAdapter +(67) InputAdapter Input [1]: [ps_suppkey#X] -(61) InputIteratorTransformer +(68) InputIteratorTransformer Input [1]: [ps_suppkey#X] -(62) ShuffledHashJoinExecTransformer +(69) ShuffledHashJoinExecTransformer Left keys [1]: [s_suppkey#X] Right keys [1]: [ps_suppkey#X] Join type: LeftSemi Join condition: None -(63) ProjectExecTransformer +(70) ProjectExecTransformer Output [4]: [hash(s_nationkey#X, 42) AS hash_partition_key#X, s_name#X, s_address#X, s_nationkey#X] Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] -(64) WholeStageCodegenTransformer (X) +(71) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, s_name#X, s_address#X, s_nationkey#X] Arguments: false -(65) ColumnarExchange +(72) VeloxAppendBatches +Input [4]: [hash_partition_key#X, s_name#X, s_address#X, s_nationkey#X] +Arguments: X + +(73) ColumnarExchange Input [4]: [hash_partition_key#X, s_name#X, s_address#X, s_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_name#X, s_address#X, s_nationkey#X], [plan_id=X], [id=#X] -(66) ShuffleQueryStage +(74) ShuffleQueryStage Output [3]: [s_name#X, s_address#X, s_nationkey#X] Arguments: X -(67) InputAdapter +(75) InputAdapter Input [3]: [s_name#X, s_address#X, s_nationkey#X] -(68) InputIteratorTransformer +(76) InputIteratorTransformer Input [3]: [s_name#X, s_address#X, s_nationkey#X] -(69) Scan parquet +(77) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,CANADA), IsNotNull(n_nationkey)] ReadSchema: struct -(70) FilterExecTransformer +(78) FilterExecTransformer Input [2]: [n_nationkey#X, n_name#X] Arguments: ((isnotnull(n_name#X) AND (n_name#X = CANADA)) AND isnotnull(n_nationkey#X)) -(71) ProjectExecTransformer +(79) ProjectExecTransformer Output [2]: [hash(n_nationkey#X, 42) AS hash_partition_key#X, n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(72) WholeStageCodegenTransformer (X) +(80) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, n_nationkey#X] Arguments: false -(73) ColumnarExchange +(81) VeloxAppendBatches +Input [2]: [hash_partition_key#X, n_nationkey#X] +Arguments: X + +(82) ColumnarExchange Input [2]: [hash_partition_key#X, n_nationkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X], [plan_id=X], [id=#X] -(74) ShuffleQueryStage +(83) ShuffleQueryStage Output [1]: [n_nationkey#X] Arguments: X -(75) InputAdapter +(84) InputAdapter Input [1]: [n_nationkey#X] -(76) InputIteratorTransformer +(85) InputIteratorTransformer Input [1]: [n_nationkey#X] -(77) ShuffledHashJoinExecTransformer +(86) ShuffledHashJoinExecTransformer Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(78) ProjectExecTransformer +(87) ProjectExecTransformer Output [2]: [s_name#X, s_address#X] Input [4]: [s_name#X, s_address#X, s_nationkey#X, n_nationkey#X] -(79) WholeStageCodegenTransformer (X) +(88) WholeStageCodegenTransformer (X) Input [2]: [s_name#X, s_address#X] Arguments: false -(80) ColumnarExchange +(89) VeloxAppendBatches +Input [2]: [s_name#X, s_address#X] +Arguments: X + +(90) ColumnarExchange Input [2]: [s_name#X, s_address#X] Arguments: rangepartitioning(s_name#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(81) ShuffleQueryStage +(91) ShuffleQueryStage Output [2]: [s_name#X, s_address#X] Arguments: X -(82) AQEShuffleRead +(92) AQEShuffleRead Input [2]: [s_name#X, s_address#X] Arguments: local -(83) VeloxColumnarToRowExec +(93) VeloxColumnarToRowExec Input [2]: [s_name#X, s_address#X] -(84) Scan parquet +(94) Scan parquet Output [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_nationkey)] ReadSchema: struct -(85) Filter +(95) Filter Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Condition : isnotnull(s_nationkey#X) -(86) Exchange +(96) Exchange Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(87) Scan parquet +(97) Sort +Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] +Arguments: [s_suppkey#X ASC NULLS FIRST], false, 0 + +(98) Scan parquet Output [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_availqty), IsNotNull(ps_partkey), IsNotNull(ps_suppkey)] ReadSchema: struct -(88) Filter +(99) Filter Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Condition : ((isnotnull(ps_availqty#X) AND isnotnull(ps_partkey#X)) AND isnotnull(ps_suppkey#X)) -(89) Exchange +(100) Exchange Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(90) Scan parquet +(101) Sort +Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] +Arguments: [ps_partkey#X ASC NULLS FIRST], false, 0 + +(102) Scan parquet Output [2]: [p_partkey#X, p_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_name), StringStartsWith(p_name,forest)] ReadSchema: struct -(91) Filter +(103) Filter Input [2]: [p_partkey#X, p_name#X] Condition : (isnotnull(p_name#X) AND StartsWith(p_name#X, forest)) -(92) Project +(104) Project Output [1]: [p_partkey#X] Input [2]: [p_partkey#X, p_name#X] -(93) Exchange +(105) Exchange Input [1]: [p_partkey#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(94) ShuffledHashJoin +(106) Sort +Input [1]: [p_partkey#X] +Arguments: [p_partkey#X ASC NULLS FIRST], false, 0 + +(107) SortMergeJoin Left keys [1]: [ps_partkey#X] Right keys [1]: [p_partkey#X] Join type: LeftSemi Join condition: None -(95) Exchange +(108) Exchange Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] Arguments: hashpartitioning(ps_partkey#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(96) Scan parquet +(109) Sort +Input [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] +Arguments: [ps_partkey#X ASC NULLS FIRST, ps_suppkey#X ASC NULLS FIRST], false, 0 + +(110) Scan parquet Output [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1994-01-01), LessThan(l_shipdate,1995-01-01), IsNotNull(l_partkey), IsNotNull(l_suppkey)] ReadSchema: struct -(97) Filter +(111) Filter Input [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] Condition : ((((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1994-01-01)) AND (l_shipdate#X < 1995-01-01)) AND isnotnull(l_partkey#X)) AND isnotnull(l_suppkey#X)) -(98) Project +(112) Project Output [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] Input [4]: [l_partkey#X, l_suppkey#X, l_quantity#X, l_shipdate#X] -(99) Exchange +(113) Exchange Input [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(100) Scan parquet +(114) Sort +Input [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] +Arguments: [l_partkey#X ASC NULLS FIRST], false, 0 + +(115) Scan parquet Output [2]: [p_partkey#X, p_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_name), StringStartsWith(p_name,forest)] ReadSchema: struct -(101) Filter +(116) Filter Input [2]: [p_partkey#X, p_name#X] Condition : (isnotnull(p_name#X) AND StartsWith(p_name#X, forest)) -(102) Project +(117) Project Output [1]: [p_partkey#X] Input [2]: [p_partkey#X, p_name#X] -(103) Exchange +(118) Exchange Input [1]: [p_partkey#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(104) ShuffledHashJoin +(119) Sort +Input [1]: [p_partkey#X] +Arguments: [p_partkey#X ASC NULLS FIRST], false, 0 + +(120) SortMergeJoin Left keys [1]: [l_partkey#X] Right keys [1]: [p_partkey#X] Join type: LeftSemi Join condition: None -(105) HashAggregate +(121) HashAggregate Input [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] Keys [2]: [l_partkey#X, l_suppkey#X] Functions [1]: [partial_sum(l_quantity#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] -(106) HashAggregate +(122) HashAggregate Input [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] Keys [2]: [l_partkey#X, l_suppkey#X] Functions [1]: [sum(l_quantity#X)] Aggregate Attributes [1]: [sum(l_quantity#X)#X] Results [3]: [(0.5 * sum(l_quantity#X)#X) AS (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] -(107) Filter +(123) Filter Input [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Condition : isnotnull((0.5 * sum(l_quantity))#X) -(108) Exchange +(124) Exchange Input [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] Arguments: hashpartitioning(l_partkey#X, l_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(109) ShuffledHashJoin +(125) Sort +Input [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] +Arguments: [l_partkey#X ASC NULLS FIRST, l_suppkey#X ASC NULLS FIRST], false, 0 + +(126) SortMergeJoin Left keys [2]: [ps_partkey#X, ps_suppkey#X] Right keys [2]: [l_partkey#X, l_suppkey#X] Join type: Inner Join condition: (cast(ps_availqty#X as decimal(24,3)) > (0.5 * sum(l_quantity))#X) -(110) Project +(127) Project Output [1]: [ps_suppkey#X] Input [6]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] -(111) Exchange +(128) Exchange Input [1]: [ps_suppkey#X] Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(112) ShuffledHashJoin +(129) Sort +Input [1]: [ps_suppkey#X] +Arguments: [ps_suppkey#X ASC NULLS FIRST], false, 0 + +(130) SortMergeJoin Left keys [1]: [s_suppkey#X] Right keys [1]: [ps_suppkey#X] Join type: LeftSemi Join condition: None -(113) Project +(131) Project Output [3]: [s_name#X, s_address#X, s_nationkey#X] Input [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] -(114) Exchange +(132) Exchange Input [3]: [s_name#X, s_address#X, s_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(115) Scan parquet +(133) Sort +Input [3]: [s_name#X, s_address#X, s_nationkey#X] +Arguments: [s_nationkey#X ASC NULLS FIRST], false, 0 + +(134) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,CANADA), IsNotNull(n_nationkey)] ReadSchema: struct -(116) Filter +(135) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : ((isnotnull(n_name#X) AND (n_name#X = CANADA)) AND isnotnull(n_nationkey#X)) -(117) Project +(136) Project Output [1]: [n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(118) Exchange +(137) Exchange Input [1]: [n_nationkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(119) ShuffledHashJoin +(138) Sort +Input [1]: [n_nationkey#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(139) SortMergeJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(120) Project +(140) Project Output [2]: [s_name#X, s_address#X] Input [4]: [s_name#X, s_address#X, s_nationkey#X, n_nationkey#X] -(121) Exchange +(141) Exchange Input [2]: [s_name#X, s_address#X] Arguments: rangepartitioning(s_name#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(122) Sort +(142) Sort Input [2]: [s_name#X, s_address#X] Arguments: [s_name#X ASC NULLS FIRST], true, 0 -(123) AdaptiveSparkPlan +(143) AdaptiveSparkPlan Output [2]: [s_name#X, s_address#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/21.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/21.txt index 7b5d71ec6d265..5e8c9ad9f92ac 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/21.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/21.txt @@ -1,113 +1,122 @@ == Physical Plan == -AdaptiveSparkPlan (118) +AdaptiveSparkPlan (137) +- == Final Plan == - VeloxColumnarToRowExec (81) - +- ^ RegularHashAggregateExecTransformer (79) - +- ^ InputIteratorTransformer (78) - +- ^ InputAdapter (77) - +- ^ ShuffleQueryStage (76), Statistics(X) - +- ColumnarExchange (75) - +- ^ ProjectExecTransformer (73) - +- ^ FlushableHashAggregateExecTransformer (72) - +- ^ ProjectExecTransformer (71) - +- ^ ShuffledHashJoinExecTransformer Inner (70) - :- ^ InputIteratorTransformer (61) - : +- ^ InputAdapter (60) - : +- ^ ShuffleQueryStage (59), Statistics(X) - : +- ColumnarExchange (58) - : +- ^ ProjectExecTransformer (56) - : +- ^ ShuffledHashJoinExecTransformer Inner (55) - : :- ^ InputIteratorTransformer (46) - : : +- ^ InputAdapter (45) - : : +- ^ ShuffleQueryStage (44), Statistics(X) - : : +- ColumnarExchange (43) - : : +- ^ ProjectExecTransformer (41) - : : +- ^ ShuffledHashJoinExecTransformer Inner (40) - : : :- ^ InputIteratorTransformer (8) - : : : +- ^ InputAdapter (7) - : : : +- ^ ShuffleQueryStage (6), Statistics(X) - : : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (91) + +- ^ RegularHashAggregateExecTransformer (89) + +- ^ InputIteratorTransformer (88) + +- ShuffleQueryStage (86), Statistics(X) + +- ColumnarExchange (85) + +- VeloxAppendBatches (84) + +- ^ ProjectExecTransformer (82) + +- ^ FlushableHashAggregateExecTransformer (81) + +- ^ ProjectExecTransformer (80) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (79) + :- ^ InputIteratorTransformer (69) + : +- ShuffleQueryStage (67), Statistics(X) + : +- ColumnarExchange (66) + : +- VeloxAppendBatches (65) + : +- ^ ProjectExecTransformer (63) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (62) + : :- ^ InputIteratorTransformer (52) + : : +- ShuffleQueryStage (50), Statistics(X) + : : +- ColumnarExchange (49) + : : +- VeloxAppendBatches (48) + : : +- ^ ProjectExecTransformer (46) + : : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (45) + : : :- ^ InputIteratorTransformer (9) + : : : +- ShuffleQueryStage (7), Statistics(X) + : : : +- ColumnarExchange (6) + : : : +- VeloxAppendBatches (5) : : : +- ^ ProjectExecTransformer (3) : : : +- ^ FilterExecTransformer (2) : : : +- ^ Scan parquet (1) - : : +- ^ InputIteratorTransformer (39) - : : +- ^ InputAdapter (38) - : : +- ^ ShuffleQueryStage (37), Statistics(X) - : : +- ColumnarExchange (36) - : : +- ^ ProjectExecTransformer (34) - : : +- ^ ShuffledHashJoinExecTransformer LeftAnti (33) - : : :- ^ ShuffledHashJoinExecTransformer LeftSemi (24) - : : : :- ^ InputIteratorTransformer (16) - : : : : +- ^ InputAdapter (15) - : : : : +- ^ ShuffleQueryStage (14), Statistics(X) - : : : : +- ColumnarExchange (13) - : : : : +- ^ ProjectExecTransformer (11) - : : : : +- ^ FilterExecTransformer (10) - : : : : +- ^ Scan parquet (9) - : : : +- ^ InputIteratorTransformer (23) - : : : +- ^ InputAdapter (22) - : : : +- ^ ShuffleQueryStage (21), Statistics(X) - : : : +- ColumnarExchange (20) - : : : +- ^ ProjectExecTransformer (18) - : : : +- ^ Scan parquet (17) - : : +- ^ InputIteratorTransformer (32) - : : +- ^ InputAdapter (31) - : : +- ^ ShuffleQueryStage (30), Statistics(X) - : : +- ColumnarExchange (29) - : : +- ^ ProjectExecTransformer (27) - : : +- ^ FilterExecTransformer (26) - : : +- ^ Scan parquet (25) - : +- ^ InputIteratorTransformer (54) - : +- ^ InputAdapter (53) - : +- ^ ShuffleQueryStage (52), Statistics(X) - : +- ColumnarExchange (51) - : +- ^ ProjectExecTransformer (49) - : +- ^ FilterExecTransformer (48) - : +- ^ Scan parquet (47) - +- ^ InputIteratorTransformer (69) - +- ^ InputAdapter (68) - +- ^ ShuffleQueryStage (67), Statistics(X) - +- ColumnarExchange (66) - +- ^ ProjectExecTransformer (64) - +- ^ FilterExecTransformer (63) - +- ^ Scan parquet (62) + : : +- ^ InputIteratorTransformer (44) + : : +- ShuffleQueryStage (42), Statistics(X) + : : +- ColumnarExchange (41) + : : +- VeloxAppendBatches (40) + : : +- ^ ProjectExecTransformer (38) + : : +- ^ ShuffledHashJoinExecTransformer LeftAnti BuildRight (37) + : : :- ^ ShuffledHashJoinExecTransformer LeftSemi BuildRight (27) + : : : :- ^ InputIteratorTransformer (18) + : : : : +- ShuffleQueryStage (16), Statistics(X) + : : : : +- ColumnarExchange (15) + : : : : +- VeloxAppendBatches (14) + : : : : +- ^ ProjectExecTransformer (12) + : : : : +- ^ FilterExecTransformer (11) + : : : : +- ^ Scan parquet (10) + : : : +- ^ InputIteratorTransformer (26) + : : : +- ShuffleQueryStage (24), Statistics(X) + : : : +- ColumnarExchange (23) + : : : +- VeloxAppendBatches (22) + : : : +- ^ ProjectExecTransformer (20) + : : : +- ^ Scan parquet (19) + : : +- ^ InputIteratorTransformer (36) + : : +- ShuffleQueryStage (34), Statistics(X) + : : +- ColumnarExchange (33) + : : +- VeloxAppendBatches (32) + : : +- ^ ProjectExecTransformer (30) + : : +- ^ FilterExecTransformer (29) + : : +- ^ Scan parquet (28) + : +- ^ InputIteratorTransformer (61) + : +- ShuffleQueryStage (59), Statistics(X) + : +- ColumnarExchange (58) + : +- VeloxAppendBatches (57) + : +- ^ ProjectExecTransformer (55) + : +- ^ FilterExecTransformer (54) + : +- ^ Scan parquet (53) + +- ^ InputIteratorTransformer (78) + +- ShuffleQueryStage (76), Statistics(X) + +- ColumnarExchange (75) + +- VeloxAppendBatches (74) + +- ^ ProjectExecTransformer (72) + +- ^ FilterExecTransformer (71) + +- ^ Scan parquet (70) +- == Initial Plan == - TakeOrderedAndProject (117) - +- HashAggregate (116) - +- Exchange (115) - +- HashAggregate (114) - +- Project (113) - +- ShuffledHashJoin Inner BuildRight (112) - :- Exchange (107) - : +- Project (106) - : +- ShuffledHashJoin Inner BuildRight (105) - : :- Exchange (100) - : : +- Project (99) - : : +- ShuffledHashJoin Inner BuildLeft (98) - : : :- Exchange (84) - : : : +- Filter (83) - : : : +- Scan parquet (82) - : : +- Exchange (97) - : : +- ShuffledHashJoin LeftAnti BuildRight (96) - : : :- ShuffledHashJoin LeftSemi BuildRight (91) - : : : :- Exchange (88) - : : : : +- Project (87) - : : : : +- Filter (86) - : : : : +- Scan parquet (85) - : : : +- Exchange (90) - : : : +- Scan parquet (89) - : : +- Exchange (95) - : : +- Project (94) - : : +- Filter (93) - : : +- Scan parquet (92) - : +- Exchange (104) - : +- Project (103) - : +- Filter (102) - : +- Scan parquet (101) - +- Exchange (111) - +- Project (110) - +- Filter (109) - +- Scan parquet (108) + TakeOrderedAndProject (136) + +- HashAggregate (135) + +- Exchange (134) + +- HashAggregate (133) + +- Project (132) + +- SortMergeJoin Inner (131) + :- Sort (125) + : +- Exchange (124) + : +- Project (123) + : +- SortMergeJoin Inner (122) + : :- Sort (116) + : : +- Exchange (115) + : : +- Project (114) + : : +- SortMergeJoin Inner (113) + : : :- Sort (95) + : : : +- Exchange (94) + : : : +- Filter (93) + : : : +- Scan parquet (92) + : : +- Sort (112) + : : +- Exchange (111) + : : +- SortMergeJoin LeftAnti (110) + : : :- SortMergeJoin LeftSemi (104) + : : : :- Sort (100) + : : : : +- Exchange (99) + : : : : +- Project (98) + : : : : +- Filter (97) + : : : : +- Scan parquet (96) + : : : +- Sort (103) + : : : +- Exchange (102) + : : : +- Scan parquet (101) + : : +- Sort (109) + : : +- Exchange (108) + : : +- Project (107) + : : +- Filter (106) + : : +- Scan parquet (105) + : +- Sort (121) + : +- Exchange (120) + : +- Project (119) + : +- Filter (118) + : +- Scan parquet (117) + +- Sort (130) + +- Exchange (129) + +- Project (128) + +- Filter (127) + +- Scan parquet (126) (1) Scan parquet @@ -129,500 +138,576 @@ Input [3]: [s_suppkey#X, s_name#X, s_nationkey#X] Input [4]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_nationkey#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [4]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_nationkey#X] +Arguments: X + +(6) ColumnarExchange Input [4]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_nationkey#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [3]: [s_suppkey#X, s_name#X, s_nationkey#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [3]: [s_suppkey#X, s_name#X, s_nationkey#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [3]: [s_suppkey#X, s_name#X, s_nationkey#X] -(9) Scan parquet +(10) Scan parquet Output [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_receiptdate), IsNotNull(l_commitdate), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(10) FilterExecTransformer +(11) FilterExecTransformer Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Arguments: ((((isnotnull(l_receiptdate#X) AND isnotnull(l_commitdate#X)) AND (l_receiptdate#X > l_commitdate#X)) AND isnotnull(l_suppkey#X)) AND isnotnull(l_orderkey#X)) -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [3]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_suppkey#X] Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] +Arguments: X + +(15) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [2]: [l_orderkey#X, l_suppkey#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [2]: [l_orderkey#X, l_suppkey#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [2]: [l_orderkey#X, l_suppkey#X] -(17) Scan parquet +(19) Scan parquet Output [2]: [l_orderkey#X, l_suppkey#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [3]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_suppkey#X] Input [2]: [l_orderkey#X, l_suppkey#X] -(19) WholeStageCodegenTransformer (X) +(21) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] Arguments: false -(20) ColumnarExchange +(22) VeloxAppendBatches +Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] +Arguments: X + +(23) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [id=#X] -(21) ShuffleQueryStage +(24) ShuffleQueryStage Output [2]: [l_orderkey#X, l_suppkey#X] Arguments: X -(22) InputAdapter +(25) InputAdapter Input [2]: [l_orderkey#X, l_suppkey#X] -(23) InputIteratorTransformer +(26) InputIteratorTransformer Input [2]: [l_orderkey#X, l_suppkey#X] -(24) ShuffledHashJoinExecTransformer +(27) ShuffledHashJoinExecTransformer Left keys [1]: [l_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: LeftSemi Join condition: NOT (l_suppkey#X = l_suppkey#X) -(25) Scan parquet +(28) Scan parquet Output [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_receiptdate), IsNotNull(l_commitdate)] ReadSchema: struct -(26) FilterExecTransformer +(29) FilterExecTransformer Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Arguments: ((isnotnull(l_receiptdate#X) AND isnotnull(l_commitdate#X)) AND (l_receiptdate#X > l_commitdate#X)) -(27) ProjectExecTransformer +(30) ProjectExecTransformer Output [3]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_suppkey#X] Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] -(28) WholeStageCodegenTransformer (X) +(31) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] Arguments: false -(29) ColumnarExchange +(32) VeloxAppendBatches +Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] +Arguments: X + +(33) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [id=#X] -(30) ShuffleQueryStage +(34) ShuffleQueryStage Output [2]: [l_orderkey#X, l_suppkey#X] Arguments: X -(31) InputAdapter +(35) InputAdapter Input [2]: [l_orderkey#X, l_suppkey#X] -(32) InputIteratorTransformer +(36) InputIteratorTransformer Input [2]: [l_orderkey#X, l_suppkey#X] -(33) ShuffledHashJoinExecTransformer +(37) ShuffledHashJoinExecTransformer Left keys [1]: [l_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: LeftAnti Join condition: NOT (l_suppkey#X = l_suppkey#X) -(34) ProjectExecTransformer +(38) ProjectExecTransformer Output [3]: [hash(l_suppkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_suppkey#X] Input [2]: [l_orderkey#X, l_suppkey#X] -(35) WholeStageCodegenTransformer (X) +(39) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] Arguments: false -(36) ColumnarExchange +(40) VeloxAppendBatches +Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] +Arguments: X + +(41) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [id=#X] -(37) ShuffleQueryStage +(42) ShuffleQueryStage Output [2]: [l_orderkey#X, l_suppkey#X] Arguments: X -(38) InputAdapter +(43) InputAdapter Input [2]: [l_orderkey#X, l_suppkey#X] -(39) InputIteratorTransformer +(44) InputIteratorTransformer Input [2]: [l_orderkey#X, l_suppkey#X] -(40) ShuffledHashJoinExecTransformer +(45) ShuffledHashJoinExecTransformer Left keys [1]: [s_suppkey#X] Right keys [1]: [l_suppkey#X] Join type: Inner Join condition: None -(41) ProjectExecTransformer +(46) ProjectExecTransformer Output [4]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, s_name#X, s_nationkey#X, l_orderkey#X] Input [5]: [s_suppkey#X, s_name#X, s_nationkey#X, l_orderkey#X, l_suppkey#X] -(42) WholeStageCodegenTransformer (X) +(47) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, s_name#X, s_nationkey#X, l_orderkey#X] Arguments: false -(43) ColumnarExchange +(48) VeloxAppendBatches +Input [4]: [hash_partition_key#X, s_name#X, s_nationkey#X, l_orderkey#X] +Arguments: X + +(49) ColumnarExchange Input [4]: [hash_partition_key#X, s_name#X, s_nationkey#X, l_orderkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [s_name#X, s_nationkey#X, l_orderkey#X], [plan_id=X], [id=#X] -(44) ShuffleQueryStage +(50) ShuffleQueryStage Output [3]: [s_name#X, s_nationkey#X, l_orderkey#X] Arguments: X -(45) InputAdapter +(51) InputAdapter Input [3]: [s_name#X, s_nationkey#X, l_orderkey#X] -(46) InputIteratorTransformer +(52) InputIteratorTransformer Input [3]: [s_name#X, s_nationkey#X, l_orderkey#X] -(47) Scan parquet +(53) Scan parquet Output [2]: [o_orderkey#X, o_orderstatus#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderstatus), EqualTo(o_orderstatus,F), IsNotNull(o_orderkey)] ReadSchema: struct -(48) FilterExecTransformer +(54) FilterExecTransformer Input [2]: [o_orderkey#X, o_orderstatus#X] Arguments: ((isnotnull(o_orderstatus#X) AND (o_orderstatus#X = F)) AND isnotnull(o_orderkey#X)) -(49) ProjectExecTransformer +(55) ProjectExecTransformer Output [2]: [hash(o_orderkey#X, 42) AS hash_partition_key#X, o_orderkey#X] Input [2]: [o_orderkey#X, o_orderstatus#X] -(50) WholeStageCodegenTransformer (X) +(56) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, o_orderkey#X] Arguments: false -(51) ColumnarExchange +(57) VeloxAppendBatches +Input [2]: [hash_partition_key#X, o_orderkey#X] +Arguments: X + +(58) ColumnarExchange Input [2]: [hash_partition_key#X, o_orderkey#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X], [plan_id=X], [id=#X] -(52) ShuffleQueryStage +(59) ShuffleQueryStage Output [1]: [o_orderkey#X] Arguments: X -(53) InputAdapter +(60) InputAdapter Input [1]: [o_orderkey#X] -(54) InputIteratorTransformer +(61) InputIteratorTransformer Input [1]: [o_orderkey#X] -(55) ShuffledHashJoinExecTransformer +(62) ShuffledHashJoinExecTransformer Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join type: Inner Join condition: None -(56) ProjectExecTransformer +(63) ProjectExecTransformer Output [3]: [hash(s_nationkey#X, 42) AS hash_partition_key#X, s_name#X, s_nationkey#X] Input [4]: [s_name#X, s_nationkey#X, l_orderkey#X, o_orderkey#X] -(57) WholeStageCodegenTransformer (X) +(64) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, s_name#X, s_nationkey#X] Arguments: false -(58) ColumnarExchange +(65) VeloxAppendBatches +Input [3]: [hash_partition_key#X, s_name#X, s_nationkey#X] +Arguments: X + +(66) ColumnarExchange Input [3]: [hash_partition_key#X, s_name#X, s_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_name#X, s_nationkey#X], [plan_id=X], [id=#X] -(59) ShuffleQueryStage +(67) ShuffleQueryStage Output [2]: [s_name#X, s_nationkey#X] Arguments: X -(60) InputAdapter +(68) InputAdapter Input [2]: [s_name#X, s_nationkey#X] -(61) InputIteratorTransformer +(69) InputIteratorTransformer Input [2]: [s_name#X, s_nationkey#X] -(62) Scan parquet +(70) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,SAUDI ARABIA), IsNotNull(n_nationkey)] ReadSchema: struct -(63) FilterExecTransformer +(71) FilterExecTransformer Input [2]: [n_nationkey#X, n_name#X] Arguments: ((isnotnull(n_name#X) AND (n_name#X = SAUDI ARABIA)) AND isnotnull(n_nationkey#X)) -(64) ProjectExecTransformer +(72) ProjectExecTransformer Output [2]: [hash(n_nationkey#X, 42) AS hash_partition_key#X, n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(65) WholeStageCodegenTransformer (X) +(73) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, n_nationkey#X] Arguments: false -(66) ColumnarExchange +(74) VeloxAppendBatches +Input [2]: [hash_partition_key#X, n_nationkey#X] +Arguments: X + +(75) ColumnarExchange Input [2]: [hash_partition_key#X, n_nationkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X], [plan_id=X], [id=#X] -(67) ShuffleQueryStage +(76) ShuffleQueryStage Output [1]: [n_nationkey#X] Arguments: X -(68) InputAdapter +(77) InputAdapter Input [1]: [n_nationkey#X] -(69) InputIteratorTransformer +(78) InputIteratorTransformer Input [1]: [n_nationkey#X] -(70) ShuffledHashJoinExecTransformer +(79) ShuffledHashJoinExecTransformer Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(71) ProjectExecTransformer +(80) ProjectExecTransformer Output [1]: [s_name#X] Input [3]: [s_name#X, s_nationkey#X, n_nationkey#X] -(72) FlushableHashAggregateExecTransformer +(81) FlushableHashAggregateExecTransformer Input [1]: [s_name#X] Keys [1]: [s_name#X] Functions [1]: [partial_count(1)] Aggregate Attributes [1]: [count#X] Results [2]: [s_name#X, count#X] -(73) ProjectExecTransformer +(82) ProjectExecTransformer Output [3]: [hash(s_name#X, 42) AS hash_partition_key#X, s_name#X, count#X] Input [2]: [s_name#X, count#X] -(74) WholeStageCodegenTransformer (X) +(83) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, s_name#X, count#X] Arguments: false -(75) ColumnarExchange +(84) VeloxAppendBatches +Input [3]: [hash_partition_key#X, s_name#X, count#X] +Arguments: X + +(85) ColumnarExchange Input [3]: [hash_partition_key#X, s_name#X, count#X] Arguments: hashpartitioning(s_name#X, 1), ENSURE_REQUIREMENTS, [s_name#X, count#X], [plan_id=X], [id=#X] -(76) ShuffleQueryStage +(86) ShuffleQueryStage Output [2]: [s_name#X, count#X] Arguments: X -(77) InputAdapter +(87) InputAdapter Input [2]: [s_name#X, count#X] -(78) InputIteratorTransformer +(88) InputIteratorTransformer Input [2]: [s_name#X, count#X] -(79) RegularHashAggregateExecTransformer +(89) RegularHashAggregateExecTransformer Input [2]: [s_name#X, count#X] Keys [1]: [s_name#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [s_name#X, count(1)#X AS numwait#X] -(80) WholeStageCodegenTransformer (X) +(90) WholeStageCodegenTransformer (X) Input [2]: [s_name#X, numwait#X] Arguments: false -(81) VeloxColumnarToRowExec +(91) VeloxColumnarToRowExec Input [2]: [s_name#X, numwait#X] -(82) Scan parquet +(92) Scan parquet Output [3]: [s_suppkey#X, s_name#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(83) Filter +(93) Filter Input [3]: [s_suppkey#X, s_name#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(84) Exchange +(94) Exchange Input [3]: [s_suppkey#X, s_name#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(85) Scan parquet +(95) Sort +Input [3]: [s_suppkey#X, s_name#X, s_nationkey#X] +Arguments: [s_suppkey#X ASC NULLS FIRST], false, 0 + +(96) Scan parquet Output [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_receiptdate), IsNotNull(l_commitdate), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(86) Filter +(97) Filter Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Condition : ((((isnotnull(l_receiptdate#X) AND isnotnull(l_commitdate#X)) AND (l_receiptdate#X > l_commitdate#X)) AND isnotnull(l_suppkey#X)) AND isnotnull(l_orderkey#X)) -(87) Project +(98) Project Output [2]: [l_orderkey#X, l_suppkey#X] Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] -(88) Exchange +(99) Exchange Input [2]: [l_orderkey#X, l_suppkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(89) Scan parquet +(100) Sort +Input [2]: [l_orderkey#X, l_suppkey#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(101) Scan parquet Output [2]: [l_orderkey#X, l_suppkey#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(90) Exchange +(102) Exchange Input [2]: [l_orderkey#X, l_suppkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(91) ShuffledHashJoin +(103) Sort +Input [2]: [l_orderkey#X, l_suppkey#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(104) SortMergeJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: LeftSemi Join condition: NOT (l_suppkey#X = l_suppkey#X) -(92) Scan parquet +(105) Scan parquet Output [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_receiptdate), IsNotNull(l_commitdate)] ReadSchema: struct -(93) Filter +(106) Filter Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] Condition : ((isnotnull(l_receiptdate#X) AND isnotnull(l_commitdate#X)) AND (l_receiptdate#X > l_commitdate#X)) -(94) Project +(107) Project Output [2]: [l_orderkey#X, l_suppkey#X] Input [4]: [l_orderkey#X, l_suppkey#X, l_commitdate#X, l_receiptdate#X] -(95) Exchange +(108) Exchange Input [2]: [l_orderkey#X, l_suppkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(96) ShuffledHashJoin +(109) Sort +Input [2]: [l_orderkey#X, l_suppkey#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(110) SortMergeJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: LeftAnti Join condition: NOT (l_suppkey#X = l_suppkey#X) -(97) Exchange +(111) Exchange Input [2]: [l_orderkey#X, l_suppkey#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(98) ShuffledHashJoin +(112) Sort +Input [2]: [l_orderkey#X, l_suppkey#X] +Arguments: [l_suppkey#X ASC NULLS FIRST], false, 0 + +(113) SortMergeJoin Left keys [1]: [s_suppkey#X] Right keys [1]: [l_suppkey#X] Join type: Inner Join condition: None -(99) Project +(114) Project Output [3]: [s_name#X, s_nationkey#X, l_orderkey#X] Input [5]: [s_suppkey#X, s_name#X, s_nationkey#X, l_orderkey#X, l_suppkey#X] -(100) Exchange +(115) Exchange Input [3]: [s_name#X, s_nationkey#X, l_orderkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(101) Scan parquet +(116) Sort +Input [3]: [s_name#X, s_nationkey#X, l_orderkey#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(117) Scan parquet Output [2]: [o_orderkey#X, o_orderstatus#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderstatus), EqualTo(o_orderstatus,F), IsNotNull(o_orderkey)] ReadSchema: struct -(102) Filter +(118) Filter Input [2]: [o_orderkey#X, o_orderstatus#X] Condition : ((isnotnull(o_orderstatus#X) AND (o_orderstatus#X = F)) AND isnotnull(o_orderkey#X)) -(103) Project +(119) Project Output [1]: [o_orderkey#X] Input [2]: [o_orderkey#X, o_orderstatus#X] -(104) Exchange +(120) Exchange Input [1]: [o_orderkey#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(105) ShuffledHashJoin +(121) Sort +Input [1]: [o_orderkey#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(122) SortMergeJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join type: Inner Join condition: None -(106) Project +(123) Project Output [2]: [s_name#X, s_nationkey#X] Input [4]: [s_name#X, s_nationkey#X, l_orderkey#X, o_orderkey#X] -(107) Exchange +(124) Exchange Input [2]: [s_name#X, s_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(108) Scan parquet +(125) Sort +Input [2]: [s_name#X, s_nationkey#X] +Arguments: [s_nationkey#X ASC NULLS FIRST], false, 0 + +(126) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_name), EqualTo(n_name,SAUDI ARABIA), IsNotNull(n_nationkey)] ReadSchema: struct -(109) Filter +(127) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : ((isnotnull(n_name#X) AND (n_name#X = SAUDI ARABIA)) AND isnotnull(n_nationkey#X)) -(110) Project +(128) Project Output [1]: [n_nationkey#X] Input [2]: [n_nationkey#X, n_name#X] -(111) Exchange +(129) Exchange Input [1]: [n_nationkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(112) ShuffledHashJoin +(130) Sort +Input [1]: [n_nationkey#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(131) SortMergeJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(113) Project +(132) Project Output [1]: [s_name#X] Input [3]: [s_name#X, s_nationkey#X, n_nationkey#X] -(114) HashAggregate +(133) HashAggregate Input [1]: [s_name#X] Keys [1]: [s_name#X] Functions [1]: [partial_count(1)] Aggregate Attributes [1]: [count#X] Results [2]: [s_name#X, count#X] -(115) Exchange +(134) Exchange Input [2]: [s_name#X, count#X] Arguments: hashpartitioning(s_name#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(116) HashAggregate +(135) HashAggregate Input [2]: [s_name#X, count#X] Keys [1]: [s_name#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [s_name#X, count(1)#X AS numwait#X] -(117) TakeOrderedAndProject +(136) TakeOrderedAndProject Input [2]: [s_name#X, numwait#X] Arguments: X, [numwait#X DESC NULLS LAST, s_name#X ASC NULLS FIRST], [s_name#X, numwait#X] -(118) AdaptiveSparkPlan +(137) AdaptiveSparkPlan Output [2]: [s_name#X, numwait#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/22.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/22.txt index 86f293b4ff074..50f1c1bdef30b 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/22.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/22.txt @@ -1,47 +1,49 @@ == Physical Plan == -AdaptiveSparkPlan (46) +AdaptiveSparkPlan (52) +- == Final Plan == - VeloxColumnarToRowExec (33) - +- ^ SortExecTransformer (31) - +- ^ InputIteratorTransformer (30) - +- ^ InputAdapter (29) - +- ^ ShuffleQueryStage (28), Statistics(X) - +- ColumnarExchange (27) - +- ^ RegularHashAggregateExecTransformer (25) - +- ^ InputIteratorTransformer (24) - +- ^ InputAdapter (23) - +- ^ ShuffleQueryStage (22), Statistics(X) - +- ColumnarExchange (21) - +- ^ ProjectExecTransformer (19) - +- ^ FlushableHashAggregateExecTransformer (18) - +- ^ ProjectExecTransformer (17) - +- ^ ShuffledHashJoinExecTransformer LeftAnti (16) - :- ^ InputIteratorTransformer (8) - : +- ^ InputAdapter (7) - : +- ^ ShuffleQueryStage (6), Statistics(X) - : +- ColumnarExchange (5) + VeloxColumnarToRowExec (37) + +- ^ SortExecTransformer (35) + +- ^ InputIteratorTransformer (34) + +- ShuffleQueryStage (32), Statistics(X) + +- ColumnarExchange (31) + +- VeloxAppendBatches (30) + +- ^ RegularHashAggregateExecTransformer (28) + +- ^ InputIteratorTransformer (27) + +- ShuffleQueryStage (25), Statistics(X) + +- ColumnarExchange (24) + +- VeloxAppendBatches (23) + +- ^ ProjectExecTransformer (21) + +- ^ FlushableHashAggregateExecTransformer (20) + +- ^ ProjectExecTransformer (19) + +- ^ ShuffledHashJoinExecTransformer LeftAnti BuildRight (18) + :- ^ InputIteratorTransformer (9) + : +- ShuffleQueryStage (7), Statistics(X) + : +- ColumnarExchange (6) + : +- VeloxAppendBatches (5) : +- ^ ProjectExecTransformer (3) : +- ^ FilterExecTransformer (2) : +- ^ Scan parquet (1) - +- ^ InputIteratorTransformer (15) - +- ^ InputAdapter (14) - +- ^ ShuffleQueryStage (13), Statistics(X) - +- ColumnarExchange (12) - +- ^ ProjectExecTransformer (10) - +- ^ Scan parquet (9) + +- ^ InputIteratorTransformer (17) + +- ShuffleQueryStage (15), Statistics(X) + +- ColumnarExchange (14) + +- VeloxAppendBatches (13) + +- ^ ProjectExecTransformer (11) + +- ^ Scan parquet (10) +- == Initial Plan == - Sort (45) - +- Exchange (44) - +- HashAggregate (43) - +- Exchange (42) - +- HashAggregate (41) - +- Project (40) - +- ShuffledHashJoin LeftAnti BuildRight (39) - :- Exchange (36) - : +- Filter (35) - : +- Scan parquet (34) - +- Exchange (38) - +- Scan parquet (37) + Sort (51) + +- Exchange (50) + +- HashAggregate (49) + +- Exchange (48) + +- HashAggregate (47) + +- Project (46) + +- SortMergeJoin LeftAnti (45) + :- Sort (41) + : +- Exchange (40) + : +- Filter (39) + : +- Scan parquet (38) + +- Sort (44) + +- Exchange (43) + +- Scan parquet (42) (1) Scan parquet @@ -63,320 +65,348 @@ Input [3]: [c_custkey#X, c_phone#X, c_acctbal#X] Input [4]: [hash_partition_key#X, c_custkey#X, c_phone#X, c_acctbal#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [4]: [hash_partition_key#X, c_custkey#X, c_phone#X, c_acctbal#X] +Arguments: X + +(6) ColumnarExchange Input [4]: [hash_partition_key#X, c_custkey#X, c_phone#X, c_acctbal#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_phone#X, c_acctbal#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [3]: [c_custkey#X, c_phone#X, c_acctbal#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [3]: [c_custkey#X, c_phone#X, c_acctbal#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [3]: [c_custkey#X, c_phone#X, c_acctbal#X] -(9) Scan parquet +(10) Scan parquet Output [1]: [o_custkey#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(10) ProjectExecTransformer +(11) ProjectExecTransformer Output [2]: [hash(o_custkey#X, 42) AS hash_partition_key#X, o_custkey#X] Input [1]: [o_custkey#X] -(11) WholeStageCodegenTransformer (X) +(12) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, o_custkey#X] Arguments: false -(12) ColumnarExchange +(13) VeloxAppendBatches +Input [2]: [hash_partition_key#X, o_custkey#X] +Arguments: X + +(14) ColumnarExchange Input [2]: [hash_partition_key#X, o_custkey#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_custkey#X], [plan_id=X], [id=#X] -(13) ShuffleQueryStage +(15) ShuffleQueryStage Output [1]: [o_custkey#X] Arguments: X -(14) InputAdapter +(16) InputAdapter Input [1]: [o_custkey#X] -(15) InputIteratorTransformer +(17) InputIteratorTransformer Input [1]: [o_custkey#X] -(16) ShuffledHashJoinExecTransformer +(18) ShuffledHashJoinExecTransformer Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join type: LeftAnti Join condition: None -(17) ProjectExecTransformer +(19) ProjectExecTransformer Output [2]: [substring(c_phone#X, 1, 2) AS cntrycode#X, c_acctbal#X] Input [3]: [c_custkey#X, c_phone#X, c_acctbal#X] -(18) FlushableHashAggregateExecTransformer +(20) FlushableHashAggregateExecTransformer Input [2]: [cntrycode#X, c_acctbal#X] Keys [1]: [cntrycode#X] Functions [2]: [partial_count(1), partial_sum(c_acctbal#X)] Aggregate Attributes [3]: [count#X, sum#X, isEmpty#X] Results [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] -(19) ProjectExecTransformer +(21) ProjectExecTransformer Output [5]: [hash(cntrycode#X, 42) AS hash_partition_key#X, cntrycode#X, count#X, sum#X, isEmpty#X] Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] -(20) WholeStageCodegenTransformer (X) +(22) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, cntrycode#X, count#X, sum#X, isEmpty#X] Arguments: false -(21) ColumnarExchange +(23) VeloxAppendBatches +Input [5]: [hash_partition_key#X, cntrycode#X, count#X, sum#X, isEmpty#X] +Arguments: X + +(24) ColumnarExchange Input [5]: [hash_partition_key#X, cntrycode#X, count#X, sum#X, isEmpty#X] Arguments: hashpartitioning(cntrycode#X, 1), ENSURE_REQUIREMENTS, [cntrycode#X, count#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(22) ShuffleQueryStage +(25) ShuffleQueryStage Output [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] Arguments: X -(23) InputAdapter +(26) InputAdapter Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] -(24) InputIteratorTransformer +(27) InputIteratorTransformer Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] -(25) RegularHashAggregateExecTransformer +(28) RegularHashAggregateExecTransformer Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] Keys [1]: [cntrycode#X] Functions [2]: [count(1), sum(c_acctbal#X)] Aggregate Attributes [2]: [count(1)#X, sum(c_acctbal#X)#X] Results [3]: [cntrycode#X, count(1)#X AS numcust#X, sum(c_acctbal#X)#X AS totacctbal#X] -(26) WholeStageCodegenTransformer (X) +(29) WholeStageCodegenTransformer (X) Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: false -(27) ColumnarExchange +(30) VeloxAppendBatches +Input [3]: [cntrycode#X, numcust#X, totacctbal#X] +Arguments: X + +(31) ColumnarExchange Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: rangepartitioning(cntrycode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(28) ShuffleQueryStage +(32) ShuffleQueryStage Output [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: X -(29) InputAdapter +(33) InputAdapter Input [3]: [cntrycode#X, numcust#X, totacctbal#X] -(30) InputIteratorTransformer +(34) InputIteratorTransformer Input [3]: [cntrycode#X, numcust#X, totacctbal#X] -(31) SortExecTransformer +(35) SortExecTransformer Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: [cntrycode#X ASC NULLS FIRST], true, 0 -(32) WholeStageCodegenTransformer (X) +(36) WholeStageCodegenTransformer (X) Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: false -(33) VeloxColumnarToRowExec +(37) VeloxColumnarToRowExec Input [3]: [cntrycode#X, numcust#X, totacctbal#X] -(34) Scan parquet +(38) Scan parquet Output [3]: [c_custkey#X, c_phone#X, c_acctbal#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_acctbal)] ReadSchema: struct -(35) Filter +(39) Filter Input [3]: [c_custkey#X, c_phone#X, c_acctbal#X] Condition : ((isnotnull(c_acctbal#X) AND substring(c_phone#X, 1, 2) IN (13,31,23,29,30,18,17)) AND (cast(c_acctbal#X as decimal(16,6)) > Subquery subquery#X, [id=#X])) -(36) Exchange +(40) Exchange Input [3]: [c_custkey#X, c_phone#X, c_acctbal#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(37) Scan parquet +(41) Sort +Input [3]: [c_custkey#X, c_phone#X, c_acctbal#X] +Arguments: [c_custkey#X ASC NULLS FIRST], false, 0 + +(42) Scan parquet Output [1]: [o_custkey#X] Batched: true Location: InMemoryFileIndex [*] ReadSchema: struct -(38) Exchange +(43) Exchange Input [1]: [o_custkey#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(39) ShuffledHashJoin +(44) Sort +Input [1]: [o_custkey#X] +Arguments: [o_custkey#X ASC NULLS FIRST], false, 0 + +(45) SortMergeJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join type: LeftAnti Join condition: None -(40) Project +(46) Project Output [2]: [substring(c_phone#X, 1, 2) AS cntrycode#X, c_acctbal#X] Input [3]: [c_custkey#X, c_phone#X, c_acctbal#X] -(41) HashAggregate +(47) HashAggregate Input [2]: [cntrycode#X, c_acctbal#X] Keys [1]: [cntrycode#X] Functions [2]: [partial_count(1), partial_sum(c_acctbal#X)] Aggregate Attributes [3]: [count#X, sum#X, isEmpty#X] Results [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] -(42) Exchange +(48) Exchange Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] Arguments: hashpartitioning(cntrycode#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(43) HashAggregate +(49) HashAggregate Input [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] Keys [1]: [cntrycode#X] Functions [2]: [count(1), sum(c_acctbal#X)] Aggregate Attributes [2]: [count(1)#X, sum(c_acctbal#X)#X] Results [3]: [cntrycode#X, count(1)#X AS numcust#X, sum(c_acctbal#X)#X AS totacctbal#X] -(44) Exchange +(50) Exchange Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: rangepartitioning(cntrycode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(45) Sort +(51) Sort Input [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: [cntrycode#X ASC NULLS FIRST], true, 0 -(46) AdaptiveSparkPlan +(52) AdaptiveSparkPlan Output [3]: [cntrycode#X, numcust#X, totacctbal#X] Arguments: isFinalPlan=true ===== Subqueries ===== Subquery:1 Hosting operator id = 2 Hosting Expression = Subquery subquery#X, [id=#X] -AdaptiveSparkPlan (65) +AdaptiveSparkPlan (72) +- == Final Plan == - VeloxColumnarToRowExec (58) - +- ^ RegularHashAggregateExecTransformer (56) - +- ^ InputIteratorTransformer (55) - +- ^ InputAdapter (54) - +- ^ ShuffleQueryStage (53), Statistics(X) - +- ColumnarExchange (52) - +- ^ FlushableHashAggregateExecTransformer (50) - +- ^ ProjectExecTransformer (49) - +- ^ FilterExecTransformer (48) - +- ^ Scan parquet (47) + VeloxColumnarToRowExec (65) + +- ^ RegularHashAggregateExecTransformer (63) + +- ^ InputIteratorTransformer (62) + +- ShuffleQueryStage (60), Statistics(X) + +- ColumnarExchange (59) + +- VeloxAppendBatches (58) + +- ^ FlushableHashAggregateExecTransformer (56) + +- ^ ProjectExecTransformer (55) + +- ^ FilterExecTransformer (54) + +- ^ Scan parquet (53) +- == Initial Plan == - HashAggregate (64) - +- Exchange (63) - +- HashAggregate (62) - +- Project (61) - +- Filter (60) - +- Scan parquet (59) + HashAggregate (71) + +- Exchange (70) + +- HashAggregate (69) + +- Project (68) + +- Filter (67) + +- Scan parquet (66) -(47) Scan parquet +(53) Scan parquet Output [2]: [c_phone#X, c_acctbal#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_acctbal), GreaterThan(c_acctbal,0.00)] ReadSchema: struct -(48) FilterExecTransformer +(54) FilterExecTransformer Input [2]: [c_phone#X, c_acctbal#X] Arguments: ((isnotnull(c_acctbal#X) AND (c_acctbal#X > 0.00)) AND substring(c_phone#X, 1, 2) IN (13,31,23,29,30,18,17)) -(49) ProjectExecTransformer +(55) ProjectExecTransformer Output [1]: [c_acctbal#X] Input [2]: [c_phone#X, c_acctbal#X] -(50) FlushableHashAggregateExecTransformer +(56) FlushableHashAggregateExecTransformer Input [1]: [c_acctbal#X] Keys: [] Functions [1]: [partial_avg(c_acctbal#X)] Aggregate Attributes [2]: [sum#X, count#X] Results [2]: [sum#X, count#X] -(51) WholeStageCodegenTransformer (X) +(57) WholeStageCodegenTransformer (X) Input [2]: [sum#X, count#X] Arguments: false -(52) ColumnarExchange +(58) VeloxAppendBatches +Input [2]: [sum#X, count#X] +Arguments: X + +(59) ColumnarExchange Input [2]: [sum#X, count#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(53) ShuffleQueryStage +(60) ShuffleQueryStage Output [2]: [sum#X, count#X] Arguments: X -(54) InputAdapter +(61) InputAdapter Input [2]: [sum#X, count#X] -(55) InputIteratorTransformer +(62) InputIteratorTransformer Input [2]: [sum#X, count#X] -(56) RegularHashAggregateExecTransformer +(63) RegularHashAggregateExecTransformer Input [2]: [sum#X, count#X] Keys: [] Functions [1]: [avg(c_acctbal#X)] Aggregate Attributes [1]: [avg(c_acctbal#X)#X] Results [1]: [avg(c_acctbal#X)#X AS avg(c_acctbal)#X] -(57) WholeStageCodegenTransformer (X) +(64) WholeStageCodegenTransformer (X) Input [1]: [avg(c_acctbal)#X] Arguments: false -(58) VeloxColumnarToRowExec +(65) VeloxColumnarToRowExec Input [1]: [avg(c_acctbal)#X] -(59) Scan parquet +(66) Scan parquet Output [2]: [c_phone#X, c_acctbal#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_acctbal), GreaterThan(c_acctbal,0.00)] ReadSchema: struct -(60) Filter +(67) Filter Input [2]: [c_phone#X, c_acctbal#X] Condition : ((isnotnull(c_acctbal#X) AND (c_acctbal#X > 0.00)) AND substring(c_phone#X, 1, 2) IN (13,31,23,29,30,18,17)) -(61) Project +(68) Project Output [1]: [c_acctbal#X] Input [2]: [c_phone#X, c_acctbal#X] -(62) HashAggregate +(69) HashAggregate Input [1]: [c_acctbal#X] Keys: [] Functions [1]: [partial_avg(c_acctbal#X)] Aggregate Attributes [2]: [sum#X, count#X] Results [2]: [sum#X, count#X] -(63) Exchange +(70) Exchange Input [2]: [sum#X, count#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X] -(64) HashAggregate +(71) HashAggregate Input [2]: [sum#X, count#X] Keys: [] Functions [1]: [avg(c_acctbal#X)] Aggregate Attributes [1]: [avg(c_acctbal#X)#X] Results [1]: [avg(c_acctbal#X)#X AS avg(c_acctbal)#X] -(65) AdaptiveSparkPlan +(72) AdaptiveSparkPlan Output [1]: [avg(c_acctbal)#X] Arguments: isFinalPlan=true Subquery:2 Hosting operator id = 1 Hosting Expression = Subquery subquery#X, [id=#X] -AdaptiveSparkPlan (65) +AdaptiveSparkPlan (72) +- == Final Plan == - VeloxColumnarToRowExec (58) - +- ^ RegularHashAggregateExecTransformer (56) - +- ^ InputIteratorTransformer (55) - +- ^ InputAdapter (54) - +- ^ ShuffleQueryStage (53), Statistics(X) - +- ColumnarExchange (52) - +- ^ FlushableHashAggregateExecTransformer (50) - +- ^ ProjectExecTransformer (49) - +- ^ FilterExecTransformer (48) - +- ^ Scan parquet (47) + VeloxColumnarToRowExec (65) + +- ^ RegularHashAggregateExecTransformer (63) + +- ^ InputIteratorTransformer (62) + +- ShuffleQueryStage (60), Statistics(X) + +- ColumnarExchange (59) + +- VeloxAppendBatches (58) + +- ^ FlushableHashAggregateExecTransformer (56) + +- ^ ProjectExecTransformer (55) + +- ^ FilterExecTransformer (54) + +- ^ Scan parquet (53) +- == Initial Plan == - HashAggregate (64) - +- Exchange (63) - +- HashAggregate (62) - +- Project (61) - +- Filter (60) - +- Scan parquet (59) \ No newline at end of file + HashAggregate (71) + +- Exchange (70) + +- HashAggregate (69) + +- Project (68) + +- Filter (67) + +- Scan parquet (66) \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/3.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/3.txt index 0c75b0257abac..50ad3b59c3471 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/3.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/3.txt @@ -1,60 +1,64 @@ == Physical Plan == -AdaptiveSparkPlan (59) +AdaptiveSparkPlan (67) +- == Final Plan == - VeloxColumnarToRowExec (39) - +- TakeOrderedAndProjectExecTransformer (38) - +- ^ ProjectExecTransformer (36) - +- ^ RegularHashAggregateExecTransformer (35) - +- ^ RegularHashAggregateExecTransformer (34) - +- ^ ProjectExecTransformer (33) - +- ^ ShuffledHashJoinExecTransformer Inner (32) - :- ^ InputIteratorTransformer (23) - : +- ^ InputAdapter (22) - : +- ^ ShuffleQueryStage (21), Statistics(X) - : +- ColumnarExchange (20) - : +- ^ ProjectExecTransformer (18) - : +- ^ ShuffledHashJoinExecTransformer Inner (17) - : :- ^ InputIteratorTransformer (8) - : : +- ^ InputAdapter (7) - : : +- ^ ShuffleQueryStage (6), Statistics(X) - : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (43) + +- TakeOrderedAndProjectExecTransformer (42) + +- ^ ProjectExecTransformer (40) + +- ^ RegularHashAggregateExecTransformer (39) + +- ^ RegularHashAggregateExecTransformer (38) + +- ^ ProjectExecTransformer (37) + +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (36) + :- ^ InputIteratorTransformer (26) + : +- ShuffleQueryStage (24), Statistics(X) + : +- ColumnarExchange (23) + : +- VeloxAppendBatches (22) + : +- ^ ProjectExecTransformer (20) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (19) + : :- ^ InputIteratorTransformer (9) + : : +- ShuffleQueryStage (7), Statistics(X) + : : +- ColumnarExchange (6) + : : +- VeloxAppendBatches (5) : : +- ^ ProjectExecTransformer (3) : : +- ^ FilterExecTransformer (2) : : +- ^ Scan parquet (1) - : +- ^ InputIteratorTransformer (16) - : +- ^ InputAdapter (15) - : +- ^ ShuffleQueryStage (14), Statistics(X) - : +- ColumnarExchange (13) - : +- ^ ProjectExecTransformer (11) - : +- ^ FilterExecTransformer (10) - : +- ^ Scan parquet (9) - +- ^ InputIteratorTransformer (31) - +- ^ InputAdapter (30) - +- ^ ShuffleQueryStage (29), Statistics(X) - +- ColumnarExchange (28) - +- ^ ProjectExecTransformer (26) - +- ^ FilterExecTransformer (25) - +- ^ Scan parquet (24) + : +- ^ InputIteratorTransformer (18) + : +- ShuffleQueryStage (16), Statistics(X) + : +- ColumnarExchange (15) + : +- VeloxAppendBatches (14) + : +- ^ ProjectExecTransformer (12) + : +- ^ FilterExecTransformer (11) + : +- ^ Scan parquet (10) + +- ^ InputIteratorTransformer (35) + +- ShuffleQueryStage (33), Statistics(X) + +- ColumnarExchange (32) + +- VeloxAppendBatches (31) + +- ^ ProjectExecTransformer (29) + +- ^ FilterExecTransformer (28) + +- ^ Scan parquet (27) +- == Initial Plan == - TakeOrderedAndProject (58) - +- HashAggregate (57) - +- HashAggregate (56) - +- Project (55) - +- ShuffledHashJoin Inner BuildRight (54) - :- Exchange (49) - : +- Project (48) - : +- ShuffledHashJoin Inner BuildLeft (47) - : :- Exchange (43) - : : +- Project (42) - : : +- Filter (41) - : : +- Scan parquet (40) - : +- Exchange (46) - : +- Filter (45) - : +- Scan parquet (44) - +- Exchange (53) - +- Project (52) - +- Filter (51) - +- Scan parquet (50) + TakeOrderedAndProject (66) + +- HashAggregate (65) + +- HashAggregate (64) + +- Project (63) + +- SortMergeJoin Inner (62) + :- Sort (56) + : +- Exchange (55) + : +- Project (54) + : +- SortMergeJoin Inner (53) + : :- Sort (48) + : : +- Exchange (47) + : : +- Project (46) + : : +- Filter (45) + : : +- Scan parquet (44) + : +- Sort (52) + : +- Exchange (51) + : +- Filter (50) + : +- Scan parquet (49) + +- Sort (61) + +- Exchange (60) + +- Project (59) + +- Filter (58) + +- Scan parquet (57) (1) Scan parquet @@ -76,248 +80,280 @@ Input [2]: [c_custkey#X, c_mktsegment#X] Input [2]: [hash_partition_key#X, c_custkey#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [2]: [hash_partition_key#X, c_custkey#X] +Arguments: X + +(6) ColumnarExchange Input [2]: [hash_partition_key#X, c_custkey#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [1]: [c_custkey#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [1]: [c_custkey#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [1]: [c_custkey#X] -(9) Scan parquet +(10) Scan parquet Output [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), LessThan(o_orderdate,1995-03-15), IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(10) FilterExecTransformer +(11) FilterExecTransformer Input [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Arguments: (((isnotnull(o_orderdate#X) AND (o_orderdate#X < 1995-03-15)) AND isnotnull(o_custkey#X)) AND isnotnull(o_orderkey#X)) -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [5]: [hash(o_custkey#X, 42) AS hash_partition_key#X, o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Input [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] +Arguments: X + +(15) ColumnarExchange Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join type: Inner Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [4]: [hash(o_orderkey#X, 42) AS hash_partition_key#X, o_orderkey#X, o_orderdate#X, o_shippriority#X] Input [5]: [c_custkey#X, o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] -(19) WholeStageCodegenTransformer (X) +(21) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, o_orderkey#X, o_orderdate#X, o_shippriority#X] Arguments: false -(20) ColumnarExchange +(22) VeloxAppendBatches +Input [4]: [hash_partition_key#X, o_orderkey#X, o_orderdate#X, o_shippriority#X] +Arguments: X + +(23) ColumnarExchange Input [4]: [hash_partition_key#X, o_orderkey#X, o_orderdate#X, o_shippriority#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderdate#X, o_shippriority#X], [plan_id=X], [id=#X] -(21) ShuffleQueryStage +(24) ShuffleQueryStage Output [3]: [o_orderkey#X, o_orderdate#X, o_shippriority#X] Arguments: X -(22) InputAdapter +(25) InputAdapter Input [3]: [o_orderkey#X, o_orderdate#X, o_shippriority#X] -(23) InputIteratorTransformer +(26) InputIteratorTransformer Input [3]: [o_orderkey#X, o_orderdate#X, o_shippriority#X] -(24) Scan parquet +(27) Scan parquet Output [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThan(l_shipdate,1995-03-15), IsNotNull(l_orderkey)] ReadSchema: struct -(25) FilterExecTransformer +(28) FilterExecTransformer Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: ((isnotnull(l_shipdate#X) AND (l_shipdate#X > 1995-03-15)) AND isnotnull(l_orderkey#X)) -(26) ProjectExecTransformer +(29) ProjectExecTransformer Output [4]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(27) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X] Arguments: false -(28) ColumnarExchange +(31) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(32) ColumnarExchange Input [4]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(29) ShuffleQueryStage +(33) ShuffleQueryStage Output [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] Arguments: X -(30) InputAdapter +(34) InputAdapter Input [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] -(31) InputIteratorTransformer +(35) InputIteratorTransformer Input [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] -(32) ShuffledHashJoinExecTransformer +(36) ShuffledHashJoinExecTransformer Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: Inner Join condition: None -(33) ProjectExecTransformer +(37) ProjectExecTransformer Output [6]: [o_orderdate#X, o_shippriority#X, l_orderkey#X, l_extendedprice#X, l_discount#X, (l_extendedprice#X * (1 - l_discount#X)) AS _pre_X#X] Input [6]: [o_orderkey#X, o_orderdate#X, o_shippriority#X, l_orderkey#X, l_extendedprice#X, l_discount#X] -(34) RegularHashAggregateExecTransformer +(38) RegularHashAggregateExecTransformer Input [6]: [o_orderdate#X, o_shippriority#X, l_orderkey#X, l_extendedprice#X, l_discount#X, _pre_X#X] Keys [3]: [l_orderkey#X, o_orderdate#X, o_shippriority#X] Functions [1]: [partial_sum(_pre_X#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] -(35) RegularHashAggregateExecTransformer +(39) RegularHashAggregateExecTransformer Input [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] Keys [3]: [l_orderkey#X, o_orderdate#X, o_shippriority#X] Functions [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X] Results [4]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X] -(36) ProjectExecTransformer +(40) ProjectExecTransformer Output [4]: [l_orderkey#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X AS revenue#X, o_orderdate#X, o_shippriority#X] Input [4]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X] -(37) WholeStageCodegenTransformer (X) +(41) WholeStageCodegenTransformer (X) Input [4]: [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] Arguments: false -(38) TakeOrderedAndProjectExecTransformer +(42) TakeOrderedAndProjectExecTransformer Input [4]: [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] Arguments: X, [revenue#X DESC NULLS LAST, o_orderdate#X ASC NULLS FIRST], [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X], 0 -(39) VeloxColumnarToRowExec +(43) VeloxColumnarToRowExec Input [4]: [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] -(40) Scan parquet +(44) Scan parquet Output [2]: [c_custkey#X, c_mktsegment#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_mktsegment), EqualTo(c_mktsegment,BUILDING), IsNotNull(c_custkey)] ReadSchema: struct -(41) Filter +(45) Filter Input [2]: [c_custkey#X, c_mktsegment#X] Condition : ((isnotnull(c_mktsegment#X) AND (c_mktsegment#X = BUILDING)) AND isnotnull(c_custkey#X)) -(42) Project +(46) Project Output [1]: [c_custkey#X] Input [2]: [c_custkey#X, c_mktsegment#X] -(43) Exchange +(47) Exchange Input [1]: [c_custkey#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(44) Scan parquet +(48) Sort +Input [1]: [c_custkey#X] +Arguments: [c_custkey#X ASC NULLS FIRST], false, 0 + +(49) Scan parquet Output [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), LessThan(o_orderdate,1995-03-15), IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(45) Filter +(50) Filter Input [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Condition : (((isnotnull(o_orderdate#X) AND (o_orderdate#X < 1995-03-15)) AND isnotnull(o_custkey#X)) AND isnotnull(o_orderkey#X)) -(46) Exchange +(51) Exchange Input [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(47) ShuffledHashJoin +(52) Sort +Input [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] +Arguments: [o_custkey#X ASC NULLS FIRST], false, 0 + +(53) SortMergeJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join type: Inner Join condition: None -(48) Project +(54) Project Output [3]: [o_orderkey#X, o_orderdate#X, o_shippriority#X] Input [5]: [c_custkey#X, o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] -(49) Exchange +(55) Exchange Input [3]: [o_orderkey#X, o_orderdate#X, o_shippriority#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(50) Scan parquet +(56) Sort +Input [3]: [o_orderkey#X, o_orderdate#X, o_shippriority#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(57) Scan parquet Output [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThan(l_shipdate,1995-03-15), IsNotNull(l_orderkey)] ReadSchema: struct -(51) Filter +(58) Filter Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : ((isnotnull(l_shipdate#X) AND (l_shipdate#X > 1995-03-15)) AND isnotnull(l_orderkey#X)) -(52) Project +(59) Project Output [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(53) Exchange +(60) Exchange Input [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(54) ShuffledHashJoin +(61) Sort +Input [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(62) SortMergeJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: Inner Join condition: None -(55) Project +(63) Project Output [5]: [o_orderdate#X, o_shippriority#X, l_orderkey#X, l_extendedprice#X, l_discount#X] Input [6]: [o_orderkey#X, o_orderdate#X, o_shippriority#X, l_orderkey#X, l_extendedprice#X, l_discount#X] -(56) HashAggregate +(64) HashAggregate Input [5]: [o_orderdate#X, o_shippriority#X, l_orderkey#X, l_extendedprice#X, l_discount#X] Keys [3]: [l_orderkey#X, o_orderdate#X, o_shippriority#X] Functions [1]: [partial_sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] -(57) HashAggregate +(65) HashAggregate Input [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] Keys [3]: [l_orderkey#X, o_orderdate#X, o_shippriority#X] Functions [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X] Results [4]: [l_orderkey#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X AS revenue#X, o_orderdate#X, o_shippriority#X] -(58) TakeOrderedAndProject +(66) TakeOrderedAndProject Input [4]: [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] Arguments: X, [revenue#X DESC NULLS LAST, o_orderdate#X ASC NULLS FIRST], [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] -(59) AdaptiveSparkPlan +(67) AdaptiveSparkPlan Output [4]: [l_orderkey#X, revenue#X, o_orderdate#X, o_shippriority#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/4.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/4.txt index ee47fca61115a..2db46753e9fcf 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/4.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/4.txt @@ -1,51 +1,53 @@ == Physical Plan == -AdaptiveSparkPlan (50) +AdaptiveSparkPlan (56) +- == Final Plan == - VeloxColumnarToRowExec (34) - +- ^ SortExecTransformer (32) - +- ^ InputIteratorTransformer (31) - +- ^ InputAdapter (30) - +- ^ ShuffleQueryStage (29), Statistics(X) - +- ColumnarExchange (28) - +- ^ RegularHashAggregateExecTransformer (26) - +- ^ InputIteratorTransformer (25) - +- ^ InputAdapter (24) - +- ^ ShuffleQueryStage (23), Statistics(X) - +- ColumnarExchange (22) - +- ^ ProjectExecTransformer (20) - +- ^ FlushableHashAggregateExecTransformer (19) - +- ^ ProjectExecTransformer (18) - +- ^ ShuffledHashJoinExecTransformer LeftSemi (17) - :- ^ InputIteratorTransformer (8) - : +- ^ InputAdapter (7) - : +- ^ ShuffleQueryStage (6), Statistics(X) - : +- ColumnarExchange (5) + VeloxColumnarToRowExec (38) + +- ^ SortExecTransformer (36) + +- ^ InputIteratorTransformer (35) + +- ShuffleQueryStage (33), Statistics(X) + +- ColumnarExchange (32) + +- VeloxAppendBatches (31) + +- ^ RegularHashAggregateExecTransformer (29) + +- ^ InputIteratorTransformer (28) + +- ShuffleQueryStage (26), Statistics(X) + +- ColumnarExchange (25) + +- VeloxAppendBatches (24) + +- ^ ProjectExecTransformer (22) + +- ^ FlushableHashAggregateExecTransformer (21) + +- ^ ProjectExecTransformer (20) + +- ^ ShuffledHashJoinExecTransformer LeftSemi BuildRight (19) + :- ^ InputIteratorTransformer (9) + : +- ShuffleQueryStage (7), Statistics(X) + : +- ColumnarExchange (6) + : +- VeloxAppendBatches (5) : +- ^ ProjectExecTransformer (3) : +- ^ FilterExecTransformer (2) : +- ^ Scan parquet (1) - +- ^ InputIteratorTransformer (16) - +- ^ InputAdapter (15) - +- ^ ShuffleQueryStage (14), Statistics(X) - +- ColumnarExchange (13) - +- ^ ProjectExecTransformer (11) - +- ^ FilterExecTransformer (10) - +- ^ Scan parquet (9) + +- ^ InputIteratorTransformer (18) + +- ShuffleQueryStage (16), Statistics(X) + +- ColumnarExchange (15) + +- VeloxAppendBatches (14) + +- ^ ProjectExecTransformer (12) + +- ^ FilterExecTransformer (11) + +- ^ Scan parquet (10) +- == Initial Plan == - Sort (49) - +- Exchange (48) - +- HashAggregate (47) - +- Exchange (46) - +- HashAggregate (45) - +- Project (44) - +- ShuffledHashJoin LeftSemi BuildRight (43) - :- Exchange (38) - : +- Project (37) - : +- Filter (36) - : +- Scan parquet (35) - +- Exchange (42) - +- Project (41) - +- Filter (40) - +- Scan parquet (39) + Sort (55) + +- Exchange (54) + +- HashAggregate (53) + +- Exchange (52) + +- HashAggregate (51) + +- Project (50) + +- SortMergeJoin LeftSemi (49) + :- Sort (43) + : +- Exchange (42) + : +- Project (41) + : +- Filter (40) + : +- Scan parquet (39) + +- Sort (48) + +- Exchange (47) + +- Project (46) + +- Filter (45) + +- Scan parquet (44) (1) Scan parquet @@ -67,202 +69,226 @@ Input [3]: [o_orderkey#X, o_orderdate#X, o_orderpriority#X] Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderpriority#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderpriority#X] +Arguments: X + +(6) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderpriority#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderpriority#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [2]: [o_orderkey#X, o_orderpriority#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [2]: [o_orderkey#X, o_orderpriority#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [2]: [o_orderkey#X, o_orderpriority#X] -(9) Scan parquet +(10) Scan parquet Output [3]: [l_orderkey#X, l_commitdate#X, l_receiptdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_commitdate), IsNotNull(l_receiptdate)] ReadSchema: struct -(10) FilterExecTransformer +(11) FilterExecTransformer Input [3]: [l_orderkey#X, l_commitdate#X, l_receiptdate#X] Arguments: ((isnotnull(l_commitdate#X) AND isnotnull(l_receiptdate#X)) AND (l_commitdate#X < l_receiptdate#X)) -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [2]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X] Input [3]: [l_orderkey#X, l_commitdate#X, l_receiptdate#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, l_orderkey#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [2]: [hash_partition_key#X, l_orderkey#X] +Arguments: X + +(15) ColumnarExchange Input [2]: [hash_partition_key#X, l_orderkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [1]: [l_orderkey#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [1]: [l_orderkey#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [1]: [l_orderkey#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: LeftSemi Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [1]: [o_orderpriority#X] Input [2]: [o_orderkey#X, o_orderpriority#X] -(19) FlushableHashAggregateExecTransformer +(21) FlushableHashAggregateExecTransformer Input [1]: [o_orderpriority#X] Keys [1]: [o_orderpriority#X] Functions [1]: [partial_count(1)] Aggregate Attributes [1]: [count#X] Results [2]: [o_orderpriority#X, count#X] -(20) ProjectExecTransformer +(22) ProjectExecTransformer Output [3]: [hash(o_orderpriority#X, 42) AS hash_partition_key#X, o_orderpriority#X, count#X] Input [2]: [o_orderpriority#X, count#X] -(21) WholeStageCodegenTransformer (X) +(23) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, o_orderpriority#X, count#X] Arguments: false -(22) ColumnarExchange +(24) VeloxAppendBatches +Input [3]: [hash_partition_key#X, o_orderpriority#X, count#X] +Arguments: X + +(25) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderpriority#X, count#X] Arguments: hashpartitioning(o_orderpriority#X, 1), ENSURE_REQUIREMENTS, [o_orderpriority#X, count#X], [plan_id=X], [id=#X] -(23) ShuffleQueryStage +(26) ShuffleQueryStage Output [2]: [o_orderpriority#X, count#X] Arguments: X -(24) InputAdapter +(27) InputAdapter Input [2]: [o_orderpriority#X, count#X] -(25) InputIteratorTransformer +(28) InputIteratorTransformer Input [2]: [o_orderpriority#X, count#X] -(26) RegularHashAggregateExecTransformer +(29) RegularHashAggregateExecTransformer Input [2]: [o_orderpriority#X, count#X] Keys [1]: [o_orderpriority#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [o_orderpriority#X, count(1)#X AS order_count#X] -(27) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [2]: [o_orderpriority#X, order_count#X] Arguments: false -(28) ColumnarExchange +(31) VeloxAppendBatches +Input [2]: [o_orderpriority#X, order_count#X] +Arguments: X + +(32) ColumnarExchange Input [2]: [o_orderpriority#X, order_count#X] Arguments: rangepartitioning(o_orderpriority#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(29) ShuffleQueryStage +(33) ShuffleQueryStage Output [2]: [o_orderpriority#X, order_count#X] Arguments: X -(30) InputAdapter +(34) InputAdapter Input [2]: [o_orderpriority#X, order_count#X] -(31) InputIteratorTransformer +(35) InputIteratorTransformer Input [2]: [o_orderpriority#X, order_count#X] -(32) SortExecTransformer +(36) SortExecTransformer Input [2]: [o_orderpriority#X, order_count#X] Arguments: [o_orderpriority#X ASC NULLS FIRST], true, 0 -(33) WholeStageCodegenTransformer (X) +(37) WholeStageCodegenTransformer (X) Input [2]: [o_orderpriority#X, order_count#X] Arguments: false -(34) VeloxColumnarToRowExec +(38) VeloxColumnarToRowExec Input [2]: [o_orderpriority#X, order_count#X] -(35) Scan parquet +(39) Scan parquet Output [3]: [o_orderkey#X, o_orderdate#X, o_orderpriority#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1993-07-01), LessThan(o_orderdate,1993-10-01)] ReadSchema: struct -(36) Filter +(40) Filter Input [3]: [o_orderkey#X, o_orderdate#X, o_orderpriority#X] Condition : ((isnotnull(o_orderdate#X) AND (o_orderdate#X >= 1993-07-01)) AND (o_orderdate#X < 1993-10-01)) -(37) Project +(41) Project Output [2]: [o_orderkey#X, o_orderpriority#X] Input [3]: [o_orderkey#X, o_orderdate#X, o_orderpriority#X] -(38) Exchange +(42) Exchange Input [2]: [o_orderkey#X, o_orderpriority#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(39) Scan parquet +(43) Sort +Input [2]: [o_orderkey#X, o_orderpriority#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(44) Scan parquet Output [3]: [l_orderkey#X, l_commitdate#X, l_receiptdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_commitdate), IsNotNull(l_receiptdate)] ReadSchema: struct -(40) Filter +(45) Filter Input [3]: [l_orderkey#X, l_commitdate#X, l_receiptdate#X] Condition : ((isnotnull(l_commitdate#X) AND isnotnull(l_receiptdate#X)) AND (l_commitdate#X < l_receiptdate#X)) -(41) Project +(46) Project Output [1]: [l_orderkey#X] Input [3]: [l_orderkey#X, l_commitdate#X, l_receiptdate#X] -(42) Exchange +(47) Exchange Input [1]: [l_orderkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(43) ShuffledHashJoin +(48) Sort +Input [1]: [l_orderkey#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(49) SortMergeJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: LeftSemi Join condition: None -(44) Project +(50) Project Output [1]: [o_orderpriority#X] Input [2]: [o_orderkey#X, o_orderpriority#X] -(45) HashAggregate +(51) HashAggregate Input [1]: [o_orderpriority#X] Keys [1]: [o_orderpriority#X] Functions [1]: [partial_count(1)] Aggregate Attributes [1]: [count#X] Results [2]: [o_orderpriority#X, count#X] -(46) Exchange +(52) Exchange Input [2]: [o_orderpriority#X, count#X] Arguments: hashpartitioning(o_orderpriority#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(47) HashAggregate +(53) HashAggregate Input [2]: [o_orderpriority#X, count#X] Keys [1]: [o_orderpriority#X] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#X] Results [2]: [o_orderpriority#X, count(1)#X AS order_count#X] -(48) Exchange +(54) Exchange Input [2]: [o_orderpriority#X, order_count#X] Arguments: rangepartitioning(o_orderpriority#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(49) Sort +(55) Sort Input [2]: [o_orderpriority#X, order_count#X] Arguments: [o_orderpriority#X ASC NULLS FIRST], true, 0 -(50) AdaptiveSparkPlan +(56) AdaptiveSparkPlan Output [2]: [o_orderpriority#X, order_count#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/5.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/5.txt index 39a8f483f6390..07a5c86709f4e 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/5.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/5.txt @@ -1,127 +1,137 @@ == Physical Plan == -AdaptiveSparkPlan (134) +AdaptiveSparkPlan (156) +- == Final Plan == - VeloxColumnarToRowExec (94) - +- ^ SortExecTransformer (92) - +- ^ InputIteratorTransformer (91) - +- ^ InputAdapter (90) - +- ^ ShuffleQueryStage (89), Statistics(X) - +- ColumnarExchange (88) - +- ^ RegularHashAggregateExecTransformer (86) - +- ^ InputIteratorTransformer (85) - +- ^ InputAdapter (84) - +- ^ ShuffleQueryStage (83), Statistics(X) - +- ColumnarExchange (82) - +- ^ ProjectExecTransformer (80) - +- ^ FlushableHashAggregateExecTransformer (79) - +- ^ ProjectExecTransformer (78) - +- ^ ShuffledHashJoinExecTransformer Inner (77) - :- ^ InputIteratorTransformer (68) - : +- ^ InputAdapter (67) - : +- ^ ShuffleQueryStage (66), Statistics(X) - : +- ColumnarExchange (65) - : +- ^ ProjectExecTransformer (63) - : +- ^ ShuffledHashJoinExecTransformer Inner (62) - : :- ^ InputIteratorTransformer (53) - : : +- ^ InputAdapter (52) - : : +- ^ ShuffleQueryStage (51), Statistics(X) - : : +- ColumnarExchange (50) - : : +- ^ ProjectExecTransformer (48) - : : +- ^ ShuffledHashJoinExecTransformer Inner (47) - : : :- ^ InputIteratorTransformer (38) - : : : +- ^ InputAdapter (37) - : : : +- ^ ShuffleQueryStage (36), Statistics(X) - : : : +- ColumnarExchange (35) - : : : +- ^ ProjectExecTransformer (33) - : : : +- ^ ShuffledHashJoinExecTransformer Inner (32) - : : : :- ^ InputIteratorTransformer (23) - : : : : +- ^ InputAdapter (22) - : : : : +- ^ ShuffleQueryStage (21), Statistics(X) - : : : : +- ColumnarExchange (20) - : : : : +- ^ ProjectExecTransformer (18) - : : : : +- ^ ShuffledHashJoinExecTransformer Inner (17) - : : : : :- ^ InputIteratorTransformer (8) - : : : : : +- ^ InputAdapter (7) - : : : : : +- ^ ShuffleQueryStage (6), Statistics(X) - : : : : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (106) + +- ^ SortExecTransformer (104) + +- ^ InputIteratorTransformer (103) + +- ShuffleQueryStage (101), Statistics(X) + +- ColumnarExchange (100) + +- VeloxAppendBatches (99) + +- ^ RegularHashAggregateExecTransformer (97) + +- ^ InputIteratorTransformer (96) + +- ShuffleQueryStage (94), Statistics(X) + +- ColumnarExchange (93) + +- VeloxAppendBatches (92) + +- ^ ProjectExecTransformer (90) + +- ^ FlushableHashAggregateExecTransformer (89) + +- ^ ProjectExecTransformer (88) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (87) + :- ^ InputIteratorTransformer (77) + : +- ShuffleQueryStage (75), Statistics(X) + : +- ColumnarExchange (74) + : +- VeloxAppendBatches (73) + : +- ^ ProjectExecTransformer (71) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (70) + : :- ^ InputIteratorTransformer (60) + : : +- ShuffleQueryStage (58), Statistics(X) + : : +- ColumnarExchange (57) + : : +- VeloxAppendBatches (56) + : : +- ^ ProjectExecTransformer (54) + : : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (53) + : : :- ^ InputIteratorTransformer (43) + : : : +- ShuffleQueryStage (41), Statistics(X) + : : : +- ColumnarExchange (40) + : : : +- VeloxAppendBatches (39) + : : : +- ^ ProjectExecTransformer (37) + : : : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (36) + : : : :- ^ InputIteratorTransformer (26) + : : : : +- ShuffleQueryStage (24), Statistics(X) + : : : : +- ColumnarExchange (23) + : : : : +- VeloxAppendBatches (22) + : : : : +- ^ ProjectExecTransformer (20) + : : : : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (19) + : : : : :- ^ InputIteratorTransformer (9) + : : : : : +- ShuffleQueryStage (7), Statistics(X) + : : : : : +- ColumnarExchange (6) + : : : : : +- VeloxAppendBatches (5) : : : : : +- ^ ProjectExecTransformer (3) : : : : : +- ^ FilterExecTransformer (2) : : : : : +- ^ Scan parquet (1) - : : : : +- ^ InputIteratorTransformer (16) - : : : : +- ^ InputAdapter (15) - : : : : +- ^ ShuffleQueryStage (14), Statistics(X) - : : : : +- ColumnarExchange (13) - : : : : +- ^ ProjectExecTransformer (11) - : : : : +- ^ FilterExecTransformer (10) - : : : : +- ^ Scan parquet (9) - : : : +- ^ InputIteratorTransformer (31) - : : : +- ^ InputAdapter (30) - : : : +- ^ ShuffleQueryStage (29), Statistics(X) - : : : +- ColumnarExchange (28) - : : : +- ^ ProjectExecTransformer (26) - : : : +- ^ FilterExecTransformer (25) - : : : +- ^ Scan parquet (24) - : : +- ^ InputIteratorTransformer (46) - : : +- ^ InputAdapter (45) - : : +- ^ ShuffleQueryStage (44), Statistics(X) - : : +- ColumnarExchange (43) - : : +- ^ ProjectExecTransformer (41) - : : +- ^ FilterExecTransformer (40) - : : +- ^ Scan parquet (39) - : +- ^ InputIteratorTransformer (61) - : +- ^ InputAdapter (60) - : +- ^ ShuffleQueryStage (59), Statistics(X) - : +- ColumnarExchange (58) - : +- ^ ProjectExecTransformer (56) - : +- ^ FilterExecTransformer (55) - : +- ^ Scan parquet (54) - +- ^ InputIteratorTransformer (76) - +- ^ InputAdapter (75) - +- ^ ShuffleQueryStage (74), Statistics(X) - +- ColumnarExchange (73) - +- ^ ProjectExecTransformer (71) - +- ^ FilterExecTransformer (70) - +- ^ Scan parquet (69) + : : : : +- ^ InputIteratorTransformer (18) + : : : : +- ShuffleQueryStage (16), Statistics(X) + : : : : +- ColumnarExchange (15) + : : : : +- VeloxAppendBatches (14) + : : : : +- ^ ProjectExecTransformer (12) + : : : : +- ^ FilterExecTransformer (11) + : : : : +- ^ Scan parquet (10) + : : : +- ^ InputIteratorTransformer (35) + : : : +- ShuffleQueryStage (33), Statistics(X) + : : : +- ColumnarExchange (32) + : : : +- VeloxAppendBatches (31) + : : : +- ^ ProjectExecTransformer (29) + : : : +- ^ FilterExecTransformer (28) + : : : +- ^ Scan parquet (27) + : : +- ^ InputIteratorTransformer (52) + : : +- ShuffleQueryStage (50), Statistics(X) + : : +- ColumnarExchange (49) + : : +- VeloxAppendBatches (48) + : : +- ^ ProjectExecTransformer (46) + : : +- ^ FilterExecTransformer (45) + : : +- ^ Scan parquet (44) + : +- ^ InputIteratorTransformer (69) + : +- ShuffleQueryStage (67), Statistics(X) + : +- ColumnarExchange (66) + : +- VeloxAppendBatches (65) + : +- ^ ProjectExecTransformer (63) + : +- ^ FilterExecTransformer (62) + : +- ^ Scan parquet (61) + +- ^ InputIteratorTransformer (86) + +- ShuffleQueryStage (84), Statistics(X) + +- ColumnarExchange (83) + +- VeloxAppendBatches (82) + +- ^ ProjectExecTransformer (80) + +- ^ FilterExecTransformer (79) + +- ^ Scan parquet (78) +- == Initial Plan == - Sort (133) - +- Exchange (132) - +- HashAggregate (131) - +- Exchange (130) - +- HashAggregate (129) - +- Project (128) - +- ShuffledHashJoin Inner BuildRight (127) - :- Exchange (122) - : +- Project (121) - : +- ShuffledHashJoin Inner BuildRight (120) - : :- Exchange (116) - : : +- Project (115) - : : +- ShuffledHashJoin Inner BuildRight (114) - : : :- Exchange (110) - : : : +- Project (109) - : : : +- ShuffledHashJoin Inner BuildRight (108) - : : : :- Exchange (104) - : : : : +- Project (103) - : : : : +- ShuffledHashJoin Inner BuildLeft (102) - : : : : :- Exchange (97) - : : : : : +- Filter (96) - : : : : : +- Scan parquet (95) - : : : : +- Exchange (101) - : : : : +- Project (100) - : : : : +- Filter (99) - : : : : +- Scan parquet (98) - : : : +- Exchange (107) - : : : +- Filter (106) - : : : +- Scan parquet (105) - : : +- Exchange (113) - : : +- Filter (112) - : : +- Scan parquet (111) - : +- Exchange (119) - : +- Filter (118) - : +- Scan parquet (117) - +- Exchange (126) - +- Project (125) - +- Filter (124) - +- Scan parquet (123) + Sort (155) + +- Exchange (154) + +- HashAggregate (153) + +- Exchange (152) + +- HashAggregate (151) + +- Project (150) + +- SortMergeJoin Inner (149) + :- Sort (143) + : +- Exchange (142) + : +- Project (141) + : +- SortMergeJoin Inner (140) + : :- Sort (135) + : : +- Exchange (134) + : : +- Project (133) + : : +- SortMergeJoin Inner (132) + : : :- Sort (127) + : : : +- Exchange (126) + : : : +- Project (125) + : : : +- SortMergeJoin Inner (124) + : : : :- Sort (119) + : : : : +- Exchange (118) + : : : : +- Project (117) + : : : : +- SortMergeJoin Inner (116) + : : : : :- Sort (110) + : : : : : +- Exchange (109) + : : : : : +- Filter (108) + : : : : : +- Scan parquet (107) + : : : : +- Sort (115) + : : : : +- Exchange (114) + : : : : +- Project (113) + : : : : +- Filter (112) + : : : : +- Scan parquet (111) + : : : +- Sort (123) + : : : +- Exchange (122) + : : : +- Filter (121) + : : : +- Scan parquet (120) + : : +- Sort (131) + : : +- Exchange (130) + : : +- Filter (129) + : : +- Scan parquet (128) + : +- Sort (139) + : +- Exchange (138) + : +- Filter (137) + : +- Scan parquet (136) + +- Sort (148) + +- Exchange (147) + +- Project (146) + +- Filter (145) + +- Scan parquet (144) (1) Scan parquet @@ -143,562 +153,650 @@ Input [2]: [c_custkey#X, c_nationkey#X] Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] +Arguments: X + +(6) ColumnarExchange Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_nationkey#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [2]: [c_custkey#X, c_nationkey#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [2]: [c_custkey#X, c_nationkey#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [2]: [c_custkey#X, c_nationkey#X] -(9) Scan parquet +(10) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1994-01-01), LessThan(o_orderdate,1995-01-01), IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(10) FilterExecTransformer +(11) FilterExecTransformer Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Arguments: ((((isnotnull(o_orderdate#X) AND (o_orderdate#X >= 1994-01-01)) AND (o_orderdate#X < 1995-01-01)) AND isnotnull(o_custkey#X)) AND isnotnull(o_orderkey#X)) -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [3]: [hash(o_custkey#X, 42) AS hash_partition_key#X, o_orderkey#X, o_custkey#X] Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] +Arguments: X + +(15) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [2]: [o_orderkey#X, o_custkey#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [2]: [o_orderkey#X, o_custkey#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [2]: [o_orderkey#X, o_custkey#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join type: Inner Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [3]: [hash(o_orderkey#X, 42) AS hash_partition_key#X, c_nationkey#X, o_orderkey#X] Input [4]: [c_custkey#X, c_nationkey#X, o_orderkey#X, o_custkey#X] -(19) WholeStageCodegenTransformer (X) +(21) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, c_nationkey#X, o_orderkey#X] Arguments: false -(20) ColumnarExchange +(22) VeloxAppendBatches +Input [3]: [hash_partition_key#X, c_nationkey#X, o_orderkey#X] +Arguments: X + +(23) ColumnarExchange Input [3]: [hash_partition_key#X, c_nationkey#X, o_orderkey#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [c_nationkey#X, o_orderkey#X], [plan_id=X], [id=#X] -(21) ShuffleQueryStage +(24) ShuffleQueryStage Output [2]: [c_nationkey#X, o_orderkey#X] Arguments: X -(22) InputAdapter +(25) InputAdapter Input [2]: [c_nationkey#X, o_orderkey#X] -(23) InputIteratorTransformer +(26) InputIteratorTransformer Input [2]: [c_nationkey#X, o_orderkey#X] -(24) Scan parquet +(27) Scan parquet Output [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_orderkey), IsNotNull(l_suppkey)] ReadSchema: struct -(25) FilterExecTransformer +(28) FilterExecTransformer Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: (isnotnull(l_orderkey#X) AND isnotnull(l_suppkey#X)) -(26) ProjectExecTransformer +(29) ProjectExecTransformer Output [5]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(27) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: false -(28) ColumnarExchange +(31) VeloxAppendBatches +Input [5]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(32) ColumnarExchange Input [5]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(29) ShuffleQueryStage +(33) ShuffleQueryStage Output [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: X -(30) InputAdapter +(34) InputAdapter Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(31) InputIteratorTransformer +(35) InputIteratorTransformer Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(32) ShuffledHashJoinExecTransformer +(36) ShuffledHashJoinExecTransformer Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: Inner Join condition: None -(33) ProjectExecTransformer +(37) ProjectExecTransformer Output [5]: [hash(l_suppkey#X, c_nationkey#X, 42) AS hash_partition_key#X, c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Input [6]: [c_nationkey#X, o_orderkey#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(34) WholeStageCodegenTransformer (X) +(38) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: false -(35) ColumnarExchange +(39) VeloxAppendBatches +Input [5]: [hash_partition_key#X, c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(40) ColumnarExchange Input [5]: [hash_partition_key#X, c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_suppkey#X, c_nationkey#X, 1), ENSURE_REQUIREMENTS, [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(36) ShuffleQueryStage +(41) ShuffleQueryStage Output [4]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: X -(37) InputAdapter +(42) InputAdapter Input [4]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(38) InputIteratorTransformer +(43) InputIteratorTransformer Input [4]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(39) Scan parquet +(44) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(40) FilterExecTransformer +(45) FilterExecTransformer Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(41) ProjectExecTransformer +(46) ProjectExecTransformer Output [3]: [hash(s_suppkey#X, s_nationkey#X, 42) AS hash_partition_key#X, s_suppkey#X, s_nationkey#X] Input [2]: [s_suppkey#X, s_nationkey#X] -(42) WholeStageCodegenTransformer (X) +(47) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] Arguments: false -(43) ColumnarExchange +(48) VeloxAppendBatches +Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] +Arguments: X + +(49) ColumnarExchange Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [id=#X] -(44) ShuffleQueryStage +(50) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] Arguments: X -(45) InputAdapter +(51) InputAdapter Input [2]: [s_suppkey#X, s_nationkey#X] -(46) InputIteratorTransformer +(52) InputIteratorTransformer Input [2]: [s_suppkey#X, s_nationkey#X] -(47) ShuffledHashJoinExecTransformer +(53) ShuffledHashJoinExecTransformer Left keys [2]: [l_suppkey#X, c_nationkey#X] Right keys [2]: [s_suppkey#X, s_nationkey#X] Join type: Inner Join condition: None -(48) ProjectExecTransformer +(54) ProjectExecTransformer Output [4]: [hash(s_nationkey#X, 42) AS hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Input [6]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, s_suppkey#X, s_nationkey#X] -(49) WholeStageCodegenTransformer (X) +(55) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: false -(50) ColumnarExchange +(56) VeloxAppendBatches +Input [4]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X] +Arguments: X + +(57) ColumnarExchange Input [4]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X], [plan_id=X], [id=#X] -(51) ShuffleQueryStage +(58) ShuffleQueryStage Output [3]: [l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: X -(52) InputAdapter +(59) InputAdapter Input [3]: [l_extendedprice#X, l_discount#X, s_nationkey#X] -(53) InputIteratorTransformer +(60) InputIteratorTransformer Input [3]: [l_extendedprice#X, l_discount#X, s_nationkey#X] -(54) Scan parquet +(61) Scan parquet Output [3]: [n_nationkey#X, n_name#X, n_regionkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), IsNotNull(n_regionkey)] ReadSchema: struct -(55) FilterExecTransformer +(62) FilterExecTransformer Input [3]: [n_nationkey#X, n_name#X, n_regionkey#X] Arguments: (isnotnull(n_nationkey#X) AND isnotnull(n_regionkey#X)) -(56) ProjectExecTransformer +(63) ProjectExecTransformer Output [4]: [hash(n_nationkey#X, 42) AS hash_partition_key#X, n_nationkey#X, n_name#X, n_regionkey#X] Input [3]: [n_nationkey#X, n_name#X, n_regionkey#X] -(57) WholeStageCodegenTransformer (X) +(64) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, n_nationkey#X, n_name#X, n_regionkey#X] Arguments: false -(58) ColumnarExchange +(65) VeloxAppendBatches +Input [4]: [hash_partition_key#X, n_nationkey#X, n_name#X, n_regionkey#X] +Arguments: X + +(66) ColumnarExchange Input [4]: [hash_partition_key#X, n_nationkey#X, n_name#X, n_regionkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X, n_regionkey#X], [plan_id=X], [id=#X] -(59) ShuffleQueryStage +(67) ShuffleQueryStage Output [3]: [n_nationkey#X, n_name#X, n_regionkey#X] Arguments: X -(60) InputAdapter +(68) InputAdapter Input [3]: [n_nationkey#X, n_name#X, n_regionkey#X] -(61) InputIteratorTransformer +(69) InputIteratorTransformer Input [3]: [n_nationkey#X, n_name#X, n_regionkey#X] -(62) ShuffledHashJoinExecTransformer +(70) ShuffledHashJoinExecTransformer Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(63) ProjectExecTransformer +(71) ProjectExecTransformer Output [5]: [hash(n_regionkey#X, 42) AS hash_partition_key#X, l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] Input [6]: [l_extendedprice#X, l_discount#X, s_nationkey#X, n_nationkey#X, n_name#X, n_regionkey#X] -(64) WholeStageCodegenTransformer (X) +(72) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] Arguments: false -(65) ColumnarExchange +(73) VeloxAppendBatches +Input [5]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] +Arguments: X + +(74) ColumnarExchange Input [5]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] Arguments: hashpartitioning(n_regionkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X], [plan_id=X], [id=#X] -(66) ShuffleQueryStage +(75) ShuffleQueryStage Output [4]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] Arguments: X -(67) InputAdapter +(76) InputAdapter Input [4]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] -(68) InputIteratorTransformer +(77) InputIteratorTransformer Input [4]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] -(69) Scan parquet +(78) Scan parquet Output [2]: [r_regionkey#X, r_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(r_name), EqualTo(r_name,ASIA), IsNotNull(r_regionkey)] ReadSchema: struct -(70) FilterExecTransformer +(79) FilterExecTransformer Input [2]: [r_regionkey#X, r_name#X] Arguments: ((isnotnull(r_name#X) AND (r_name#X = ASIA)) AND isnotnull(r_regionkey#X)) -(71) ProjectExecTransformer +(80) ProjectExecTransformer Output [2]: [hash(r_regionkey#X, 42) AS hash_partition_key#X, r_regionkey#X] Input [2]: [r_regionkey#X, r_name#X] -(72) WholeStageCodegenTransformer (X) +(81) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, r_regionkey#X] Arguments: false -(73) ColumnarExchange +(82) VeloxAppendBatches +Input [2]: [hash_partition_key#X, r_regionkey#X] +Arguments: X + +(83) ColumnarExchange Input [2]: [hash_partition_key#X, r_regionkey#X] Arguments: hashpartitioning(r_regionkey#X, 1), ENSURE_REQUIREMENTS, [r_regionkey#X], [plan_id=X], [id=#X] -(74) ShuffleQueryStage +(84) ShuffleQueryStage Output [1]: [r_regionkey#X] Arguments: X -(75) InputAdapter +(85) InputAdapter Input [1]: [r_regionkey#X] -(76) InputIteratorTransformer +(86) InputIteratorTransformer Input [1]: [r_regionkey#X] -(77) ShuffledHashJoinExecTransformer +(87) ShuffledHashJoinExecTransformer Left keys [1]: [n_regionkey#X] Right keys [1]: [r_regionkey#X] Join type: Inner Join condition: None -(78) ProjectExecTransformer +(88) ProjectExecTransformer Output [4]: [l_extendedprice#X, l_discount#X, n_name#X, (l_extendedprice#X * (1 - l_discount#X)) AS _pre_X#X] Input [5]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X, r_regionkey#X] -(79) FlushableHashAggregateExecTransformer +(89) FlushableHashAggregateExecTransformer Input [4]: [l_extendedprice#X, l_discount#X, n_name#X, _pre_X#X] Keys [1]: [n_name#X] Functions [1]: [partial_sum(_pre_X#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [n_name#X, sum#X, isEmpty#X] -(80) ProjectExecTransformer +(90) ProjectExecTransformer Output [4]: [hash(n_name#X, 42) AS hash_partition_key#X, n_name#X, sum#X, isEmpty#X] Input [3]: [n_name#X, sum#X, isEmpty#X] -(81) WholeStageCodegenTransformer (X) +(91) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, n_name#X, sum#X, isEmpty#X] Arguments: false -(82) ColumnarExchange +(92) VeloxAppendBatches +Input [4]: [hash_partition_key#X, n_name#X, sum#X, isEmpty#X] +Arguments: X + +(93) ColumnarExchange Input [4]: [hash_partition_key#X, n_name#X, sum#X, isEmpty#X] Arguments: hashpartitioning(n_name#X, 1), ENSURE_REQUIREMENTS, [n_name#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(83) ShuffleQueryStage +(94) ShuffleQueryStage Output [3]: [n_name#X, sum#X, isEmpty#X] Arguments: X -(84) InputAdapter +(95) InputAdapter Input [3]: [n_name#X, sum#X, isEmpty#X] -(85) InputIteratorTransformer +(96) InputIteratorTransformer Input [3]: [n_name#X, sum#X, isEmpty#X] -(86) RegularHashAggregateExecTransformer +(97) RegularHashAggregateExecTransformer Input [3]: [n_name#X, sum#X, isEmpty#X] Keys [1]: [n_name#X] Functions [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X] Results [2]: [n_name#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X AS revenue#X] -(87) WholeStageCodegenTransformer (X) +(98) WholeStageCodegenTransformer (X) Input [2]: [n_name#X, revenue#X] Arguments: false -(88) ColumnarExchange +(99) VeloxAppendBatches +Input [2]: [n_name#X, revenue#X] +Arguments: X + +(100) ColumnarExchange Input [2]: [n_name#X, revenue#X] Arguments: rangepartitioning(revenue#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(89) ShuffleQueryStage +(101) ShuffleQueryStage Output [2]: [n_name#X, revenue#X] Arguments: X -(90) InputAdapter +(102) InputAdapter Input [2]: [n_name#X, revenue#X] -(91) InputIteratorTransformer +(103) InputIteratorTransformer Input [2]: [n_name#X, revenue#X] -(92) SortExecTransformer +(104) SortExecTransformer Input [2]: [n_name#X, revenue#X] Arguments: [revenue#X DESC NULLS LAST], true, 0 -(93) WholeStageCodegenTransformer (X) +(105) WholeStageCodegenTransformer (X) Input [2]: [n_name#X, revenue#X] Arguments: false -(94) VeloxColumnarToRowExec +(106) VeloxColumnarToRowExec Input [2]: [n_name#X, revenue#X] -(95) Scan parquet +(107) Scan parquet Output [2]: [c_custkey#X, c_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey), IsNotNull(c_nationkey)] ReadSchema: struct -(96) Filter +(108) Filter Input [2]: [c_custkey#X, c_nationkey#X] Condition : (isnotnull(c_custkey#X) AND isnotnull(c_nationkey#X)) -(97) Exchange +(109) Exchange Input [2]: [c_custkey#X, c_nationkey#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(98) Scan parquet +(110) Sort +Input [2]: [c_custkey#X, c_nationkey#X] +Arguments: [c_custkey#X ASC NULLS FIRST], false, 0 + +(111) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1994-01-01), LessThan(o_orderdate,1995-01-01), IsNotNull(o_custkey), IsNotNull(o_orderkey)] ReadSchema: struct -(99) Filter +(112) Filter Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Condition : ((((isnotnull(o_orderdate#X) AND (o_orderdate#X >= 1994-01-01)) AND (o_orderdate#X < 1995-01-01)) AND isnotnull(o_custkey#X)) AND isnotnull(o_orderkey#X)) -(100) Project +(113) Project Output [2]: [o_orderkey#X, o_custkey#X] Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] -(101) Exchange +(114) Exchange Input [2]: [o_orderkey#X, o_custkey#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(102) ShuffledHashJoin +(115) Sort +Input [2]: [o_orderkey#X, o_custkey#X] +Arguments: [o_custkey#X ASC NULLS FIRST], false, 0 + +(116) SortMergeJoin Left keys [1]: [c_custkey#X] Right keys [1]: [o_custkey#X] Join type: Inner Join condition: None -(103) Project +(117) Project Output [2]: [c_nationkey#X, o_orderkey#X] Input [4]: [c_custkey#X, c_nationkey#X, o_orderkey#X, o_custkey#X] -(104) Exchange +(118) Exchange Input [2]: [c_nationkey#X, o_orderkey#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(105) Scan parquet +(119) Sort +Input [2]: [c_nationkey#X, o_orderkey#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(120) Scan parquet Output [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_orderkey), IsNotNull(l_suppkey)] ReadSchema: struct -(106) Filter +(121) Filter Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Condition : (isnotnull(l_orderkey#X) AND isnotnull(l_suppkey#X)) -(107) Exchange +(122) Exchange Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(108) ShuffledHashJoin +(123) Sort +Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(124) SortMergeJoin Left keys [1]: [o_orderkey#X] Right keys [1]: [l_orderkey#X] Join type: Inner Join condition: None -(109) Project +(125) Project Output [4]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Input [6]: [c_nationkey#X, o_orderkey#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(110) Exchange +(126) Exchange Input [4]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_suppkey#X, c_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(111) Scan parquet +(127) Sort +Input [4]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] +Arguments: [l_suppkey#X ASC NULLS FIRST, c_nationkey#X ASC NULLS FIRST], false, 0 + +(128) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(112) Filter +(129) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(113) Exchange +(130) Exchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, s_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(114) ShuffledHashJoin +(131) Sort +Input [2]: [s_suppkey#X, s_nationkey#X] +Arguments: [s_suppkey#X ASC NULLS FIRST, s_nationkey#X ASC NULLS FIRST], false, 0 + +(132) SortMergeJoin Left keys [2]: [l_suppkey#X, c_nationkey#X] Right keys [2]: [s_suppkey#X, s_nationkey#X] Join type: Inner Join condition: None -(115) Project +(133) Project Output [3]: [l_extendedprice#X, l_discount#X, s_nationkey#X] Input [6]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, s_suppkey#X, s_nationkey#X] -(116) Exchange +(134) Exchange Input [3]: [l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(117) Scan parquet +(135) Sort +Input [3]: [l_extendedprice#X, l_discount#X, s_nationkey#X] +Arguments: [s_nationkey#X ASC NULLS FIRST], false, 0 + +(136) Scan parquet Output [3]: [n_nationkey#X, n_name#X, n_regionkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), IsNotNull(n_regionkey)] ReadSchema: struct -(118) Filter +(137) Filter Input [3]: [n_nationkey#X, n_name#X, n_regionkey#X] Condition : (isnotnull(n_nationkey#X) AND isnotnull(n_regionkey#X)) -(119) Exchange +(138) Exchange Input [3]: [n_nationkey#X, n_name#X, n_regionkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(120) ShuffledHashJoin +(139) Sort +Input [3]: [n_nationkey#X, n_name#X, n_regionkey#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(140) SortMergeJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(121) Project +(141) Project Output [4]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] Input [6]: [l_extendedprice#X, l_discount#X, s_nationkey#X, n_nationkey#X, n_name#X, n_regionkey#X] -(122) Exchange +(142) Exchange Input [4]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] Arguments: hashpartitioning(n_regionkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(123) Scan parquet +(143) Sort +Input [4]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] +Arguments: [n_regionkey#X ASC NULLS FIRST], false, 0 + +(144) Scan parquet Output [2]: [r_regionkey#X, r_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(r_name), EqualTo(r_name,ASIA), IsNotNull(r_regionkey)] ReadSchema: struct -(124) Filter +(145) Filter Input [2]: [r_regionkey#X, r_name#X] Condition : ((isnotnull(r_name#X) AND (r_name#X = ASIA)) AND isnotnull(r_regionkey#X)) -(125) Project +(146) Project Output [1]: [r_regionkey#X] Input [2]: [r_regionkey#X, r_name#X] -(126) Exchange +(147) Exchange Input [1]: [r_regionkey#X] Arguments: hashpartitioning(r_regionkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(127) ShuffledHashJoin +(148) Sort +Input [1]: [r_regionkey#X] +Arguments: [r_regionkey#X ASC NULLS FIRST], false, 0 + +(149) SortMergeJoin Left keys [1]: [n_regionkey#X] Right keys [1]: [r_regionkey#X] Join type: Inner Join condition: None -(128) Project +(150) Project Output [3]: [l_extendedprice#X, l_discount#X, n_name#X] Input [5]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X, r_regionkey#X] -(129) HashAggregate +(151) HashAggregate Input [3]: [l_extendedprice#X, l_discount#X, n_name#X] Keys [1]: [n_name#X] Functions [1]: [partial_sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [3]: [n_name#X, sum#X, isEmpty#X] -(130) Exchange +(152) Exchange Input [3]: [n_name#X, sum#X, isEmpty#X] Arguments: hashpartitioning(n_name#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(131) HashAggregate +(153) HashAggregate Input [3]: [n_name#X, sum#X, isEmpty#X] Keys [1]: [n_name#X] Functions [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))] Aggregate Attributes [1]: [sum((l_extendedprice#X * (1 - l_discount#X)))#X] Results [2]: [n_name#X, sum((l_extendedprice#X * (1 - l_discount#X)))#X AS revenue#X] -(132) Exchange +(154) Exchange Input [2]: [n_name#X, revenue#X] Arguments: rangepartitioning(revenue#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(133) Sort +(155) Sort Input [2]: [n_name#X, revenue#X] Arguments: [revenue#X DESC NULLS LAST], true, 0 -(134) AdaptiveSparkPlan +(156) AdaptiveSparkPlan Output [2]: [n_name#X, revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/6.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/6.txt index 0882ff9e151c6..ddc921e22d0f6 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/6.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/6.txt @@ -1,23 +1,23 @@ == Physical Plan == -AdaptiveSparkPlan (19) +AdaptiveSparkPlan (20) +- == Final Plan == - VeloxColumnarToRowExec (12) - +- ^ RegularHashAggregateExecTransformer (10) - +- ^ InputIteratorTransformer (9) - +- ^ InputAdapter (8) - +- ^ ShuffleQueryStage (7), Statistics(X) - +- ColumnarExchange (6) + VeloxColumnarToRowExec (13) + +- ^ RegularHashAggregateExecTransformer (11) + +- ^ InputIteratorTransformer (10) + +- ShuffleQueryStage (8), Statistics(X) + +- ColumnarExchange (7) + +- VeloxAppendBatches (6) +- ^ FlushableHashAggregateExecTransformer (4) +- ^ ProjectExecTransformer (3) +- ^ FilterExecTransformer (2) +- ^ Scan parquet (1) +- == Initial Plan == - HashAggregate (18) - +- Exchange (17) - +- HashAggregate (16) - +- Project (15) - +- Filter (14) - +- Scan parquet (13) + HashAggregate (19) + +- Exchange (18) + +- HashAggregate (17) + +- Project (16) + +- Filter (15) + +- Scan parquet (14) (1) Scan parquet @@ -46,67 +46,71 @@ Results [2]: [sum#X, isEmpty#X] Input [2]: [sum#X, isEmpty#X] Arguments: false -(6) ColumnarExchange +(6) VeloxAppendBatches +Input [2]: [sum#X, isEmpty#X] +Arguments: X + +(7) ColumnarExchange Input [2]: [sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(7) ShuffleQueryStage +(8) ShuffleQueryStage Output [2]: [sum#X, isEmpty#X] Arguments: X -(8) InputAdapter +(9) InputAdapter Input [2]: [sum#X, isEmpty#X] -(9) InputIteratorTransformer +(10) InputIteratorTransformer Input [2]: [sum#X, isEmpty#X] -(10) RegularHashAggregateExecTransformer +(11) RegularHashAggregateExecTransformer Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum((l_extendedprice#X * l_discount#X))] Aggregate Attributes [1]: [sum((l_extendedprice#X * l_discount#X))#X] Results [1]: [sum((l_extendedprice#X * l_discount#X))#X AS revenue#X] -(11) WholeStageCodegenTransformer (X) +(12) WholeStageCodegenTransformer (X) Input [1]: [revenue#X] Arguments: false -(12) VeloxColumnarToRowExec +(13) VeloxColumnarToRowExec Input [1]: [revenue#X] -(13) Scan parquet +(14) Scan parquet Output [4]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), IsNotNull(l_discount), IsNotNull(l_quantity), GreaterThanOrEqual(l_shipdate,1994-01-01), LessThan(l_shipdate,1995-01-01), GreaterThanOrEqual(l_discount,0.05), LessThanOrEqual(l_discount,0.07), LessThan(l_quantity,24.00)] ReadSchema: struct -(14) Filter +(15) Filter Input [4]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : (((((((isnotnull(l_shipdate#X) AND isnotnull(l_discount#X)) AND isnotnull(l_quantity#X)) AND (l_shipdate#X >= 1994-01-01)) AND (l_shipdate#X < 1995-01-01)) AND (l_discount#X >= 0.05)) AND (l_discount#X <= 0.07)) AND (l_quantity#X < 24.00)) -(15) Project +(16) Project Output [2]: [l_extendedprice#X, l_discount#X] Input [4]: [l_quantity#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(16) HashAggregate +(17) HashAggregate Input [2]: [l_extendedprice#X, l_discount#X] Keys: [] Functions [1]: [partial_sum((l_extendedprice#X * l_discount#X))] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [2]: [sum#X, isEmpty#X] -(17) Exchange +(18) Exchange Input [2]: [sum#X, isEmpty#X] Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X] -(18) HashAggregate +(19) HashAggregate Input [2]: [sum#X, isEmpty#X] Keys: [] Functions [1]: [sum((l_extendedprice#X * l_discount#X))] Aggregate Attributes [1]: [sum((l_extendedprice#X * l_discount#X))#X] Results [1]: [sum((l_extendedprice#X * l_discount#X))#X AS revenue#X] -(19) AdaptiveSparkPlan +(20) AdaptiveSparkPlan Output [1]: [revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/7.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/7.txt index 29c3f048b7f8c..b27398e415d31 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/7.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/7.txt @@ -1,122 +1,131 @@ == Physical Plan == -AdaptiveSparkPlan (128) +AdaptiveSparkPlan (149) +- == Final Plan == - VeloxColumnarToRowExec (90) - +- ^ SortExecTransformer (88) - +- ^ InputIteratorTransformer (87) - +- ^ InputAdapter (86) - +- ^ ShuffleQueryStage (85), Statistics(X) - +- ColumnarExchange (84) - +- ^ RegularHashAggregateExecTransformer (82) - +- ^ InputIteratorTransformer (81) - +- ^ InputAdapter (80) - +- ^ ShuffleQueryStage (79), Statistics(X) - +- ColumnarExchange (78) - +- ^ ProjectExecTransformer (76) - +- ^ FlushableHashAggregateExecTransformer (75) - +- ^ ProjectExecTransformer (74) - +- ^ ShuffledHashJoinExecTransformer Inner (73) - :- ^ InputIteratorTransformer (68) - : +- ^ InputAdapter (67) - : +- ^ ShuffleQueryStage (66), Statistics(X) - : +- ColumnarExchange (65) - : +- ^ ProjectExecTransformer (63) - : +- ^ ShuffledHashJoinExecTransformer Inner (62) - : :- ^ InputIteratorTransformer (53) - : : +- ^ InputAdapter (52) - : : +- ^ ShuffleQueryStage (51), Statistics(X) - : : +- ColumnarExchange (50) - : : +- ^ ProjectExecTransformer (48) - : : +- ^ ShuffledHashJoinExecTransformer Inner (47) - : : :- ^ InputIteratorTransformer (38) - : : : +- ^ InputAdapter (37) - : : : +- ^ ShuffleQueryStage (36), Statistics(X) - : : : +- ColumnarExchange (35) - : : : +- ^ ProjectExecTransformer (33) - : : : +- ^ ShuffledHashJoinExecTransformer Inner (32) - : : : :- ^ InputIteratorTransformer (23) - : : : : +- ^ InputAdapter (22) - : : : : +- ^ ShuffleQueryStage (21), Statistics(X) - : : : : +- ColumnarExchange (20) - : : : : +- ^ ProjectExecTransformer (18) - : : : : +- ^ ShuffledHashJoinExecTransformer Inner (17) - : : : : :- ^ InputIteratorTransformer (8) - : : : : : +- ^ InputAdapter (7) - : : : : : +- ^ ShuffleQueryStage (6), Statistics(X) - : : : : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (101) + +- ^ SortExecTransformer (99) + +- ^ InputIteratorTransformer (98) + +- ShuffleQueryStage (96), Statistics(X) + +- ColumnarExchange (95) + +- VeloxAppendBatches (94) + +- ^ RegularHashAggregateExecTransformer (92) + +- ^ InputIteratorTransformer (91) + +- ShuffleQueryStage (89), Statistics(X) + +- ColumnarExchange (88) + +- VeloxAppendBatches (87) + +- ^ ProjectExecTransformer (85) + +- ^ FlushableHashAggregateExecTransformer (84) + +- ^ ProjectExecTransformer (83) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (82) + :- ^ InputIteratorTransformer (77) + : +- ShuffleQueryStage (75), Statistics(X) + : +- ColumnarExchange (74) + : +- VeloxAppendBatches (73) + : +- ^ ProjectExecTransformer (71) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (70) + : :- ^ InputIteratorTransformer (60) + : : +- ShuffleQueryStage (58), Statistics(X) + : : +- ColumnarExchange (57) + : : +- VeloxAppendBatches (56) + : : +- ^ ProjectExecTransformer (54) + : : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (53) + : : :- ^ InputIteratorTransformer (43) + : : : +- ShuffleQueryStage (41), Statistics(X) + : : : +- ColumnarExchange (40) + : : : +- VeloxAppendBatches (39) + : : : +- ^ ProjectExecTransformer (37) + : : : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (36) + : : : :- ^ InputIteratorTransformer (26) + : : : : +- ShuffleQueryStage (24), Statistics(X) + : : : : +- ColumnarExchange (23) + : : : : +- VeloxAppendBatches (22) + : : : : +- ^ ProjectExecTransformer (20) + : : : : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (19) + : : : : :- ^ InputIteratorTransformer (9) + : : : : : +- ShuffleQueryStage (7), Statistics(X) + : : : : : +- ColumnarExchange (6) + : : : : : +- VeloxAppendBatches (5) : : : : : +- ^ ProjectExecTransformer (3) : : : : : +- ^ FilterExecTransformer (2) : : : : : +- ^ Scan parquet (1) - : : : : +- ^ InputIteratorTransformer (16) - : : : : +- ^ InputAdapter (15) - : : : : +- ^ ShuffleQueryStage (14), Statistics(X) - : : : : +- ColumnarExchange (13) - : : : : +- ^ ProjectExecTransformer (11) - : : : : +- ^ FilterExecTransformer (10) - : : : : +- ^ Scan parquet (9) - : : : +- ^ InputIteratorTransformer (31) - : : : +- ^ InputAdapter (30) - : : : +- ^ ShuffleQueryStage (29), Statistics(X) - : : : +- ColumnarExchange (28) - : : : +- ^ ProjectExecTransformer (26) - : : : +- ^ FilterExecTransformer (25) - : : : +- ^ Scan parquet (24) - : : +- ^ InputIteratorTransformer (46) - : : +- ^ InputAdapter (45) - : : +- ^ ShuffleQueryStage (44), Statistics(X) - : : +- ColumnarExchange (43) - : : +- ^ ProjectExecTransformer (41) - : : +- ^ FilterExecTransformer (40) - : : +- ^ Scan parquet (39) - : +- ^ InputIteratorTransformer (61) - : +- ^ InputAdapter (60) - : +- ^ ShuffleQueryStage (59), Statistics(X) - : +- ColumnarExchange (58) - : +- ^ ProjectExecTransformer (56) - : +- ^ FilterExecTransformer (55) - : +- ^ Scan parquet (54) - +- ^ InputIteratorTransformer (72) - +- ^ InputAdapter (71) - +- ^ ShuffleQueryStage (70), Statistics(X) - +- ReusedExchange (69) + : : : : +- ^ InputIteratorTransformer (18) + : : : : +- ShuffleQueryStage (16), Statistics(X) + : : : : +- ColumnarExchange (15) + : : : : +- VeloxAppendBatches (14) + : : : : +- ^ ProjectExecTransformer (12) + : : : : +- ^ FilterExecTransformer (11) + : : : : +- ^ Scan parquet (10) + : : : +- ^ InputIteratorTransformer (35) + : : : +- ShuffleQueryStage (33), Statistics(X) + : : : +- ColumnarExchange (32) + : : : +- VeloxAppendBatches (31) + : : : +- ^ ProjectExecTransformer (29) + : : : +- ^ FilterExecTransformer (28) + : : : +- ^ Scan parquet (27) + : : +- ^ InputIteratorTransformer (52) + : : +- ShuffleQueryStage (50), Statistics(X) + : : +- ColumnarExchange (49) + : : +- VeloxAppendBatches (48) + : : +- ^ ProjectExecTransformer (46) + : : +- ^ FilterExecTransformer (45) + : : +- ^ Scan parquet (44) + : +- ^ InputIteratorTransformer (69) + : +- ShuffleQueryStage (67), Statistics(X) + : +- ColumnarExchange (66) + : +- VeloxAppendBatches (65) + : +- ^ ProjectExecTransformer (63) + : +- ^ FilterExecTransformer (62) + : +- ^ Scan parquet (61) + +- ^ InputIteratorTransformer (81) + +- ShuffleQueryStage (79), Statistics(X) + +- ReusedExchange (78) +- == Initial Plan == - Sort (127) - +- Exchange (126) - +- HashAggregate (125) - +- Exchange (124) - +- HashAggregate (123) - +- Project (122) - +- ShuffledHashJoin Inner BuildRight (121) - :- Exchange (117) - : +- Project (116) - : +- ShuffledHashJoin Inner BuildRight (115) - : :- Exchange (111) - : : +- Project (110) - : : +- ShuffledHashJoin Inner BuildRight (109) - : : :- Exchange (105) - : : : +- Project (104) - : : : +- ShuffledHashJoin Inner BuildRight (103) - : : : :- Exchange (99) - : : : : +- Project (98) - : : : : +- ShuffledHashJoin Inner BuildLeft (97) - : : : : :- Exchange (93) - : : : : : +- Filter (92) - : : : : : +- Scan parquet (91) - : : : : +- Exchange (96) - : : : : +- Filter (95) - : : : : +- Scan parquet (94) - : : : +- Exchange (102) - : : : +- Filter (101) - : : : +- Scan parquet (100) - : : +- Exchange (108) - : : +- Filter (107) - : : +- Scan parquet (106) - : +- Exchange (114) - : +- Filter (113) - : +- Scan parquet (112) - +- Exchange (120) - +- Filter (119) - +- Scan parquet (118) + Sort (148) + +- Exchange (147) + +- HashAggregate (146) + +- Exchange (145) + +- HashAggregate (144) + +- Project (143) + +- SortMergeJoin Inner (142) + :- Sort (137) + : +- Exchange (136) + : +- Project (135) + : +- SortMergeJoin Inner (134) + : :- Sort (129) + : : +- Exchange (128) + : : +- Project (127) + : : +- SortMergeJoin Inner (126) + : : :- Sort (121) + : : : +- Exchange (120) + : : : +- Project (119) + : : : +- SortMergeJoin Inner (118) + : : : :- Sort (113) + : : : : +- Exchange (112) + : : : : +- Project (111) + : : : : +- SortMergeJoin Inner (110) + : : : : :- Sort (105) + : : : : : +- Exchange (104) + : : : : : +- Filter (103) + : : : : : +- Scan parquet (102) + : : : : +- Sort (109) + : : : : +- Exchange (108) + : : : : +- Filter (107) + : : : : +- Scan parquet (106) + : : : +- Sort (117) + : : : +- Exchange (116) + : : : +- Filter (115) + : : : +- Scan parquet (114) + : : +- Sort (125) + : : +- Exchange (124) + : : +- Filter (123) + : : +- Scan parquet (122) + : +- Sort (133) + : +- Exchange (132) + : +- Filter (131) + : +- Scan parquet (130) + +- Sort (141) + +- Exchange (140) + +- Filter (139) + +- Scan parquet (138) (1) Scan parquet @@ -138,534 +147,618 @@ Input [2]: [s_suppkey#X, s_nationkey#X] Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] +Arguments: X + +(6) ColumnarExchange Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [2]: [s_suppkey#X, s_nationkey#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [2]: [s_suppkey#X, s_nationkey#X] -(9) Scan parquet +(10) Scan parquet Output [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1995-01-01), LessThanOrEqual(l_shipdate,1996-12-31), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(10) FilterExecTransformer +(11) FilterExecTransformer Input [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: ((((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1995-01-01)) AND (l_shipdate#X <= 1996-12-31)) AND isnotnull(l_suppkey#X)) AND isnotnull(l_orderkey#X)) -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [6]: [hash(l_suppkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Input [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [6]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] +Arguments: X + +(15) ColumnarExchange Input [6]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [s_suppkey#X] Right keys [1]: [l_suppkey#X] Join type: Inner Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [6]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Input [7]: [s_suppkey#X, s_nationkey#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(19) WholeStageCodegenTransformer (X) +(21) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: false -(20) ColumnarExchange +(22) VeloxAppendBatches +Input [6]: [hash_partition_key#X, s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] +Arguments: X + +(23) ColumnarExchange Input [6]: [hash_partition_key#X, s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X], [plan_id=X], [id=#X] -(21) ShuffleQueryStage +(24) ShuffleQueryStage Output [5]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: X -(22) InputAdapter +(25) InputAdapter Input [5]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(23) InputIteratorTransformer +(26) InputIteratorTransformer Input [5]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(24) Scan parquet +(27) Scan parquet Output [2]: [o_orderkey#X, o_custkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderkey), IsNotNull(o_custkey)] ReadSchema: struct -(25) FilterExecTransformer +(28) FilterExecTransformer Input [2]: [o_orderkey#X, o_custkey#X] Arguments: (isnotnull(o_orderkey#X) AND isnotnull(o_custkey#X)) -(26) ProjectExecTransformer +(29) ProjectExecTransformer Output [3]: [hash(o_orderkey#X, 42) AS hash_partition_key#X, o_orderkey#X, o_custkey#X] Input [2]: [o_orderkey#X, o_custkey#X] -(27) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] Arguments: false -(28) ColumnarExchange +(31) VeloxAppendBatches +Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] +Arguments: X + +(32) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [id=#X] -(29) ShuffleQueryStage +(33) ShuffleQueryStage Output [2]: [o_orderkey#X, o_custkey#X] Arguments: X -(30) InputAdapter +(34) InputAdapter Input [2]: [o_orderkey#X, o_custkey#X] -(31) InputIteratorTransformer +(35) InputIteratorTransformer Input [2]: [o_orderkey#X, o_custkey#X] -(32) ShuffledHashJoinExecTransformer +(36) ShuffledHashJoinExecTransformer Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join type: Inner Join condition: None -(33) ProjectExecTransformer +(37) ProjectExecTransformer Output [6]: [hash(o_custkey#X, 42) AS hash_partition_key#X, s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] Input [7]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_orderkey#X, o_custkey#X] -(34) WholeStageCodegenTransformer (X) +(38) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] Arguments: false -(35) ColumnarExchange +(39) VeloxAppendBatches +Input [6]: [hash_partition_key#X, s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] +Arguments: X + +(40) ColumnarExchange Input [6]: [hash_partition_key#X, s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X], [plan_id=X], [id=#X] -(36) ShuffleQueryStage +(41) ShuffleQueryStage Output [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] Arguments: X -(37) InputAdapter +(42) InputAdapter Input [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] -(38) InputIteratorTransformer +(43) InputIteratorTransformer Input [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] -(39) Scan parquet +(44) Scan parquet Output [2]: [c_custkey#X, c_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey), IsNotNull(c_nationkey)] ReadSchema: struct -(40) FilterExecTransformer +(45) FilterExecTransformer Input [2]: [c_custkey#X, c_nationkey#X] Arguments: (isnotnull(c_custkey#X) AND isnotnull(c_nationkey#X)) -(41) ProjectExecTransformer +(46) ProjectExecTransformer Output [3]: [hash(c_custkey#X, 42) AS hash_partition_key#X, c_custkey#X, c_nationkey#X] Input [2]: [c_custkey#X, c_nationkey#X] -(42) WholeStageCodegenTransformer (X) +(47) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] Arguments: false -(43) ColumnarExchange +(48) VeloxAppendBatches +Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] +Arguments: X + +(49) ColumnarExchange Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_nationkey#X], [plan_id=X], [id=#X] -(44) ShuffleQueryStage +(50) ShuffleQueryStage Output [2]: [c_custkey#X, c_nationkey#X] Arguments: X -(45) InputAdapter +(51) InputAdapter Input [2]: [c_custkey#X, c_nationkey#X] -(46) InputIteratorTransformer +(52) InputIteratorTransformer Input [2]: [c_custkey#X, c_nationkey#X] -(47) ShuffledHashJoinExecTransformer +(53) ShuffledHashJoinExecTransformer Left keys [1]: [o_custkey#X] Right keys [1]: [c_custkey#X] Join type: Inner Join condition: None -(48) ProjectExecTransformer +(54) ProjectExecTransformer Output [6]: [hash(s_nationkey#X, 42) AS hash_partition_key#X, s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] Input [7]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X, c_custkey#X, c_nationkey#X] -(49) WholeStageCodegenTransformer (X) +(55) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] Arguments: false -(50) ColumnarExchange +(56) VeloxAppendBatches +Input [6]: [hash_partition_key#X, s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] +Arguments: X + +(57) ColumnarExchange Input [6]: [hash_partition_key#X, s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X], [plan_id=X], [id=#X] -(51) ShuffleQueryStage +(58) ShuffleQueryStage Output [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] Arguments: X -(52) InputAdapter +(59) InputAdapter Input [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] -(53) InputIteratorTransformer +(60) InputIteratorTransformer Input [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] -(54) Scan parquet +(61) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), Or(EqualTo(n_name,FRANCE),EqualTo(n_name,GERMANY))] ReadSchema: struct -(55) FilterExecTransformer +(62) FilterExecTransformer Input [2]: [n_nationkey#X, n_name#X] Arguments: (isnotnull(n_nationkey#X) AND ((n_name#X = FRANCE) OR (n_name#X = GERMANY))) -(56) ProjectExecTransformer +(63) ProjectExecTransformer Output [3]: [hash(n_nationkey#X, 42) AS hash_partition_key#X, n_nationkey#X, n_name#X] Input [2]: [n_nationkey#X, n_name#X] -(57) WholeStageCodegenTransformer (X) +(64) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] Arguments: false -(58) ColumnarExchange +(65) VeloxAppendBatches +Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] +Arguments: X + +(66) ColumnarExchange Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [id=#X] -(59) ShuffleQueryStage +(67) ShuffleQueryStage Output [2]: [n_nationkey#X, n_name#X] Arguments: X -(60) InputAdapter +(68) InputAdapter Input [2]: [n_nationkey#X, n_name#X] -(61) InputIteratorTransformer +(69) InputIteratorTransformer Input [2]: [n_nationkey#X, n_name#X] -(62) ShuffledHashJoinExecTransformer +(70) ShuffledHashJoinExecTransformer Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(63) ProjectExecTransformer +(71) ProjectExecTransformer Output [6]: [hash(c_nationkey#X, 42) AS hash_partition_key#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] Input [7]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_nationkey#X, n_name#X] -(64) WholeStageCodegenTransformer (X) +(72) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] Arguments: false -(65) ColumnarExchange +(73) VeloxAppendBatches +Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] +Arguments: X + +(74) ColumnarExchange Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X], [plan_id=X], [id=#X] -(66) ShuffleQueryStage +(75) ShuffleQueryStage Output [5]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] Arguments: X -(67) InputAdapter +(76) InputAdapter Input [5]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] -(68) InputIteratorTransformer +(77) InputIteratorTransformer Input [5]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] -(69) ReusedExchange [Reuses operator id: 58] +(78) ReusedExchange [Reuses operator id: 66] Output [2]: [n_nationkey#X, n_name#X] -(70) ShuffleQueryStage +(79) ShuffleQueryStage Output [2]: [n_nationkey#X, n_name#X] Arguments: X -(71) InputAdapter +(80) InputAdapter Input [2]: [n_nationkey#X, n_name#X] -(72) InputIteratorTransformer +(81) InputIteratorTransformer Input [2]: [n_nationkey#X, n_name#X] -(73) ShuffledHashJoinExecTransformer +(82) ShuffledHashJoinExecTransformer Left keys [1]: [c_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: (((n_name#X = FRANCE) AND (n_name#X = GERMANY)) OR ((n_name#X = GERMANY) AND (n_name#X = FRANCE))) -(74) ProjectExecTransformer +(83) ProjectExecTransformer Output [4]: [n_name#X AS supp_nation#X, n_name#X AS cust_nation#X, year(l_shipdate#X) AS l_year#X, (l_extendedprice#X * (1 - l_discount#X)) AS volume#X] Input [7]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X, n_nationkey#X, n_name#X] -(75) FlushableHashAggregateExecTransformer +(84) FlushableHashAggregateExecTransformer Input [4]: [supp_nation#X, cust_nation#X, l_year#X, volume#X] Keys [3]: [supp_nation#X, cust_nation#X, l_year#X] Functions [1]: [partial_sum(volume#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] -(76) ProjectExecTransformer +(85) ProjectExecTransformer Output [6]: [hash(supp_nation#X, cust_nation#X, l_year#X, 42) AS hash_partition_key#X, supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] -(77) WholeStageCodegenTransformer (X) +(86) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Arguments: false -(78) ColumnarExchange +(87) VeloxAppendBatches +Input [6]: [hash_partition_key#X, supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] +Arguments: X + +(88) ColumnarExchange Input [6]: [hash_partition_key#X, supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Arguments: hashpartitioning(supp_nation#X, cust_nation#X, l_year#X, 1), ENSURE_REQUIREMENTS, [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(79) ShuffleQueryStage +(89) ShuffleQueryStage Output [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Arguments: X -(80) InputAdapter +(90) InputAdapter Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] -(81) InputIteratorTransformer +(91) InputIteratorTransformer Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] -(82) RegularHashAggregateExecTransformer +(92) RegularHashAggregateExecTransformer Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Keys [3]: [supp_nation#X, cust_nation#X, l_year#X] Functions [1]: [sum(volume#X)] Aggregate Attributes [1]: [sum(volume#X)#X] Results [4]: [supp_nation#X, cust_nation#X, l_year#X, sum(volume#X)#X AS revenue#X] -(83) WholeStageCodegenTransformer (X) +(93) WholeStageCodegenTransformer (X) Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: false -(84) ColumnarExchange +(94) VeloxAppendBatches +Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] +Arguments: X + +(95) ColumnarExchange Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: rangepartitioning(supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(85) ShuffleQueryStage +(96) ShuffleQueryStage Output [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: X -(86) InputAdapter +(97) InputAdapter Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] -(87) InputIteratorTransformer +(98) InputIteratorTransformer Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] -(88) SortExecTransformer +(99) SortExecTransformer Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: [supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST], true, 0 -(89) WholeStageCodegenTransformer (X) +(100) WholeStageCodegenTransformer (X) Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: false -(90) VeloxColumnarToRowExec +(101) VeloxColumnarToRowExec Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] -(91) Scan parquet +(102) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(92) Filter +(103) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(93) Exchange +(104) Exchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(94) Scan parquet +(105) Sort +Input [2]: [s_suppkey#X, s_nationkey#X] +Arguments: [s_suppkey#X ASC NULLS FIRST], false, 0 + +(106) Scan parquet Output [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_shipdate), GreaterThanOrEqual(l_shipdate,1995-01-01), LessThanOrEqual(l_shipdate,1996-12-31), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(95) Filter +(107) Filter Input [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Condition : ((((isnotnull(l_shipdate#X) AND (l_shipdate#X >= 1995-01-01)) AND (l_shipdate#X <= 1996-12-31)) AND isnotnull(l_suppkey#X)) AND isnotnull(l_orderkey#X)) -(96) Exchange +(108) Exchange Input [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(97) ShuffledHashJoin +(109) Sort +Input [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] +Arguments: [l_suppkey#X ASC NULLS FIRST], false, 0 + +(110) SortMergeJoin Left keys [1]: [s_suppkey#X] Right keys [1]: [l_suppkey#X] Join type: Inner Join condition: None -(98) Project +(111) Project Output [5]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Input [7]: [s_suppkey#X, s_nationkey#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -(99) Exchange +(112) Exchange Input [5]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(100) Scan parquet +(113) Sort +Input [5]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(114) Scan parquet Output [2]: [o_orderkey#X, o_custkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderkey), IsNotNull(o_custkey)] ReadSchema: struct -(101) Filter +(115) Filter Input [2]: [o_orderkey#X, o_custkey#X] Condition : (isnotnull(o_orderkey#X) AND isnotnull(o_custkey#X)) -(102) Exchange +(116) Exchange Input [2]: [o_orderkey#X, o_custkey#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(103) ShuffledHashJoin +(117) Sort +Input [2]: [o_orderkey#X, o_custkey#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(118) SortMergeJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join type: Inner Join condition: None -(104) Project +(119) Project Output [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] Input [7]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_orderkey#X, o_custkey#X] -(105) Exchange +(120) Exchange Input [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(106) Scan parquet +(121) Sort +Input [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] +Arguments: [o_custkey#X ASC NULLS FIRST], false, 0 + +(122) Scan parquet Output [2]: [c_custkey#X, c_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey), IsNotNull(c_nationkey)] ReadSchema: struct -(107) Filter +(123) Filter Input [2]: [c_custkey#X, c_nationkey#X] Condition : (isnotnull(c_custkey#X) AND isnotnull(c_nationkey#X)) -(108) Exchange +(124) Exchange Input [2]: [c_custkey#X, c_nationkey#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(109) ShuffledHashJoin +(125) Sort +Input [2]: [c_custkey#X, c_nationkey#X] +Arguments: [c_custkey#X ASC NULLS FIRST], false, 0 + +(126) SortMergeJoin Left keys [1]: [o_custkey#X] Right keys [1]: [c_custkey#X] Join type: Inner Join condition: None -(110) Project +(127) Project Output [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] Input [7]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X, c_custkey#X, c_nationkey#X] -(111) Exchange +(128) Exchange Input [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(112) Scan parquet +(129) Sort +Input [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] +Arguments: [s_nationkey#X ASC NULLS FIRST], false, 0 + +(130) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), Or(EqualTo(n_name,FRANCE),EqualTo(n_name,GERMANY))] ReadSchema: struct -(113) Filter +(131) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : (isnotnull(n_nationkey#X) AND ((n_name#X = FRANCE) OR (n_name#X = GERMANY))) -(114) Exchange +(132) Exchange Input [2]: [n_nationkey#X, n_name#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(115) ShuffledHashJoin +(133) Sort +Input [2]: [n_nationkey#X, n_name#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(134) SortMergeJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(116) Project +(135) Project Output [5]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] Input [7]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_nationkey#X, n_name#X] -(117) Exchange +(136) Exchange Input [5]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(118) Scan parquet +(137) Sort +Input [5]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] +Arguments: [c_nationkey#X ASC NULLS FIRST], false, 0 + +(138) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), Or(EqualTo(n_name,GERMANY),EqualTo(n_name,FRANCE))] ReadSchema: struct -(119) Filter +(139) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : (isnotnull(n_nationkey#X) AND ((n_name#X = GERMANY) OR (n_name#X = FRANCE))) -(120) Exchange +(140) Exchange Input [2]: [n_nationkey#X, n_name#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(121) ShuffledHashJoin +(141) Sort +Input [2]: [n_nationkey#X, n_name#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(142) SortMergeJoin Left keys [1]: [c_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: (((n_name#X = FRANCE) AND (n_name#X = GERMANY)) OR ((n_name#X = GERMANY) AND (n_name#X = FRANCE))) -(122) Project +(143) Project Output [4]: [n_name#X AS supp_nation#X, n_name#X AS cust_nation#X, year(l_shipdate#X) AS l_year#X, (l_extendedprice#X * (1 - l_discount#X)) AS volume#X] Input [7]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X, n_nationkey#X, n_name#X] -(123) HashAggregate +(144) HashAggregate Input [4]: [supp_nation#X, cust_nation#X, l_year#X, volume#X] Keys [3]: [supp_nation#X, cust_nation#X, l_year#X] Functions [1]: [partial_sum(volume#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] -(124) Exchange +(145) Exchange Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Arguments: hashpartitioning(supp_nation#X, cust_nation#X, l_year#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(125) HashAggregate +(146) HashAggregate Input [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] Keys [3]: [supp_nation#X, cust_nation#X, l_year#X] Functions [1]: [sum(volume#X)] Aggregate Attributes [1]: [sum(volume#X)#X] Results [4]: [supp_nation#X, cust_nation#X, l_year#X, sum(volume#X)#X AS revenue#X] -(126) Exchange +(147) Exchange Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: rangepartitioning(supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(127) Sort +(148) Sort Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: [supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST], true, 0 -(128) AdaptiveSparkPlan +(149) AdaptiveSparkPlan Output [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/8.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/8.txt index 9c24e9ec5a01d..fa2a2789f4d37 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/8.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/8.txt @@ -1,166 +1,180 @@ == Physical Plan == -AdaptiveSparkPlan (177) +AdaptiveSparkPlan (207) +- == Final Plan == - VeloxColumnarToRowExec (125) - +- ^ SortExecTransformer (123) - +- ^ InputIteratorTransformer (122) - +- ^ InputAdapter (121) - +- ^ ShuffleQueryStage (120), Statistics(X) - +- ColumnarExchange (119) - +- ^ ProjectExecTransformer (117) - +- ^ RegularHashAggregateExecTransformer (116) - +- ^ InputIteratorTransformer (115) - +- ^ InputAdapter (114) - +- ^ ShuffleQueryStage (113), Statistics(X) - +- ColumnarExchange (112) - +- ^ ProjectExecTransformer (110) - +- ^ FlushableHashAggregateExecTransformer (109) - +- ^ ProjectExecTransformer (108) - +- ^ ShuffledHashJoinExecTransformer Inner (107) - :- ^ InputIteratorTransformer (98) - : +- ^ InputAdapter (97) - : +- ^ ShuffleQueryStage (96), Statistics(X) - : +- ColumnarExchange (95) - : +- ^ ProjectExecTransformer (93) - : +- ^ ShuffledHashJoinExecTransformer Inner (92) - : :- ^ InputIteratorTransformer (83) - : : +- ^ InputAdapter (82) - : : +- ^ ShuffleQueryStage (81), Statistics(X) - : : +- ColumnarExchange (80) - : : +- ^ ProjectExecTransformer (78) - : : +- ^ ShuffledHashJoinExecTransformer Inner (77) - : : :- ^ InputIteratorTransformer (68) - : : : +- ^ InputAdapter (67) - : : : +- ^ ShuffleQueryStage (66), Statistics(X) - : : : +- ColumnarExchange (65) - : : : +- ^ ProjectExecTransformer (63) - : : : +- ^ ShuffledHashJoinExecTransformer Inner (62) - : : : :- ^ InputIteratorTransformer (53) - : : : : +- ^ InputAdapter (52) - : : : : +- ^ ShuffleQueryStage (51), Statistics(X) - : : : : +- ColumnarExchange (50) - : : : : +- ^ ProjectExecTransformer (48) - : : : : +- ^ ShuffledHashJoinExecTransformer Inner (47) - : : : : :- ^ InputIteratorTransformer (38) - : : : : : +- ^ InputAdapter (37) - : : : : : +- ^ ShuffleQueryStage (36), Statistics(X) - : : : : : +- ColumnarExchange (35) - : : : : : +- ^ ProjectExecTransformer (33) - : : : : : +- ^ ShuffledHashJoinExecTransformer Inner (32) - : : : : : :- ^ InputIteratorTransformer (23) - : : : : : : +- ^ InputAdapter (22) - : : : : : : +- ^ ShuffleQueryStage (21), Statistics(X) - : : : : : : +- ColumnarExchange (20) - : : : : : : +- ^ ProjectExecTransformer (18) - : : : : : : +- ^ ShuffledHashJoinExecTransformer Inner (17) - : : : : : : :- ^ InputIteratorTransformer (8) - : : : : : : : +- ^ InputAdapter (7) - : : : : : : : +- ^ ShuffleQueryStage (6), Statistics(X) - : : : : : : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (141) + +- ^ SortExecTransformer (139) + +- ^ InputIteratorTransformer (138) + +- ShuffleQueryStage (136), Statistics(X) + +- ColumnarExchange (135) + +- VeloxAppendBatches (134) + +- ^ ProjectExecTransformer (132) + +- ^ RegularHashAggregateExecTransformer (131) + +- ^ InputIteratorTransformer (130) + +- ShuffleQueryStage (128), Statistics(X) + +- ColumnarExchange (127) + +- VeloxAppendBatches (126) + +- ^ ProjectExecTransformer (124) + +- ^ FlushableHashAggregateExecTransformer (123) + +- ^ ProjectExecTransformer (122) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (121) + :- ^ InputIteratorTransformer (111) + : +- ShuffleQueryStage (109), Statistics(X) + : +- ColumnarExchange (108) + : +- VeloxAppendBatches (107) + : +- ^ ProjectExecTransformer (105) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (104) + : :- ^ InputIteratorTransformer (94) + : : +- ShuffleQueryStage (92), Statistics(X) + : : +- ColumnarExchange (91) + : : +- VeloxAppendBatches (90) + : : +- ^ ProjectExecTransformer (88) + : : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (87) + : : :- ^ InputIteratorTransformer (77) + : : : +- ShuffleQueryStage (75), Statistics(X) + : : : +- ColumnarExchange (74) + : : : +- VeloxAppendBatches (73) + : : : +- ^ ProjectExecTransformer (71) + : : : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (70) + : : : :- ^ InputIteratorTransformer (60) + : : : : +- ShuffleQueryStage (58), Statistics(X) + : : : : +- ColumnarExchange (57) + : : : : +- VeloxAppendBatches (56) + : : : : +- ^ ProjectExecTransformer (54) + : : : : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (53) + : : : : :- ^ InputIteratorTransformer (43) + : : : : : +- ShuffleQueryStage (41), Statistics(X) + : : : : : +- ColumnarExchange (40) + : : : : : +- VeloxAppendBatches (39) + : : : : : +- ^ ProjectExecTransformer (37) + : : : : : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (36) + : : : : : :- ^ InputIteratorTransformer (26) + : : : : : : +- ShuffleQueryStage (24), Statistics(X) + : : : : : : +- ColumnarExchange (23) + : : : : : : +- VeloxAppendBatches (22) + : : : : : : +- ^ ProjectExecTransformer (20) + : : : : : : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (19) + : : : : : : :- ^ InputIteratorTransformer (9) + : : : : : : : +- ShuffleQueryStage (7), Statistics(X) + : : : : : : : +- ColumnarExchange (6) + : : : : : : : +- VeloxAppendBatches (5) : : : : : : : +- ^ ProjectExecTransformer (3) : : : : : : : +- ^ FilterExecTransformer (2) : : : : : : : +- ^ Scan parquet (1) - : : : : : : +- ^ InputIteratorTransformer (16) - : : : : : : +- ^ InputAdapter (15) - : : : : : : +- ^ ShuffleQueryStage (14), Statistics(X) - : : : : : : +- ColumnarExchange (13) - : : : : : : +- ^ ProjectExecTransformer (11) - : : : : : : +- ^ FilterExecTransformer (10) - : : : : : : +- ^ Scan parquet (9) - : : : : : +- ^ InputIteratorTransformer (31) - : : : : : +- ^ InputAdapter (30) - : : : : : +- ^ ShuffleQueryStage (29), Statistics(X) - : : : : : +- ColumnarExchange (28) - : : : : : +- ^ ProjectExecTransformer (26) - : : : : : +- ^ FilterExecTransformer (25) - : : : : : +- ^ Scan parquet (24) - : : : : +- ^ InputIteratorTransformer (46) - : : : : +- ^ InputAdapter (45) - : : : : +- ^ ShuffleQueryStage (44), Statistics(X) - : : : : +- ColumnarExchange (43) - : : : : +- ^ ProjectExecTransformer (41) - : : : : +- ^ FilterExecTransformer (40) - : : : : +- ^ Scan parquet (39) - : : : +- ^ InputIteratorTransformer (61) - : : : +- ^ InputAdapter (60) - : : : +- ^ ShuffleQueryStage (59), Statistics(X) - : : : +- ColumnarExchange (58) - : : : +- ^ ProjectExecTransformer (56) - : : : +- ^ FilterExecTransformer (55) - : : : +- ^ Scan parquet (54) - : : +- ^ InputIteratorTransformer (76) - : : +- ^ InputAdapter (75) - : : +- ^ ShuffleQueryStage (74), Statistics(X) - : : +- ColumnarExchange (73) - : : +- ^ ProjectExecTransformer (71) - : : +- ^ FilterExecTransformer (70) - : : +- ^ Scan parquet (69) - : +- ^ InputIteratorTransformer (91) - : +- ^ InputAdapter (90) - : +- ^ ShuffleQueryStage (89), Statistics(X) - : +- ColumnarExchange (88) - : +- ^ ProjectExecTransformer (86) - : +- ^ FilterExecTransformer (85) - : +- ^ Scan parquet (84) - +- ^ InputIteratorTransformer (106) - +- ^ InputAdapter (105) - +- ^ ShuffleQueryStage (104), Statistics(X) - +- ColumnarExchange (103) - +- ^ ProjectExecTransformer (101) - +- ^ FilterExecTransformer (100) - +- ^ Scan parquet (99) + : : : : : : +- ^ InputIteratorTransformer (18) + : : : : : : +- ShuffleQueryStage (16), Statistics(X) + : : : : : : +- ColumnarExchange (15) + : : : : : : +- VeloxAppendBatches (14) + : : : : : : +- ^ ProjectExecTransformer (12) + : : : : : : +- ^ FilterExecTransformer (11) + : : : : : : +- ^ Scan parquet (10) + : : : : : +- ^ InputIteratorTransformer (35) + : : : : : +- ShuffleQueryStage (33), Statistics(X) + : : : : : +- ColumnarExchange (32) + : : : : : +- VeloxAppendBatches (31) + : : : : : +- ^ ProjectExecTransformer (29) + : : : : : +- ^ FilterExecTransformer (28) + : : : : : +- ^ Scan parquet (27) + : : : : +- ^ InputIteratorTransformer (52) + : : : : +- ShuffleQueryStage (50), Statistics(X) + : : : : +- ColumnarExchange (49) + : : : : +- VeloxAppendBatches (48) + : : : : +- ^ ProjectExecTransformer (46) + : : : : +- ^ FilterExecTransformer (45) + : : : : +- ^ Scan parquet (44) + : : : +- ^ InputIteratorTransformer (69) + : : : +- ShuffleQueryStage (67), Statistics(X) + : : : +- ColumnarExchange (66) + : : : +- VeloxAppendBatches (65) + : : : +- ^ ProjectExecTransformer (63) + : : : +- ^ FilterExecTransformer (62) + : : : +- ^ Scan parquet (61) + : : +- ^ InputIteratorTransformer (86) + : : +- ShuffleQueryStage (84), Statistics(X) + : : +- ColumnarExchange (83) + : : +- VeloxAppendBatches (82) + : : +- ^ ProjectExecTransformer (80) + : : +- ^ FilterExecTransformer (79) + : : +- ^ Scan parquet (78) + : +- ^ InputIteratorTransformer (103) + : +- ShuffleQueryStage (101), Statistics(X) + : +- ColumnarExchange (100) + : +- VeloxAppendBatches (99) + : +- ^ ProjectExecTransformer (97) + : +- ^ FilterExecTransformer (96) + : +- ^ Scan parquet (95) + +- ^ InputIteratorTransformer (120) + +- ShuffleQueryStage (118), Statistics(X) + +- ColumnarExchange (117) + +- VeloxAppendBatches (116) + +- ^ ProjectExecTransformer (114) + +- ^ FilterExecTransformer (113) + +- ^ Scan parquet (112) +- == Initial Plan == - Sort (176) - +- Exchange (175) - +- HashAggregate (174) - +- Exchange (173) - +- HashAggregate (172) - +- Project (171) - +- ShuffledHashJoin Inner BuildRight (170) - :- Exchange (165) - : +- Project (164) - : +- ShuffledHashJoin Inner BuildRight (163) - : :- Exchange (159) - : : +- Project (158) - : : +- ShuffledHashJoin Inner BuildRight (157) - : : :- Exchange (153) - : : : +- Project (152) - : : : +- ShuffledHashJoin Inner BuildRight (151) - : : : :- Exchange (147) - : : : : +- Project (146) - : : : : +- ShuffledHashJoin Inner BuildRight (145) - : : : : :- Exchange (141) - : : : : : +- Project (140) - : : : : : +- ShuffledHashJoin Inner BuildRight (139) - : : : : : :- Exchange (135) - : : : : : : +- Project (134) - : : : : : : +- ShuffledHashJoin Inner BuildLeft (133) - : : : : : : :- Exchange (129) - : : : : : : : +- Project (128) - : : : : : : : +- Filter (127) - : : : : : : : +- Scan parquet (126) - : : : : : : +- Exchange (132) - : : : : : : +- Filter (131) - : : : : : : +- Scan parquet (130) - : : : : : +- Exchange (138) - : : : : : +- Filter (137) - : : : : : +- Scan parquet (136) - : : : : +- Exchange (144) - : : : : +- Filter (143) - : : : : +- Scan parquet (142) - : : : +- Exchange (150) - : : : +- Filter (149) - : : : +- Scan parquet (148) - : : +- Exchange (156) - : : +- Filter (155) - : : +- Scan parquet (154) - : +- Exchange (162) - : +- Filter (161) - : +- Scan parquet (160) - +- Exchange (169) - +- Project (168) - +- Filter (167) - +- Scan parquet (166) + Sort (206) + +- Exchange (205) + +- HashAggregate (204) + +- Exchange (203) + +- HashAggregate (202) + +- Project (201) + +- SortMergeJoin Inner (200) + :- Sort (194) + : +- Exchange (193) + : +- Project (192) + : +- SortMergeJoin Inner (191) + : :- Sort (186) + : : +- Exchange (185) + : : +- Project (184) + : : +- SortMergeJoin Inner (183) + : : :- Sort (178) + : : : +- Exchange (177) + : : : +- Project (176) + : : : +- SortMergeJoin Inner (175) + : : : :- Sort (170) + : : : : +- Exchange (169) + : : : : +- Project (168) + : : : : +- SortMergeJoin Inner (167) + : : : : :- Sort (162) + : : : : : +- Exchange (161) + : : : : : +- Project (160) + : : : : : +- SortMergeJoin Inner (159) + : : : : : :- Sort (154) + : : : : : : +- Exchange (153) + : : : : : : +- Project (152) + : : : : : : +- SortMergeJoin Inner (151) + : : : : : : :- Sort (146) + : : : : : : : +- Exchange (145) + : : : : : : : +- Project (144) + : : : : : : : +- Filter (143) + : : : : : : : +- Scan parquet (142) + : : : : : : +- Sort (150) + : : : : : : +- Exchange (149) + : : : : : : +- Filter (148) + : : : : : : +- Scan parquet (147) + : : : : : +- Sort (158) + : : : : : +- Exchange (157) + : : : : : +- Filter (156) + : : : : : +- Scan parquet (155) + : : : : +- Sort (166) + : : : : +- Exchange (165) + : : : : +- Filter (164) + : : : : +- Scan parquet (163) + : : : +- Sort (174) + : : : +- Exchange (173) + : : : +- Filter (172) + : : : +- Scan parquet (171) + : : +- Sort (182) + : : +- Exchange (181) + : : +- Filter (180) + : : +- Scan parquet (179) + : +- Sort (190) + : +- Exchange (189) + : +- Filter (188) + : +- Scan parquet (187) + +- Sort (199) + +- Exchange (198) + +- Project (197) + +- Filter (196) + +- Scan parquet (195) (1) Scan parquet @@ -182,746 +196,866 @@ Input [2]: [p_partkey#X, p_type#X] Input [2]: [hash_partition_key#X, p_partkey#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [2]: [hash_partition_key#X, p_partkey#X] +Arguments: X + +(6) ColumnarExchange Input [2]: [hash_partition_key#X, p_partkey#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [1]: [p_partkey#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [1]: [p_partkey#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [1]: [p_partkey#X] -(9) Scan parquet +(10) Scan parquet Output [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(10) FilterExecTransformer +(11) FilterExecTransformer Input [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: ((isnotnull(l_partkey#X) AND isnotnull(l_suppkey#X)) AND isnotnull(l_orderkey#X)) -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [6]: [hash(l_partkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Input [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [6]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(15) ColumnarExchange Input [6]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [p_partkey#X] Right keys [1]: [l_partkey#X] Join type: Inner Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [5]: [hash(l_suppkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Input [6]: [p_partkey#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(19) WholeStageCodegenTransformer (X) +(21) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: false -(20) ColumnarExchange +(22) VeloxAppendBatches +Input [5]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(23) ColumnarExchange Input [5]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(21) ShuffleQueryStage +(24) ShuffleQueryStage Output [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: X -(22) InputAdapter +(25) InputAdapter Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(23) InputIteratorTransformer +(26) InputIteratorTransformer Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(24) Scan parquet +(27) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(25) FilterExecTransformer +(28) FilterExecTransformer Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(26) ProjectExecTransformer +(29) ProjectExecTransformer Output [3]: [hash(s_suppkey#X, 42) AS hash_partition_key#X, s_suppkey#X, s_nationkey#X] Input [2]: [s_suppkey#X, s_nationkey#X] -(27) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] Arguments: false -(28) ColumnarExchange +(31) VeloxAppendBatches +Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] +Arguments: X + +(32) ColumnarExchange Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [id=#X] -(29) ShuffleQueryStage +(33) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] Arguments: X -(30) InputAdapter +(34) InputAdapter Input [2]: [s_suppkey#X, s_nationkey#X] -(31) InputIteratorTransformer +(35) InputIteratorTransformer Input [2]: [s_suppkey#X, s_nationkey#X] -(32) ShuffledHashJoinExecTransformer +(36) ShuffledHashJoinExecTransformer Left keys [1]: [l_suppkey#X] Right keys [1]: [s_suppkey#X] Join type: Inner Join condition: None -(33) ProjectExecTransformer +(37) ProjectExecTransformer Output [5]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Input [6]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, s_suppkey#X, s_nationkey#X] -(34) WholeStageCodegenTransformer (X) +(38) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: false -(35) ColumnarExchange +(39) VeloxAppendBatches +Input [5]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] +Arguments: X + +(40) ColumnarExchange Input [5]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X], [plan_id=X], [id=#X] -(36) ShuffleQueryStage +(41) ShuffleQueryStage Output [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: X -(37) InputAdapter +(42) InputAdapter Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] -(38) InputIteratorTransformer +(43) InputIteratorTransformer Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] -(39) Scan parquet +(44) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1995-01-01), LessThanOrEqual(o_orderdate,1996-12-31), IsNotNull(o_orderkey), IsNotNull(o_custkey)] ReadSchema: struct -(40) FilterExecTransformer +(45) FilterExecTransformer Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Arguments: ((((isnotnull(o_orderdate#X) AND (o_orderdate#X >= 1995-01-01)) AND (o_orderdate#X <= 1996-12-31)) AND isnotnull(o_orderkey#X)) AND isnotnull(o_custkey#X)) -(41) ProjectExecTransformer +(46) ProjectExecTransformer Output [4]: [hash(o_orderkey#X, 42) AS hash_partition_key#X, o_orderkey#X, o_custkey#X, o_orderdate#X] Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] -(42) WholeStageCodegenTransformer (X) +(47) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_orderdate#X] Arguments: false -(43) ColumnarExchange +(48) VeloxAppendBatches +Input [4]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_orderdate#X] +Arguments: X + +(49) ColumnarExchange Input [4]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_orderdate#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_orderdate#X], [plan_id=X], [id=#X] -(44) ShuffleQueryStage +(50) ShuffleQueryStage Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Arguments: X -(45) InputAdapter +(51) InputAdapter Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] -(46) InputIteratorTransformer +(52) InputIteratorTransformer Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] -(47) ShuffledHashJoinExecTransformer +(53) ShuffledHashJoinExecTransformer Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join type: Inner Join condition: None -(48) ProjectExecTransformer +(54) ProjectExecTransformer Output [6]: [hash(o_custkey#X, 42) AS hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] Input [7]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderkey#X, o_custkey#X, o_orderdate#X] -(49) WholeStageCodegenTransformer (X) +(55) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] Arguments: false -(50) ColumnarExchange +(56) VeloxAppendBatches +Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] +Arguments: X + +(57) ColumnarExchange Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X], [plan_id=X], [id=#X] -(51) ShuffleQueryStage +(58) ShuffleQueryStage Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] Arguments: X -(52) InputAdapter +(59) InputAdapter Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] -(53) InputIteratorTransformer +(60) InputIteratorTransformer Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] -(54) Scan parquet +(61) Scan parquet Output [2]: [c_custkey#X, c_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey), IsNotNull(c_nationkey)] ReadSchema: struct -(55) FilterExecTransformer +(62) FilterExecTransformer Input [2]: [c_custkey#X, c_nationkey#X] Arguments: (isnotnull(c_custkey#X) AND isnotnull(c_nationkey#X)) -(56) ProjectExecTransformer +(63) ProjectExecTransformer Output [3]: [hash(c_custkey#X, 42) AS hash_partition_key#X, c_custkey#X, c_nationkey#X] Input [2]: [c_custkey#X, c_nationkey#X] -(57) WholeStageCodegenTransformer (X) +(64) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] Arguments: false -(58) ColumnarExchange +(65) VeloxAppendBatches +Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] +Arguments: X + +(66) ColumnarExchange Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_nationkey#X], [plan_id=X], [id=#X] -(59) ShuffleQueryStage +(67) ShuffleQueryStage Output [2]: [c_custkey#X, c_nationkey#X] Arguments: X -(60) InputAdapter +(68) InputAdapter Input [2]: [c_custkey#X, c_nationkey#X] -(61) InputIteratorTransformer +(69) InputIteratorTransformer Input [2]: [c_custkey#X, c_nationkey#X] -(62) ShuffledHashJoinExecTransformer +(70) ShuffledHashJoinExecTransformer Left keys [1]: [o_custkey#X] Right keys [1]: [c_custkey#X] Join type: Inner Join condition: None -(63) ProjectExecTransformer +(71) ProjectExecTransformer Output [6]: [hash(c_nationkey#X, 42) AS hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] Input [7]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X, c_custkey#X, c_nationkey#X] -(64) WholeStageCodegenTransformer (X) +(72) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] Arguments: false -(65) ColumnarExchange +(73) VeloxAppendBatches +Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] +Arguments: X + +(74) ColumnarExchange Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X], [plan_id=X], [id=#X] -(66) ShuffleQueryStage +(75) ShuffleQueryStage Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] Arguments: X -(67) InputAdapter +(76) InputAdapter Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] -(68) InputIteratorTransformer +(77) InputIteratorTransformer Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] -(69) Scan parquet +(78) Scan parquet Output [2]: [n_nationkey#X, n_regionkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), IsNotNull(n_regionkey)] ReadSchema: struct -(70) FilterExecTransformer +(79) FilterExecTransformer Input [2]: [n_nationkey#X, n_regionkey#X] Arguments: (isnotnull(n_nationkey#X) AND isnotnull(n_regionkey#X)) -(71) ProjectExecTransformer +(80) ProjectExecTransformer Output [3]: [hash(n_nationkey#X, 42) AS hash_partition_key#X, n_nationkey#X, n_regionkey#X] Input [2]: [n_nationkey#X, n_regionkey#X] -(72) WholeStageCodegenTransformer (X) +(81) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, n_nationkey#X, n_regionkey#X] Arguments: false -(73) ColumnarExchange +(82) VeloxAppendBatches +Input [3]: [hash_partition_key#X, n_nationkey#X, n_regionkey#X] +Arguments: X + +(83) ColumnarExchange Input [3]: [hash_partition_key#X, n_nationkey#X, n_regionkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_regionkey#X], [plan_id=X], [id=#X] -(74) ShuffleQueryStage +(84) ShuffleQueryStage Output [2]: [n_nationkey#X, n_regionkey#X] Arguments: X -(75) InputAdapter +(85) InputAdapter Input [2]: [n_nationkey#X, n_regionkey#X] -(76) InputIteratorTransformer +(86) InputIteratorTransformer Input [2]: [n_nationkey#X, n_regionkey#X] -(77) ShuffledHashJoinExecTransformer +(87) ShuffledHashJoinExecTransformer Left keys [1]: [c_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(78) ProjectExecTransformer +(88) ProjectExecTransformer Output [6]: [hash(s_nationkey#X, 42) AS hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] Input [7]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X, n_nationkey#X, n_regionkey#X] -(79) WholeStageCodegenTransformer (X) +(89) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] Arguments: false -(80) ColumnarExchange +(90) VeloxAppendBatches +Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] +Arguments: X + +(91) ColumnarExchange Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X], [plan_id=X], [id=#X] -(81) ShuffleQueryStage +(92) ShuffleQueryStage Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] Arguments: X -(82) InputAdapter +(93) InputAdapter Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] -(83) InputIteratorTransformer +(94) InputIteratorTransformer Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] -(84) Scan parquet +(95) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey)] ReadSchema: struct -(85) FilterExecTransformer +(96) FilterExecTransformer Input [2]: [n_nationkey#X, n_name#X] Arguments: isnotnull(n_nationkey#X) -(86) ProjectExecTransformer +(97) ProjectExecTransformer Output [3]: [hash(n_nationkey#X, 42) AS hash_partition_key#X, n_nationkey#X, n_name#X] Input [2]: [n_nationkey#X, n_name#X] -(87) WholeStageCodegenTransformer (X) +(98) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] Arguments: false -(88) ColumnarExchange +(99) VeloxAppendBatches +Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] +Arguments: X + +(100) ColumnarExchange Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [id=#X] -(89) ShuffleQueryStage +(101) ShuffleQueryStage Output [2]: [n_nationkey#X, n_name#X] Arguments: X -(90) InputAdapter +(102) InputAdapter Input [2]: [n_nationkey#X, n_name#X] -(91) InputIteratorTransformer +(103) InputIteratorTransformer Input [2]: [n_nationkey#X, n_name#X] -(92) ShuffledHashJoinExecTransformer +(104) ShuffledHashJoinExecTransformer Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(93) ProjectExecTransformer +(105) ProjectExecTransformer Output [6]: [hash(n_regionkey#X, 42) AS hash_partition_key#X, l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] Input [7]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X, n_nationkey#X, n_name#X] -(94) WholeStageCodegenTransformer (X) +(106) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] Arguments: false -(95) ColumnarExchange +(107) VeloxAppendBatches +Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] +Arguments: X + +(108) ColumnarExchange Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] Arguments: hashpartitioning(n_regionkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X], [plan_id=X], [id=#X] -(96) ShuffleQueryStage +(109) ShuffleQueryStage Output [5]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] Arguments: X -(97) InputAdapter +(110) InputAdapter Input [5]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] -(98) InputIteratorTransformer +(111) InputIteratorTransformer Input [5]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] -(99) Scan parquet +(112) Scan parquet Output [2]: [r_regionkey#X, r_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(r_name), EqualTo(r_name,AMERICA), IsNotNull(r_regionkey)] ReadSchema: struct -(100) FilterExecTransformer +(113) FilterExecTransformer Input [2]: [r_regionkey#X, r_name#X] Arguments: ((isnotnull(r_name#X) AND (r_name#X = AMERICA)) AND isnotnull(r_regionkey#X)) -(101) ProjectExecTransformer +(114) ProjectExecTransformer Output [2]: [hash(r_regionkey#X, 42) AS hash_partition_key#X, r_regionkey#X] Input [2]: [r_regionkey#X, r_name#X] -(102) WholeStageCodegenTransformer (X) +(115) WholeStageCodegenTransformer (X) Input [2]: [hash_partition_key#X, r_regionkey#X] Arguments: false -(103) ColumnarExchange +(116) VeloxAppendBatches +Input [2]: [hash_partition_key#X, r_regionkey#X] +Arguments: X + +(117) ColumnarExchange Input [2]: [hash_partition_key#X, r_regionkey#X] Arguments: hashpartitioning(r_regionkey#X, 1), ENSURE_REQUIREMENTS, [r_regionkey#X], [plan_id=X], [id=#X] -(104) ShuffleQueryStage +(118) ShuffleQueryStage Output [1]: [r_regionkey#X] Arguments: X -(105) InputAdapter +(119) InputAdapter Input [1]: [r_regionkey#X] -(106) InputIteratorTransformer +(120) InputIteratorTransformer Input [1]: [r_regionkey#X] -(107) ShuffledHashJoinExecTransformer +(121) ShuffledHashJoinExecTransformer Left keys [1]: [n_regionkey#X] Right keys [1]: [r_regionkey#X] Join type: Inner Join condition: None -(108) ProjectExecTransformer +(122) ProjectExecTransformer Output [4]: [year(o_orderdate#X) AS o_year#X, (l_extendedprice#X * (1 - l_discount#X)) AS volume#X, n_name#X AS nation#X, CASE WHEN (n_name#X = BRAZIL) THEN (l_extendedprice#X * (1 - l_discount#X)) ELSE 0.0000 END AS _pre_X#X] Input [6]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X, r_regionkey#X] -(109) FlushableHashAggregateExecTransformer +(123) FlushableHashAggregateExecTransformer Input [4]: [o_year#X, volume#X, nation#X, _pre_X#X] Keys [1]: [o_year#X] Functions [2]: [partial_sum(_pre_X#X), partial_sum(volume#X)] Aggregate Attributes [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Results [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] -(110) ProjectExecTransformer +(124) ProjectExecTransformer Output [6]: [hash(o_year#X, 42) AS hash_partition_key#X, o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] -(111) WholeStageCodegenTransformer (X) +(125) WholeStageCodegenTransformer (X) Input [6]: [hash_partition_key#X, o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: false -(112) ColumnarExchange +(126) VeloxAppendBatches +Input [6]: [hash_partition_key#X, o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] +Arguments: X + +(127) ColumnarExchange Input [6]: [hash_partition_key#X, o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: hashpartitioning(o_year#X, 1), ENSURE_REQUIREMENTS, [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(113) ShuffleQueryStage +(128) ShuffleQueryStage Output [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: X -(114) InputAdapter +(129) InputAdapter Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] -(115) InputIteratorTransformer +(130) InputIteratorTransformer Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] -(116) RegularHashAggregateExecTransformer +(131) RegularHashAggregateExecTransformer Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Keys [1]: [o_year#X] Functions [2]: [sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END), sum(volume#X)] Aggregate Attributes [2]: [sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X, sum(volume#X)#X] Results [3]: [o_year#X, sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X, sum(volume#X)#X] -(117) ProjectExecTransformer +(132) ProjectExecTransformer Output [2]: [o_year#X, (sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X / sum(volume#X)#X) AS mkt_share#X] Input [3]: [o_year#X, sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X, sum(volume#X)#X] -(118) WholeStageCodegenTransformer (X) +(133) WholeStageCodegenTransformer (X) Input [2]: [o_year#X, mkt_share#X] Arguments: false -(119) ColumnarExchange +(134) VeloxAppendBatches +Input [2]: [o_year#X, mkt_share#X] +Arguments: X + +(135) ColumnarExchange Input [2]: [o_year#X, mkt_share#X] Arguments: rangepartitioning(o_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(120) ShuffleQueryStage +(136) ShuffleQueryStage Output [2]: [o_year#X, mkt_share#X] Arguments: X -(121) InputAdapter +(137) InputAdapter Input [2]: [o_year#X, mkt_share#X] -(122) InputIteratorTransformer +(138) InputIteratorTransformer Input [2]: [o_year#X, mkt_share#X] -(123) SortExecTransformer +(139) SortExecTransformer Input [2]: [o_year#X, mkt_share#X] Arguments: [o_year#X ASC NULLS FIRST], true, 0 -(124) WholeStageCodegenTransformer (X) +(140) WholeStageCodegenTransformer (X) Input [2]: [o_year#X, mkt_share#X] Arguments: false -(125) VeloxColumnarToRowExec +(141) VeloxColumnarToRowExec Input [2]: [o_year#X, mkt_share#X] -(126) Scan parquet +(142) Scan parquet Output [2]: [p_partkey#X, p_type#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_type), EqualTo(p_type,ECONOMY ANODIZED STEEL), IsNotNull(p_partkey)] ReadSchema: struct -(127) Filter +(143) Filter Input [2]: [p_partkey#X, p_type#X] Condition : ((isnotnull(p_type#X) AND (p_type#X = ECONOMY ANODIZED STEEL)) AND isnotnull(p_partkey#X)) -(128) Project +(144) Project Output [1]: [p_partkey#X] Input [2]: [p_partkey#X, p_type#X] -(129) Exchange +(145) Exchange Input [1]: [p_partkey#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(130) Scan parquet +(146) Sort +Input [1]: [p_partkey#X] +Arguments: [p_partkey#X ASC NULLS FIRST], false, 0 + +(147) Scan parquet Output [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(131) Filter +(148) Filter Input [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Condition : ((isnotnull(l_partkey#X) AND isnotnull(l_suppkey#X)) AND isnotnull(l_orderkey#X)) -(132) Exchange +(149) Exchange Input [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(133) ShuffledHashJoin +(150) Sort +Input [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] +Arguments: [l_partkey#X ASC NULLS FIRST], false, 0 + +(151) SortMergeJoin Left keys [1]: [p_partkey#X] Right keys [1]: [l_partkey#X] Join type: Inner Join condition: None -(134) Project +(152) Project Output [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Input [6]: [p_partkey#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -(135) Exchange +(153) Exchange Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(136) Scan parquet +(154) Sort +Input [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] +Arguments: [l_suppkey#X ASC NULLS FIRST], false, 0 + +(155) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(137) Filter +(156) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(138) Exchange +(157) Exchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(139) ShuffledHashJoin +(158) Sort +Input [2]: [s_suppkey#X, s_nationkey#X] +Arguments: [s_suppkey#X ASC NULLS FIRST], false, 0 + +(159) SortMergeJoin Left keys [1]: [l_suppkey#X] Right keys [1]: [s_suppkey#X] Join type: Inner Join condition: None -(140) Project +(160) Project Output [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Input [6]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, s_suppkey#X, s_nationkey#X] -(141) Exchange +(161) Exchange Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(142) Scan parquet +(162) Sort +Input [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(163) Scan parquet Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderdate), GreaterThanOrEqual(o_orderdate,1995-01-01), LessThanOrEqual(o_orderdate,1996-12-31), IsNotNull(o_orderkey), IsNotNull(o_custkey)] ReadSchema: struct -(143) Filter +(164) Filter Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Condition : ((((isnotnull(o_orderdate#X) AND (o_orderdate#X >= 1995-01-01)) AND (o_orderdate#X <= 1996-12-31)) AND isnotnull(o_orderkey#X)) AND isnotnull(o_custkey#X)) -(144) Exchange +(165) Exchange Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(145) ShuffledHashJoin +(166) Sort +Input [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(167) SortMergeJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join type: Inner Join condition: None -(146) Project +(168) Project Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] Input [7]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderkey#X, o_custkey#X, o_orderdate#X] -(147) Exchange +(169) Exchange Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(148) Scan parquet +(170) Sort +Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] +Arguments: [o_custkey#X ASC NULLS FIRST], false, 0 + +(171) Scan parquet Output [2]: [c_custkey#X, c_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(c_custkey), IsNotNull(c_nationkey)] ReadSchema: struct -(149) Filter +(172) Filter Input [2]: [c_custkey#X, c_nationkey#X] Condition : (isnotnull(c_custkey#X) AND isnotnull(c_nationkey#X)) -(150) Exchange +(173) Exchange Input [2]: [c_custkey#X, c_nationkey#X] Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(151) ShuffledHashJoin +(174) Sort +Input [2]: [c_custkey#X, c_nationkey#X] +Arguments: [c_custkey#X ASC NULLS FIRST], false, 0 + +(175) SortMergeJoin Left keys [1]: [o_custkey#X] Right keys [1]: [c_custkey#X] Join type: Inner Join condition: None -(152) Project +(176) Project Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] Input [7]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X, c_custkey#X, c_nationkey#X] -(153) Exchange +(177) Exchange Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(154) Scan parquet +(178) Sort +Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] +Arguments: [c_nationkey#X ASC NULLS FIRST], false, 0 + +(179) Scan parquet Output [2]: [n_nationkey#X, n_regionkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey), IsNotNull(n_regionkey)] ReadSchema: struct -(155) Filter +(180) Filter Input [2]: [n_nationkey#X, n_regionkey#X] Condition : (isnotnull(n_nationkey#X) AND isnotnull(n_regionkey#X)) -(156) Exchange +(181) Exchange Input [2]: [n_nationkey#X, n_regionkey#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(157) ShuffledHashJoin +(182) Sort +Input [2]: [n_nationkey#X, n_regionkey#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(183) SortMergeJoin Left keys [1]: [c_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(158) Project +(184) Project Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] Input [7]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X, n_nationkey#X, n_regionkey#X] -(159) Exchange +(185) Exchange Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(160) Scan parquet +(186) Sort +Input [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] +Arguments: [s_nationkey#X ASC NULLS FIRST], false, 0 + +(187) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey)] ReadSchema: struct -(161) Filter +(188) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : isnotnull(n_nationkey#X) -(162) Exchange +(189) Exchange Input [2]: [n_nationkey#X, n_name#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(163) ShuffledHashJoin +(190) Sort +Input [2]: [n_nationkey#X, n_name#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(191) SortMergeJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(164) Project +(192) Project Output [5]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] Input [7]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X, n_nationkey#X, n_name#X] -(165) Exchange +(193) Exchange Input [5]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] Arguments: hashpartitioning(n_regionkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(166) Scan parquet +(194) Sort +Input [5]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] +Arguments: [n_regionkey#X ASC NULLS FIRST], false, 0 + +(195) Scan parquet Output [2]: [r_regionkey#X, r_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(r_name), EqualTo(r_name,AMERICA), IsNotNull(r_regionkey)] ReadSchema: struct -(167) Filter +(196) Filter Input [2]: [r_regionkey#X, r_name#X] Condition : ((isnotnull(r_name#X) AND (r_name#X = AMERICA)) AND isnotnull(r_regionkey#X)) -(168) Project +(197) Project Output [1]: [r_regionkey#X] Input [2]: [r_regionkey#X, r_name#X] -(169) Exchange +(198) Exchange Input [1]: [r_regionkey#X] Arguments: hashpartitioning(r_regionkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(170) ShuffledHashJoin +(199) Sort +Input [1]: [r_regionkey#X] +Arguments: [r_regionkey#X ASC NULLS FIRST], false, 0 + +(200) SortMergeJoin Left keys [1]: [n_regionkey#X] Right keys [1]: [r_regionkey#X] Join type: Inner Join condition: None -(171) Project +(201) Project Output [3]: [year(o_orderdate#X) AS o_year#X, (l_extendedprice#X * (1 - l_discount#X)) AS volume#X, n_name#X AS nation#X] Input [6]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X, r_regionkey#X] -(172) HashAggregate +(202) HashAggregate Input [3]: [o_year#X, volume#X, nation#X] Keys [1]: [o_year#X] Functions [2]: [partial_sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END), partial_sum(volume#X)] Aggregate Attributes [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] Results [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] -(173) Exchange +(203) Exchange Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Arguments: hashpartitioning(o_year#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(174) HashAggregate +(204) HashAggregate Input [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] Keys [1]: [o_year#X] Functions [2]: [sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END), sum(volume#X)] Aggregate Attributes [2]: [sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X, sum(volume#X)#X] Results [2]: [o_year#X, (sum(CASE WHEN (nation#X = BRAZIL) THEN volume#X ELSE 0.0000 END)#X / sum(volume#X)#X) AS mkt_share#X] -(175) Exchange +(205) Exchange Input [2]: [o_year#X, mkt_share#X] Arguments: rangepartitioning(o_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(176) Sort +(206) Sort Input [2]: [o_year#X, mkt_share#X] Arguments: [o_year#X ASC NULLS FIRST], true, 0 -(177) AdaptiveSparkPlan +(207) AdaptiveSparkPlan Output [2]: [o_year#X, mkt_share#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/9.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/9.txt index aef854c3549ef..3000cbae7a6d4 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/9.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/9.txt @@ -1,126 +1,136 @@ == Physical Plan == -AdaptiveSparkPlan (133) +AdaptiveSparkPlan (155) +- == Final Plan == - VeloxColumnarToRowExec (94) - +- ^ SortExecTransformer (92) - +- ^ InputIteratorTransformer (91) - +- ^ InputAdapter (90) - +- ^ ShuffleQueryStage (89), Statistics(X) - +- ColumnarExchange (88) - +- ^ RegularHashAggregateExecTransformer (86) - +- ^ InputIteratorTransformer (85) - +- ^ InputAdapter (84) - +- ^ ShuffleQueryStage (83), Statistics(X) - +- ColumnarExchange (82) - +- ^ ProjectExecTransformer (80) - +- ^ FlushableHashAggregateExecTransformer (79) - +- ^ ProjectExecTransformer (78) - +- ^ ShuffledHashJoinExecTransformer Inner (77) - :- ^ InputIteratorTransformer (68) - : +- ^ InputAdapter (67) - : +- ^ ShuffleQueryStage (66), Statistics(X) - : +- ColumnarExchange (65) - : +- ^ ProjectExecTransformer (63) - : +- ^ ShuffledHashJoinExecTransformer Inner (62) - : :- ^ InputIteratorTransformer (53) - : : +- ^ InputAdapter (52) - : : +- ^ ShuffleQueryStage (51), Statistics(X) - : : +- ColumnarExchange (50) - : : +- ^ ProjectExecTransformer (48) - : : +- ^ ShuffledHashJoinExecTransformer Inner (47) - : : :- ^ InputIteratorTransformer (38) - : : : +- ^ InputAdapter (37) - : : : +- ^ ShuffleQueryStage (36), Statistics(X) - : : : +- ColumnarExchange (35) - : : : +- ^ ProjectExecTransformer (33) - : : : +- ^ ShuffledHashJoinExecTransformer Inner (32) - : : : :- ^ InputIteratorTransformer (23) - : : : : +- ^ InputAdapter (22) - : : : : +- ^ ShuffleQueryStage (21), Statistics(X) - : : : : +- ColumnarExchange (20) - : : : : +- ^ ProjectExecTransformer (18) - : : : : +- ^ ShuffledHashJoinExecTransformer Inner (17) - : : : : :- ^ InputIteratorTransformer (8) - : : : : : +- ^ InputAdapter (7) - : : : : : +- ^ ShuffleQueryStage (6), Statistics(X) - : : : : : +- ColumnarExchange (5) + VeloxColumnarToRowExec (106) + +- ^ SortExecTransformer (104) + +- ^ InputIteratorTransformer (103) + +- ShuffleQueryStage (101), Statistics(X) + +- ColumnarExchange (100) + +- VeloxAppendBatches (99) + +- ^ RegularHashAggregateExecTransformer (97) + +- ^ InputIteratorTransformer (96) + +- ShuffleQueryStage (94), Statistics(X) + +- ColumnarExchange (93) + +- VeloxAppendBatches (92) + +- ^ ProjectExecTransformer (90) + +- ^ FlushableHashAggregateExecTransformer (89) + +- ^ ProjectExecTransformer (88) + +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (87) + :- ^ InputIteratorTransformer (77) + : +- ShuffleQueryStage (75), Statistics(X) + : +- ColumnarExchange (74) + : +- VeloxAppendBatches (73) + : +- ^ ProjectExecTransformer (71) + : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (70) + : :- ^ InputIteratorTransformer (60) + : : +- ShuffleQueryStage (58), Statistics(X) + : : +- ColumnarExchange (57) + : : +- VeloxAppendBatches (56) + : : +- ^ ProjectExecTransformer (54) + : : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (53) + : : :- ^ InputIteratorTransformer (43) + : : : +- ShuffleQueryStage (41), Statistics(X) + : : : +- ColumnarExchange (40) + : : : +- VeloxAppendBatches (39) + : : : +- ^ ProjectExecTransformer (37) + : : : +- ^ ShuffledHashJoinExecTransformer Inner BuildRight (36) + : : : :- ^ InputIteratorTransformer (26) + : : : : +- ShuffleQueryStage (24), Statistics(X) + : : : : +- ColumnarExchange (23) + : : : : +- VeloxAppendBatches (22) + : : : : +- ^ ProjectExecTransformer (20) + : : : : +- ^ ShuffledHashJoinExecTransformer Inner BuildLeft (19) + : : : : :- ^ InputIteratorTransformer (9) + : : : : : +- ShuffleQueryStage (7), Statistics(X) + : : : : : +- ColumnarExchange (6) + : : : : : +- VeloxAppendBatches (5) : : : : : +- ^ ProjectExecTransformer (3) : : : : : +- ^ FilterExecTransformer (2) : : : : : +- ^ Scan parquet (1) - : : : : +- ^ InputIteratorTransformer (16) - : : : : +- ^ InputAdapter (15) - : : : : +- ^ ShuffleQueryStage (14), Statistics(X) - : : : : +- ColumnarExchange (13) - : : : : +- ^ ProjectExecTransformer (11) - : : : : +- ^ FilterExecTransformer (10) - : : : : +- ^ Scan parquet (9) - : : : +- ^ InputIteratorTransformer (31) - : : : +- ^ InputAdapter (30) - : : : +- ^ ShuffleQueryStage (29), Statistics(X) - : : : +- ColumnarExchange (28) - : : : +- ^ ProjectExecTransformer (26) - : : : +- ^ FilterExecTransformer (25) - : : : +- ^ Scan parquet (24) - : : +- ^ InputIteratorTransformer (46) - : : +- ^ InputAdapter (45) - : : +- ^ ShuffleQueryStage (44), Statistics(X) - : : +- ColumnarExchange (43) - : : +- ^ ProjectExecTransformer (41) - : : +- ^ FilterExecTransformer (40) - : : +- ^ Scan parquet (39) - : +- ^ InputIteratorTransformer (61) - : +- ^ InputAdapter (60) - : +- ^ ShuffleQueryStage (59), Statistics(X) - : +- ColumnarExchange (58) - : +- ^ ProjectExecTransformer (56) - : +- ^ FilterExecTransformer (55) - : +- ^ Scan parquet (54) - +- ^ InputIteratorTransformer (76) - +- ^ InputAdapter (75) - +- ^ ShuffleQueryStage (74), Statistics(X) - +- ColumnarExchange (73) - +- ^ ProjectExecTransformer (71) - +- ^ FilterExecTransformer (70) - +- ^ Scan parquet (69) + : : : : +- ^ InputIteratorTransformer (18) + : : : : +- ShuffleQueryStage (16), Statistics(X) + : : : : +- ColumnarExchange (15) + : : : : +- VeloxAppendBatches (14) + : : : : +- ^ ProjectExecTransformer (12) + : : : : +- ^ FilterExecTransformer (11) + : : : : +- ^ Scan parquet (10) + : : : +- ^ InputIteratorTransformer (35) + : : : +- ShuffleQueryStage (33), Statistics(X) + : : : +- ColumnarExchange (32) + : : : +- VeloxAppendBatches (31) + : : : +- ^ ProjectExecTransformer (29) + : : : +- ^ FilterExecTransformer (28) + : : : +- ^ Scan parquet (27) + : : +- ^ InputIteratorTransformer (52) + : : +- ShuffleQueryStage (50), Statistics(X) + : : +- ColumnarExchange (49) + : : +- VeloxAppendBatches (48) + : : +- ^ ProjectExecTransformer (46) + : : +- ^ FilterExecTransformer (45) + : : +- ^ Scan parquet (44) + : +- ^ InputIteratorTransformer (69) + : +- ShuffleQueryStage (67), Statistics(X) + : +- ColumnarExchange (66) + : +- VeloxAppendBatches (65) + : +- ^ ProjectExecTransformer (63) + : +- ^ FilterExecTransformer (62) + : +- ^ Scan parquet (61) + +- ^ InputIteratorTransformer (86) + +- ShuffleQueryStage (84), Statistics(X) + +- ColumnarExchange (83) + +- VeloxAppendBatches (82) + +- ^ ProjectExecTransformer (80) + +- ^ FilterExecTransformer (79) + +- ^ Scan parquet (78) +- == Initial Plan == - Sort (132) - +- Exchange (131) - +- HashAggregate (130) - +- Exchange (129) - +- HashAggregate (128) - +- Project (127) - +- ShuffledHashJoin Inner BuildRight (126) - :- Exchange (122) - : +- Project (121) - : +- ShuffledHashJoin Inner BuildRight (120) - : :- Exchange (116) - : : +- Project (115) - : : +- ShuffledHashJoin Inner BuildRight (114) - : : :- Exchange (110) - : : : +- Project (109) - : : : +- ShuffledHashJoin Inner BuildRight (108) - : : : :- Exchange (104) - : : : : +- Project (103) - : : : : +- ShuffledHashJoin Inner BuildLeft (102) - : : : : :- Exchange (98) - : : : : : +- Project (97) - : : : : : +- Filter (96) - : : : : : +- Scan parquet (95) - : : : : +- Exchange (101) - : : : : +- Filter (100) - : : : : +- Scan parquet (99) - : : : +- Exchange (107) - : : : +- Filter (106) - : : : +- Scan parquet (105) - : : +- Exchange (113) - : : +- Filter (112) - : : +- Scan parquet (111) - : +- Exchange (119) - : +- Filter (118) - : +- Scan parquet (117) - +- Exchange (125) - +- Filter (124) - +- Scan parquet (123) + Sort (154) + +- Exchange (153) + +- HashAggregate (152) + +- Exchange (151) + +- HashAggregate (150) + +- Project (149) + +- SortMergeJoin Inner (148) + :- Sort (143) + : +- Exchange (142) + : +- Project (141) + : +- SortMergeJoin Inner (140) + : :- Sort (135) + : : +- Exchange (134) + : : +- Project (133) + : : +- SortMergeJoin Inner (132) + : : :- Sort (127) + : : : +- Exchange (126) + : : : +- Project (125) + : : : +- SortMergeJoin Inner (124) + : : : :- Sort (119) + : : : : +- Exchange (118) + : : : : +- Project (117) + : : : : +- SortMergeJoin Inner (116) + : : : : :- Sort (111) + : : : : : +- Exchange (110) + : : : : : +- Project (109) + : : : : : +- Filter (108) + : : : : : +- Scan parquet (107) + : : : : +- Sort (115) + : : : : +- Exchange (114) + : : : : +- Filter (113) + : : : : +- Scan parquet (112) + : : : +- Sort (123) + : : : +- Exchange (122) + : : : +- Filter (121) + : : : +- Scan parquet (120) + : : +- Sort (131) + : : +- Exchange (130) + : : +- Filter (129) + : : +- Scan parquet (128) + : +- Sort (139) + : +- Exchange (138) + : +- Filter (137) + : +- Scan parquet (136) + +- Sort (147) + +- Exchange (146) + +- Filter (145) + +- Scan parquet (144) (1) Scan parquet @@ -142,558 +152,646 @@ Input [2]: [p_partkey#X, p_name#X] Input [2]: [hash_partition_key#X, p_partkey#X] Arguments: false -(5) ColumnarExchange +(5) VeloxAppendBatches +Input [2]: [hash_partition_key#X, p_partkey#X] +Arguments: X + +(6) ColumnarExchange Input [2]: [hash_partition_key#X, p_partkey#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [id=#X] -(6) ShuffleQueryStage +(7) ShuffleQueryStage Output [1]: [p_partkey#X] Arguments: X -(7) InputAdapter +(8) InputAdapter Input [1]: [p_partkey#X] -(8) InputIteratorTransformer +(9) InputIteratorTransformer Input [1]: [p_partkey#X] -(9) Scan parquet +(10) Scan parquet Output [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(10) FilterExecTransformer +(11) FilterExecTransformer Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: ((isnotnull(l_partkey#X) AND isnotnull(l_suppkey#X)) AND isnotnull(l_orderkey#X)) -(11) ProjectExecTransformer +(12) ProjectExecTransformer Output [7]: [hash(l_partkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -(12) WholeStageCodegenTransformer (X) +(13) WholeStageCodegenTransformer (X) Input [7]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: false -(13) ColumnarExchange +(14) VeloxAppendBatches +Input [7]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(15) ColumnarExchange Input [7]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(14) ShuffleQueryStage +(16) ShuffleQueryStage Output [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: X -(15) InputAdapter +(17) InputAdapter Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -(16) InputIteratorTransformer +(18) InputIteratorTransformer Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -(17) ShuffledHashJoinExecTransformer +(19) ShuffledHashJoinExecTransformer Left keys [1]: [p_partkey#X] Right keys [1]: [l_partkey#X] Join type: Inner Join condition: None -(18) ProjectExecTransformer +(20) ProjectExecTransformer Output [7]: [hash(l_suppkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Input [7]: [p_partkey#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -(19) WholeStageCodegenTransformer (X) +(21) WholeStageCodegenTransformer (X) Input [7]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: false -(20) ColumnarExchange +(22) VeloxAppendBatches +Input [7]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] +Arguments: X + +(23) ColumnarExchange Input [7]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] -(21) ShuffleQueryStage +(24) ShuffleQueryStage Output [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: X -(22) InputAdapter +(25) InputAdapter Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -(23) InputIteratorTransformer +(26) InputIteratorTransformer Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -(24) Scan parquet +(27) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(25) FilterExecTransformer +(28) FilterExecTransformer Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(26) ProjectExecTransformer +(29) ProjectExecTransformer Output [3]: [hash(s_suppkey#X, 42) AS hash_partition_key#X, s_suppkey#X, s_nationkey#X] Input [2]: [s_suppkey#X, s_nationkey#X] -(27) WholeStageCodegenTransformer (X) +(30) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] Arguments: false -(28) ColumnarExchange +(31) VeloxAppendBatches +Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] +Arguments: X + +(32) ColumnarExchange Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [id=#X] -(29) ShuffleQueryStage +(33) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] Arguments: X -(30) InputAdapter +(34) InputAdapter Input [2]: [s_suppkey#X, s_nationkey#X] -(31) InputIteratorTransformer +(35) InputIteratorTransformer Input [2]: [s_suppkey#X, s_nationkey#X] -(32) ShuffledHashJoinExecTransformer +(36) ShuffledHashJoinExecTransformer Left keys [1]: [l_suppkey#X] Right keys [1]: [s_suppkey#X] Join type: Inner Join condition: None -(33) ProjectExecTransformer +(37) ProjectExecTransformer Output [8]: [hash(l_suppkey#X, l_partkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Input [8]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_suppkey#X, s_nationkey#X] -(34) WholeStageCodegenTransformer (X) +(38) WholeStageCodegenTransformer (X) Input [8]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: false -(35) ColumnarExchange +(39) VeloxAppendBatches +Input [8]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] +Arguments: X + +(40) ColumnarExchange Input [8]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: hashpartitioning(l_suppkey#X, l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X], [plan_id=X], [id=#X] -(36) ShuffleQueryStage +(41) ShuffleQueryStage Output [7]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: X -(37) InputAdapter +(42) InputAdapter Input [7]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] -(38) InputIteratorTransformer +(43) InputIteratorTransformer Input [7]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] -(39) Scan parquet +(44) Scan parquet Output [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_suppkey), IsNotNull(ps_partkey)] ReadSchema: struct -(40) FilterExecTransformer +(45) FilterExecTransformer Input [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Arguments: (isnotnull(ps_suppkey#X) AND isnotnull(ps_partkey#X)) -(41) ProjectExecTransformer +(46) ProjectExecTransformer Output [4]: [hash(ps_suppkey#X, ps_partkey#X, 42) AS hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Input [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] -(42) WholeStageCodegenTransformer (X) +(47) WholeStageCodegenTransformer (X) Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Arguments: false -(43) ColumnarExchange +(48) VeloxAppendBatches +Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_supplycost#X] +Arguments: X + +(49) ColumnarExchange Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Arguments: hashpartitioning(ps_suppkey#X, ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_supplycost#X], [plan_id=X], [id=#X] -(44) ShuffleQueryStage +(50) ShuffleQueryStage Output [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Arguments: X -(45) InputAdapter +(51) InputAdapter Input [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] -(46) InputIteratorTransformer +(52) InputIteratorTransformer Input [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] -(47) ShuffledHashJoinExecTransformer +(53) ShuffledHashJoinExecTransformer Left keys [2]: [l_suppkey#X, l_partkey#X] Right keys [2]: [ps_suppkey#X, ps_partkey#X] Join type: Inner Join condition: None -(48) ProjectExecTransformer +(54) ProjectExecTransformer Output [7]: [hash(l_orderkey#X, 42) AS hash_partition_key#X, l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] Input [10]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_partkey#X, ps_suppkey#X, ps_supplycost#X] -(49) WholeStageCodegenTransformer (X) +(55) WholeStageCodegenTransformer (X) Input [7]: [hash_partition_key#X, l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] Arguments: false -(50) ColumnarExchange +(56) VeloxAppendBatches +Input [7]: [hash_partition_key#X, l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] +Arguments: X + +(57) ColumnarExchange Input [7]: [hash_partition_key#X, l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X], [plan_id=X], [id=#X] -(51) ShuffleQueryStage +(58) ShuffleQueryStage Output [6]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] Arguments: X -(52) InputAdapter +(59) InputAdapter Input [6]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] -(53) InputIteratorTransformer +(60) InputIteratorTransformer Input [6]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] -(54) Scan parquet +(61) Scan parquet Output [2]: [o_orderkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderkey)] ReadSchema: struct -(55) FilterExecTransformer +(62) FilterExecTransformer Input [2]: [o_orderkey#X, o_orderdate#X] Arguments: isnotnull(o_orderkey#X) -(56) ProjectExecTransformer +(63) ProjectExecTransformer Output [3]: [hash(o_orderkey#X, 42) AS hash_partition_key#X, o_orderkey#X, o_orderdate#X] Input [2]: [o_orderkey#X, o_orderdate#X] -(57) WholeStageCodegenTransformer (X) +(64) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderdate#X] Arguments: false -(58) ColumnarExchange +(65) VeloxAppendBatches +Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderdate#X] +Arguments: X + +(66) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderdate#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderdate#X], [plan_id=X], [id=#X] -(59) ShuffleQueryStage +(67) ShuffleQueryStage Output [2]: [o_orderkey#X, o_orderdate#X] Arguments: X -(60) InputAdapter +(68) InputAdapter Input [2]: [o_orderkey#X, o_orderdate#X] -(61) InputIteratorTransformer +(69) InputIteratorTransformer Input [2]: [o_orderkey#X, o_orderdate#X] -(62) ShuffledHashJoinExecTransformer +(70) ShuffledHashJoinExecTransformer Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join type: Inner Join condition: None -(63) ProjectExecTransformer +(71) ProjectExecTransformer Output [7]: [hash(s_nationkey#X, 42) AS hash_partition_key#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] Input [8]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderkey#X, o_orderdate#X] -(64) WholeStageCodegenTransformer (X) +(72) WholeStageCodegenTransformer (X) Input [7]: [hash_partition_key#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] Arguments: false -(65) ColumnarExchange +(73) VeloxAppendBatches +Input [7]: [hash_partition_key#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] +Arguments: X + +(74) ColumnarExchange Input [7]: [hash_partition_key#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X], [plan_id=X], [id=#X] -(66) ShuffleQueryStage +(75) ShuffleQueryStage Output [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] Arguments: X -(67) InputAdapter +(76) InputAdapter Input [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] -(68) InputIteratorTransformer +(77) InputIteratorTransformer Input [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] -(69) Scan parquet +(78) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey)] ReadSchema: struct -(70) FilterExecTransformer +(79) FilterExecTransformer Input [2]: [n_nationkey#X, n_name#X] Arguments: isnotnull(n_nationkey#X) -(71) ProjectExecTransformer +(80) ProjectExecTransformer Output [3]: [hash(n_nationkey#X, 42) AS hash_partition_key#X, n_nationkey#X, n_name#X] Input [2]: [n_nationkey#X, n_name#X] -(72) WholeStageCodegenTransformer (X) +(81) WholeStageCodegenTransformer (X) Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] Arguments: false -(73) ColumnarExchange +(82) VeloxAppendBatches +Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] +Arguments: X + +(83) ColumnarExchange Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [id=#X] -(74) ShuffleQueryStage +(84) ShuffleQueryStage Output [2]: [n_nationkey#X, n_name#X] Arguments: X -(75) InputAdapter +(85) InputAdapter Input [2]: [n_nationkey#X, n_name#X] -(76) InputIteratorTransformer +(86) InputIteratorTransformer Input [2]: [n_nationkey#X, n_name#X] -(77) ShuffledHashJoinExecTransformer +(87) ShuffledHashJoinExecTransformer Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(78) ProjectExecTransformer +(88) ProjectExecTransformer Output [3]: [n_name#X AS nation#X, year(o_orderdate#X) AS o_year#X, ((l_extendedprice#X * (1 - l_discount#X)) - (ps_supplycost#X * l_quantity#X)) AS amount#X] Input [8]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X, n_nationkey#X, n_name#X] -(79) FlushableHashAggregateExecTransformer +(89) FlushableHashAggregateExecTransformer Input [3]: [nation#X, o_year#X, amount#X] Keys [2]: [nation#X, o_year#X] Functions [1]: [partial_sum(amount#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [4]: [nation#X, o_year#X, sum#X, isEmpty#X] -(80) ProjectExecTransformer +(90) ProjectExecTransformer Output [5]: [hash(nation#X, o_year#X, 42) AS hash_partition_key#X, nation#X, o_year#X, sum#X, isEmpty#X] Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] -(81) WholeStageCodegenTransformer (X) +(91) WholeStageCodegenTransformer (X) Input [5]: [hash_partition_key#X, nation#X, o_year#X, sum#X, isEmpty#X] Arguments: false -(82) ColumnarExchange +(92) VeloxAppendBatches +Input [5]: [hash_partition_key#X, nation#X, o_year#X, sum#X, isEmpty#X] +Arguments: X + +(93) ColumnarExchange Input [5]: [hash_partition_key#X, nation#X, o_year#X, sum#X, isEmpty#X] Arguments: hashpartitioning(nation#X, o_year#X, 1), ENSURE_REQUIREMENTS, [nation#X, o_year#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] -(83) ShuffleQueryStage +(94) ShuffleQueryStage Output [4]: [nation#X, o_year#X, sum#X, isEmpty#X] Arguments: X -(84) InputAdapter +(95) InputAdapter Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] -(85) InputIteratorTransformer +(96) InputIteratorTransformer Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] -(86) RegularHashAggregateExecTransformer +(97) RegularHashAggregateExecTransformer Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] Keys [2]: [nation#X, o_year#X] Functions [1]: [sum(amount#X)] Aggregate Attributes [1]: [sum(amount#X)#X] Results [3]: [nation#X, o_year#X, sum(amount#X)#X AS sum_profit#X] -(87) WholeStageCodegenTransformer (X) +(98) WholeStageCodegenTransformer (X) Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: false -(88) ColumnarExchange +(99) VeloxAppendBatches +Input [3]: [nation#X, o_year#X, sum_profit#X] +Arguments: X + +(100) ColumnarExchange Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: rangepartitioning(nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] -(89) ShuffleQueryStage +(101) ShuffleQueryStage Output [3]: [nation#X, o_year#X, sum_profit#X] Arguments: X -(90) InputAdapter +(102) InputAdapter Input [3]: [nation#X, o_year#X, sum_profit#X] -(91) InputIteratorTransformer +(103) InputIteratorTransformer Input [3]: [nation#X, o_year#X, sum_profit#X] -(92) SortExecTransformer +(104) SortExecTransformer Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: [nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST], true, 0 -(93) WholeStageCodegenTransformer (X) +(105) WholeStageCodegenTransformer (X) Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: false -(94) VeloxColumnarToRowExec +(106) VeloxColumnarToRowExec Input [3]: [nation#X, o_year#X, sum_profit#X] -(95) Scan parquet +(107) Scan parquet Output [2]: [p_partkey#X, p_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(p_name), StringContains(p_name,green), IsNotNull(p_partkey)] ReadSchema: struct -(96) Filter +(108) Filter Input [2]: [p_partkey#X, p_name#X] Condition : ((isnotnull(p_name#X) AND Contains(p_name#X, green)) AND isnotnull(p_partkey#X)) -(97) Project +(109) Project Output [1]: [p_partkey#X] Input [2]: [p_partkey#X, p_name#X] -(98) Exchange +(110) Exchange Input [1]: [p_partkey#X] Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(99) Scan parquet +(111) Sort +Input [1]: [p_partkey#X] +Arguments: [p_partkey#X ASC NULLS FIRST], false, 0 + +(112) Scan parquet Output [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(l_partkey), IsNotNull(l_suppkey), IsNotNull(l_orderkey)] ReadSchema: struct -(100) Filter +(113) Filter Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Condition : ((isnotnull(l_partkey#X) AND isnotnull(l_suppkey#X)) AND isnotnull(l_orderkey#X)) -(101) Exchange +(114) Exchange Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(102) ShuffledHashJoin +(115) Sort +Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] +Arguments: [l_partkey#X ASC NULLS FIRST], false, 0 + +(116) SortMergeJoin Left keys [1]: [p_partkey#X] Right keys [1]: [l_partkey#X] Join type: Inner Join condition: None -(103) Project +(117) Project Output [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Input [7]: [p_partkey#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -(104) Exchange +(118) Exchange Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(105) Scan parquet +(119) Sort +Input [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] +Arguments: [l_suppkey#X ASC NULLS FIRST], false, 0 + +(120) Scan parquet Output [2]: [s_suppkey#X, s_nationkey#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(s_suppkey), IsNotNull(s_nationkey)] ReadSchema: struct -(106) Filter +(121) Filter Input [2]: [s_suppkey#X, s_nationkey#X] Condition : (isnotnull(s_suppkey#X) AND isnotnull(s_nationkey#X)) -(107) Exchange +(122) Exchange Input [2]: [s_suppkey#X, s_nationkey#X] Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(108) ShuffledHashJoin +(123) Sort +Input [2]: [s_suppkey#X, s_nationkey#X] +Arguments: [s_suppkey#X ASC NULLS FIRST], false, 0 + +(124) SortMergeJoin Left keys [1]: [l_suppkey#X] Right keys [1]: [s_suppkey#X] Join type: Inner Join condition: None -(109) Project +(125) Project Output [7]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Input [8]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_suppkey#X, s_nationkey#X] -(110) Exchange +(126) Exchange Input [7]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] Arguments: hashpartitioning(l_suppkey#X, l_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(111) Scan parquet +(127) Sort +Input [7]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] +Arguments: [l_suppkey#X ASC NULLS FIRST, l_partkey#X ASC NULLS FIRST], false, 0 + +(128) Scan parquet Output [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(ps_suppkey), IsNotNull(ps_partkey)] ReadSchema: struct -(112) Filter +(129) Filter Input [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Condition : (isnotnull(ps_suppkey#X) AND isnotnull(ps_partkey#X)) -(113) Exchange +(130) Exchange Input [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] Arguments: hashpartitioning(ps_suppkey#X, ps_partkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(114) ShuffledHashJoin +(131) Sort +Input [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] +Arguments: [ps_suppkey#X ASC NULLS FIRST, ps_partkey#X ASC NULLS FIRST], false, 0 + +(132) SortMergeJoin Left keys [2]: [l_suppkey#X, l_partkey#X] Right keys [2]: [ps_suppkey#X, ps_partkey#X] Join type: Inner Join condition: None -(115) Project +(133) Project Output [6]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] Input [10]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_partkey#X, ps_suppkey#X, ps_supplycost#X] -(116) Exchange +(134) Exchange Input [6]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(117) Scan parquet +(135) Sort +Input [6]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] +Arguments: [l_orderkey#X ASC NULLS FIRST], false, 0 + +(136) Scan parquet Output [2]: [o_orderkey#X, o_orderdate#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(o_orderkey)] ReadSchema: struct -(118) Filter +(137) Filter Input [2]: [o_orderkey#X, o_orderdate#X] Condition : isnotnull(o_orderkey#X) -(119) Exchange +(138) Exchange Input [2]: [o_orderkey#X, o_orderdate#X] Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(120) ShuffledHashJoin +(139) Sort +Input [2]: [o_orderkey#X, o_orderdate#X] +Arguments: [o_orderkey#X ASC NULLS FIRST], false, 0 + +(140) SortMergeJoin Left keys [1]: [l_orderkey#X] Right keys [1]: [o_orderkey#X] Join type: Inner Join condition: None -(121) Project +(141) Project Output [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] Input [8]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderkey#X, o_orderdate#X] -(122) Exchange +(142) Exchange Input [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(123) Scan parquet +(143) Sort +Input [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] +Arguments: [s_nationkey#X ASC NULLS FIRST], false, 0 + +(144) Scan parquet Output [2]: [n_nationkey#X, n_name#X] Batched: true Location: InMemoryFileIndex [*] PushedFilters: [IsNotNull(n_nationkey)] ReadSchema: struct -(124) Filter +(145) Filter Input [2]: [n_nationkey#X, n_name#X] Condition : isnotnull(n_nationkey#X) -(125) Exchange +(146) Exchange Input [2]: [n_nationkey#X, n_name#X] Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(126) ShuffledHashJoin +(147) Sort +Input [2]: [n_nationkey#X, n_name#X] +Arguments: [n_nationkey#X ASC NULLS FIRST], false, 0 + +(148) SortMergeJoin Left keys [1]: [s_nationkey#X] Right keys [1]: [n_nationkey#X] Join type: Inner Join condition: None -(127) Project +(149) Project Output [3]: [n_name#X AS nation#X, year(o_orderdate#X) AS o_year#X, ((l_extendedprice#X * (1 - l_discount#X)) - (ps_supplycost#X * l_quantity#X)) AS amount#X] Input [8]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X, n_nationkey#X, n_name#X] -(128) HashAggregate +(150) HashAggregate Input [3]: [nation#X, o_year#X, amount#X] Keys [2]: [nation#X, o_year#X] Functions [1]: [partial_sum(amount#X)] Aggregate Attributes [2]: [sum#X, isEmpty#X] Results [4]: [nation#X, o_year#X, sum#X, isEmpty#X] -(129) Exchange +(151) Exchange Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] Arguments: hashpartitioning(nation#X, o_year#X, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(130) HashAggregate +(152) HashAggregate Input [4]: [nation#X, o_year#X, sum#X, isEmpty#X] Keys [2]: [nation#X, o_year#X] Functions [1]: [sum(amount#X)] Aggregate Attributes [1]: [sum(amount#X)#X] Results [3]: [nation#X, o_year#X, sum(amount#X)#X AS sum_profit#X] -(131) Exchange +(153) Exchange Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: rangepartitioning(nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X] -(132) Sort +(154) Sort Input [3]: [nation#X, o_year#X, sum_profit#X] Arguments: [nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST], true, 0 -(133) AdaptiveSparkPlan +(155) AdaptiveSparkPlan Output [3]: [nation#X, o_year#X, sum_profit#X] Arguments: isFinalPlan=true \ No newline at end of file diff --git a/backends-velox/src/test/scala/org/apache/gluten/execution/FallbackSuite.scala b/backends-velox/src/test/scala/org/apache/gluten/execution/FallbackSuite.scala index 15a71ceb587ba..2b40ac54b2c67 100644 --- a/backends-velox/src/test/scala/org/apache/gluten/execution/FallbackSuite.scala +++ b/backends-velox/src/test/scala/org/apache/gluten/execution/FallbackSuite.scala @@ -20,8 +20,10 @@ import org.apache.gluten.GlutenConfig import org.apache.gluten.extension.GlutenPlan import org.apache.spark.SparkConf -import org.apache.spark.sql.execution.SparkPlan +import org.apache.spark.sql.execution.{ColumnarShuffleExchangeExec, SparkPlan} import org.apache.spark.sql.execution.adaptive.{AdaptiveSparkPlanHelper, AQEShuffleReadExec} +import org.apache.spark.sql.execution.exchange.ShuffleExchangeExec +import org.apache.spark.sql.execution.joins.SortMergeJoinExec class FallbackSuite extends VeloxWholeStageTransformerSuite with AdaptiveSparkPlanHelper { protected val rootPath: String = getClass.getResource("/").getPath @@ -71,6 +73,29 @@ class FallbackSuite extends VeloxWholeStageTransformerSuite with AdaptiveSparkPl collect(plan) { case v: VeloxColumnarToRowExec => v }.size } + private def collectColumnarShuffleExchange(plan: SparkPlan): Int = { + collect(plan) { case c: ColumnarShuffleExchangeExec => c }.size + } + + private def collectShuffleExchange(plan: SparkPlan): Int = { + collect(plan) { case c: ShuffleExchangeExec => c }.size + } + + test("fallback with shuffle manager") { + withSQLConf(GlutenConfig.COLUMNAR_SHUFFLE_ENABLED.key -> "false") { + runQueryAndCompare("select c1, count(*) from tmp1 group by c1") { + df => + val plan = df.queryExecution.executedPlan + + assert(collectColumnarShuffleExchange(plan) == 0) + assert(collectShuffleExchange(plan) == 1) + + val wholeQueryColumnarToRow = collectColumnarToRow(plan) + assert(wholeQueryColumnarToRow == 2) + } + } + } + test("fallback with collect") { withSQLConf(GlutenConfig.COLUMNAR_WHOLESTAGE_FALLBACK_THRESHOLD.key -> "1") { runQueryAndCompare("SELECT count(*) FROM tmp1") { @@ -216,4 +241,26 @@ class FallbackSuite extends VeloxWholeStageTransformerSuite with AdaptiveSparkPl } } } + + test("fallback with smj") { + val sql = "SELECT /*+ SHUFFLE_MERGE(tmp1) */ * FROM tmp1 join tmp2 on tmp1.c1 = tmp2.c1" + withSQLConf( + GlutenConfig.COLUMNAR_FPRCE_SHUFFLED_HASH_JOIN_ENABLED.key -> "true", + GlutenConfig.COLUMNAR_SHUFFLED_HASH_JOIN_ENABLED.key -> "false") { + runQueryAndCompare(sql) { + df => + val plan = df.queryExecution.executedPlan + assert(collect(plan) { case smj: SortMergeJoinExec => smj }.size == 1) + } + } + withSQLConf( + GlutenConfig.COLUMNAR_FPRCE_SHUFFLED_HASH_JOIN_ENABLED.key -> "false", + GlutenConfig.COLUMNAR_SORTMERGEJOIN_ENABLED.key -> "false") { + runQueryAndCompare(sql) { + df => + val plan = df.queryExecution.executedPlan + assert(collect(plan) { case smj: SortMergeJoinExec => smj }.size == 1) + } + } + } } diff --git a/backends-velox/src/test/scala/org/apache/gluten/execution/ScalarFunctionsValidateSuite.scala b/backends-velox/src/test/scala/org/apache/gluten/execution/ScalarFunctionsValidateSuite.scala index 16cc632dbdf29..622f300a0cc20 100644 --- a/backends-velox/src/test/scala/org/apache/gluten/execution/ScalarFunctionsValidateSuite.scala +++ b/backends-velox/src/test/scala/org/apache/gluten/execution/ScalarFunctionsValidateSuite.scala @@ -16,6 +16,7 @@ */ package org.apache.gluten.execution +import org.apache.spark.sql.execution.ProjectExec import org.apache.spark.sql.types._ import java.sql.Timestamp @@ -100,6 +101,12 @@ class ScalarFunctionsValidateSuite extends FunctionsValidateTest { } } + testWithSpecifiedSparkVersion("null input for array_size", Some("3.3")) { + runQueryAndCompare("SELECT array_size(null)") { + checkGlutenOperatorMatch[ProjectExecTransformer] + } + } + test("chr function") { val df = runQueryAndCompare( "SELECT chr(l_orderkey + 64) " + @@ -156,24 +163,28 @@ class ScalarFunctionsValidateSuite extends FunctionsValidateTest { checkLengthAndPlan(df, 1) } - test("greatest function") { - val df = runQueryAndCompare( - "SELECT greatest(l_orderkey, l_orderkey)" + - "from lineitem limit 1")(checkGlutenOperatorMatch[ProjectExecTransformer]) - } - - test("least function") { - val df = runQueryAndCompare( - "SELECT least(l_orderkey, l_orderkey)" + - "from lineitem limit 1")(checkGlutenOperatorMatch[ProjectExecTransformer]) - } - test("Test greatest function") { runQueryAndCompare( "SELECT greatest(l_orderkey, l_orderkey)" + "from lineitem limit 1") { checkGlutenOperatorMatch[ProjectExecTransformer] } + withTempPath { + path => + spark + .sql("""SELECT * + FROM VALUES (CAST(5.345 AS DECIMAL(6, 2)), CAST(5.35 AS DECIMAL(5, 4))), + (CAST(5.315 AS DECIMAL(6, 2)), CAST(5.355 AS DECIMAL(5, 4))), + (CAST(3.345 AS DECIMAL(6, 2)), CAST(4.35 AS DECIMAL(5, 4))) AS data(a, b);""") + .write + .parquet(path.getCanonicalPath) + + spark.read.parquet(path.getCanonicalPath).createOrReplaceTempView("view") + + runQueryAndCompare("SELECT greatest(a, b) from view") { + checkGlutenOperatorMatch[ProjectExecTransformer] + } + } } test("Test least function") { @@ -182,6 +193,22 @@ class ScalarFunctionsValidateSuite extends FunctionsValidateTest { "from lineitem limit 1") { checkGlutenOperatorMatch[ProjectExecTransformer] } + withTempPath { + path => + spark + .sql("""SELECT * + FROM VALUES (CAST(5.345 AS DECIMAL(6, 2)), CAST(5.35 AS DECIMAL(5, 4))), + (CAST(5.315 AS DECIMAL(6, 2)), CAST(5.355 AS DECIMAL(5, 4))), + (CAST(3.345 AS DECIMAL(6, 2)), CAST(4.35 AS DECIMAL(5, 4))) AS data(a, b);""") + .write + .parquet(path.getCanonicalPath) + + spark.read.parquet(path.getCanonicalPath).createOrReplaceTempView("view") + + runQueryAndCompare("SELECT least(a, b) from view") { + checkGlutenOperatorMatch[ProjectExecTransformer] + } + } } test("Test hash function") { @@ -272,6 +299,12 @@ class ScalarFunctionsValidateSuite extends FunctionsValidateTest { } } + test("Test log function") { + runQueryAndCompare("SELECT log(10, l_orderkey) from lineitem limit 1") { + checkGlutenOperatorMatch[ProjectExecTransformer] + } + } + test("Test shiftleft function") { val df = runQueryAndCompare("SELECT shiftleft(int_field1, 1) from datatab limit 1") { checkGlutenOperatorMatch[ProjectExecTransformer] @@ -537,6 +570,46 @@ class ScalarFunctionsValidateSuite extends FunctionsValidateTest { } } + test("test transform_keys function") { + withTempPath { + path => + Seq( + Map[String, Int]("a" -> 1, "b" -> 2), + Map[String, Int]("a" -> 2, "b" -> 3), + null + ) + .toDF("m") + .write + .parquet(path.getCanonicalPath) + + spark.read.parquet(path.getCanonicalPath).createOrReplaceTempView("map_tbl") + + runQueryAndCompare("select transform_keys(m, (k, v) -> upper(k)) from map_tbl") { + checkGlutenOperatorMatch[ProjectExecTransformer] + } + } + } + + test("test transform_values function") { + withTempPath { + path => + Seq( + Map[String, Int]("a" -> 1, "b" -> 2), + Map[String, Int]("a" -> 2, "b" -> 3), + null + ) + .toDF("m") + .write + .parquet(path.getCanonicalPath) + + spark.read.parquet(path.getCanonicalPath).createOrReplaceTempView("map_tbl") + + runQueryAndCompare("select transform_values(m, (k, v) -> v + 1) from map_tbl") { + checkGlutenOperatorMatch[ProjectExecTransformer] + } + } + } + test("zip_with") { withTempPath { path => @@ -583,6 +656,13 @@ class ScalarFunctionsValidateSuite extends FunctionsValidateTest { } } + test("Test input_file_name function") { + runQueryAndCompare("""SELECT input_file_name(), l_orderkey + | from lineitem limit 100""".stripMargin) { + checkGlutenOperatorMatch[ProjectExecTransformer] + } + } + test("Test spark_partition_id function") { runQueryAndCompare("""SELECT spark_partition_id(), l_orderkey | from lineitem limit 100""".stripMargin) { @@ -650,6 +730,12 @@ class ScalarFunctionsValidateSuite extends FunctionsValidateTest { } } + test("soundex") { + runQueryAndCompare("select soundex(c_comment) from customer limit 50") { + checkGlutenOperatorMatch[ProjectExecTransformer] + } + } + test("Test make_timestamp function") { withTempPath { path => @@ -844,6 +930,30 @@ class ScalarFunctionsValidateSuite extends FunctionsValidateTest { } } + testWithSpecifiedSparkVersion("try_subtract", Some("3.3")) { + runQueryAndCompare( + "select try_subtract(2147483647, cast(l_orderkey as int)), " + + "try_subtract(-2147483648, cast(l_orderkey as int)) from lineitem") { + checkGlutenOperatorMatch[ProjectExecTransformer] + } + } + + test("try_divide") { + runQueryAndCompare( + "select try_divide(cast(l_orderkey as int), 0) from lineitem", + noFallBack = false) { + _ => // Spark would always cast inputs to double for this function. + } + } + + testWithSpecifiedSparkVersion("try_multiply", Some("3.3")) { + runQueryAndCompare( + "select try_multiply(2147483647, cast(l_orderkey as int)), " + + "try_multiply(-2147483648, cast(l_orderkey as int)) from lineitem") { + checkGlutenOperatorMatch[ProjectExecTransformer] + } + } + test("test array forall") { withTempPath { path => @@ -1038,4 +1148,52 @@ class ScalarFunctionsValidateSuite extends FunctionsValidateTest { } } } + + test("arrays_overlap") { + withTempPath { + path => + Seq[(Seq[Integer], Seq[Integer])]((Seq(1, 2, 3), Seq(3, 4)), (Seq(5, null), Seq())) + .toDF("v1", "v2") + .write + .parquet(path.getCanonicalPath) + + spark.read.parquet(path.getCanonicalPath).createOrReplaceTempView("array_tbl") + + runQueryAndCompare("select arrays_overlap(v1, v2) from array_tbl;") { + checkGlutenOperatorMatch[ProjectExecTransformer] + } + } + } + + test("PreciseTimestampConversion") { + withTempPath { + path => + val df = spark + .sql( + "select * from VALUES ('A1', TIMESTAMP'2021-01-01 00:00:00'), " + + "('A1', TIMESTAMP'2021-01-01 00:04:30'), ('A1', TIMESTAMP'2021-01-01 00:06:00'), " + + "('A2', TIMESTAMP'2021-01-01 00:01:00') AS tab(a, b)") + .write + .parquet(path.getCanonicalPath) + + spark.read.parquet(path.getCanonicalPath).createOrReplaceTempView("string_timestamp") + + runQueryAndCompare( + "SELECT a, window.start, window.end, count(*) as cnt FROM" + + " string_timestamp GROUP by a, window(b, '5 minutes') ORDER BY a, start;") { + df => + val executedPlan = getExecutedPlan(df) + assert( + executedPlan.exists(plan => plan.isInstanceOf[ProjectExecTransformer]), + s"Expect ProjectExecTransformer exists " + + s"in executedPlan:\n ${executedPlan.last}" + ) + assert( + !executedPlan.exists(plan => plan.isInstanceOf[ProjectExec]), + s"Expect ProjectExec doesn't exist " + + s"in executedPlan:\n ${executedPlan.last}" + ) + } + } + } } diff --git a/backends-velox/src/test/scala/org/apache/gluten/execution/TestOperator.scala b/backends-velox/src/test/scala/org/apache/gluten/execution/TestOperator.scala index 657039572d93a..c010b9128ce1c 100644 --- a/backends-velox/src/test/scala/org/apache/gluten/execution/TestOperator.scala +++ b/backends-velox/src/test/scala/org/apache/gluten/execution/TestOperator.scala @@ -19,21 +19,23 @@ package org.apache.gluten.execution import org.apache.gluten.GlutenConfig import org.apache.gluten.datasource.ArrowCSVFileFormat import org.apache.gluten.execution.datasource.v2.ArrowBatchScanExec +import org.apache.gluten.expression.VeloxDummyExpression import org.apache.gluten.sql.shims.SparkShimLoader import org.apache.spark.SparkConf -import org.apache.spark.sql.{AnalysisException, Row} +import org.apache.spark.sql.{AnalysisException, DataFrame, Row} import org.apache.spark.sql.execution._ +import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper import org.apache.spark.sql.execution.window.WindowExec import org.apache.spark.sql.functions._ import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.types.{DecimalType, StringType, StructField, StructType} +import org.apache.spark.sql.types.{DecimalType, IntegerType, StringType, StructField, StructType} import java.util.concurrent.TimeUnit import scala.collection.JavaConverters -class TestOperator extends VeloxWholeStageTransformerSuite { +class TestOperator extends VeloxWholeStageTransformerSuite with AdaptiveSparkPlanHelper { protected val rootPath: String = getClass.getResource("/").getPath override protected val resourcePath: String = "/tpch-data-parquet-velox" @@ -44,6 +46,12 @@ class TestOperator extends VeloxWholeStageTransformerSuite { override def beforeAll(): Unit = { super.beforeAll() createTPCHNotNullTables() + VeloxDummyExpression.registerFunctions(spark.sessionState.functionRegistry) + } + + override def afterAll(): Unit = { + VeloxDummyExpression.unregisterFunctions(spark.sessionState.functionRegistry) + super.afterAll() } override protected def sparkConf: SparkConf = { @@ -65,14 +73,20 @@ class TestOperator extends VeloxWholeStageTransformerSuite { test("select_part_column") { val df = runQueryAndCompare("select l_shipdate, l_orderkey from lineitem limit 1") { - df => { assert(df.schema.fields.length == 2) } + df => + { + assert(df.schema.fields.length == 2) + } } checkLengthAndPlan(df, 1) } test("select_as") { val df = runQueryAndCompare("select l_shipdate as my_col from lineitem limit 1") { - df => { assert(df.schema.fieldNames(0).equals("my_col")) } + df => + { + assert(df.schema.fieldNames(0).equals("my_col")) + } } checkLengthAndPlan(df, 1) } @@ -156,6 +170,71 @@ class TestOperator extends VeloxWholeStageTransformerSuite { checkLengthAndPlan(df, 60141) } + test("not in") { + // integral type + val df = runQueryAndCompare( + "select l_orderkey from lineitem " + + "where l_partkey not in (1552, 674, 1062)") { + checkGlutenOperatorMatch[FileSourceScanExecTransformer] + } + checkLengthAndPlan(df, 60053) + + val df2 = runQueryAndCompare( + "select l_orderkey from lineitem " + + "where l_partkey not in (1552, 674) and l_partkey not in (1062)") { + checkGlutenOperatorMatch[FileSourceScanExecTransformer] + } + checkLengthAndPlan(df2, 60053) + + val df3 = runQueryAndCompare( + "select l_orderkey from lineitem " + + "where l_partkey not in (1552, 674) and l_partkey != 1062") { + checkGlutenOperatorMatch[FileSourceScanExecTransformer] + } + checkLengthAndPlan(df3, 60053) + + // string type + val df4 = + runQueryAndCompare("select o_orderstatus from orders where o_orderstatus not in ('O', 'F')") { + checkGlutenOperatorMatch[FileSourceScanExecTransformer] + } + checkLengthAndPlan(df4, 363) + + // bool type + withTable("t") { + sql("create table t (id int, b boolean) using parquet") + sql("insert into t values (1, true), (2, false), (3, null)") + runQueryAndCompare("select * from t where b not in (true)") { + checkGlutenOperatorMatch[FileSourceScanExecTransformer] + } + + runQueryAndCompare("select * from t where b not in (true, false)") { + checkGlutenOperatorMatch[FileSourceScanExecTransformer] + } + } + + // mix not-in with range + runQueryAndCompare( + "select l_orderkey from lineitem " + + "where l_partkey not in (1552, 674) and l_partkey >= 1552") { + checkGlutenOperatorMatch[FileSourceScanExecTransformer] + } + + // mix not-in with in + runQueryAndCompare( + "select l_orderkey from lineitem " + + "where l_partkey not in (1552, 674) and l_partkey in (1552)") { + checkGlutenOperatorMatch[FileSourceScanExecTransformer] + } + + // not-in with or relation + runQueryAndCompare( + "select l_orderkey from lineitem " + + "where l_partkey not in (1552, 674) or l_partkey in (1552)") { + checkGlutenOperatorMatch[FileSourceScanExecTransformer] + } + } + test("coalesce") { var df = runQueryAndCompare( "select l_orderkey, coalesce(l_comment, 'default_val') " + @@ -211,17 +290,56 @@ class TestOperator extends VeloxWholeStageTransformerSuite { Seq("sort", "streaming").foreach { windowType => withSQLConf("spark.gluten.sql.columnar.backend.velox.window.type" -> windowType) { + runQueryAndCompare( + "select max(l_partkey) over" + + " (partition by l_suppkey order by l_orderkey" + + " RANGE BETWEEN 1 PRECEDING AND CURRENT ROW), " + + "min(l_comment) over" + + " (partition by l_suppkey order by l_linenumber" + + " RANGE BETWEEN 1 PRECEDING AND CURRENT ROW) from lineitem ") { + checkSparkOperatorMatch[WindowExecTransformer] + } + runQueryAndCompare( "select max(l_partkey) over" + " (partition by l_suppkey order by l_orderkey" + " RANGE BETWEEN CURRENT ROW AND 2 FOLLOWING) from lineitem ") { - checkSparkOperatorMatch[WindowExec] + checkSparkOperatorMatch[WindowExecTransformer] } runQueryAndCompare( "select max(l_partkey) over" + " (partition by l_suppkey order by l_orderkey" + " RANGE BETWEEN 6 PRECEDING AND CURRENT ROW) from lineitem ") { + checkSparkOperatorMatch[WindowExecTransformer] + } + + runQueryAndCompare( + "select max(l_partkey) over" + + " (partition by l_suppkey order by l_orderkey" + + " RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING) from lineitem ") { + checkSparkOperatorMatch[WindowExecTransformer] + } + + runQueryAndCompare( + "select max(l_partkey) over" + + " (partition by l_suppkey order by l_orderkey" + + " RANGE BETWEEN 6 PRECEDING AND 3 PRECEDING) from lineitem ") { + checkSparkOperatorMatch[WindowExecTransformer] + } + + runQueryAndCompare( + "select max(l_partkey) over" + + " (partition by l_suppkey order by l_orderkey" + + " RANGE BETWEEN 3 FOLLOWING AND 6 FOLLOWING) from lineitem ") { + checkSparkOperatorMatch[WindowExecTransformer] + } + + // DecimalType as order by column is not supported + runQueryAndCompare( + "select min(l_comment) over" + + " (partition by l_suppkey order by l_discount" + + " RANGE BETWEEN 1 PRECEDING AND CURRENT ROW) from lineitem ") { checkSparkOperatorMatch[WindowExec] } @@ -483,42 +601,153 @@ class TestOperator extends VeloxWholeStageTransformerSuite { } test("csv scan") { - val filePath = rootPath + "/datasource/csv/student.csv" - val df = spark.read - .format("csv") - .option("header", "true") - .load(filePath) - df.createOrReplaceTempView("student") - runQueryAndCompare("select * from student") { - df => - val plan = df.queryExecution.executedPlan - assert(plan.find(s => s.isInstanceOf[ColumnarToRowExec]).isDefined) - assert(plan.find(_.isInstanceOf[ArrowFileSourceScanExec]).isDefined) - val scan = plan.find(_.isInstanceOf[ArrowFileSourceScanExec]).toList.head - assert( - scan - .asInstanceOf[ArrowFileSourceScanExec] - .relation - .fileFormat - .isInstanceOf[ArrowCSVFileFormat]) + val df = runAndCompare("select * from student") { + val filePath = rootPath + "/datasource/csv/student.csv" + val df = spark.read + .format("csv") + .option("header", "true") + .load(filePath) + df.createOrReplaceTempView("student") + } + val plan = df.queryExecution.executedPlan + assert(plan.find(s => s.isInstanceOf[ColumnarToRowExec]).isDefined) + assert(plan.find(_.isInstanceOf[ArrowFileSourceScanExec]).isDefined) + val scan = plan.find(_.isInstanceOf[ArrowFileSourceScanExec]).toList.head + assert( + scan + .asInstanceOf[ArrowFileSourceScanExec] + .relation + .fileFormat + .isInstanceOf[ArrowCSVFileFormat]) + } + + test("csv scan with option string as null") { + val df = runAndCompare("select * from student") { + val filePath = rootPath + "/datasource/csv/student_option_str.csv" + // test strings as null + val df = spark.read + .format("csv") + .option("header", "true") + .load(filePath) + df.createOrReplaceTempView("student") } + val plan = df.queryExecution.executedPlan + assert(plan.find(_.isInstanceOf[ColumnarToRowExec]).isDefined) + assert(plan.find(_.isInstanceOf[ArrowFileSourceScanExec]).isDefined) + } + + test("csv scan with option delimiter") { + val df = runAndCompare("select * from student") { + val filePath = rootPath + "/datasource/csv/student_option.csv" + val df = spark.read + .format("csv") + .option("header", "true") + .option("delimiter", ";") + .load(filePath) + df.createOrReplaceTempView("student") + } + val plan = df.queryExecution.executedPlan + assert(plan.find(s => s.isInstanceOf[ColumnarToRowExec]).isDefined) + assert(plan.find(_.isInstanceOf[ArrowFileSourceScanExec]).isDefined) + } + + test("csv scan with schema") { + val df = runAndCompare("select * from student") { + val filePath = rootPath + "/datasource/csv/student_option_schema.csv" + val schema = new StructType() + .add("id", StringType) + .add("name", StringType) + .add("language", StringType) + val df = spark.read + .schema(schema) + .format("csv") + .option("header", "true") + .load(filePath) + df.createOrReplaceTempView("student") + } + val plan = df.queryExecution.executedPlan + assert(plan.find(s => s.isInstanceOf[ColumnarToRowExec]).isDefined) + val scan = plan.find(_.isInstanceOf[ArrowFileSourceScanExec]) + assert(scan.isDefined) + assert( + !scan.get + .asInstanceOf[ArrowFileSourceScanExec] + .original + .relation + .fileFormat + .asInstanceOf[ArrowCSVFileFormat] + .fallback) + } + + test("csv scan with missing columns") { + val df = runAndCompare("select languagemissing, language, id_new_col from student") { + val filePath = rootPath + "/datasource/csv/student_option_schema.csv" + val schema = new StructType() + .add("id_new_col", IntegerType) + .add("name", StringType) + .add("language", StringType) + .add("languagemissing", StringType) + val df = spark.read + .schema(schema) + .format("csv") + .option("header", "true") + .load(filePath) + df.createOrReplaceTempView("student") + } + val plan = df.queryExecution.executedPlan + assert(plan.find(s => s.isInstanceOf[VeloxColumnarToRowExec]).isDefined) + assert(plan.find(_.isInstanceOf[ArrowFileSourceScanExec]).isDefined) + } + + test("csv scan with different name") { + val df = runAndCompare("select * from student") { + val filePath = rootPath + "/datasource/csv/student_option_schema.csv" + val schema = new StructType() + .add("id_new_col", StringType) + .add("name", StringType) + .add("language", StringType) + val df = spark.read + .schema(schema) + .format("csv") + .option("header", "true") + .load(filePath) + df.createOrReplaceTempView("student") + } + val plan = df.queryExecution.executedPlan + assert(plan.find(s => s.isInstanceOf[ColumnarToRowExec]).isDefined) + assert(plan.find(_.isInstanceOf[ArrowFileSourceScanExec]).isDefined) + + val df2 = runAndCompare("select * from student_schema") { + val filePath = rootPath + "/datasource/csv/student_option_schema.csv" + val schema = new StructType() + .add("name", StringType) + .add("language", StringType) + val df = spark.read + .schema(schema) + .format("csv") + .option("header", "true") + .load(filePath) + df.createOrReplaceTempView("student_schema") + } + val plan2 = df2.queryExecution.executedPlan + assert(plan2.find(s => s.isInstanceOf[ColumnarToRowExec]).isDefined) + assert(plan2.find(_.isInstanceOf[ArrowFileSourceScanExec]).isDefined) } test("csv scan with filter") { - val filePath = rootPath + "/datasource/csv/student.csv" - val df = spark.read - .format("csv") - .option("header", "true") - .load(filePath) - df.createOrReplaceTempView("student") - runQueryAndCompare("select * from student where Name = 'Peter'") { - df => - assert(df.queryExecution.executedPlan.find(s => s.isInstanceOf[ColumnarToRowExec]).isEmpty) - assert( - df.queryExecution.executedPlan - .find(s => s.isInstanceOf[ArrowFileSourceScanExec]) - .isDefined) + val df = runAndCompare("select * from student where Name = 'Peter'") { + val filePath = rootPath + "/datasource/csv/student.csv" + val df = spark.read + .format("csv") + .option("header", "true") + .load(filePath) + df.createOrReplaceTempView("student") } + assert(df.queryExecution.executedPlan.find(s => s.isInstanceOf[ColumnarToRowExec]).isEmpty) + assert( + df.queryExecution.executedPlan + .find(s => s.isInstanceOf[ArrowFileSourceScanExec]) + .isDefined) } test("insert into select from csv") { @@ -540,21 +769,78 @@ class TestOperator extends VeloxWholeStageTransformerSuite { test("csv scan datasource v2") { withSQLConf("spark.sql.sources.useV1SourceList" -> "") { - val filePath = rootPath + "/datasource/csv/student.csv" - val df = spark.read - .format("csv") - .option("header", "true") - .load(filePath) - df.createOrReplaceTempView("student") - runQueryAndCompare("select * from student") { - checkGlutenOperatorMatch[ArrowBatchScanExec] + val df = runAndCompare("select * from student") { + val filePath = rootPath + "/datasource/csv/student.csv" + val df = spark.read + .format("csv") + .option("header", "true") + .load(filePath) + df.createOrReplaceTempView("student") } - runQueryAndCompare("select * from student where Name = 'Peter'") { - df => - val plan = df.queryExecution.executedPlan - assert(plan.find(s => s.isInstanceOf[ColumnarToRowExec]).isEmpty) - assert(plan.find(s => s.isInstanceOf[ArrowBatchScanExec]).isDefined) + val plan = df.queryExecution.executedPlan + assert(plan.find(s => s.isInstanceOf[ColumnarToRowExec]).isDefined) + assert(plan.find(s => s.isInstanceOf[ArrowBatchScanExec]).isDefined) + } + } + + test("csv scan datasource v2 with filter") { + withSQLConf("spark.sql.sources.useV1SourceList" -> "") { + val df = runAndCompare("select * from student where Name = 'Peter'") { + val filePath = rootPath + "/datasource/csv/student.csv" + val df = spark.read + .format("csv") + .option("header", "true") + .load(filePath) + df.createOrReplaceTempView("student") + } + + val plan = df.queryExecution.executedPlan + assert(plan.find(s => s.isInstanceOf[ColumnarToRowExec]).isEmpty) + assert(plan.find(s => s.isInstanceOf[ArrowBatchScanExec]).isDefined) + } + } + + test("csv scan with schema datasource v2") { + withSQLConf("spark.sql.sources.useV1SourceList" -> "") { + val df = runAndCompare("select * from student") { + val filePath = rootPath + "/datasource/csv/student_option_schema.csv" + val schema = new StructType() + .add("id", StringType) + .add("name", StringType) + .add("language", StringType) + val df = spark.read + .schema(schema) + .format("csv") + .option("header", "true") + .load(filePath) + df.createOrReplaceTempView("student") } + val plan = df.queryExecution.executedPlan + assert(plan.find(s => s.isInstanceOf[ColumnarToRowExec]).isDefined) + assert(plan.find(_.isInstanceOf[ArrowBatchScanExec]).isDefined) + } + } + + test("combine small batches before shuffle") { + val minBatchSize = 15 + withSQLConf( + "spark.gluten.sql.columnar.backend.velox.coalesceBatchesBeforeShuffle" -> "true", + "spark.gluten.sql.columnar.maxBatchSize" -> "2", + "spark.gluten.sql.columnar.backend.velox.minBatchSizeForShuffle" -> s"$minBatchSize" + ) { + val df = runQueryAndCompare( + "select l_orderkey, sum(l_partkey) as sum from lineitem " + + "where l_orderkey < 100 group by l_orderkey") { _ => } + checkLengthAndPlan(df, 27) + val ops = collect(df.queryExecution.executedPlan) { case p: VeloxAppendBatchesExec => p } + assert(ops.size == 1) + val op = ops.head + assert(op.minOutputBatchSize == minBatchSize) + val metrics = op.metrics + assert(metrics("numInputRows").value == 27) + assert(metrics("numInputBatches").value == 14) + assert(metrics("numOutputRows").value == 27) + assert(metrics("numOutputBatches").value == 2) } } @@ -801,6 +1087,13 @@ class TestOperator extends VeloxWholeStageTransformerSuite { // No ProjectExecTransformer is introduced. checkSparkOperatorChainMatch[GenerateExecTransformer, FilterExecTransformer] } + + runQueryAndCompare( + s""" + |SELECT $func(${VeloxDummyExpression.VELOX_DUMMY_EXPRESSION}(a)) from t2; + |""".stripMargin) { + checkGlutenOperatorMatch[GenerateExecTransformer] + } } } } @@ -1050,6 +1343,18 @@ class TestOperator extends VeloxWholeStageTransformerSuite { } } + test("Test sample op") { + withSQLConf("spark.gluten.sql.columnarSampleEnabled" -> "true") { + withTable("t") { + sql("create table t (id int, b boolean) using parquet") + sql("insert into t values (1, true), (2, false), (3, null), (4, true), (5, false)") + runQueryAndCompare("select * from t TABLESAMPLE(20 PERCENT)", false) { + checkGlutenOperatorMatch[SampleExecTransformer] + } + } + } + } + test("test cross join") { withTable("t1", "t2") { sql(""" @@ -1089,6 +1394,16 @@ class TestOperator extends VeloxWholeStageTransformerSuite { } } + withSQLConf("spark.gluten.sql.columnar.forceShuffledHashJoin" -> "false") { + runQueryAndCompare( + """ + |select * from t1 left semi join t2 on t1.c1 = t2.c1 and t1.c1 > 50; + |""".stripMargin + ) { + checkGlutenOperatorMatch[SortMergeJoinExecTransformer] + } + } + runQueryAndCompare( """ |select * from t1 cross join t2; @@ -1354,7 +1669,12 @@ class TestOperator extends VeloxWholeStageTransformerSuite { } } - test("test roundrobine with sort") { + test("test RoundRobin repartition with sort") { + def checkRoundRobinOperators(df: DataFrame): Unit = { + checkGlutenOperatorMatch[SortExecTransformer](df) + checkGlutenOperatorMatch[ColumnarShuffleExchangeExec](df) + } + // scalastyle:off runQueryAndCompare("SELECT /*+ REPARTITION(3) */ l_orderkey, l_partkey FROM lineitem") { /* @@ -1364,7 +1684,7 @@ class TestOperator extends VeloxWholeStageTransformerSuite { +- ^(2) ProjectExecTransformer [hash(l_orderkey#16L, l_partkey#17L) AS hash_partition_key#302, l_orderkey#16L, l_partkey#17L] +- ^(2) BatchScanExecTransformer[l_orderkey#16L, l_partkey#17L] ParquetScan DataFilters: [], Format: parquet, Location: InMemoryFileIndex(1 paths)[..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct, PushedFilters: [] RuntimeFilters: [] */ - checkGlutenOperatorMatch[SortExecTransformer] + checkRoundRobinOperators } // scalastyle:on @@ -1377,6 +1697,11 @@ class TestOperator extends VeloxWholeStageTransformerSuite { } } } + + // Gluten-5206: test repartition on map type + runQueryAndCompare( + "SELECT /*+ REPARTITION(3) */ l_orderkey, map(l_orderkey, l_partkey) FROM lineitem")( + checkRoundRobinOperators) } test("Support Map type signature") { @@ -1514,4 +1839,76 @@ class TestOperator extends VeloxWholeStageTransformerSuite { checkGlutenOperatorMatch[GenerateExecTransformer] } } + + test("Fix shuffle with null type failure") { + // single and other partitioning + Seq("1", "2").foreach { + numShufflePartitions => + withSQLConf("spark.sql.shuffle.partitions" -> numShufflePartitions) { + def checkNullTypeRepartition(df: => DataFrame, numProject: Int): Unit = { + var expected: Array[Row] = null + withSQLConf("spark.sql.execution.sortBeforeRepartition" -> "false") { + expected = df.collect() + } + val actual = df + checkAnswer(actual, expected) + assert( + collect(actual.queryExecution.executedPlan) { + case p: ProjectExec => p + }.size == numProject + ) + assert( + collect(actual.queryExecution.executedPlan) { + case shuffle: ColumnarShuffleExchangeExec => shuffle + }.size == 1 + ) + } + + // hash + checkNullTypeRepartition( + spark + .table("lineitem") + .selectExpr("l_orderkey", "null as x") + .repartition($"l_orderkey"), + 0 + ) + // range + checkNullTypeRepartition( + spark + .table("lineitem") + .selectExpr("l_orderkey", "null as x") + .repartitionByRange($"l_orderkey"), + 0 + ) + // round robin + checkNullTypeRepartition( + spark.table("lineitem").selectExpr("l_orderkey", "null as x").repartition(), + 0 + ) + checkNullTypeRepartition( + spark.table("lineitem").selectExpr("null as x", "null as y").repartition(), + 1 + ) + } + } + } + + test("fix non-deterministic filter executed twice when push down to scan") { + val df = sql("select * from lineitem where rand() <= 0.5") + // plan check + val plan = df.queryExecution.executedPlan + val scans = plan.collect { case scan: FileSourceScanExecTransformer => scan } + val filters = plan.collect { case filter: FilterExecTransformer => filter } + assert(scans.size == 1) + assert(filters.size == 1) + assert(scans(0).dataFilters.size == 1) + val remainingFilters = FilterHandler.getRemainingFilters( + scans(0).dataFilters, + splitConjunctivePredicates(filters(0).condition)) + assert(remainingFilters.size == 0) + + // result length check, table lineitem has 60,000 rows + val resultLength = df.collect().length + assert(resultLength > 25000 && resultLength < 35000) + } } diff --git a/backends-velox/src/test/scala/org/apache/gluten/execution/VeloxAggregateFunctionsSuite.scala b/backends-velox/src/test/scala/org/apache/gluten/execution/VeloxAggregateFunctionsSuite.scala index faa361edf5aaa..ae6306cc0d4a1 100644 --- a/backends-velox/src/test/scala/org/apache/gluten/execution/VeloxAggregateFunctionsSuite.scala +++ b/backends-velox/src/test/scala/org/apache/gluten/execution/VeloxAggregateFunctionsSuite.scala @@ -22,7 +22,9 @@ import org.apache.gluten.extension.columnar.validator.FallbackInjects import org.apache.spark.SparkConf import org.apache.spark.sql.catalyst.expressions.aggregate.{Final, Partial} import org.apache.spark.sql.execution.aggregate.BaseAggregateExec +import org.apache.spark.sql.functions._ import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types._ abstract class VeloxAggregateFunctionsSuite extends VeloxWholeStageTransformerSuite { @@ -553,21 +555,15 @@ abstract class VeloxAggregateFunctionsSuite extends VeloxWholeStageTransformerSu } test("approx_count_distinct") { - runQueryAndCompare(""" - |select approx_count_distinct(l_shipmode) from lineitem; - |""".stripMargin) { + runQueryAndCompare( + """ + |select approx_count_distinct(l_shipmode), approx_count_distinct(l_discount) from lineitem; + |""".stripMargin) { checkGlutenOperatorMatch[HashAggregateExecTransformer] } runQueryAndCompare( - "select approx_count_distinct(l_partkey), count(distinct l_orderkey) from lineitem") { - df => - { - assert( - getExecutedPlan(df).count( - plan => { - plan.isInstanceOf[HashAggregateExecTransformer] - }) == 0) - } + "select approx_count_distinct(l_discount), count(distinct l_orderkey) from lineitem") { + checkGlutenOperatorMatch[HashAggregateExecTransformer] } } @@ -1118,6 +1114,27 @@ abstract class VeloxAggregateFunctionsSuite extends VeloxWholeStageTransformerSu } } } + + test("complex type with null") { + val jsonStr = """{"txn":{"appId":"txnId","version":0,"lastUpdated":null}}""" + val jsonSchema = StructType( + Seq( + StructField( + "txn", + StructType( + Seq( + StructField("appId", StringType, true), + StructField("lastUpdated", LongType, true), + StructField("version", LongType, true))), + true))) + val df = spark.read.schema(jsonSchema).json(Seq(jsonStr).toDS) + df.select(collect_set(col("txn"))).collect + + df.select(min(col("txn"))).collect + + df.select(max(col("txn"))).collect + + } } class VeloxAggregateFunctionsDefaultSuite extends VeloxAggregateFunctionsSuite { diff --git a/backends-velox/src/test/scala/org/apache/gluten/execution/VeloxMetricsSuite.scala b/backends-velox/src/test/scala/org/apache/gluten/execution/VeloxMetricsSuite.scala index ce8450fea4231..468f262592192 100644 --- a/backends-velox/src/test/scala/org/apache/gluten/execution/VeloxMetricsSuite.scala +++ b/backends-velox/src/test/scala/org/apache/gluten/execution/VeloxMetricsSuite.scala @@ -19,6 +19,7 @@ package org.apache.gluten.execution import org.apache.gluten.GlutenConfig import org.apache.gluten.sql.shims.SparkShimLoader +import org.apache.spark.SparkConf import org.apache.spark.sql.execution.CommandResultExec import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper import org.apache.spark.sql.internal.SQLConf @@ -52,6 +53,11 @@ class VeloxMetricsSuite extends VeloxWholeStageTransformerSuite with AdaptiveSpa super.afterAll() } + override protected def sparkConf: SparkConf = { + super.sparkConf + .set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.ColumnarShuffleManager") + } + test("test sort merge join metrics") { withSQLConf( GlutenConfig.COLUMNAR_FPRCE_SHUFFLED_HASH_JOIN_ENABLED.key -> "false", @@ -143,6 +149,36 @@ class VeloxMetricsSuite extends VeloxWholeStageTransformerSuite with AdaptiveSpa } } + test("Metrics of window") { + runQueryAndCompare("SELECT c1, c2, sum(c2) over (partition by c1) as s FROM metrics_t1") { + df => + val window = find(df.queryExecution.executedPlan) { + case _: WindowExecTransformer => true + case _ => false + } + assert(window.isDefined) + val metrics = window.get.metrics + assert(metrics("numOutputRows").value == 100) + assert(metrics("outputVectors").value == 2) + } + } + + test("Metrics of noop filter's children") { + withSQLConf("spark.gluten.ras.enabled" -> "true") { + runQueryAndCompare("SELECT c1, c2 FROM metrics_t1 where c1 < 50") { + df => + val scan = find(df.queryExecution.executedPlan) { + case _: FileSourceScanExecTransformer => true + case _ => false + } + assert(scan.isDefined) + val metrics = scan.get.metrics + assert(metrics("rawInputRows").value == 100) + assert(metrics("outputVectors").value == 1) + } + } + } + test("Write metrics") { if (SparkShimLoader.getSparkVersion.startsWith("3.4")) { withSQLConf(("spark.gluten.sql.native.writer.enabled", "true")) { diff --git a/backends-velox/src/test/scala/org/apache/gluten/expression/VeloxUdfSuite.scala b/backends-velox/src/test/scala/org/apache/gluten/expression/VeloxUdfSuite.scala index 4d2f9fae3147c..534a8d9f1c74d 100644 --- a/backends-velox/src/test/scala/org/apache/gluten/expression/VeloxUdfSuite.scala +++ b/backends-velox/src/test/scala/org/apache/gluten/expression/VeloxUdfSuite.scala @@ -93,12 +93,13 @@ abstract class VeloxUdfSuite extends GlutenQueryTest with SQLHelper { | myavg(1), | myavg(1L), | myavg(cast(1.0 as float)), - | myavg(cast(1.0 as double)) + | myavg(cast(1.0 as double)), + | mycount_if(true) |""".stripMargin) df.collect() assert( df.collect() - .sameElements(Array(Row(1.0, 1.0, 1.0, 1.0)))) + .sameElements(Array(Row(1.0, 1.0, 1.0, 1.0, 1L)))) } } diff --git a/backends-velox/src/test/scala/org/apache/gluten/fuzzer/FuzzerTestBase.scala b/backends-velox/src/test/scala/org/apache/gluten/fuzzer/FuzzerTestBase.scala index 7d59fbfae721b..1ee79a2ade873 100644 --- a/backends-velox/src/test/scala/org/apache/gluten/fuzzer/FuzzerTestBase.scala +++ b/backends-velox/src/test/scala/org/apache/gluten/fuzzer/FuzzerTestBase.scala @@ -35,7 +35,7 @@ abstract class FuzzerTestBase extends VeloxWholeStageTransformerSuite { .set("spark.plugins", "org.apache.gluten.GlutenPlugin") .set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.ColumnarShuffleManager") .set("spark.memory.offHeap.enabled", "true") - .set("spark.memory.offHeap.size", "512MB") + .set("spark.memory.offHeap.size", "4g") .set("spark.driver.memory", "4g") .set("spark.driver.maxResultSize", "4g") } diff --git a/backends-velox/src/test/scala/org/apache/gluten/fuzzer/ShuffleWriterFuzzerTest.scala b/backends-velox/src/test/scala/org/apache/gluten/fuzzer/ShuffleWriterFuzzerTest.scala index 1d27f26811deb..7d8fc56d9728a 100644 --- a/backends-velox/src/test/scala/org/apache/gluten/fuzzer/ShuffleWriterFuzzerTest.scala +++ b/backends-velox/src/test/scala/org/apache/gluten/fuzzer/ShuffleWriterFuzzerTest.scala @@ -68,7 +68,7 @@ class ShuffleWriterFuzzerTest extends FuzzerTestBase { logWarning( s"==============================> " + s"Started reproduction (seed: ${dataGenerator.getSeed})") - val result = defaultRunner(testShuffle(sql)) + val result = defaultRunner(testShuffle(sql))() assert(result.isInstanceOf[Successful], s"Failed to run 'reproduce' with seed: $seed") } } diff --git a/backends-velox/src/test/scala/org/apache/spark/sql/execution/VeloxParquetWriteForHiveSuite.scala b/backends-velox/src/test/scala/org/apache/spark/sql/execution/VeloxParquetWriteForHiveSuite.scala index 9597e3110a104..731f5ef4845c1 100644 --- a/backends-velox/src/test/scala/org/apache/spark/sql/execution/VeloxParquetWriteForHiveSuite.scala +++ b/backends-velox/src/test/scala/org/apache/spark/sql/execution/VeloxParquetWriteForHiveSuite.scala @@ -139,11 +139,7 @@ class VeloxParquetWriteForHiveSuite extends GlutenQueryTest with SQLTestUtils { withTable("t") { spark.sql("CREATE TABLE t (c int) STORED AS PARQUET") withSQLConf("spark.sql.hive.convertMetastoreParquet" -> "false") { - if (isSparkVersionGE("3.4")) { - checkNativeWrite("INSERT OVERWRITE TABLE t SELECT 1 as c", checkNative = false) - } else { - checkNativeWrite("INSERT OVERWRITE TABLE t SELECT 1 as c", checkNative = true) - } + checkNativeWrite("INSERT OVERWRITE TABLE t SELECT 1 as c", checkNative = true) } checkAnswer(spark.table("t"), Row(1)) } diff --git a/cpp-ch/CMakeLists.txt b/cpp-ch/CMakeLists.txt index 0c41f494d2a1a..82049bdd31bb4 100644 --- a/cpp-ch/CMakeLists.txt +++ b/cpp-ch/CMakeLists.txt @@ -16,7 +16,9 @@ cmake_minimum_required(VERSION 3.20) file(READ ${CMAKE_CURRENT_SOURCE_DIR}/clickhouse.version CH_VERSION) -set(CH_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/ClickHouse CACHE STRING "ClickHouse source dir") +set(CH_SOURCE_DIR + ${CMAKE_CURRENT_SOURCE_DIR}/ClickHouse + CACHE STRING "ClickHouse source dir") string(REGEX REPLACE ".*CH_ORG=([^\n]+).*" "\\1" CH_ORG "${CH_VERSION}") string(REGEX REPLACE ".*CH_BRANCH=([^\n]+).*" "\\1" CH_BRANCH "${CH_VERSION}") string(REGEX REPLACE ".*CH_COMMIT=([^\n]+).*" "\\1" CH_COMMIT "${CH_VERSION}") @@ -27,93 +29,93 @@ message("CH_COMMIT=${CH_COMMIT}") project(libch LANGUAGES C CXX ASM) file(GLOB clickhouse_files "${CH_SOURCE_DIR}/*") -if ("${CH_SOURCE_DIR}" STREQUAL "${CMAKE_SOURCE_DIR}/ClickHouse") - if (NOT clickhouse_files) - execute_process(COMMAND git clone -b ${CH_BRANCH} --depth 3 https://github.com/${CH_ORG}/ClickHouse.git ${CH_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY) - execute_process(COMMAND git reset --hard ${CH_COMMIT} WORKING_DIRECTORY ${CH_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY) - execute_process(COMMAND git submodule update --init --force --depth 1 WORKING_DIRECTORY ${CH_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY) - else() - execute_process(COMMAND git fetch origin ${CH_BRANCH} --depth 3 WORKING_DIRECTORY ${CH_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY) - execute_process(COMMAND git checkout ${CH_BRANCH} WORKING_DIRECTORY ${CH_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY) - execute_process(COMMAND git reset --hard ${CH_COMMIT} WORKING_DIRECTORY ${CH_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY) - execute_process(COMMAND git submodule update --init --recursive --force --depth 1 WORKING_DIRECTORY ${CH_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY) - endif() +if("${CH_SOURCE_DIR}" STREQUAL "${CMAKE_SOURCE_DIR}/ClickHouse") + if(NOT clickhouse_files) + execute_process( + COMMAND + git clone -b ${CH_BRANCH} --depth 3 + https://github.com/${CH_ORG}/ClickHouse.git ${CH_SOURCE_DIR} + COMMAND_ERROR_IS_FATAL ANY) + execute_process( + COMMAND git reset --hard ${CH_COMMIT} + WORKING_DIRECTORY ${CH_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY) + execute_process( + COMMAND git submodule update --init --force --depth 1 + WORKING_DIRECTORY ${CH_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY) + else() + execute_process( + COMMAND git fetch origin ${CH_BRANCH} --depth 3 + WORKING_DIRECTORY ${CH_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY) + execute_process( + COMMAND git checkout ${CH_BRANCH} + WORKING_DIRECTORY ${CH_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY) + execute_process( + COMMAND git reset --hard ${CH_COMMIT} + WORKING_DIRECTORY ${CH_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY) + execute_process( + COMMAND git submodule update --init --recursive --force --depth 1 + WORKING_DIRECTORY ${CH_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY) + endif() else() - if (NOT clickhouse_files) - # Checking out *all* submodules takes > 5 min. Therefore, the smoke build ("FastTest") in CI initializes only the set of - # submodules minimally needed for a build and we cannot assume here that all submodules are populated. - message(ERROR "clickhouse ${CH_SOURCE_DIR} is missing or empty. to fix try run:") - message(STATUS " git clone --recursive --depth 1 https://github.com/Kyligence/ClickHouse.git ${CMAKE_SOURCE_DIR}") - endif() + if(NOT clickhouse_files) + # Checking out *all* submodules takes > 5 min. Therefore, the smoke build + # ("FastTest") in CI initializes only the set of submodules minimally needed + # for a build and we cannot assume here that all submodules are populated. + message(ERROR + "clickhouse ${CH_SOURCE_DIR} is missing or empty. to fix try run:") + message( + STATUS + " git clone --recursive --depth 1 https://github.com/Kyligence/ClickHouse.git ${CMAKE_SOURCE_DIR}" + ) + endif() endif() -if (EXISTS "${CH_SOURCE_DIR}/utils/extern-local-engine") - execute_process(COMMAND rm -rf ${CH_SOURCE_DIR}/utils/extern-local-engine) -endif () -execute_process(COMMAND ln -s ${CMAKE_CURRENT_SOURCE_DIR}/local-engine ${CH_SOURCE_DIR}/utils/extern-local-engine COMMAND_ERROR_IS_FATAL ANY) +if(EXISTS "${CH_SOURCE_DIR}/utils/extern-local-engine") + execute_process(COMMAND rm -rf ${CH_SOURCE_DIR}/utils/extern-local-engine) +endif() +execute_process( + COMMAND ln -s ${CMAKE_CURRENT_SOURCE_DIR}/local-engine + ${CH_SOURCE_DIR}/utils/extern-local-engine COMMAND_ERROR_IS_FATAL ANY) -# execute_process(COMMAND find ${CMAKE_CURRENT_SOURCE_DIR}/local-engine -regex '.*\.\(c\|cpp\|h\)' -exec clang-format-15 --verbose -i --style=file -i {} \;) +# execute_process(COMMAND find ${CMAKE_CURRENT_SOURCE_DIR}/local-engine -regex +# '.*\.\(c\|cpp\|h\)' -exec clang-format-15 --verbose -i --style=file -i {} \;) set(CH_BINARY_DIR "${CMAKE_CURRENT_SOURCE_DIR}/build") option(ENABLE_CPP_TEST "Build CPP Unit test" OFF) -if (ENABLE_CPP_TEST) -add_custom_command( - USES_TERMINAL - COMMAND - bash -c - \"cmake -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} - -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} - -DENABLE_PROTOBUF=ON - -DENABLE_TESTS=ON - -DENABLE_JEMALLOC=ON - -DENABLE_MULTITARGET_CODE=ON - -DENABLE_EXTERN_LOCAL_ENGINE=ON - -DCOMPILER_FLAGS='-fvisibility=hidden -fvisibility-inlines-hidden' - -S ${CH_SOURCE_DIR} -G Ninja -B ${CH_BINARY_DIR} && - cmake --build ${CH_BINARY_DIR} --target ch unit_tests_local_engine\" - OUTPUT _build_ch) +if(ENABLE_CPP_TEST) + add_custom_command( + USES_TERMINAL + COMMAND + bash -c \"cmake -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DENABLE_PROTOBUF=ON + -DENABLE_TESTS=ON -DENABLE_JEMALLOC=ON -DENABLE_MULTITARGET_CODE=ON + -DENABLE_EXTERN_LOCAL_ENGINE=ON -DCOMPILER_FLAGS='-fvisibility=hidden + -fvisibility-inlines-hidden' -S ${CH_SOURCE_DIR} -G Ninja -B + ${CH_BINARY_DIR} && cmake --build ${CH_BINARY_DIR} --target ch + unit_tests_local_engine\" + OUTPUT _build_ch) else() -add_custom_command( - USES_TERMINAL - COMMAND - bash -c - \"cmake -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} - -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} - -DENABLE_PROTOBUF=ON - -DENABLE_TESTS=OFF - -DENABLE_JEMALLOC=ON - -DENABLE_MULTITARGET_CODE=ON - -DENABLE_EXTERN_LOCAL_ENGINE=ON - -DENABLE_ODBC=OFF - -DENABLE_CAPNP=OFF - -DENABLE_ROCKSDB=OFF - -DENABLE_GRPC=OFF - -DENABLE_RUST=OFF - -DENABLE_H3=OFF - -DENABLE_AMQPCPP=OFF - -DENABLE_CASSANDRA=OFF - -DENABLE_KAFKA=OFF - -DENABLE_NATS=OFF - -DENABLE_LIBPQXX=OFF - -DENABLE_NURAFT=OFF - -DENABLE_DATASKETCHES=OFF - -DENABLE_SQLITE=OFF - -DENABLE_S2_GEOMETRY=OFF - -DENABLE_ANNOY=OFF - -DENABLE_ULID=OFF - -DENABLE_MYSQL=OFF - -DENABLE_BCRYPT=OFF - -DENABLE_LDAP=OFF - -DENABLE_MSGPACK=OFF - -DUSE_REPLXX=OFF - -DENABLE_CLICKHOUSE_ALL=OFF - -DCOMPILER_FLAGS='-fvisibility=hidden -fvisibility-inlines-hidden' - -S ${CH_SOURCE_DIR} -G Ninja -B ${CH_BINARY_DIR} && - cmake --build ${CH_BINARY_DIR} --target libch\" - OUTPUT _build_ch) + add_custom_command( + USES_TERMINAL + COMMAND + bash -c \"cmake -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DENABLE_PROTOBUF=ON + -DENABLE_TESTS=OFF -DENABLE_JEMALLOC=ON -DENABLE_MULTITARGET_CODE=ON + -DENABLE_EXTERN_LOCAL_ENGINE=ON -DENABLE_ODBC=OFF -DENABLE_CAPNP=OFF + -DENABLE_ROCKSDB=OFF -DENABLE_GRPC=OFF -DENABLE_RUST=OFF -DENABLE_H3=OFF + -DENABLE_AMQPCPP=OFF -DENABLE_CASSANDRA=OFF -DENABLE_KAFKA=OFF + -DENABLE_NATS=OFF -DENABLE_LIBPQXX=OFF -DENABLE_NURAFT=OFF + -DENABLE_DATASKETCHES=OFF -DENABLE_SQLITE=OFF -DENABLE_S2_GEOMETRY=OFF + -DENABLE_ANNOY=OFF -DENABLE_ULID=OFF -DENABLE_MYSQL=OFF + -DENABLE_BCRYPT=OFF -DENABLE_LDAP=OFF -DENABLE_MSGPACK=OFF + -DUSE_REPLXX=OFF -DENABLE_CLICKHOUSE_ALL=OFF -DENABLE_GWP_ASAN=OFF + -DCOMPILER_FLAGS='-fvisibility=hidden -fvisibility-inlines-hidden' -S + ${CH_SOURCE_DIR} -G Ninja -B ${CH_BINARY_DIR} && cmake --build + ${CH_BINARY_DIR} --target libch\" + OUTPUT _build_ch) endif() add_custom_target(build_ch ALL DEPENDS _build_ch) diff --git a/cpp-ch/clickhouse.version b/cpp-ch/clickhouse.version index 775667afb07dc..1630f57601877 100644 --- a/cpp-ch/clickhouse.version +++ b/cpp-ch/clickhouse.version @@ -1,3 +1,4 @@ CH_ORG=Kyligence -CH_BRANCH=rebase_ch/20240526 -CH_COMMIT=ff17e067fac \ No newline at end of file +CH_BRANCH=rebase_ch/20240703 +CH_COMMIT=aa71be074ad + diff --git a/cpp-ch/local-engine/AggregateFunctions/AggregateFunctionSparkAvg.cpp b/cpp-ch/local-engine/AggregateFunctions/AggregateFunctionSparkAvg.cpp new file mode 100644 index 0000000000000..5eb3a0b360575 --- /dev/null +++ b/cpp-ch/local-engine/AggregateFunctions/AggregateFunctionSparkAvg.cpp @@ -0,0 +1,158 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include +#include + +#include + +#include +#include + +namespace DB +{ +struct Settings; + +namespace ErrorCodes +{ + +} +} + +namespace local_engine +{ +using namespace DB; + + +DataTypePtr getSparkAvgReturnType(const DataTypePtr & arg_type) +{ + const UInt32 precision_value = std::min(getDecimalPrecision(*arg_type) + 4, DecimalUtils::max_precision); + const auto scale_value = std::min(getDecimalScale(*arg_type) + 4, precision_value); + return createDecimal(precision_value, scale_value); +} + +template +requires is_decimal +class AggregateFunctionSparkAvg final : public AggregateFunctionAvg +{ +public: + using Base = AggregateFunctionAvg; + + explicit AggregateFunctionSparkAvg(const DataTypes & argument_types_, UInt32 num_scale_, UInt32 round_scale_) + : Base(argument_types_, createResultType(argument_types_, num_scale_, round_scale_), num_scale_) + , num_scale(num_scale_) + , round_scale(round_scale_) + { + } + + DataTypePtr createResultType(const DataTypes & argument_types_, UInt32 num_scale_, UInt32 round_scale_) + { + const DataTypePtr & data_type = argument_types_[0]; + const UInt32 precision_value = std::min(getDecimalPrecision(*data_type) + 4, DecimalUtils::max_precision); + const auto scale_value = std::min(num_scale_ + 4, precision_value); + return createDecimal(precision_value, scale_value); + } + + void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override + { + const DataTypePtr & result_type = this->getResultType(); + auto result_scale = getDecimalScale(*result_type); + WhichDataType which(result_type); + if (which.isDecimal32()) + { + assert_cast &>(to).getData().push_back( + divideDecimalAndUInt(this->data(place), num_scale, result_scale, round_scale)); + } + else if (which.isDecimal64()) + { + assert_cast &>(to).getData().push_back( + divideDecimalAndUInt(this->data(place), num_scale, result_scale, round_scale)); + } + else if (which.isDecimal128()) + { + assert_cast &>(to).getData().push_back( + divideDecimalAndUInt(this->data(place), num_scale, result_scale, round_scale)); + } + else + { + assert_cast &>(to).getData().push_back( + divideDecimalAndUInt(this->data(place), num_scale, result_scale, round_scale)); + } + } + + String getName() const override { return "sparkAvg"; } + +private: + Int128 NO_SANITIZE_UNDEFINED + divideDecimalAndUInt(AvgFraction, UInt64> avg, UInt32 num_scale, UInt32 result_scale, UInt32 round_scale) const + { + auto value = avg.numerator.value; + if (result_scale > num_scale) + { + auto diff = DecimalUtils::scaleMultiplier>(result_scale - num_scale); + value = value * diff; + } + else if (result_scale < num_scale) + { + auto diff = DecimalUtils::scaleMultiplier>(num_scale - result_scale); + value = value / diff; + } + + auto result = value / avg.denominator; + + if (round_scale > result_scale) + return result; + + auto round_diff = DecimalUtils::scaleMultiplier>(result_scale - round_scale); + return (result + round_diff / 2) / round_diff * round_diff; + } + +private: + UInt32 num_scale; + UInt32 round_scale; +}; + +AggregateFunctionPtr +createAggregateFunctionSparkAvg(const std::string & name, const DataTypes & argument_types, const Array & parameters, const Settings * settings) +{ + assertNoParameters(name, parameters); + assertUnary(name, argument_types); + + AggregateFunctionPtr res; + const DataTypePtr & data_type = argument_types[0]; + if (!isDecimal(data_type)) + throw Exception( + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument for aggregate function {}", data_type->getName(), name); + + bool allowPrecisionLoss = settings->get(DECIMAL_OPERATIONS_ALLOW_PREC_LOSS).get(); + const UInt32 p1 = DB::getDecimalPrecision(*data_type); + const UInt32 s1 = DB::getDecimalScale(*data_type); + auto [p2, s2] = GlutenDecimalUtils::LONG_DECIMAL; + auto [_, round_scale] = GlutenDecimalUtils::dividePrecisionScale(p1, s1, p2, s2, allowPrecisionLoss); + + res.reset(createWithDecimalType(*data_type, argument_types, getDecimalScale(*data_type), round_scale)); + return res; +} + +void registerAggregateFunctionSparkAvg(AggregateFunctionFactory & factory) +{ + factory.registerFunction("sparkAvg", createAggregateFunctionSparkAvg); +} + +} diff --git a/cpp-ch/local-engine/CMakeLists.txt b/cpp-ch/local-engine/CMakeLists.txt index 8c96c5f98f719..93ee4b8218af1 100644 --- a/cpp-ch/local-engine/CMakeLists.txt +++ b/cpp-ch/local-engine/CMakeLists.txt @@ -13,22 +13,24 @@ # See the License for the specific language governing permissions and # limitations under the License. - -if (ENABLE_MULTITARGET_CODE) - add_definitions(-DENABLE_MULTITARGET_CODE=1) +if(ENABLE_MULTITARGET_CODE) + add_definitions(-DENABLE_MULTITARGET_CODE=1) else() - add_definitions(-DENABLE_MULTITARGET_CODE=0) + add_definitions(-DENABLE_MULTITARGET_CODE=0) endif() -set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -w -ffunction-sections -fdata-sections") -set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -w -ffunction-sections -fdata-sections") -set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,--no-export-dynamic -Wl,--gc-sections") +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -w -ffunction-sections -fdata-sections") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -w -ffunction-sections -fdata-sections") +set(CMAKE_SHARED_LINKER_FLAGS + "${CMAKE_EXE_LINKER_FLAGS} -Wl,--no-export-dynamic -Wl,--gc-sections") -if (COMPILER_CLANG AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 16) - set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} --ld-path=${LLD_WRAPPER}") +if(COMPILER_CLANG AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 16) + set(CMAKE_SHARED_LINKER_FLAGS + "${CMAKE_SHARED_LINKER_FLAGS} --ld-path=${LLD_WRAPPER}") else() - set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} --ld-path=${LLD_PATH}") -endif () + set(CMAKE_SHARED_LINKER_FLAGS + "${CMAKE_SHARED_LINKER_FLAGS} --ld-path=${LLD_PATH}") +endif() set(THRIFT_INCLUDE_DIR "${ClickHouse_SOURCE_DIR}/contrib/thrift/lib/cpp/src") @@ -37,7 +39,7 @@ include(FindJava) include(UseJava) include(FindJNI) -#set(JNI_NATIVE_SOURCES local_engine_jni.cpp) +# set(JNI_NATIVE_SOURCES local_engine_jni.cpp) set(LOCALENGINE_SHARED_LIB _gluten_ch) add_subdirectory(proto) @@ -61,116 +63,124 @@ add_headers_and_sources(disks Disks) add_headers_and_sources(disks Disks/ObjectStorages) include_directories( - ${JNI_INCLUDE_DIRS} - ${CMAKE_CURRENT_BINARY_DIR}/proto - ${THRIFT_INCLUDE_DIR} - ${CMAKE_BINARY_DIR}/contrib/thrift-cmake - ${CMAKE_BINARY_DIR}/contrib/llvm-project/llvm/include - ${CMAKE_CURRENT_SOURCE_DIR} - ${ClickHouse_SOURCE_DIR}/src - ${ClickHouse_SOURCE_DIR}/base - ${ClickHouse_SOURCE_DIR}/contrib/orc/c++/include - ${ClickHouse_SOURCE_DIR}/contrib/arrow-cmake/cpp/src/orc/c++/include - ${CMAKE_BINARY_DIR}/contrib/orc/c++/include - ${ClickHouse_SOURCE_DIR}/contrib/azure/sdk/storage/azure-storage-blobs/inc - ${ClickHouse_SOURCE_DIR}/contrib/azure/sdk/core/azure-core/inc - ${ClickHouse_SOURCE_DIR}/contrib/azure/sdk/storage/azure-storage-common/inc - ${ClickHouse_SOURCE_DIR}/contrib/llvm-project/llvm/include - ${ClickHouse_SOURCE_DIR}/contrib/llvm-project/utils/bazel/llvm-project-overlay/llvm/include + ${JNI_INCLUDE_DIRS} + ${CMAKE_CURRENT_BINARY_DIR}/proto + ${THRIFT_INCLUDE_DIR} + ${CMAKE_BINARY_DIR}/contrib/thrift-cmake + ${CMAKE_BINARY_DIR}/contrib/llvm-project/llvm/include + ${CMAKE_CURRENT_SOURCE_DIR} + ${ClickHouse_SOURCE_DIR}/src + ${ClickHouse_SOURCE_DIR}/base + ${ClickHouse_SOURCE_DIR}/contrib/orc/c++/include + ${ClickHouse_SOURCE_DIR}/contrib/arrow-cmake/cpp/src/orc/c++/include + ${CMAKE_BINARY_DIR}/contrib/orc/c++/include + ${ClickHouse_SOURCE_DIR}/contrib/azure/sdk/storage/azure-storage-blobs/inc + ${ClickHouse_SOURCE_DIR}/contrib/azure/sdk/core/azure-core/inc + ${ClickHouse_SOURCE_DIR}/contrib/azure/sdk/storage/azure-storage-common/inc + ${ClickHouse_SOURCE_DIR}/contrib/llvm-project/llvm/include + ${ClickHouse_SOURCE_DIR}/contrib/llvm-project/utils/bazel/llvm-project-overlay/llvm/include ) add_subdirectory(Storages/Parquet) add_subdirectory(Storages/SubstraitSource) add_subdirectory(Functions) -add_library(gluten_clickhouse_backend_libs - ${builder_sources} - ${join_sources} - ${parser_sources} - ${rewriter_sources} - ${storages_sources} - ${common_sources} - ${external_sources} - ${shuffle_sources} - ${operator_sources} - ${aggregate_functions_sources} - ${jni_sources} - ${disks_sources} -) - -target_link_libraries(gluten_clickhouse_backend_libs PUBLIC - substrait_source - clickhouse_aggregate_functions - clickhouse_functions - gluten_spark_functions - ch_contrib::xxHash -) +add_library( + gluten_clickhouse_backend_libs + ${builder_sources} + ${join_sources} + ${parser_sources} + ${rewriter_sources} + ${storages_sources} + ${common_sources} + ${external_sources} + ${shuffle_sources} + ${operator_sources} + ${aggregate_functions_sources} + ${jni_sources} + ${disks_sources}) + +target_link_libraries( + gluten_clickhouse_backend_libs + PUBLIC substrait_source clickhouse_aggregate_functions clickhouse_functions + gluten_spark_functions ch_contrib::xxHash) # Add udf sources files in sub directories to functions_sources -option(ENABLE_LOCAL_UDFS "Build UDFs in 'local-engine/Parser/*_udf' subdirectories" ON) -if (ENABLE_LOCAL_UDFS) - file(GLOB children CONFIGURE_DEPENDS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} Parser/*_udf) - foreach(child ${children}) - add_headers_and_sources(local_udfs ${child}) - endforeach() -endif () - -file(GLOB children CONFIGURE_DEPENDS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} Parser/*_function_parser) +option(ENABLE_LOCAL_UDFS + "Build UDFs in 'local-engine/Parser/*_udf' subdirectories" ON) +if(ENABLE_LOCAL_UDFS) + file( + GLOB children CONFIGURE_DEPENDS + RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} + Parser/*_udf) + foreach(child ${children}) + add_headers_and_sources(local_udfs ${child}) + endforeach() +endif() + +file( + GLOB children CONFIGURE_DEPENDS + RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} + Parser/*_function_parser) foreach(child ${children}) - add_headers_and_sources(function_parsers ${child}) + add_headers_and_sources(function_parsers ${child}) endforeach() -# Notice: soures files under Parser/*_udf subdirectories must be built into target ${LOCALENGINE_SHARED_LIB} directly -# to make sure all function parsers are registered successly. -add_library(${LOCALENGINE_SHARED_LIB} SHARED - local_engine_jni.cpp - ${local_udfs_sources} - ${function_parsers_sources} - $) # why add clickhouse_malloc? check clickhouse PR-8046 +# Notice: soures files under Parser/*_udf subdirectories must be built into +# target ${LOCALENGINE_SHARED_LIB} directly to make sure all function parsers +# are registered successly. +add_library( + ${LOCALENGINE_SHARED_LIB} SHARED + local_engine_jni.cpp ${local_udfs_sources} ${function_parsers_sources} + $) # why add clickhouse_malloc? check + # clickhouse PR-8046 target_compile_options(${LOCALENGINE_SHARED_LIB} PUBLIC -fPIC - -Wno-shorten-64-to-32) - -target_link_libraries(${LOCALENGINE_SHARED_LIB} -PUBLIC - clickhouse_new_delete - clickhouse_common_config - clickhouse_common_io - clickhouse_parsers - clickhouse_storages_system - loggers - gluten_clickhouse_backend_libs - ch_contrib::protobuf -PRIVATE - substrait -) + -Wno-shorten-64-to-32) + +target_link_libraries( + ${LOCALENGINE_SHARED_LIB} + PUBLIC clickhouse_new_delete + clickhouse_common_config + clickhouse_common_io + clickhouse_parsers + clickhouse_storages_system + loggers + gluten_clickhouse_backend_libs + ch_contrib::protobuf + PRIVATE substrait) target_link_libraries(${LOCALENGINE_SHARED_LIB} PUBLIC ch_parquet) -if (ENABLE_JEMALLOC) - target_link_options(${LOCALENGINE_SHARED_LIB} PRIVATE - -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/libch.map -Wl,-Bsymbolic-functions) +if(ENABLE_JEMALLOC) + target_link_options( + ${LOCALENGINE_SHARED_LIB} PRIVATE + -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/libch.map + -Wl,-Bsymbolic-functions) else() - target_link_options(${LOCALENGINE_SHARED_LIB} PRIVATE - -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/libch-hide-jemalloc.map) + target_link_options( + ${LOCALENGINE_SHARED_LIB} PRIVATE + -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/libch-hide-jemalloc.map) endif() -if ("${CMAKE_BUILD_TYPE}" MATCHES "Debug") - set(LOCALENGINE_SHARED_LIB_NAME "libchd.so") -else () - set(LOCALENGINE_SHARED_LIB_NAME "libch.so") -endif () +if("${CMAKE_BUILD_TYPE}" MATCHES "Debug") + set(LOCALENGINE_SHARED_LIB_NAME "libchd.so") +else() + set(LOCALENGINE_SHARED_LIB_NAME "libch.so") +endif() add_custom_command( - OUTPUT ${LOCALENGINE_SHARED_LIB_NAME} - COMMAND ${CMAKE_COMMAND} -E rename $ ${LOCALENGINE_SHARED_LIB_NAME} - COMMENT "Renaming $ to ${LOCALENGINE_SHARED_LIB_NAME}" - DEPENDS ${LOCALENGINE_SHARED_LIB}) + OUTPUT ${LOCALENGINE_SHARED_LIB_NAME} + COMMAND ${CMAKE_COMMAND} -E rename $ + ${LOCALENGINE_SHARED_LIB_NAME} + COMMENT + "Renaming $ to ${LOCALENGINE_SHARED_LIB_NAME}" + DEPENDS ${LOCALENGINE_SHARED_LIB}) add_custom_target(libch ALL DEPENDS ${LOCALENGINE_SHARED_LIB_NAME}) add_subdirectory(tests) -if (ENABLE_EXAMPLES) - add_subdirectory(examples) -endif() \ No newline at end of file +if(ENABLE_EXAMPLES) + add_subdirectory(examples) +endif() diff --git a/cpp-ch/local-engine/Common/BlockIterator.cpp b/cpp-ch/local-engine/Common/BlockIterator.cpp index 1a76f646b20a4..4647018932078 100644 --- a/cpp-ch/local-engine/Common/BlockIterator.cpp +++ b/cpp-ch/local-engine/Common/BlockIterator.cpp @@ -34,24 +34,30 @@ void local_engine::BlockIterator::checkNextValid() throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Block iterator next should after hasNext"); } } + void BlockIterator::produce() { consumed = false; } + void BlockIterator::consume() { consumed = true; } + bool BlockIterator::isConsumed() const { return consumed; } + DB::Block & BlockIterator::currentBlock() { return cached_block; } + void BlockIterator::setCurrentBlock(DB::Block & block) { cached_block = block; } + } diff --git a/cpp-ch/local-engine/Common/CHUtil.cpp b/cpp-ch/local-engine/Common/CHUtil.cpp index 9e2ce63047186..770fbbc59c809 100644 --- a/cpp-ch/local-engine/Common/CHUtil.cpp +++ b/cpp-ch/local-engine/Common/CHUtil.cpp @@ -17,6 +17,7 @@ #include "CHUtil.h" #include +#include #include #include #include @@ -77,6 +78,7 @@ namespace ErrorCodes { extern const int BAD_ARGUMENTS; extern const int UNKNOWN_TYPE; +extern const int CANNOT_PARSE_PROTOBUF_SCHEMA; } } @@ -466,17 +468,17 @@ String QueryPipelineUtil::explainPipeline(DB::QueryPipeline & pipeline) using namespace DB; -std::map BackendInitializerUtil::getBackendConfMap(std::string * plan) +std::map BackendInitializerUtil::getBackendConfMap(const std::string_view plan) { std::map ch_backend_conf; - if (plan == nullptr) + if (plan.empty()) return ch_backend_conf; /// Parse backend configs from plan extensions do { auto plan_ptr = std::make_unique(); - auto success = plan_ptr->ParseFromString(*plan); + auto success = plan_ptr->ParseFromString(plan); if (!success) break; @@ -526,6 +528,50 @@ std::map BackendInitializerUtil::getBackendConfMap(std return ch_backend_conf; } +std::vector BackendInitializerUtil::wrapDiskPathConfig( + const String & path_prefix, + const String & path_suffix, + Poco::Util::AbstractConfiguration & config) +{ + std::vector changed_paths; + if (path_prefix.empty() && path_suffix.empty()) + return changed_paths; + Poco::Util::AbstractConfiguration::Keys disks; + std::unordered_set disk_types = {"s3", "hdfs_gluten", "cache"}; + config.keys("storage_configuration.disks", disks); + + std::ranges::for_each( + disks, + [&](const auto & disk_name) + { + String disk_prefix = "storage_configuration.disks." + disk_name; + String disk_type = config.getString(disk_prefix + ".type", ""); + if (!disk_types.contains(disk_type)) + return; + if (disk_type == "cache") + { + String path = config.getString(disk_prefix + ".path", ""); + if (!path.empty()) + { + String final_path = path_prefix + path + path_suffix; + config.setString(disk_prefix + ".path", final_path); + changed_paths.emplace_back(final_path); + } + } + else if (disk_type == "s3" || disk_type == "hdfs_gluten") + { + String metadata_path = config.getString(disk_prefix + ".metadata_path", ""); + if (!metadata_path.empty()) + { + String final_path = path_prefix + metadata_path + path_suffix; + config.setString(disk_prefix + ".metadata_path", final_path); + changed_paths.emplace_back(final_path); + } + } + }); + return changed_paths; +} + DB::Context::ConfigurationPtr BackendInitializerUtil::initConfig(std::map & backend_conf_map) { DB::Context::ConfigurationPtr config; @@ -552,9 +598,38 @@ DB::Context::ConfigurationPtr BackendInitializerUtil::initConfig(std::mapsetString(key.substr(CH_RUNTIME_CONFIG_PREFIX.size()), value); + const auto name = key.substr(CH_RUNTIME_CONFIG_PREFIX.size()); + if ((name == "storage_configuration.disks.s3.metadata_path" || name == "path") && !value.ends_with("/")) + config->setString(name, value + "/"); + else + config->setString(name, value); } } + + if (backend_conf_map.contains(GLUTEN_TASK_OFFHEAP)) + { + config->setString(CH_TASK_MEMORY, backend_conf_map.at(GLUTEN_TASK_OFFHEAP)); + } + + const bool use_current_directory_as_tmp = config->getBool("use_current_directory_as_tmp", false); + char buffer[PATH_MAX]; + if (use_current_directory_as_tmp && getcwd(buffer, sizeof(buffer)) != nullptr) + { + wrapDiskPathConfig(String(buffer), "", *config); + } + + const bool reuse_disk_cache = config->getBool("reuse_disk_cache", true); + + if (!reuse_disk_cache) + { + String pid = std::to_string(static_cast(getpid())); + auto path_need_clean = wrapDiskPathConfig("", "/" + pid, *config); + std::lock_guard lock(BackendFinalizerUtil::paths_mutex); + BackendFinalizerUtil::paths_need_to_clean.insert( + BackendFinalizerUtil::paths_need_to_clean.end(), + path_need_clean.begin(), + path_need_clean.end()); + } return config; } @@ -575,12 +650,13 @@ void BackendInitializerUtil::initEnvs(DB::Context::ConfigurationPtr config) /// Set environment variable TZ if possible if (config->has("timezone")) { - const String timezone_name = config->getString("timezone"); - if (0 != setenv("TZ", timezone_name.data(), 1)) /// NOLINT + const std::string config_timezone = config->getString("timezone"); + const String mapped_timezone = DateTimeUtil::convertTimeZone(config_timezone); + if (0 != setenv("TZ", mapped_timezone.data(), 1)) // NOLINT(concurrency-mt-unsafe) // ok if not called concurrently with other setenv/getenv throw Poco::Exception("Cannot setenv TZ variable"); tzset(); - DateLUT::setDefaultTimezone(timezone_name); + DateLUT::setDefaultTimezone(mapped_timezone); } /// Set environment variable LIBHDFS3_CONF if possible @@ -612,7 +688,9 @@ void BackendInitializerUtil::initSettings(std::map & b { /// Initialize default setting. settings.set("date_time_input_format", "best_effort"); - settings.set("mergetree.merge_after_insert", true); + settings.set(MERGETREE_MERGE_AFTER_INSERT, true); + settings.set(MERGETREE_INSERT_WITHOUT_LOCAL_STORAGE, false); + settings.set(DECIMAL_OPERATIONS_ALLOW_PREC_LOSS, true); for (const auto & [key, value] : backend_conf_map) { @@ -646,8 +724,18 @@ void BackendInitializerUtil::initSettings(std::map & b settings.set(k, toField(k, value)); LOG_DEBUG(&Poco::Logger::get("CHUtil"), "Set settings key:{} value:{}", key, value); } + else if (key == SPARK_SESSION_TIME_ZONE) + { + String time_zone_val = DateTimeUtil::convertTimeZone(value); + settings.set("session_timezone", time_zone_val); + LOG_DEBUG(&Poco::Logger::get("CHUtil"), "Set settings key:{} value:{}", "session_timezone", time_zone_val); + } + else if (key == DECIMAL_OPERATIONS_ALLOW_PREC_LOSS) + { + settings.set(key, toField(key, value)); + LOG_DEBUG(&Poco::Logger::get("CHUtil"), "Set settings key:{} value:{}", key, value); + } } - /// Finally apply some fixed kvs to settings. settings.set("join_use_nulls", true); settings.set("input_format_orc_allow_missing_columns", true); @@ -669,9 +757,25 @@ void BackendInitializerUtil::initSettings(std::map & b settings.set("output_format_json_quote_64bit_integers", false); settings.set("output_format_json_quote_denormals", true); settings.set("output_format_json_skip_null_value_in_named_tuples", true); + settings.set("output_format_json_escape_forward_slashes", false); settings.set("function_json_value_return_type_allow_complex", true); settings.set("function_json_value_return_type_allow_nullable", true); settings.set("precise_float_parsing", true); + if (backend_conf_map.contains(GLUTEN_TASK_OFFHEAP)) + { + auto task_memory = std::stoull(backend_conf_map.at(GLUTEN_TASK_OFFHEAP)); + if (!backend_conf_map.contains(CH_RUNTIME_SETTINGS_PREFIX + "max_bytes_before_external_sort")) + { + settings.max_bytes_before_external_sort = static_cast(0.8 * task_memory); + } + if (!backend_conf_map.contains(CH_RUNTIME_SETTINGS_PREFIX + "prefer_external_sort_block_bytes")) + { + auto mem_gb = task_memory / static_cast(1_GiB); + // 2.8x+5, Heuristics calculate the block size of external sort, [8,16] + settings.prefer_external_sort_block_bytes = std::max(std::min( + static_cast(2.8*mem_gb + 5), 16ul), 8ul) * 1024 * 1024; + } + } } void BackendInitializerUtil::initContexts(DB::Context::ConfigurationPtr config) @@ -701,6 +805,11 @@ void BackendInitializerUtil::initContexts(DB::Context::ConfigurationPtr config) global_context->setTemporaryStoragePath(config->getString("tmp_path", getDefaultPath()), 0); global_context->setPath(config->getString("path", "/")); + String uncompressed_cache_policy = config->getString("uncompressed_cache_policy", DEFAULT_UNCOMPRESSED_CACHE_POLICY); + size_t uncompressed_cache_size = config->getUInt64("uncompressed_cache_size", DEFAULT_UNCOMPRESSED_CACHE_MAX_SIZE); + double uncompressed_cache_size_ratio = config->getDouble("uncompressed_cache_size_ratio", DEFAULT_UNCOMPRESSED_CACHE_SIZE_RATIO); + global_context->setUncompressedCache(uncompressed_cache_policy, uncompressed_cache_size, uncompressed_cache_size_ratio); + String mark_cache_policy = config->getString("mark_cache_policy", DEFAULT_MARK_CACHE_POLICY); size_t mark_cache_size = config->getUInt64("mark_cache_size", DEFAULT_MARK_CACHE_MAX_SIZE); double mark_cache_size_ratio = config->getDouble("mark_cache_size_ratio", DEFAULT_MARK_CACHE_SIZE_RATIO); @@ -709,10 +818,29 @@ void BackendInitializerUtil::initContexts(DB::Context::ConfigurationPtr config) global_context->setMarkCache(mark_cache_policy, mark_cache_size, mark_cache_size_ratio); + String index_uncompressed_cache_policy = config->getString("index_uncompressed_cache_policy", DEFAULT_INDEX_UNCOMPRESSED_CACHE_POLICY); + size_t index_uncompressed_cache_size = config->getUInt64("index_uncompressed_cache_size", DEFAULT_INDEX_UNCOMPRESSED_CACHE_MAX_SIZE); + double index_uncompressed_cache_size_ratio = config->getDouble("index_uncompressed_cache_size_ratio", DEFAULT_INDEX_UNCOMPRESSED_CACHE_SIZE_RATIO); + global_context->setIndexUncompressedCache(index_uncompressed_cache_policy, index_uncompressed_cache_size, index_uncompressed_cache_size_ratio); + String index_mark_cache_policy = config->getString("index_mark_cache_policy", DEFAULT_INDEX_MARK_CACHE_POLICY); size_t index_mark_cache_size = config->getUInt64("index_mark_cache_size", DEFAULT_INDEX_MARK_CACHE_MAX_SIZE); double index_mark_cache_size_ratio = config->getDouble("index_mark_cache_size_ratio", DEFAULT_INDEX_MARK_CACHE_SIZE_RATIO); global_context->setIndexMarkCache(index_mark_cache_policy, index_mark_cache_size, index_mark_cache_size_ratio); + + size_t mmap_cache_size = config->getUInt64("mmap_cache_size", DEFAULT_MMAP_CACHE_MAX_SIZE); + global_context->setMMappedFileCache(mmap_cache_size); + + /// Initialize a dummy query cache. + global_context->setQueryCache(0, 0, 0, 0); + + // We must set the application type to CLIENT to avoid ServerUUID::get() throw exception + global_context->setApplicationType(Context::ApplicationType::CLIENT); + } + else + { + // just for ut + global_context->updateStorageConfiguration(*config); } } @@ -730,6 +858,7 @@ void BackendInitializerUtil::updateNewSettings(const DB::ContextMutablePtr & con extern void registerAggregateFunctionCombinatorPartialMerge(AggregateFunctionCombinatorFactory &); extern void registerAggregateFunctionsBloomFilter(AggregateFunctionFactory &); +extern void registerAggregateFunctionSparkAvg(AggregateFunctionFactory &); extern void registerFunctions(FunctionFactory &); void registerAllFunctions() @@ -739,7 +868,7 @@ void registerAllFunctions() DB::registerAggregateFunctions(); auto & agg_factory = AggregateFunctionFactory::instance(); registerAggregateFunctionsBloomFilter(agg_factory); - + registerAggregateFunctionSparkAvg(agg_factory); { /// register aggregate function combinators from local_engine auto & factory = AggregateFunctionCombinatorFactory::instance(); @@ -750,10 +879,7 @@ void registerAllFunctions() void registerGlutenDisks() { registerDisks(true); - -#if USE_AWS_S3 registerGlutenDisks(true); -#endif } void BackendInitializerUtil::registerAllFactories() @@ -787,14 +913,8 @@ void BackendInitializerUtil::initCompiledExpressionCache(DB::Context::Configurat #endif } -void BackendInitializerUtil::init_json(std::string * plan_json) -{ - auto plan_ptr = std::make_unique(); - google::protobuf::util::JsonStringToMessage(plan_json->c_str(), plan_ptr.get()); - return init(new String(plan_ptr->SerializeAsString())); -} -void BackendInitializerUtil::init(std::string * plan) +void BackendInitializerUtil::init(const std::string & plan) { std::map backend_conf_map = getBackendConfMap(plan); DB::Context::ConfigurationPtr config = initConfig(backend_conf_map); @@ -840,10 +960,19 @@ void BackendInitializerUtil::init(std::string * plan) active_parts_loading_threads, 0, // We don't need any threads one all the parts will be loaded active_parts_loading_threads); + + const size_t cleanup_threads = config->getUInt("max_parts_cleaning_thread_pool_size", 128); + getPartsCleaningThreadPool().initialize( + cleanup_threads, + 0, // We don't need any threads one all the parts will be deleted + cleanup_threads); + + // Avoid using LD_PRELOAD in child process + unsetenv("LD_PRELOAD"); }); } -void BackendInitializerUtil::updateConfig(const DB::ContextMutablePtr & context, std::string * plan) +void BackendInitializerUtil::updateConfig(const DB::ContextMutablePtr & context, const std::string_view plan) { std::map backend_conf_map = getBackendConfMap(plan); @@ -868,17 +997,38 @@ void BackendFinalizerUtil::finalizeGlobally() global_context.reset(); shared_context.reset(); } + std::lock_guard lock(paths_mutex); + std::ranges::for_each(paths_need_to_clean, [](const auto & path) + { + if (fs::exists(path)) + fs::remove_all(path); + }); + paths_need_to_clean.clear(); } void BackendFinalizerUtil::finalizeSessionally() { } +std::vector BackendFinalizerUtil::paths_need_to_clean; + +std::mutex BackendFinalizerUtil::paths_mutex; + Int64 DateTimeUtil::currentTimeMillis() { return timeInMilliseconds(std::chrono::system_clock::now()); } +String DateTimeUtil::convertTimeZone(const String & time_zone) +{ + String res = time_zone; + /// Convert timezone ID like '+08:00' to GMT+8:00 + if (time_zone.starts_with("+") || time_zone.starts_with("-")) + res = "GMT" + time_zone; + res = DateLUT::mappingForJavaTimezone(res); + return res; +} + UInt64 MemoryUtil::getCurrentMemoryUsage(size_t depth) { Int64 current_memory_usage = 0; diff --git a/cpp-ch/local-engine/Common/CHUtil.h b/cpp-ch/local-engine/Common/CHUtil.h index edbd91c50d227..3ac0f63ce10b4 100644 --- a/cpp-ch/local-engine/Common/CHUtil.h +++ b/cpp-ch/local-engine/Common/CHUtil.h @@ -1,3 +1,4 @@ +/* /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -34,7 +35,12 @@ class QueryPlan; namespace local_engine { -static const std::unordered_set BOOL_VALUE_SETTINGS{"mergetree.merge_after_insert"}; +static const String MERGETREE_INSERT_WITHOUT_LOCAL_STORAGE = "mergetree.insert_without_local_storage"; +static const String MERGETREE_MERGE_AFTER_INSERT = "mergetree.merge_after_insert"; +static const std::string DECIMAL_OPERATIONS_ALLOW_PREC_LOSS = "spark.sql.decimalOperations.allowPrecisionLoss"; + +static const std::unordered_set BOOL_VALUE_SETTINGS{ + MERGETREE_MERGE_AFTER_INSERT, MERGETREE_INSERT_WITHOUT_LOCAL_STORAGE, DECIMAL_OPERATIONS_ALLOW_PREC_LOSS}; static const std::unordered_set LONG_VALUE_SETTINGS{ "optimize.maxfilesize", "optimize.minFileSize", "mergetree.max_num_part_per_merge_task"}; @@ -134,9 +140,8 @@ class BackendInitializerUtil /// Initialize two kinds of resources /// 1. global level resources like global_context/shared_context, notice that they can only be initialized once in process lifetime /// 2. session level resources like settings/configs, they can be initialized multiple times following the lifetime of executor/driver - static void init(std::string * plan); - static void init_json(std::string * plan_json); - static void updateConfig(const DB::ContextMutablePtr &, std::string *); + static void init(const std::string & plan); + static void updateConfig(const DB::ContextMutablePtr &, const std::string_view); // use excel text parser @@ -167,6 +172,10 @@ class BackendInitializerUtil inline static const std::string SPARK_HADOOP_PREFIX = "spark.hadoop."; inline static const std::string S3A_PREFIX = "fs.s3a."; inline static const std::string SPARK_DELTA_PREFIX = "spark.databricks.delta."; + inline static const std::string SPARK_SESSION_TIME_ZONE = "spark.sql.session.timeZone"; + + inline static const String GLUTEN_TASK_OFFHEAP = "spark.gluten.memory.task.offHeap.size.in.bytes"; + inline static const String CH_TASK_MEMORY = "off_heap_per_task"; /// On yarn mode, native writing on hdfs cluster takes yarn container user as the user passed to libhdfs3, which /// will cause permission issue because yarn container user is not the owner of the hdfs dir to be written. @@ -187,9 +196,10 @@ class BackendInitializerUtil static void registerAllFactories(); static void applyGlobalConfigAndSettings(DB::Context::ConfigurationPtr, DB::Settings &); static void updateNewSettings(const DB::ContextMutablePtr &, const DB::Settings &); + static std::vector wrapDiskPathConfig(const String & path_prefix, const String & path_suffix, Poco::Util::AbstractConfiguration & config); - static std::map getBackendConfMap(std::string * plan); + static std::map getBackendConfMap(const std::string_view plan); inline static std::once_flag init_flag; inline static Poco::Logger * logger; @@ -203,6 +213,9 @@ class BackendFinalizerUtil /// Release session level resources like StorageJoinBuilder. Invoked every time executor/driver shutdown. static void finalizeSessionally(); + + static std::vector paths_need_to_clean; + static std::mutex paths_mutex; }; // Ignore memory track, memory should free before IgnoreMemoryTracker deconstruction @@ -220,6 +233,7 @@ class DateTimeUtil { public: static Int64 currentTimeMillis(); + static String convertTimeZone(const String & time_zone); }; class MemoryUtil @@ -275,10 +289,7 @@ class ConcurrentDeque return deq.empty(); } - std::deque unsafeGet() - { - return deq; - } + std::deque unsafeGet() { return deq; } private: std::deque deq; diff --git a/cpp-ch/local-engine/Common/GlutenDecimalUtils.h b/cpp-ch/local-engine/Common/GlutenDecimalUtils.h new file mode 100644 index 0000000000000..32af66ec590e0 --- /dev/null +++ b/cpp-ch/local-engine/Common/GlutenDecimalUtils.h @@ -0,0 +1,108 @@ +/* +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + + +namespace local_engine +{ + +class GlutenDecimalUtils +{ +public: + static constexpr size_t MAX_PRECISION = 38; + static constexpr size_t MAX_SCALE = 38; + static constexpr auto system_Default = std::tuple(MAX_PRECISION, 18); + static constexpr auto user_Default = std::tuple(10, 0); + static constexpr size_t MINIMUM_ADJUSTED_SCALE = 6; + + // The decimal types compatible with other numeric types + static constexpr auto BOOLEAN_DECIMAL = std::tuple(1, 0); + static constexpr auto BYTE_DECIMAL = std::tuple(3, 0); + static constexpr auto SHORT_DECIMAL = std::tuple(5, 0); + static constexpr auto INT_DECIMAL = std::tuple(10, 0); + static constexpr auto LONG_DECIMAL = std::tuple(20, 0); + static constexpr auto FLOAT_DECIMAL = std::tuple(14, 7); + static constexpr auto DOUBLE_DECIMAL = std::tuple(30, 15); + static constexpr auto BIGINT_DECIMAL = std::tuple(MAX_PRECISION, 0); + + static std::tuple adjustPrecisionScale(size_t precision, size_t scale) + { + if (precision <= MAX_PRECISION) + { + // Adjustment only needed when we exceed max precision + return std::tuple(precision, scale); + } + else if (scale < 0) + { + // Decimal can have negative scale (SPARK-24468). In this case, we cannot allow a precision + // loss since we would cause a loss of digits in the integer part. + // In this case, we are likely to meet an overflow. + return std::tuple(GlutenDecimalUtils::MAX_PRECISION, scale); + } + else + { + // Precision/scale exceed maximum precision. Result must be adjusted to MAX_PRECISION. + auto intDigits = precision - scale; + // If original scale is less than MINIMUM_ADJUSTED_SCALE, use original scale value; otherwise + // preserve at least MINIMUM_ADJUSTED_SCALE fractional digits + auto minScaleValue = std::min(scale, GlutenDecimalUtils::MINIMUM_ADJUSTED_SCALE); + // The resulting scale is the maximum between what is available without causing a loss of + // digits for the integer part of the decimal and the minimum guaranteed scale, which is + // computed above + auto adjustedScale = std::max(GlutenDecimalUtils::MAX_PRECISION - intDigits, minScaleValue); + + return std::tuple(GlutenDecimalUtils::MAX_PRECISION, adjustedScale); + } + } + + static std::tuple dividePrecisionScale(size_t p1, size_t s1, size_t p2, size_t s2, bool allowPrecisionLoss) + { + if (allowPrecisionLoss) + { + // Precision: p1 - s1 + s2 + max(6, s1 + p2 + 1) + // Scale: max(6, s1 + p2 + 1) + const size_t intDig = p1 - s1 + s2; + const size_t scale = std::max(MINIMUM_ADJUSTED_SCALE, s1 + p2 + 1); + const size_t precision = intDig + scale; + return adjustPrecisionScale(precision, scale); + } + else + { + auto intDig = std::min(MAX_SCALE, p1 - s1 + s2); + auto decDig = std::min(MAX_SCALE, std::max(static_cast(6), s1 + p2 + 1)); + auto diff = (intDig + decDig) - MAX_SCALE; + if (diff > 0) + { + decDig -= diff / 2 + 1; + intDig = MAX_SCALE - decDig; + } + return std::tuple(intDig + decDig, decDig); + } + } + + static std::tuple widerDecimalType(const size_t p1, const size_t s1, const size_t p2, const size_t s2) + { + // max(s1, s2) + max(p1-s1, p2-s2), max(s1, s2) + auto scale = std::max(s1, s2); + auto range = std::max(p1 - s1, p2 - s2); + return std::tuple(range + scale, scale); + } + +}; + +} diff --git a/cpp-ch/local-engine/Common/MergeTreeTool.cpp b/cpp-ch/local-engine/Common/MergeTreeTool.cpp index f430c9306912b..63bf64726bf24 100644 --- a/cpp-ch/local-engine/Common/MergeTreeTool.cpp +++ b/cpp-ch/local-engine/Common/MergeTreeTool.cpp @@ -16,14 +16,14 @@ */ #include "MergeTreeTool.h" +#include +#include + #include #include #include #include #include -#include -#include -#include #include using namespace DB; diff --git a/cpp-ch/local-engine/Common/QueryContext.cpp b/cpp-ch/local-engine/Common/QueryContext.cpp index c659e6f34ea16..f4d39c612430e 100644 --- a/cpp-ch/local-engine/Common/QueryContext.cpp +++ b/cpp-ch/local-engine/Common/QueryContext.cpp @@ -67,7 +67,7 @@ int64_t initializeQuery(ReservationListenerWrapperPtr listener) else listener->reserve(size); }; - CurrentMemoryTracker::before_free = [listener](Int64 size) -> void { listener->free(size); }; + CurrentMemoryTracker::before_free = [listener](Int64 size) -> void { listener->tryFree(size); }; CurrentMemoryTracker::current_memory = [listener]() -> Int64 { return listener->currentMemory(); }; allocator_map.insert(allocator_id, allocator_context); return allocator_id; diff --git a/cpp-ch/local-engine/Disks/ObjectStorages/GlutenDiskHDFS.cpp b/cpp-ch/local-engine/Disks/ObjectStorages/GlutenDiskHDFS.cpp index bff4108f28a16..f207ad232b4f2 100644 --- a/cpp-ch/local-engine/Disks/ObjectStorages/GlutenDiskHDFS.cpp +++ b/cpp-ch/local-engine/Disks/ObjectStorages/GlutenDiskHDFS.cpp @@ -17,6 +17,8 @@ #include "GlutenDiskHDFS.h" #include + +#include #include #if USE_HDFS @@ -30,16 +32,11 @@ void GlutenDiskHDFS::createDirectory(const String & path) hdfsCreateDirectory(hdfs_object_storage->getHDFSFS(), path.c_str()); } -String GlutenDiskHDFS::path2AbsPath(const String & path) -{ - return getObjectStorage()->generateObjectKeyForPath(path).serialize(); -} - void GlutenDiskHDFS::createDirectories(const String & path) { DiskObjectStorage::createDirectories(path); - auto* hdfs = hdfs_object_storage->getHDFSFS(); - fs::path p = path; + auto * hdfs = hdfs_object_storage->getHDFSFS(); + fs::path p = "/" + path; std::vector paths_created; while (hdfsExists(hdfs, p.c_str()) < 0) { @@ -55,7 +52,15 @@ void GlutenDiskHDFS::createDirectories(const String & path) void GlutenDiskHDFS::removeDirectory(const String & path) { DiskObjectStorage::removeDirectory(path); - hdfsDelete(hdfs_object_storage->getHDFSFS(), path.c_str(), 1); + String abs_path = "/" + path; + hdfsDelete(hdfs_object_storage->getHDFSFS(), abs_path.c_str(), 1); +} + +void GlutenDiskHDFS::removeRecursive(const String & path) +{ + DiskObjectStorage::removeRecursive(path); + String abs_path = "/" + path; + hdfsDelete(hdfs_object_storage->getHDFSFS(), abs_path.c_str(), 1); } DiskObjectStoragePtr GlutenDiskHDFS::createDiskObjectStorage() @@ -67,9 +72,19 @@ DiskObjectStoragePtr GlutenDiskHDFS::createDiskObjectStorage() getMetadataStorage(), getObjectStorage(), SerializedPlanParser::global_context->getConfigRef(), - config_prefix); + config_prefix, + object_storage_creator); } - +std::unique_ptr GlutenDiskHDFS::writeFile( + const String & path, + size_t buf_size, + DB::WriteMode mode, + const DB::WriteSettings & settings) +{ + if (throttler) + throttler->add(1); + return DiskObjectStorage::writeFile(path, buf_size, mode, settings); +} } #endif \ No newline at end of file diff --git a/cpp-ch/local-engine/Disks/ObjectStorages/GlutenDiskHDFS.h b/cpp-ch/local-engine/Disks/ObjectStorages/GlutenDiskHDFS.h index 9caedaae87858..97a99f1deaba1 100644 --- a/cpp-ch/local-engine/Disks/ObjectStorages/GlutenDiskHDFS.h +++ b/cpp-ch/local-engine/Disks/ObjectStorages/GlutenDiskHDFS.h @@ -19,6 +19,7 @@ #include +#include #include #if USE_HDFS #include @@ -36,13 +37,18 @@ class GlutenDiskHDFS : public DB::DiskObjectStorage DB::MetadataStoragePtr metadata_storage_, DB::ObjectStoragePtr object_storage_, const Poco::Util::AbstractConfiguration & config, - const String & config_prefix) + const String & config_prefix, + std::function _object_storage_creator) : DiskObjectStorage(name_, object_key_prefix_, metadata_storage_, object_storage_, config, config_prefix) + , object_key_prefix(object_key_prefix_) + , hdfs_config_prefix(config_prefix) + , object_storage_creator(_object_storage_creator) { - chassert(dynamic_cast(object_storage_.get()) != nullptr); - object_key_prefix = object_key_prefix_; - hdfs_object_storage = dynamic_cast(object_storage_.get()); + hdfs_object_storage = typeid_cast>(object_storage_); hdfsSetWorkingDirectory(hdfs_object_storage->getHDFSFS(), "/"); + auto max_speed = config.getUInt(config_prefix + ".write_speed", 450); + throttler = std::make_shared(max_speed); } void createDirectory(const String & path) override; @@ -51,12 +57,31 @@ class GlutenDiskHDFS : public DB::DiskObjectStorage void removeDirectory(const String & path) override; + void removeRecursive(const String & path) override; + DB::DiskObjectStoragePtr createDiskObjectStorage() override; -private: - String path2AbsPath(const String & path); - GlutenHDFSObjectStorage * hdfs_object_storage; + std::unique_ptr writeFile(const String& path, size_t buf_size, DB::WriteMode mode, + const DB::WriteSettings& settings) override; + + void applyNewSettings( + const Poco::Util::AbstractConfiguration & config, + DB::ContextPtr context, + const String & config_prefix, + const DB::DisksMap & map) override + { + DB::ObjectStoragePtr tmp = object_storage_creator(config, context); + hdfs_object_storage = typeid_cast>(tmp); + object_storage = hdfs_object_storage; + } +private: + std::shared_ptr hdfs_object_storage; String object_key_prefix; + DB::ThrottlerPtr throttler; + const String hdfs_config_prefix; + std::function + object_storage_creator; }; #endif } diff --git a/cpp-ch/local-engine/Disks/ObjectStorages/GlutenHDFSObjectStorage.cpp b/cpp-ch/local-engine/Disks/ObjectStorages/GlutenHDFSObjectStorage.cpp index 60b82ec845bb4..cab87d66d8848 100644 --- a/cpp-ch/local-engine/Disks/ObjectStorages/GlutenHDFSObjectStorage.cpp +++ b/cpp-ch/local-engine/Disks/ObjectStorages/GlutenHDFSObjectStorage.cpp @@ -38,7 +38,7 @@ DB::ObjectStorageKey local_engine::GlutenHDFSObjectStorage::generateObjectKeyFor initializeHDFSFS(); /// what ever data_source_description.description value is, consider that key as relative key chassert(data_directory.starts_with("/")); - return ObjectStorageKey::createAsRelative(fs::path(url_without_path) / data_directory.substr(1) / path); + return ObjectStorageKey::createAsRelative(fs::path(url_without_path) / data_directory.substr(1), path); } } #endif diff --git a/cpp-ch/local-engine/Disks/ObjectStorages/GlutenHDFSObjectStorage.h b/cpp-ch/local-engine/Disks/ObjectStorages/GlutenHDFSObjectStorage.h index a532c98cb87d5..da37e1d782dbc 100644 --- a/cpp-ch/local-engine/Disks/ObjectStorages/GlutenHDFSObjectStorage.h +++ b/cpp-ch/local-engine/Disks/ObjectStorages/GlutenHDFSObjectStorage.h @@ -33,7 +33,7 @@ class GlutenHDFSObjectStorage final : public DB::HDFSObjectStorage const String & hdfs_root_path_, SettingsPtr settings_, const Poco::Util::AbstractConfiguration & config_) - : HDFSObjectStorage(hdfs_root_path_, std::move(settings_), config_, /* lazy_initialize */true), config(config_) + : HDFSObjectStorage(hdfs_root_path_, std::move(settings_), config_, /* lazy_initialize */false) { } std::unique_ptr readObject( /// NOLINT @@ -43,8 +43,6 @@ class GlutenHDFSObjectStorage final : public DB::HDFSObjectStorage std::optional file_size = {}) const override; DB::ObjectStorageKey generateObjectKeyForPath(const std::string & path) const override; hdfsFS getHDFSFS() const { return hdfs_fs.get(); } -private: - const Poco::Util::AbstractConfiguration & config; }; #endif diff --git a/cpp-ch/local-engine/Disks/ObjectStorages/registerGlutenDiskObjectStorage.cpp b/cpp-ch/local-engine/Disks/ObjectStorages/registerGlutenDiskObjectStorage.cpp index 800b51f93e948..9e45464980347 100644 --- a/cpp-ch/local-engine/Disks/ObjectStorages/registerGlutenDiskObjectStorage.cpp +++ b/cpp-ch/local-engine/Disks/ObjectStorages/registerGlutenDiskObjectStorage.cpp @@ -59,6 +59,14 @@ static S3::URI getS3URI( return uri; } +static std::string getEndpoint( + const Poco::Util::AbstractConfiguration & config, + const std::string & config_prefix, + const ContextPtr & context) +{ + return context->getMacros()->expand(config.getString(config_prefix + ".endpoint")); +} + void registerGlutenS3ObjectStorage(ObjectStorageFactory & factory) { static constexpr auto disk_type = "s3_gluten"; @@ -74,8 +82,9 @@ void registerGlutenS3ObjectStorage(ObjectStorageFactory & factory) { auto uri = getS3URI(config, config_prefix, context); auto s3_capabilities = getCapabilitiesFromConfig(config, config_prefix); - auto settings = getSettings(config, config_prefix, context); - auto client = getClient(config, config_prefix, context, *settings, true); + auto endpoint = getEndpoint(config, config_prefix, context); + auto settings = getSettings(config, config_prefix, context, endpoint, /* validate_settings */true); + auto client = getClient(endpoint, *settings, context, /* for_disk_s3 */true); auto key_generator = createObjectStorageKeysGeneratorAsIsWithPrefix(uri.key); auto object_storage = std::make_shared( @@ -112,7 +121,7 @@ void registerGlutenHDFSObjectStorage(ObjectStorageFactory & factory) config.getUInt64(config_prefix + ".min_bytes_for_seek", 1024 * 1024), context->getSettingsRef().hdfs_replication ); - return std::make_unique(uri, std::move(settings), config); + return std::make_shared(uri, std::move(settings), config); }); } #endif diff --git a/cpp-ch/local-engine/Disks/registerGlutenDisks.cpp b/cpp-ch/local-engine/Disks/registerGlutenDisks.cpp index c7e9c5fd32baf..8a920edcce77a 100644 --- a/cpp-ch/local-engine/Disks/registerGlutenDisks.cpp +++ b/cpp-ch/local-engine/Disks/registerGlutenDisks.cpp @@ -40,6 +40,8 @@ void registerGlutenHDFSObjectStorage(DB::ObjectStorageFactory & factory); void registerGlutenDisks(bool global_skip_access_check) { auto & factory = DB::DiskFactory::instance(); + +#if USE_AWS_S3 auto creator = [global_skip_access_check]( const String & name, const Poco::Util::AbstractConfiguration & config, @@ -66,7 +68,7 @@ void registerGlutenDisks(bool global_skip_access_check) }; auto & object_factory = DB::ObjectStorageFactory::instance(); -#if USE_AWS_S3 + registerGlutenS3ObjectStorage(object_factory); factory.registerDiskType("s3_gluten", creator); /// For compatibility #endif @@ -82,11 +84,20 @@ void registerGlutenDisks(bool global_skip_access_check) bool) -> DB::DiskPtr { bool skip_access_check = global_skip_access_check || config.getBool(config_prefix + ".skip_access_check", false); - auto object_storage = DB::ObjectStorageFactory::instance().create(name, config, config_prefix, context, skip_access_check); + auto object_storage_creator = [name, skip_access_check, config_prefix]( + const Poco::Util::AbstractConfiguration & conf, DB::ContextPtr ctx) -> DB::ObjectStoragePtr + { return DB::ObjectStorageFactory::instance().create(name, conf, config_prefix, ctx, skip_access_check); }; + auto object_storage = object_storage_creator(config, context); auto metadata_storage = DB::MetadataStorageFactory::instance().create(name, config, config_prefix, object_storage, "local"); DB::DiskObjectStoragePtr disk = std::make_shared( - name, object_storage->getCommonKeyPrefix(), std::move(metadata_storage), std::move(object_storage), config, config_prefix); + name, + object_storage->getCommonKeyPrefix(), + std::move(metadata_storage), + std::move(object_storage), + config, + config_prefix, + object_storage_creator); disk->startup(context, skip_access_check); return disk; diff --git a/cpp-ch/local-engine/Functions/CMakeLists.txt b/cpp-ch/local-engine/Functions/CMakeLists.txt index 5968c86094f72..9e31595e03d5c 100644 --- a/cpp-ch/local-engine/Functions/CMakeLists.txt +++ b/cpp-ch/local-engine/Functions/CMakeLists.txt @@ -16,51 +16,56 @@ add_headers_and_sources(gluten_spark_functions .) add_library(gluten_spark_functions_obj OBJECT ${gluten_spark_functions_sources}) -list (APPEND PRIVATE_LIBS - boost::headers_only - pcg_random - Poco::Foundation - Poco::Util - Poco::Net - Poco::JSON - ch_contrib::cctz - ch_contrib::fmt - ch_contrib::pdqsort - ch_contrib::miniselect - ch_contrib::magic_enum - ch_contrib::double_conversion - ch_contrib::dragonbox_to_chars - ch_contrib::re2 - ch_contrib::abseil_swiss_tables - ch_contrib::sparsehash - ch_contrib::metrohash - ch_contrib::murmurhash - ch_contrib::wyhash - ch_contrib::cityhash - ch_contrib::farmhash - ch_contrib::xxHash - OpenSSL::SSL -) +list( + APPEND + PRIVATE_LIBS + boost::headers_only + pcg_random + Poco::Foundation + Poco::Util + Poco::Net + Poco::JSON + ch_contrib::cctz + ch_contrib::fmt + ch_contrib::pdqsort + ch_contrib::miniselect + ch_contrib::magic_enum + ch_contrib::double_conversion + ch_contrib::dragonbox_to_chars + ch_contrib::re2 + ch_contrib::abseil_swiss_tables + ch_contrib::sparsehash + ch_contrib::metrohash + ch_contrib::murmurhash + ch_contrib::wyhash + ch_contrib::cityhash + ch_contrib::farmhash + ch_contrib::xxHash + OpenSSL::SSL) -if (TARGET ch_contrib::vectorscan) - list (APPEND PRIVATE_LIBS ch_contrib::vectorscan) +if(TARGET ch_contrib::vectorscan) + list(APPEND PRIVATE_LIBS ch_contrib::vectorscan) endif() -if (TARGET ch_contrib::rapidjson) - list (APPEND PRIVATE_LIBS ch_contrib::rapidjson) +if(TARGET ch_contrib::rapidjson) + list(APPEND PRIVATE_LIBS ch_contrib::rapidjson) endif() -if (TARGET ch_contrib::simdjson) - list (APPEND PRIVATE_LIBS ch_contrib::simdjson) +if(TARGET ch_contrib::simdjson) + list(APPEND PRIVATE_LIBS ch_contrib::simdjson) endif() -if (TARGET ch_rust::blake3) - list (APPEND PRIVATE_LIBS ch_rust::blake3) +if(TARGET ch_rust::blake3) + list(APPEND PRIVATE_LIBS ch_rust::blake3) endif() -list (APPEND OBJECT_LIBS $) +if(TARGET ch_contrib::gwp_asan) + list(APPEND PRIVATE_LIBS ch_contrib::gwp_asan) +endif() + +list(APPEND OBJECT_LIBS $) target_link_libraries(gluten_spark_functions_obj PRIVATE ${PRIVATE_LIBS}) add_library(gluten_spark_functions INTERFACE) -target_link_libraries(gluten_spark_functions INTERFACE ${OBJECT_LIBS}) \ No newline at end of file +target_link_libraries(gluten_spark_functions INTERFACE ${OBJECT_LIBS}) diff --git a/cpp-ch/local-engine/Functions/FunctionGetDateData.h b/cpp-ch/local-engine/Functions/FunctionGetDateData.h index 4f79d4bd0c4b8..6cd93dd02c985 100644 --- a/cpp-ch/local-engine/Functions/FunctionGetDateData.h +++ b/cpp-ch/local-engine/Functions/FunctionGetDateData.h @@ -46,7 +46,7 @@ class FunctionGetDateData : public DB::IFunction FunctionGetDateData() = default; ~FunctionGetDateData() override = default; - DB::ColumnPtr executeImpl(const DB::ColumnsWithTypeAndName & arguments, const DB::DataTypePtr & result_type, size_t) const override + DB::ColumnPtr executeImpl(const DB::ColumnsWithTypeAndName & arguments, const DB::DataTypePtr &, size_t) const override { if (arguments.size() != 1) throw DB::Exception(DB::ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {}'s arguments number must be 1.", getName()); @@ -54,9 +54,6 @@ class FunctionGetDateData : public DB::IFunction const DB::ColumnWithTypeAndName arg1 = arguments[0]; const auto * src_col = checkAndGetColumn(arg1.column.get()); size_t size = src_col->size(); - - if (!result_type->isNullable()) - throw DB::Exception(DB::ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Function {}'s return type must be nullable", getName()); using ColVecTo = ColumnVector; typename ColVecTo::MutablePtr result_column = ColVecTo::create(size, 0); diff --git a/cpp-ch/local-engine/Functions/FunctionGreatestLeast.h b/cpp-ch/local-engine/Functions/FunctionGreatestLeast.h new file mode 100644 index 0000000000000..6930c1d75b796 --- /dev/null +++ b/cpp-ch/local-engine/Functions/FunctionGreatestLeast.h @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include + +namespace DB +{ +namespace ErrorCodes +{ + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; +} +} +namespace local_engine +{ +template +class FunctionGreatestestLeast : public DB::FunctionLeastGreatestGeneric +{ +public: + bool useDefaultImplementationForNulls() const override { return false; } + virtual String getName() const = 0; + +private: + DB::DataTypePtr getReturnTypeImpl(const DB::DataTypes & types) const override + { + if (types.empty()) + throw DB::Exception(DB::ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} cannot be called without arguments", getName()); + return makeNullable(getLeastSupertype(types)); + } + + DB::ColumnPtr executeImpl(const DB::ColumnsWithTypeAndName & arguments, const DB::DataTypePtr & result_type, size_t input_rows_count) const override + { + size_t num_arguments = arguments.size(); + DB::Columns converted_columns(num_arguments); + for (size_t arg = 0; arg < num_arguments; ++arg) + converted_columns[arg] = castColumn(arguments[arg], result_type)->convertToFullColumnIfConst(); + auto result_column = result_type->createColumn(); + result_column->reserve(input_rows_count); + for (size_t row_num = 0; row_num < input_rows_count; ++row_num) + { + size_t best_arg = 0; + for (size_t arg = 1; arg < num_arguments; ++arg) + { + if constexpr (kind == DB::LeastGreatest::Greatest) + { + auto cmp_result = converted_columns[arg]->compareAt(row_num, row_num, *converted_columns[best_arg], -1); + if (cmp_result > 0) + best_arg = arg; + } + else + { + auto cmp_result = converted_columns[arg]->compareAt(row_num, row_num, *converted_columns[best_arg], 1); + if (cmp_result < 0) + best_arg = arg; + } + } + result_column->insertFrom(*converted_columns[best_arg], row_num); + } + return result_column; + } +}; + +} diff --git a/cpp-ch/local-engine/Functions/SparkArrayFlatten.cpp b/cpp-ch/local-engine/Functions/SparkArrayFlatten.cpp new file mode 100644 index 0000000000000..d39bca5ea104e --- /dev/null +++ b/cpp-ch/local-engine/Functions/SparkArrayFlatten.cpp @@ -0,0 +1,160 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int ILLEGAL_TYPE_OF_ARGUMENT; + extern const int ILLEGAL_COLUMN; +} + +/// arrayFlatten([[1, 2, 3], [4, 5]]) = [1, 2, 3, 4, 5] - flatten array. +class SparkArrayFlatten : public IFunction +{ +public: + static constexpr auto name = "sparkArrayFlatten"; + + static FunctionPtr create(ContextPtr) { return std::make_shared(); } + + size_t getNumberOfArguments() const override { return 1; } + bool useDefaultImplementationForConstants() const override { return true; } + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + { + if (!isArray(arguments[0])) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument of function {}, expected Array", + arguments[0]->getName(), getName()); + + DataTypePtr nested_type = arguments[0]; + nested_type = checkAndGetDataType(removeNullable(nested_type).get())->getNestedType(); + return nested_type; + } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override + { + /** We create an array column with array elements as the most deep elements of nested arrays, + * and construct offsets by selecting elements of most deep offsets by values of ancestor offsets. + * +Example 1: + +Source column: Array(Array(UInt8)): +Row 1: [[1, 2, 3], [4, 5]], Row 2: [[6], [7, 8]] +data: [1, 2, 3], [4, 5], [6], [7, 8] +offsets: 2, 4 +data.data: 1 2 3 4 5 6 7 8 +data.offsets: 3 5 6 8 + +Result column: Array(UInt8): +Row 1: [1, 2, 3, 4, 5], Row 2: [6, 7, 8] +data: 1 2 3 4 5 6 7 8 +offsets: 5 8 + +Result offsets are selected from the most deep (data.offsets) by previous deep (offsets) (and values are decremented by one): +3 5 6 8 + ^ ^ + +Example 2: + +Source column: Array(Array(Array(UInt8))): +Row 1: [[], [[1], [], [2, 3]]], Row 2: [[[4]]] + +most deep data: 1 2 3 4 + +offsets1: 2 3 +offsets2: 0 3 4 +- ^ ^ - select by prev offsets +offsets3: 1 1 3 4 +- ^ ^ - select by prev offsets + +result offsets: 3, 4 +result: Row 1: [1, 2, 3], Row2: [4] + */ + + const ColumnArray * src_col = checkAndGetColumn(arguments[0].column.get()); + + if (!src_col) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} in argument of function 'arrayFlatten'", + arguments[0].column->getName()); + + const IColumn::Offsets & src_offsets = src_col->getOffsets(); + + ColumnArray::ColumnOffsets::MutablePtr result_offsets_column; + const IColumn::Offsets * prev_offsets = &src_offsets; + const IColumn * prev_data = &src_col->getData(); + bool nullable = prev_data->isNullable(); + // when array has null element, return null + if (nullable) + { + const ColumnNullable * nullable_column = checkAndGetColumn(prev_data); + prev_data = nullable_column->getNestedColumnPtr().get(); + for (size_t i = 0; i < nullable_column->size(); i++) + { + if (nullable_column->isNullAt(i)) + { + auto res= nullable_column->cloneEmpty(); + res->insertManyDefaults(input_rows_count); + return res; + } + } + } + if (isNothing(prev_data->getDataType())) + return prev_data->cloneResized(input_rows_count); + // only flatten one dimension + if (const ColumnArray * next_col = checkAndGetColumn(prev_data)) + { + result_offsets_column = ColumnArray::ColumnOffsets::create(input_rows_count); + + IColumn::Offsets & result_offsets = result_offsets_column->getData(); + + const IColumn::Offsets * next_offsets = &next_col->getOffsets(); + + for (size_t i = 0; i < input_rows_count; ++i) + result_offsets[i] = (*next_offsets)[(*prev_offsets)[i] - 1]; /// -1 array subscript is Ok, see PaddedPODArray + prev_data = &next_col->getData(); + } + + auto res = ColumnArray::create( + prev_data->getPtr(), + result_offsets_column ? std::move(result_offsets_column) : src_col->getOffsetsPtr()); + if (nullable) + return makeNullable(res); + return res; + } + +private: + String getName() const override + { + return name; + } +}; + +REGISTER_FUNCTION(SparkArrayFlatten) +{ + factory.registerFunction(); +} + +} diff --git a/cpp-ch/local-engine/Functions/SparkFunctionArrayDistinct.cpp b/cpp-ch/local-engine/Functions/SparkFunctionArrayDistinct.cpp index 89598ff7a1b10..0779346b6e043 100644 --- a/cpp-ch/local-engine/Functions/SparkFunctionArrayDistinct.cpp +++ b/cpp-ch/local-engine/Functions/SparkFunctionArrayDistinct.cpp @@ -314,7 +314,10 @@ void FunctionArrayDistinctSpark::executeHashed( if (!set.find(hash)) { set.insert(hash); - res_data_col.insertFrom(src_data, j); + if (nullable_col) + res_data_col.insertFrom(*nullable_col, j); + else + res_data_col.insertFrom(src_data, j); } } diff --git a/cpp-ch/local-engine/Functions/SparkFunctionFloor.h b/cpp-ch/local-engine/Functions/SparkFunctionFloor.h index b016c9afa4079..ce33d11dbd8cf 100644 --- a/cpp-ch/local-engine/Functions/SparkFunctionFloor.h +++ b/cpp-ch/local-engine/Functions/SparkFunctionFloor.h @@ -14,13 +14,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include -#include +#pragma once + #include #include -#include #include -#include +#include +#include +#include using namespace DB; @@ -130,20 +131,29 @@ struct SparkFloatFloorImpl { private: static_assert(!is_decimal); - using Op = FloatRoundingComputation; - using Data = std::array; + template < + Vectorize vectorize = +#ifdef __SSE4_1__ + Vectorize::Yes +#else + Vectorize::No +#endif + > + using Op = FloatRoundingComputation; + using Data = std::array::data_count>; + public: static void apply(const PaddedPODArray & in, size_t scale, PaddedPODArray & out, PaddedPODArray & null_map) { - auto mm_scale = Op::prepare(scale); + auto mm_scale = Op<>::prepare(scale); const size_t data_count = std::tuple_size(); - const T* end_in = in.data() + in.size(); - const T* limit = in.data() + in.size() / data_count * data_count; - const T* __restrict p_in = in.data(); - T* __restrict p_out = out.data(); + const T * end_in = in.data() + in.size(); + const T * limit = in.data() + in.size() / data_count * data_count; + const T * __restrict p_in = in.data(); + T * __restrict p_out = out.data(); while (p_in < limit) { - Op::compute(p_in, mm_scale, p_out); + Op<>::compute(p_in, mm_scale, p_out); p_in += data_count; p_out += data_count; } @@ -154,7 +164,7 @@ struct SparkFloatFloorImpl Data tmp_dst; size_t tail_size_bytes = (end_in - p_in) * sizeof(*p_in); memcpy(&tmp_src, p_in, tail_size_bytes); - Op::compute(reinterpret_cast(&tmp_src), mm_scale, reinterpret_cast(&tmp_dst)); + Op<>::compute(reinterpret_cast(&tmp_src), mm_scale, reinterpret_cast(&tmp_dst)); memcpy(p_out, &tmp_dst, tail_size_bytes); } @@ -171,11 +181,31 @@ struct SparkFloatFloorImpl checkAndSetNullable(out[i], null_map[i]); } } - }; class SparkFunctionFloor : public DB::FunctionFloor { + static Scale getScaleArg(const ColumnsWithTypeAndName & arguments) + { + if (arguments.size() == 2) + { + const IColumn & scale_column = *arguments[1].column; + if (!isColumnConst(scale_column)) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Scale argument for rounding functions must be constant"); + + Field scale_field = assert_cast(scale_column).getField(); + if (scale_field.getType() != Field::Types::UInt64 && scale_field.getType() != Field::Types::Int64) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Scale argument for rounding functions must have integer type"); + + Int64 scale64 = scale_field.get(); + if (scale64 > std::numeric_limits::max() || scale64 < std::numeric_limits::min()) + throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Scale argument for rounding function is too large"); + + return scale64; + } + return 0; + } + public: static constexpr auto name = "sparkFloor"; static DB::FunctionPtr create(DB::ContextPtr) { return std::make_shared(); } @@ -183,17 +213,20 @@ class SparkFunctionFloor : public DB::FunctionFloor ~SparkFunctionFloor() override = default; String getName() const override { return name; } - DB::DataTypePtr getReturnTypeImpl(const DB::DataTypes & arguments) const override + ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; } + + DB::DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override { auto result_type = DB::FunctionFloor::getReturnTypeImpl(arguments); return makeNullable(result_type); } - DB::ColumnPtr executeImpl(const DB::ColumnsWithTypeAndName & arguments, const DB::DataTypePtr & result_type, size_t input_rows) const override + DB::ColumnPtr + executeImpl(const DB::ColumnsWithTypeAndName & arguments, const DB::DataTypePtr & result_type, size_t input_rows) const override { const ColumnWithTypeAndName & first_arg = arguments[0]; Scale scale_arg = getScaleArg(arguments); - switch(first_arg.type->getTypeId()) + switch (first_arg.type->getTypeId()) { case TypeIndex::Float32: return executeInternal(first_arg.column, scale_arg); @@ -206,7 +239,7 @@ class SparkFunctionFloor : public DB::FunctionFloor } } - template + template static ColumnPtr executeInternal(const ColumnPtr & col_arg, const Scale & scale_arg) { const auto * col = checkAndGetColumn>(col_arg.get()); diff --git a/cpp-ch/local-engine/Functions/SparkFunctionGetJsonObject.h b/cpp-ch/local-engine/Functions/SparkFunctionGetJsonObject.h index 6ba05e901791e..22b395536c235 100644 --- a/cpp-ch/local-engine/Functions/SparkFunctionGetJsonObject.h +++ b/cpp-ch/local-engine/Functions/SparkFunctionGetJsonObject.h @@ -113,6 +113,8 @@ class GetJsonObjectImpl JSONStringSerializer serializer(*col_str); if (elements.size() == 1) [[likely]] { + if (elements[0].isNull()) + return false; nullable_col_str.getNullMapData().push_back(0); if (elements[0].isString()) { @@ -230,7 +232,7 @@ class FlattenJSONStringOnRequiredFunction : public DB::IFunction tmp.push('{'); else if (ch == '}') { - if (tmp.top() == '{') + if (!tmp.empty() && tmp.top() == '{') tmp.pop(); } if (tmp.empty()) diff --git a/cpp-ch/local-engine/Functions/SparkFunctionGreatest.cpp b/cpp-ch/local-engine/Functions/SparkFunctionGreatest.cpp new file mode 100644 index 0000000000000..920fe1b9c9cce --- /dev/null +++ b/cpp-ch/local-engine/Functions/SparkFunctionGreatest.cpp @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include + +namespace local_engine +{ +class SparkFunctionGreatest : public FunctionGreatestestLeast +{ +public: + static constexpr auto name = "sparkGreatest"; + static DB::FunctionPtr create(DB::ContextPtr) { return std::make_shared(); } + SparkFunctionGreatest() = default; + ~SparkFunctionGreatest() override = default; + String getName() const override + { + return name; + } +}; + +REGISTER_FUNCTION(SparkGreatest) +{ + factory.registerFunction(); +} +} diff --git a/cpp-ch/local-engine/Functions/SparkFunctionHashingExtended.h b/cpp-ch/local-engine/Functions/SparkFunctionHashingExtended.h index 82d678aa2a4e0..57bf00ba99044 100644 --- a/cpp-ch/local-engine/Functions/SparkFunctionHashingExtended.h +++ b/cpp-ch/local-engine/Functions/SparkFunctionHashingExtended.h @@ -28,6 +28,7 @@ #include #include #include +#include namespace DB { @@ -200,14 +201,14 @@ class SparkFunctionAnyHash : public IFunction { if constexpr (std::is_same_v) { - if (n == -0.0f) [[unlikely]] + if (n == -0.0f || isNaN(n)) [[unlikely]] return applyNumber(0, seed); else return Impl::apply(reinterpret_cast(&n), sizeof(n), seed); } else { - if (n == -0.0) [[unlikely]] + if (n == -0.0 || isNaN(n)) [[unlikely]] return applyNumber(0, seed); else return Impl::apply(reinterpret_cast(&n), sizeof(n), seed); diff --git a/cpp-ch/local-engine/Functions/SparkFunctionLeast.cpp b/cpp-ch/local-engine/Functions/SparkFunctionLeast.cpp new file mode 100644 index 0000000000000..70aafdf072095 --- /dev/null +++ b/cpp-ch/local-engine/Functions/SparkFunctionLeast.cpp @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include + +namespace local_engine +{ +class SparkFunctionLeast : public FunctionGreatestestLeast +{ +public: + static constexpr auto name = "sparkLeast"; + static DB::FunctionPtr create(DB::ContextPtr) { return std::make_shared(); } + SparkFunctionLeast() = default; + ~SparkFunctionLeast() override = default; + String getName() const override + { + return name; + } +}; + +REGISTER_FUNCTION(SparkLeast) +{ + factory.registerFunction(); +} +} diff --git a/cpp-ch/local-engine/Functions/SparkFunctionRint.cpp b/cpp-ch/local-engine/Functions/SparkFunctionRint.cpp new file mode 100644 index 0000000000000..656994c3ea62e --- /dev/null +++ b/cpp-ch/local-engine/Functions/SparkFunctionRint.cpp @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "SparkFunctionRint.h" + +#include + +namespace DB +{ +namespace ErrorCodes +{ +extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; +extern const int ILLEGAL_TYPE_OF_ARGUMENT; +} +} + +namespace local_engine +{ + +DB::ColumnPtr SparkFunctionRint::executeImpl( + const DB::ColumnsWithTypeAndName & arguments, + const DB::DataTypePtr & result_type, + size_t /*input_rows_count*/) const +{ + if (arguments.size() != 1) + throw DB::Exception(DB::ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} requires exactly one argument.", getName()); + if (!isFloat(*arguments[0].type)) + throw DB::Exception(DB::ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Argument for function {} must be float32 or float64, got {}", getName(), arguments[0].type->getName()); + + auto output = result_type->createColumn(); + bool is_float32 = DB::WhichDataType(*arguments[0].type).isFloat32(); + auto input = arguments[0].column->convertToFullIfNeeded(); + auto& output_data = static_cast(output.get())->getData(); + output_data.resize(input->size()); + for (size_t i = 0; i < input->size(); ++i) + { + if (is_float32) + output_data[i] = std::rint(DB::checkAndGetColumn(*input).getData()[i]); + else + output_data[i] = std::rint(DB::checkAndGetColumn(*input).getData()[i]); + } + return std::move(output); +} + + +REGISTER_FUNCTION(SparkFunctionRint) +{ + factory.registerFunction(); +} +} \ No newline at end of file diff --git a/cpp-ch/local-engine/Functions/SparkFunctionRint.h b/cpp-ch/local-engine/Functions/SparkFunctionRint.h new file mode 100644 index 0000000000000..ee7a7bb1ea66e --- /dev/null +++ b/cpp-ch/local-engine/Functions/SparkFunctionRint.h @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include + +namespace local_engine +{ +class SparkFunctionRint : public DB::IFunction +{ +public: + static constexpr auto name = "sparkRint"; + + static DB::FunctionPtr create(DB::ContextPtr) { return std::make_shared(); } + + SparkFunctionRint() = default; + + ~SparkFunctionRint() override = default; + + String getName() const override + { + return name; + } + + size_t getNumberOfArguments() const override + { + return 1; + } + + bool isSuitableForShortCircuitArgumentsExecution(const DB::DataTypesWithConstInfo &) const override { return true; } + + DB::ColumnPtr executeImpl( + const DB::ColumnsWithTypeAndName & arguments, + const DB::DataTypePtr & result_type, + size_t input_rows_count) const override; + + DB::DataTypePtr getReturnTypeImpl(const DB::DataTypes &) const override + { + return std::make_shared(); + } +}; +} diff --git a/cpp-ch/local-engine/Functions/SparkFunctionRoundHalfUp.h b/cpp-ch/local-engine/Functions/SparkFunctionRoundHalfUp.h index 47135aabd94f7..441842d4e7e1a 100644 --- a/cpp-ch/local-engine/Functions/SparkFunctionRoundHalfUp.h +++ b/cpp-ch/local-engine/Functions/SparkFunctionRoundHalfUp.h @@ -18,6 +18,11 @@ #include +namespace DB::ErrorCodes +{ +extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; +} + namespace local_engine { using namespace DB; @@ -35,10 +40,11 @@ class BaseFloatRoundingHalfUpComputation static VectorType load(const ScalarType * in) { return _mm_loadu_ps(in); } static VectorType load1(const ScalarType in) { return _mm_load1_ps(&in); } - static void store(ScalarType * out, VectorType val) { _mm_storeu_ps(out, val);} + static void store(ScalarType * out, VectorType val) { _mm_storeu_ps(out, val); } static VectorType multiply(VectorType val, VectorType scale) { return _mm_mul_ps(val, scale); } static VectorType divide(VectorType val, VectorType scale) { return _mm_div_ps(val, scale); } - template static VectorType apply(VectorType val) + template + static VectorType apply(VectorType val) { ScalarType tempFloatsIn[data_count]; ScalarType tempFloatsOut[data_count]; @@ -49,10 +55,7 @@ class BaseFloatRoundingHalfUpComputation return load(tempFloatsOut); } - static VectorType prepare(size_t scale) - { - return load1(scale); - } + static VectorType prepare(size_t scale) { return load1(scale); } }; template <> @@ -65,10 +68,11 @@ class BaseFloatRoundingHalfUpComputation static VectorType load(const ScalarType * in) { return _mm_loadu_pd(in); } static VectorType load1(const ScalarType in) { return _mm_load1_pd(&in); } - static void store(ScalarType * out, VectorType val) { _mm_storeu_pd(out, val);} + static void store(ScalarType * out, VectorType val) { _mm_storeu_pd(out, val); } static VectorType multiply(VectorType val, VectorType scale) { return _mm_mul_pd(val, scale); } static VectorType divide(VectorType val, VectorType scale) { return _mm_div_pd(val, scale); } - template static VectorType apply(VectorType val) + template + static VectorType apply(VectorType val) { ScalarType tempFloatsIn[data_count]; ScalarType tempFloatsOut[data_count]; @@ -79,10 +83,7 @@ class BaseFloatRoundingHalfUpComputation return load(tempFloatsOut); } - static VectorType prepare(size_t scale) - { - return load1(scale); - } + static VectorType prepare(size_t scale) { return load1(scale); } }; @@ -135,11 +136,11 @@ struct FloatRoundingHalfUpImpl const size_t data_count = std::tuple_size(); - const T* end_in = in.data() + in.size(); - const T* limit = in.data() + in.size() / data_count * data_count; + const T * end_in = in.data() + in.size(); + const T * limit = in.data() + in.size() / data_count * data_count; - const T* __restrict p_in = in.data(); - T* __restrict p_out = out.data(); + const T * __restrict p_in = in.data(); + T * __restrict p_out = out.data(); while (p_in < limit) { @@ -169,9 +170,10 @@ template - using FunctionRoundingImpl = std::conditional_t, - FloatRoundingHalfUpImpl, - IntegerRoundingImpl>; + using FunctionRoundingImpl = std::conditional_t< + std::is_floating_point_v, + FloatRoundingHalfUpImpl, + IntegerRoundingImpl>; static ColumnPtr apply(const IColumn * col_general, Scale scale_arg) { @@ -233,10 +235,7 @@ class FunctionRoundingHalfUp : public IFunction static constexpr auto name = "roundHalfUp"; static FunctionPtr create(ContextPtr) { return std::make_shared(); } - String getName() const override - { - return name; - } + String getName() const override { return name; } bool isVariadic() const override { return true; } size_t getNumberOfArguments() const override { return 0; } @@ -246,14 +245,16 @@ class FunctionRoundingHalfUp : public IFunction DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override { if ((arguments.empty()) || (arguments.size() > 2)) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, - "Number of arguments for function {} doesn't match: passed {}, should be 1 or 2.", - getName(), arguments.size()); + throw Exception( + ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Number of arguments for function {} doesn't match: passed {}, should be 1 or 2.", + getName(), + arguments.size()); for (const auto & type : arguments) if (!isNumber(type)) - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument of function {}", - arguments[0]->getName(), getName()); + throw Exception( + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument of function {}", arguments[0]->getName(), getName()); return arguments[0]; } @@ -267,13 +268,11 @@ class FunctionRoundingHalfUp : public IFunction throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Scale argument for rounding functions must be constant"); Field scale_field = assert_cast(scale_column).getField(); - if (scale_field.getType() != Field::Types::UInt64 - && scale_field.getType() != Field::Types::Int64) + if (scale_field.getType() != Field::Types::UInt64 && scale_field.getType() != Field::Types::Int64) throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Scale argument for rounding functions must have integer type"); Int64 scale64 = scale_field.get(); - if (scale64 > std::numeric_limits::max() - || scale64 < std::numeric_limits::min()) + if (scale64 > std::numeric_limits::max() || scale64 < std::numeric_limits::min()) throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Scale argument for rounding function is too large"); return scale64; @@ -305,26 +304,24 @@ class FunctionRoundingHalfUp : public IFunction }; if (!callOnIndexAndDataType(column.type->getTypeId(), call)) - { throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of argument of function {}", column.name, getName()); - } return res; } - bool hasInformationAboutMonotonicity() const override - { - return true; - } + bool hasInformationAboutMonotonicity() const override { return true; } Monotonicity getMonotonicityForRange(const IDataType &, const Field &, const Field &) const override { - return { .is_monotonic = true, .is_always_monotonic = true }; + return {.is_monotonic = true, .is_always_monotonic = true}; } }; -struct NameRoundHalfUp { static constexpr auto name = "roundHalfUp"; }; +struct NameRoundHalfUp +{ + static constexpr auto name = "roundHalfUp"; +}; using FunctionRoundHalfUp = FunctionRoundingHalfUp; diff --git a/cpp-ch/local-engine/Functions/SparkFunctionToDateTime.h b/cpp-ch/local-engine/Functions/SparkFunctionToDateTime.h index d846f89564124..980af85bd9838 100644 --- a/cpp-ch/local-engine/Functions/SparkFunctionToDateTime.h +++ b/cpp-ch/local-engine/Functions/SparkFunctionToDateTime.h @@ -145,9 +145,6 @@ class SparkFunctionConvertToDateTime : public IFunction if (arguments.size() != 1 && arguments.size() != 2) throw DB::Exception(DB::ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {}'s arguments number must be 1 or 2.", name); - if (!result_type->isNullable()) - throw DB::Exception(DB::ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Function {}'s return type must be nullable", name); - if (!isDateTime64(removeNullable(result_type))) throw DB::Exception(DB::ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Function {}'s return type must be datetime.", name); diff --git a/cpp-ch/local-engine/Functions/SparkFunctionTrim.cpp b/cpp-ch/local-engine/Functions/SparkFunctionTrim.cpp index d8f6be1bfc32f..88ed3f635672d 100644 --- a/cpp-ch/local-engine/Functions/SparkFunctionTrim.cpp +++ b/cpp-ch/local-engine/Functions/SparkFunctionTrim.cpp @@ -77,8 +77,6 @@ namespace bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } - ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; } - DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override { if (arguments.size() != 2) @@ -112,19 +110,34 @@ namespace if (!src_str_col) throw Exception(ErrorCodes::ILLEGAL_COLUMN, "First argument of function {} must be String", getName()); - const ColumnConst * trim_str_col = checkAndGetColumnConst(arguments[1].column.get()); - if (!trim_str_col) - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Second argument of function {} must be Const String", getName()); - - String trim_str = trim_str_col->getValue(); - if (trim_str.empty()) - return src_str_col->cloneResized(input_rows_count); - auto res_col = ColumnString::create(); - res_col->reserve(input_rows_count); + if (const auto * trim_const_str_col = checkAndGetColumnConst(arguments[1].column.get())) + { + String trim_str = trim_const_str_col->getValue(); + if (trim_str.empty()) + return src_str_col->cloneResized(input_rows_count); + + auto res_col = ColumnString::create(); + res_col->reserve(input_rows_count); + executeVector(src_str_col->getChars(), src_str_col->getOffsets(), res_col->getChars(), res_col->getOffsets(), trim_str); + return std::move(res_col); + } + else if (const auto * trim_str_col = checkAndGetColumn(arguments[1].column.get())) + { + auto res_col = ColumnString::create(); + res_col->reserve(input_rows_count); + + executeVector( + src_str_col->getChars(), + src_str_col->getOffsets(), + res_col->getChars(), + res_col->getOffsets(), + trim_str_col->getChars(), + trim_str_col->getOffsets()); + return std::move(res_col); + } - executeVector(src_str_col->getChars(), src_str_col->getOffsets(), res_col->getChars(), res_col->getOffsets(), trim_str); - return std::move(res_col); + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Second argument of function {} must be String or Const String", getName()); } private: @@ -159,6 +172,43 @@ namespace } } + void executeVector( + const ColumnString::Chars & data, + const ColumnString::Offsets & offsets, + ColumnString::Chars & res_data, + ColumnString::Offsets & res_offsets, + const ColumnString::Chars & trim_data, + const ColumnString::Offsets & trim_offsets) const + { + res_data.reserve_exact(data.size()); + + size_t rows = offsets.size(); + res_offsets.resize_exact(rows); + + size_t prev_offset = 0; + size_t prev_trim_str_offset = 0; + size_t res_offset = 0; + + const UInt8 * start; + size_t length; + + for (size_t i = 0; i < rows; ++i) + { + std::unordered_set trim_set( + &trim_data[prev_trim_str_offset], &trim_data[prev_trim_str_offset] + trim_offsets[i] - prev_trim_str_offset - 1); + + trim(reinterpret_cast(&data[prev_offset]), offsets[i] - prev_offset - 1, start, length, trim_set); + res_data.resize_exact(res_data.size() + length + 1); + memcpySmallAllowReadWriteOverflow15(&res_data[res_offset], start, length); + res_offset += length + 1; + res_data[res_offset - 1] = '\0'; + + res_offsets[i] = res_offset; + prev_offset = offsets[i]; + prev_trim_str_offset = trim_offsets[i]; + } + } + void trim(const UInt8 * data, size_t size, const UInt8 *& res_data, size_t & res_size, const std::unordered_set & trim_set) const { diff --git a/cpp-ch/local-engine/Join/BroadCastJoinBuilder.cpp b/cpp-ch/local-engine/Join/BroadCastJoinBuilder.cpp index d90951241166c..1c79a00a7c4c5 100644 --- a/cpp-ch/local-engine/Join/BroadCastJoinBuilder.cpp +++ b/cpp-ch/local-engine/Join/BroadCastJoinBuilder.cpp @@ -82,6 +82,7 @@ std::shared_ptr buildJoin( jlong row_count, const std::string & join_keys, substrait::JoinRel_JoinType join_type, + bool has_mixed_join_condition, const std::string & named_struct) { auto join_key_list = Poco::StringTokenizer(join_keys, ","); @@ -103,7 +104,9 @@ std::shared_ptr buildJoin( row_count, key_names, true, - std::make_shared(SizeLimits(), true, kind, strictness, key_names), + kind, + strictness, + has_mixed_join_condition, columns_description, ConstraintsDescription(), key, diff --git a/cpp-ch/local-engine/Join/BroadCastJoinBuilder.h b/cpp-ch/local-engine/Join/BroadCastJoinBuilder.h index 5aa1e0876ed09..9a6837e35a0ac 100644 --- a/cpp-ch/local-engine/Join/BroadCastJoinBuilder.h +++ b/cpp-ch/local-engine/Join/BroadCastJoinBuilder.h @@ -36,6 +36,7 @@ std::shared_ptr buildJoin( jlong row_count, const std::string & join_keys, substrait::JoinRel_JoinType join_type, + bool has_mixed_join_condition, const std::string & named_struct); void cleanBuildHashTable(const std::string & hash_table_id, jlong instance); std::shared_ptr getJoin(const std::string & hash_table_id); diff --git a/cpp-ch/local-engine/Join/StorageJoinFromReadBuffer.cpp b/cpp-ch/local-engine/Join/StorageJoinFromReadBuffer.cpp index 6d0021adbf400..326e11a84f815 100644 --- a/cpp-ch/local-engine/Join/StorageJoinFromReadBuffer.cpp +++ b/cpp-ch/local-engine/Join/StorageJoinFromReadBuffer.cpp @@ -16,13 +16,16 @@ */ #include "StorageJoinFromReadBuffer.h" +#include #include -#include +#include #include #include #include #include +#include + namespace DB { class HashJoin; @@ -40,25 +43,23 @@ extern const int DEADLOCK_AVOIDED; using namespace DB; -void restore(DB::ReadBuffer & in, IJoin & join, const Block & sample_block) -{ - local_engine::NativeReader block_stream(in); - ProfileInfo info; - while (Block block = block_stream.read()) - { - auto final_block = sample_block.cloneWithColumns(block.mutateColumns()); - info.update(final_block); - join.addBlockToJoin(final_block, true); - } -} +constexpr auto RIHGT_COLUMN_PREFIX = "broadcast_right_"; DB::Block rightSampleBlock(bool use_nulls, const StorageInMemoryMetadata & storage_metadata_, JoinKind kind) { + DB::ColumnsWithTypeAndName new_cols; DB::Block block = storage_metadata_.getSampleBlock(); - if (use_nulls && isLeftOrFull(kind)) - for (auto & col : block) - DB::JoinCommon::convertColumnToNullable(col); - return block; + for (const auto & col : block) + { + // Add a prefix to avoid column name conflicts with left table. + new_cols.emplace_back(col.column, col.type, RIHGT_COLUMN_PREFIX + col.name); + if (use_nulls && isLeftOrFull(kind)) + { + auto & new_col = new_cols.back(); + DB::JoinCommon::convertColumnToNullable(new_col); + } + } + return DB::Block(new_cols); } namespace local_engine @@ -67,46 +68,138 @@ namespace local_engine StorageJoinFromReadBuffer::StorageJoinFromReadBuffer( DB::ReadBuffer & in, size_t row_count_, - const Names & key_names, - bool use_nulls, - std::shared_ptr table_join, + const Names & key_names_, + bool use_nulls_, + DB::JoinKind kind, + DB::JoinStrictness strictness, + bool has_mixed_join_condition, const ColumnsDescription & columns, const ConstraintsDescription & constraints, const String & comment, - const bool overwrite) - : key_names_(key_names), use_nulls_(use_nulls) + const bool overwrite_) + : key_names({}), use_nulls(use_nulls_), row_count(row_count_), overwrite(overwrite_) { - storage_metadata_.setColumns(columns); - storage_metadata_.setConstraints(constraints); - storage_metadata_.setComment(comment); + storage_metadata.setColumns(columns); + storage_metadata.setConstraints(constraints); + storage_metadata.setComment(comment); - for (const auto & key : key_names) - if (!storage_metadata_.getColumns().hasPhysical(key)) + for (const auto & key : key_names_) + if (!storage_metadata.getColumns().hasPhysical(key)) throw Exception(ErrorCodes::NO_SUCH_COLUMN_IN_TABLE, "Key column ({}) does not exist in table declaration.", key); - right_sample_block_ = rightSampleBlock(use_nulls, storage_metadata_, table_join->kind()); - join_ = std::make_shared(table_join, right_sample_block_, overwrite, row_count_); - restore(in, *join_, storage_metadata_.getSampleBlock()); + for (const auto & name : key_names_) + key_names.push_back(RIHGT_COLUMN_PREFIX + name); + auto table_join = std::make_shared(SizeLimits(), true, kind, strictness, key_names); + right_sample_block = rightSampleBlock(use_nulls, storage_metadata, table_join->kind()); + /// If there is mixed join conditions, need to build the hash join lazily, which rely on the real table join. + if (!has_mixed_join_condition) + buildJoin(in, right_sample_block, table_join); + else + collectAllInputs(in, right_sample_block); } -DB::JoinPtr StorageJoinFromReadBuffer::getJoinLocked(std::shared_ptr analyzed_join, DB::ContextPtr /*context*/) const +/// The column names may be different in two blocks. +/// and the nullability also could be different, with TPCDS-Q1 as an example. +static DB::ColumnWithTypeAndName convertColumnAsNecessary(const DB::ColumnWithTypeAndName & column, const DB::ColumnWithTypeAndName & sample_column) { - if ((analyzed_join->forceNullableRight() && !use_nulls_) - || (!analyzed_join->forceNullableRight() && isLeftOrFull(analyzed_join->kind()) && use_nulls_)) - throw Exception( - ErrorCodes::INCOMPATIBLE_TYPE_OF_JOIN, - "Table {} needs the same join_use_nulls setting as present in LEFT or FULL JOIN", - storage_metadata_.comment); + if (sample_column.type->equals(*column.type)) + return {column.column, column.type, sample_column.name}; + else if ( + sample_column.type->isNullable() && !column.type->isNullable() + && DB::removeNullable(sample_column.type)->equals(*column.type)) + { + auto nullable_column = column; + DB::JoinCommon::convertColumnToNullable(nullable_column); + return {nullable_column.column, sample_column.type, sample_column.name}; + } + else + throw DB::Exception( + DB::ErrorCodes::LOGICAL_ERROR, + "Columns have different types. original:{} expected:{}", + column.dumpStructure(), + sample_column.dumpStructure()); +} + +void StorageJoinFromReadBuffer::buildJoin(DB::ReadBuffer & in, const Block header, std::shared_ptr analyzed_join) +{ + local_engine::NativeReader block_stream(in); + ProfileInfo info; + join = std::make_shared(analyzed_join, header, overwrite, row_count); + while (Block block = block_stream.read()) + { + DB::ColumnsWithTypeAndName columns; + for (size_t i = 0; i < block.columns(); ++i) + { + const auto & column = block.getByPosition(i); + columns.emplace_back(convertColumnAsNecessary(column, header.getByPosition(i))); + } + DB::Block final_block(columns); + info.update(final_block); + join->addBlockToJoin(final_block, true); + } +} - /// TODO: check key columns +void StorageJoinFromReadBuffer::collectAllInputs(DB::ReadBuffer & in, const DB::Block header) +{ + local_engine::NativeReader block_stream(in); + ProfileInfo info; + while (Block block = block_stream.read()) + { + DB::ColumnsWithTypeAndName columns; + for (size_t i = 0; i < block.columns(); ++i) + { + const auto & column = block.getByPosition(i); + columns.emplace_back(convertColumnAsNecessary(column, header.getByPosition(i))); + } + DB::Block final_block(columns); + info.update(final_block); + input_blocks.emplace_back(std::move(final_block)); + } +} - /// Set names qualifiers: table.column -> column - /// It's required because storage join stores non-qualified names - /// Qualifies will be added by join implementation (HashJoin) - analyzed_join->setRightKeys(key_names_); +void StorageJoinFromReadBuffer::buildJoinLazily(DB::Block header, std::shared_ptr analyzed_join) +{ + { + std::shared_lock lock(join_mutex); + if (join) + return; + } + std::unique_lock lock(join_mutex); + if (join) + return; + join = std::make_shared(analyzed_join, header, overwrite, row_count); + while(!input_blocks.empty()) + { + auto & block = *input_blocks.begin(); + DB::ColumnsWithTypeAndName columns; + for (size_t i = 0; i < block.columns(); ++i) + { + const auto & column = block.getByPosition(i); + columns.emplace_back(convertColumnAsNecessary(column, header.getByPosition(i))); + } + DB::Block final_block(columns); + join->addBlockToJoin(final_block, true); + input_blocks.pop_front(); + } +} - HashJoinPtr join_clone = std::make_shared(analyzed_join, right_sample_block_); - join_clone->reuseJoinedData(static_cast(*join_)); +/// The column names of 'rgiht_header' could be different from the ones in `input_blocks`, and we must +/// use 'right_header' to build the HashJoin. Otherwise, it will cause exceptions with name mismatches. +/// +/// In most cases, 'getJoinLocked' is called only once, and the input_blocks should not be too large. +/// This is will be OK. +DB::JoinPtr StorageJoinFromReadBuffer::getJoinLocked(std::shared_ptr analyzed_join, DB::ContextPtr /*context*/) +{ + if ((analyzed_join->forceNullableRight() && !use_nulls) + || (!analyzed_join->forceNullableRight() && isLeftOrFull(analyzed_join->kind()) && use_nulls)) + throw Exception( + ErrorCodes::INCOMPATIBLE_TYPE_OF_JOIN, + "Table {} needs the same join_use_nulls setting as present in LEFT or FULL JOIN", + storage_metadata.comment); + buildJoinLazily(getRightSampleBlock(), analyzed_join); + HashJoinPtr join_clone = std::make_shared(analyzed_join, right_sample_block); + /// reuseJoinedData will set the flag `HashJoin::from_storage_join` which is required by `FilledStep` + join_clone->reuseJoinedData(static_cast(*join)); return join_clone; } } diff --git a/cpp-ch/local-engine/Join/StorageJoinFromReadBuffer.h b/cpp-ch/local-engine/Join/StorageJoinFromReadBuffer.h index 2e949fa87c678..ddefda69c30fe 100644 --- a/cpp-ch/local-engine/Join/StorageJoinFromReadBuffer.h +++ b/cpp-ch/local-engine/Join/StorageJoinFromReadBuffer.h @@ -15,6 +15,7 @@ * limitations under the License. */ #pragma once +#include #include #include @@ -23,6 +24,8 @@ namespace DB class TableJoin; class IJoin; using JoinPtr = std::shared_ptr; +class HashJoin; +class ReadBuffer; } namespace local_engine @@ -33,23 +36,37 @@ class StorageJoinFromReadBuffer public: StorageJoinFromReadBuffer( DB::ReadBuffer & in_, - size_t row_count_, + size_t row_count, const DB::Names & key_names_, bool use_nulls_, - std::shared_ptr table_join_, + DB::JoinKind kind, + DB::JoinStrictness strictness, + bool has_mixed_join_condition, const DB::ColumnsDescription & columns_, const DB::ConstraintsDescription & constraints_, const String & comment, bool overwrite_); - DB::JoinPtr getJoinLocked(std::shared_ptr analyzed_join, DB::ContextPtr context) const; - const DB::Block & getRightSampleBlock() const { return right_sample_block_; } + /// The columns' names in right_header may be different from the names in the ColumnsDescription + /// in the constructor. + /// This should be called once. + DB::JoinPtr getJoinLocked(std::shared_ptr analyzed_join, DB::ContextPtr context); + const DB::Block & getRightSampleBlock() const { return right_sample_block; } private: - DB::StorageInMemoryMetadata storage_metadata_; - const DB::Names key_names_; - bool use_nulls_; - DB::JoinPtr join_; - DB::Block right_sample_block_; + DB::StorageInMemoryMetadata storage_metadata; + DB::Names key_names; + bool use_nulls; + size_t row_count; + bool overwrite; + DB::Block right_sample_block; + std::shared_mutex join_mutex; + std::list input_blocks; + std::shared_ptr join = nullptr; + + void readAllBlocksFromInput(DB::ReadBuffer & in); + void buildJoin(DB::ReadBuffer & in, const DB::Block header, std::shared_ptr analyzed_join); + void collectAllInputs(DB::ReadBuffer & in, const DB::Block header); + void buildJoinLazily(DB::Block header, std::shared_ptr analyzed_join); }; } diff --git a/cpp-ch/local-engine/Operator/BlocksBufferPoolTransform.cpp b/cpp-ch/local-engine/Operator/BlocksBufferPoolTransform.cpp index 3427a81c60c3b..16a5bd5d26196 100644 --- a/cpp-ch/local-engine/Operator/BlocksBufferPoolTransform.cpp +++ b/cpp-ch/local-engine/Operator/BlocksBufferPoolTransform.cpp @@ -43,7 +43,7 @@ DB::IProcessor::Status BlocksBufferPoolTransform::prepare() { auto & output = outputs.front(); auto & input = inputs.front(); - if (output.isFinished()) + if (output.isFinished() || isCancelled()) { input.close(); return Status::Finished; diff --git a/cpp-ch/local-engine/Operator/EmptyProjectStep.cpp b/cpp-ch/local-engine/Operator/EmptyProjectStep.cpp index 58cb33e59a6f3..62991585f3b5a 100644 --- a/cpp-ch/local-engine/Operator/EmptyProjectStep.cpp +++ b/cpp-ch/local-engine/Operator/EmptyProjectStep.cpp @@ -39,7 +39,7 @@ class EmptyProject : public DB::IProcessor { auto & output = outputs.front(); auto & input = inputs.front(); - if (output.isFinished()) + if (output.isFinished() || isCancelled()) { input.close(); return Status::Finished; diff --git a/cpp-ch/local-engine/Operator/ExpandTransform.cpp b/cpp-ch/local-engine/Operator/ExpandTransform.cpp index d48d484397c23..f5787163c5a1e 100644 --- a/cpp-ch/local-engine/Operator/ExpandTransform.cpp +++ b/cpp-ch/local-engine/Operator/ExpandTransform.cpp @@ -48,7 +48,7 @@ ExpandTransform::Status ExpandTransform::prepare() auto & output = outputs.front(); auto & input = inputs.front(); - if (output.isFinished()) + if (output.isFinished() || isCancelled()) { input.close(); return Status::Finished; @@ -79,12 +79,12 @@ ExpandTransform::Status ExpandTransform::prepare() if (!input.hasData()) return Status::NeedData; - + input_chunk = input.pull(true); has_input = true; expand_expr_iterator = 0; } - + return Status::Ready; } @@ -92,6 +92,7 @@ void ExpandTransform::work() { if (expand_expr_iterator >= project_set_exprs.getExpandRows()) throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "expand_expr_iterator >= project_set_exprs.getExpandRows()"); + const auto & original_cols = input_chunk.getColumns(); size_t rows = input_chunk.getNumRows(); DB::Columns cols; @@ -103,7 +104,7 @@ void ExpandTransform::work() if (kind == EXPAND_FIELD_KIND_SELECTION) { - const auto & original_col = original_cols[field.get()]; + const auto & original_col = original_cols.at(field.get()); if (type->isNullable() == original_col->isNullable()) { cols.push_back(original_col); @@ -139,8 +140,9 @@ void ExpandTransform::work() } } output_chunk = DB::Chunk(cols, rows); - expand_expr_iterator += 1; - has_output = expand_expr_iterator <= project_set_exprs.getExpandRows(); + has_output = true; + + ++expand_expr_iterator; has_input = expand_expr_iterator < project_set_exprs.getExpandRows(); } } diff --git a/cpp-ch/local-engine/Operator/GraceMergingAggregatedStep.cpp b/cpp-ch/local-engine/Operator/GraceMergingAggregatedStep.cpp index ca86785febfe6..a9a2df276a594 100644 --- a/cpp-ch/local-engine/Operator/GraceMergingAggregatedStep.cpp +++ b/cpp-ch/local-engine/Operator/GraceMergingAggregatedStep.cpp @@ -146,7 +146,7 @@ GraceMergingAggregatedTransform::Status GraceMergingAggregatedTransform::prepare { auto & output = outputs.front(); auto & input = inputs.front(); - if (output.isFinished()) + if (output.isFinished() || isCancelled()) { input.close(); return Status::Finished; @@ -224,7 +224,7 @@ void GraceMergingAggregatedTransform::work() block_converter = prepareBucketOutputBlocks(current_bucket_index); if (block_converter) break; - current_bucket_index++; + current_bucket_index++; } } if (!block_converter) @@ -455,7 +455,7 @@ std::unique_ptr GraceMergingAggregatedTransform::pr block = {}; } } - + if (buffer_file_stream.original_file_stream) { buffer_file_stream.original_file_stream->finishWriting(); diff --git a/cpp-ch/local-engine/Operator/PartitionColumnFillingTransform.h b/cpp-ch/local-engine/Operator/PartitionColumnFillingTransform.h index 2eac10f3975b6..692991b3fc9d0 100644 --- a/cpp-ch/local-engine/Operator/PartitionColumnFillingTransform.h +++ b/cpp-ch/local-engine/Operator/PartitionColumnFillingTransform.h @@ -25,9 +25,11 @@ class PartitionColumnFillingTransform : public DB::ISimpleTransform public: PartitionColumnFillingTransform( const DB::Block & input_, const DB::Block & output_, const String & partition_col_name_, const String & partition_col_value_); - void transform(DB::Chunk & chunk) override; + String getName() const override { return "PartitionColumnFillingTransform"; } + void transform(DB::Chunk & chunk) override; + private: DB::ColumnPtr createPartitionColumn(); diff --git a/cpp-ch/local-engine/Operator/ReplicateRowsStep.cpp b/cpp-ch/local-engine/Operator/ReplicateRowsStep.cpp new file mode 100644 index 0000000000000..f2d4bc8a865dc --- /dev/null +++ b/cpp-ch/local-engine/Operator/ReplicateRowsStep.cpp @@ -0,0 +1,108 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "ReplicateRowsStep.h" + +#include + +#include + +namespace DB +{ +namespace ErrorCodes +{ +extern const int LOGICAL_ERROR; +} +} + +namespace local_engine +{ +static DB::ITransformingStep::Traits getTraits() +{ + return DB::ITransformingStep::Traits + { + { + .preserves_number_of_streams = true, + .preserves_sorting = false, + }, + { + .preserves_number_of_rows = false, + } + }; +} + +ReplicateRowsStep::ReplicateRowsStep(const DB::DataStream & input_stream) + : ITransformingStep(input_stream, transformHeader(input_stream.header), getTraits()) +{ +} + +DB::Block ReplicateRowsStep::transformHeader(const DB::Block& input) +{ + DB::Block output; + for (int i = 1; i < input.columns(); i++) + { + output.insert(input.getByPosition(i)); + } + return output; +} + +void ReplicateRowsStep::transformPipeline( + DB::QueryPipelineBuilder & pipeline, + const DB::BuildQueryPipelineSettings & /*settings*/) +{ + pipeline.addSimpleTransform( + [&](const DB::Block & header) + { + return std::make_shared(header); + }); +} + +void ReplicateRowsStep::updateOutputStream() +{ + output_stream = createOutputStream(input_streams.front(), transformHeader(input_streams.front().header), getDataStreamTraits()); +} + +ReplicateRowsTransform::ReplicateRowsTransform(const DB::Block & input_header_) + : ISimpleTransform(input_header_, ReplicateRowsStep::transformHeader(input_header_), true) +{ +} + +void ReplicateRowsTransform::transform(DB::Chunk & chunk) +{ + auto replica_column = chunk.getColumns().front(); + size_t total_rows = 0; + for (int i = 0; i < replica_column->size(); i++) + { + total_rows += replica_column->get64(i); + } + + auto columns = chunk.detachColumns(); + DB::MutableColumns mutable_columns; + for (int i = 1; i < columns.size(); i++) + { + mutable_columns.push_back(columns[i]->cloneEmpty()); + mutable_columns.back()->reserve(total_rows); + DB::ColumnPtr src_col = columns[i]; + DB::MutableColumnPtr & cur = mutable_columns.back(); + for (int j = 0; j < replica_column->size(); j++) + { + cur->insertManyFrom(*src_col, j, replica_column->getUInt(j)); + } + } + + chunk.setColumns(std::move(mutable_columns), total_rows); +} +} \ No newline at end of file diff --git a/cpp-ch/local-engine/Operator/ReplicateRowsStep.h b/cpp-ch/local-engine/Operator/ReplicateRowsStep.h new file mode 100644 index 0000000000000..f588bf0ceb8c1 --- /dev/null +++ b/cpp-ch/local-engine/Operator/ReplicateRowsStep.h @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include + +namespace local_engine +{ + +class ReplicateRowsStep : public DB::ITransformingStep +{ +public: + ReplicateRowsStep(const DB::DataStream& input_stream); + + static DB::Block transformHeader(const DB::Block& input); + + String getName() const override { return "ReplicateRowsStep"; } + void transformPipeline(DB::QueryPipelineBuilder& pipeline, + const DB::BuildQueryPipelineSettings& settings) override; +private: + void updateOutputStream() override; +}; + +class ReplicateRowsTransform : public DB::ISimpleTransform +{ +public: + ReplicateRowsTransform(const DB::Block& input_header_); + + String getName() const override { return "ReplicateRowsTransform"; } + void transform(DB::Chunk&) override; + +}; +} diff --git a/cpp-ch/local-engine/Operator/StreamingAggregatingStep.cpp b/cpp-ch/local-engine/Operator/StreamingAggregatingStep.cpp index 698d353b1d4b5..65d77f8e968f0 100644 --- a/cpp-ch/local-engine/Operator/StreamingAggregatingStep.cpp +++ b/cpp-ch/local-engine/Operator/StreamingAggregatingStep.cpp @@ -67,11 +67,12 @@ StreamingAggregatingTransform::Status StreamingAggregatingTransform::prepare() { auto & output = outputs.front(); auto & input = inputs.front(); - if (output.isFinished()) + if (output.isFinished() || isCancelled()) { input.close(); return Status::Finished; } + if (has_output) { if (output.canPush()) @@ -140,10 +141,10 @@ bool StreamingAggregatingTransform::needEvict() auto max_mem_used = static_cast(context->getSettingsRef().max_memory_usage * max_allowed_memory_usage_ratio); auto current_result_rows = data_variants->size(); - /// avoid evict empty or too small aggregated results. + /// avoid evict empty or too small aggregated results. if (current_result_rows < aggregated_keys_before_evict) return false; - + /// If the grouping keys is high cardinality, we should evict data variants early, and avoid to use a big /// hash table. if (static_cast(total_output_rows)/total_input_rows > high_cardinality_threshold) diff --git a/cpp-ch/local-engine/Parser/CHColumnToSparkRow.cpp b/cpp-ch/local-engine/Parser/CHColumnToSparkRow.cpp index 6f8df0ecbd75a..5bb66e4b3f9dd 100644 --- a/cpp-ch/local-engine/Parser/CHColumnToSparkRow.cpp +++ b/cpp-ch/local-engine/Parser/CHColumnToSparkRow.cpp @@ -453,7 +453,7 @@ std::unique_ptr CHColumnToSparkRow::convertCHColumnToSparkRow(cons if (!block.columns()) throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "A block with empty columns"); std::unique_ptr spark_row_info = std::make_unique(block, masks); - spark_row_info->setBufferAddress(reinterpret_cast(alloc(spark_row_info->getTotalBytes(), 64))); + spark_row_info->setBufferAddress(static_cast(alloc(spark_row_info->getTotalBytes(), 64))); // spark_row_info->setBufferAddress(alignedAlloc(spark_row_info->getTotalBytes(), 64)); memset(spark_row_info->getBufferAddress(), 0, spark_row_info->getTotalBytes()); for (auto col_idx = 0; col_idx < spark_row_info->getNumCols(); col_idx++) @@ -820,6 +820,11 @@ int64_t VariableLengthDataWriter::writeStruct(size_t row_idx, const DB::Tuple & auto v = field_value.get(); writer.unsafeWrite(reinterpret_cast(&v), buffer_address + offset + start + len_null_bitmap + i * 8); } + else if (writer.getWhichDataType().isDecimal64() || writer.getWhichDataType().isDateTime64()) + { + auto v = field_value.get(); + writer.unsafeWrite(reinterpret_cast(&v), buffer_address + offset + start + len_null_bitmap + i * 8); + } else writer.unsafeWrite( reinterpret_cast(&field_value.get()), buffer_address + offset + start + len_null_bitmap + i * 8); diff --git a/cpp-ch/local-engine/Parser/FilterRelParser.cpp b/cpp-ch/local-engine/Parser/FilterRelParser.cpp index 19facf3bff960..e0098f747c2ab 100644 --- a/cpp-ch/local-engine/Parser/FilterRelParser.cpp +++ b/cpp-ch/local-engine/Parser/FilterRelParser.cpp @@ -31,7 +31,7 @@ DB::QueryPlanPtr FilterRelParser::parse(DB::QueryPlanPtr query_plan, const subst substrait::Rel final_rel = rel; rewriter.rewrite(final_rel); - const auto & filter_rel = rel.filter(); + const auto & filter_rel = final_rel.filter(); std::string filter_name; auto input_header = query_plan->getCurrentDataStream().header; @@ -59,14 +59,18 @@ DB::QueryPlanPtr FilterRelParser::parse(DB::QueryPlanPtr query_plan, const subst filter_step->setStepDescription("WHERE"); steps.emplace_back(filter_step.get()); query_plan->addStep(std::move(filter_step)); - + + // header maybe changed, need to rollback it + if (!blocksHaveEqualStructure(input_header, query_plan->getCurrentDataStream().header)) { + steps.emplace_back(getPlanParser()->addRollbackFilterHeaderStep(query_plan, input_header)); + } + // remove nullable auto * remove_null_step = getPlanParser()->addRemoveNullableStep(*query_plan, non_nullable_columns); if (remove_null_step) { steps.emplace_back(remove_null_step); } - return query_plan; } diff --git a/cpp-ch/local-engine/Parser/JoinRelParser.cpp b/cpp-ch/local-engine/Parser/JoinRelParser.cpp index 8f7f35d5ef23b..a6a146954d6f7 100644 --- a/cpp-ch/local-engine/Parser/JoinRelParser.cpp +++ b/cpp-ch/local-engine/Parser/JoinRelParser.cpp @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include #include @@ -31,6 +31,10 @@ #include #include +#include +#include + + namespace DB { namespace ErrorCodes @@ -179,40 +183,182 @@ DB::QueryPlanPtr JoinRelParser::parseOp(const substrait::Rel & rel, std::list JoinRelParser::extractTableSidesFromExpression(const substrait::Expression & expr, const DB::Block & left_header, const DB::Block & right_header) +{ + std::unordered_set table_sides; + if (expr.has_scalar_function()) + { + for (const auto & arg : expr.scalar_function().arguments()) + { + auto table_sides_from_arg = extractTableSidesFromExpression(arg.value(), left_header, right_header); + table_sides.insert(table_sides_from_arg.begin(), table_sides_from_arg.end()); + } + } + else if (expr.has_selection() && expr.selection().has_direct_reference() && expr.selection().direct_reference().has_struct_field()) + { + auto pos = expr.selection().direct_reference().struct_field().field(); + if (pos < left_header.columns()) + { + table_sides.insert(DB::JoinTableSide::Left); + } + else + { + table_sides.insert(DB::JoinTableSide::Right); + } + } + else if (expr.has_singular_or_list()) + { + auto child_table_sides = extractTableSidesFromExpression(expr.singular_or_list().value(), left_header, right_header); + table_sides.insert(child_table_sides.begin(), child_table_sides.end()); + for (const auto & option : expr.singular_or_list().options()) + { + child_table_sides = extractTableSidesFromExpression(option, left_header, right_header); + table_sides.insert(child_table_sides.begin(), child_table_sides.end()); + } + } + else if (expr.has_cast()) + { + auto child_table_sides = extractTableSidesFromExpression(expr.cast().input(), left_header, right_header); + table_sides.insert(child_table_sides.begin(), child_table_sides.end()); + } + else if (expr.has_if_then()) + { + for (const auto & if_child : expr.if_then().ifs()) + { + auto child_table_sides = extractTableSidesFromExpression(if_child.if_(), left_header, right_header); + table_sides.insert(child_table_sides.begin(), child_table_sides.end()); + child_table_sides = extractTableSidesFromExpression(if_child.then(), left_header, right_header); + table_sides.insert(child_table_sides.begin(), child_table_sides.end()); + } + auto child_table_sides = extractTableSidesFromExpression(expr.if_then().else_(), left_header, right_header); + table_sides.insert(child_table_sides.begin(), child_table_sides.end()); + } + else if (expr.has_literal()) + { + // nothing + } + else + { + throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Illegal expression '{}'", expr.DebugString()); + } + return table_sides; +} + + +void JoinRelParser::renamePlanColumns(DB::QueryPlan & left, DB::QueryPlan & right, const StorageJoinFromReadBuffer & storage_join) +{ + /// To support mixed join conditions, we must make sure that the column names in the right be the same as + /// storage_join's right sample block. + ActionsDAGPtr project = ActionsDAG::makeConvertingActions( + right.getCurrentDataStream().header.getColumnsWithTypeAndName(), + storage_join.getRightSampleBlock().getColumnsWithTypeAndName(), + ActionsDAG::MatchColumnsMode::Position); + + if (project) + { + QueryPlanStepPtr project_step = std::make_unique(right.getCurrentDataStream(), project); + project_step->setStepDescription("Rename Broadcast Table Name"); + steps.emplace_back(project_step.get()); + right.addStep(std::move(project_step)); + } + + /// If the columns name in right table is duplicated with left table, we need to rename the left table's columns, + /// avoid the columns name in the right table be changed in `addConvertStep`. + /// This could happen in tpc-ds q44. + DB::ColumnsWithTypeAndName new_left_cols; + const auto & right_header = right.getCurrentDataStream().header; + auto left_prefix = getUniqueName("left"); + for (const auto & col : left.getCurrentDataStream().header) + { + if (right_header.has(col.name)) + { + new_left_cols.emplace_back(col.column, col.type, left_prefix + col.name); + } + else + { + new_left_cols.emplace_back(col.column, col.type, col.name); + } + } + project = ActionsDAG::makeConvertingActions( + left.getCurrentDataStream().header.getColumnsWithTypeAndName(), + new_left_cols, + ActionsDAG::MatchColumnsMode::Position); + + if (project) + { + QueryPlanStepPtr project_step = std::make_unique(left.getCurrentDataStream(), project); + project_step->setStepDescription("Rename Left Table Name for broadcast join"); + steps.emplace_back(project_step.get()); + left.addStep(std::move(project_step)); + } +} + DB::QueryPlanPtr JoinRelParser::parseJoin(const substrait::JoinRel & join, DB::QueryPlanPtr left, DB::QueryPlanPtr right) { auto join_opt_info = parseJoinOptimizationInfo(join); auto storage_join = join_opt_info.is_broadcast ? BroadCastJoinBuilder::getJoin(join_opt_info.storage_join_key) : nullptr; - if (storage_join) { - ActionsDAGPtr project = ActionsDAG::makeConvertingActions( - right->getCurrentDataStream().header.getColumnsWithTypeAndName(), - storage_join->getRightSampleBlock().getColumnsWithTypeAndName(), - ActionsDAG::MatchColumnsMode::Position); + renamePlanColumns(*left, *right, *storage_join); + } - if (project) + auto table_join = createDefaultTableJoin(join.type()); + DB::Block right_header_before_convert_step = right->getCurrentDataStream().header; + addConvertStep(*table_join, *left, *right); + + // Add a check to find error easily. + if (storage_join) + { + bool is_col_names_changed = false; + const auto & current_right_header = right->getCurrentDataStream().header; + if (right_header_before_convert_step.columns() != current_right_header.columns()) + is_col_names_changed = true; + if (!is_col_names_changed) { - QueryPlanStepPtr project_step = std::make_unique(right->getCurrentDataStream(), project); - project_step->setStepDescription("Rename Broadcast Table Name"); - steps.emplace_back(project_step.get()); - right->addStep(std::move(project_step)); + for (size_t i = 0; i < right_header_before_convert_step.columns(); i++) + { + if (right_header_before_convert_step.getByPosition(i).name != current_right_header.getByPosition(i).name) + { + is_col_names_changed = true; + break; + } + } + } + if (is_col_names_changed) + { + throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "For broadcast join, we must not change the columns name in the right table.\nleft header:{},\nright header: {} -> {}", + left->getCurrentDataStream().header.dumpStructure(), + right_header_before_convert_step.dumpStructure(), + right->getCurrentDataStream().header.dumpStructure()); } } - auto table_join = createDefaultTableJoin(join.type()); - addConvertStep(*table_join, *left, *right); Names after_join_names; auto left_names = left->getCurrentDataStream().header.getNames(); after_join_names.insert(after_join_names.end(), left_names.begin(), left_names.end()); auto right_name = table_join->columnsFromJoinedTable().getNames(); after_join_names.insert(after_join_names.end(), right_name.begin(), right_name.end()); - bool add_filter_step = tryAddPushDownFilter(*table_join, join, *left, *right, table_join->columnsFromJoinedTable(), after_join_names); + + auto left_header = left->getCurrentDataStream().header; + auto right_header = right->getCurrentDataStream().header; QueryPlanPtr query_plan; + + /// Support only one join clause. + table_join->addDisjunct(); + /// some examples to explain when the post_join_filter is not empty + /// - on t1.key = t2.key and t1.v1 > 1 and t2.v1 > 1, 't1.v1> 1' is in the post filter. but 't2.v1 > 1' + /// will be pushed down into right table by spark and is not in the post filter. 't1.key = t2.key ' is + /// in JoinRel::expression. + /// - on t1.key = t2. key and t1.v1 > t2.v2, 't1.v1 > t2.v2' is in the post filter. + collectJoinKeys(*table_join, join, left_header, right_header); + if (storage_join) { + + applyJoinFilter(*table_join, join, *left, *right, true); auto broadcast_hash_join = storage_join->getJoinLocked(table_join, context); + QueryPlanStepPtr join_step = std::make_unique(left->getCurrentDataStream(), broadcast_hash_join, 8192); join_step->setStepDescription("STORAGE_JOIN"); @@ -224,6 +370,18 @@ DB::QueryPlanPtr JoinRelParser::parseJoin(const substrait::JoinRel & join, DB::Q } else if (join_opt_info.is_smj) { + bool need_post_filter = !applyJoinFilter(*table_join, join, *left, *right, false); + + /// If applyJoinFilter returns false, it means there are mixed conditions in the post_join_filter. + /// It should be a inner join. + /// TODO: make smj support mixed conditions + if (need_post_filter && table_join->kind() != DB::JoinKind::Inner) + { + throw DB::Exception( + DB::ErrorCodes::LOGICAL_ERROR, + "Sort merge join doesn't support mixed join conditions, except inner join."); + } + JoinPtr smj_join = std::make_shared(table_join, right->getCurrentDataStream().header.cloneEmpty(), -1); MultiEnum join_algorithm = context->getSettingsRef().join_algorithm; QueryPlanStepPtr join_step @@ -237,12 +395,14 @@ DB::QueryPlanPtr JoinRelParser::parseJoin(const substrait::JoinRel & join, DB::Q query_plan = std::make_unique(); query_plan->unitePlans(std::move(join_step), {std::move(plans)}); + if (need_post_filter) + addPostFilter(*query_plan, join); } else { - /// TODO: make grace hash join be the default hash join algorithm. - /// - /// Following is some configuration for grace hash join. + applyJoinFilter(*table_join, join, *left, *right, true); + + /// Following is some configurations for grace hash join. /// - spark.gluten.sql.columnar.backend.ch.runtime_settings.join_algorithm=grace_hash. This will /// enable grace hash join. /// - spark.gluten.sql.columnar.backend.ch.runtime_settings.max_bytes_in_join=3145728. This setup @@ -278,28 +438,15 @@ DB::QueryPlanPtr JoinRelParser::parseJoin(const substrait::JoinRel & join, DB::Q } reorderJoinOutput(*query_plan, after_join_names); - if (add_filter_step) - { - addPostFilter(*query_plan, join); - } return query_plan; } void JoinRelParser::addConvertStep(TableJoin & table_join, DB::QueryPlan & left, DB::QueryPlan & right) { - - /// After https://github.com/ClickHouse/ClickHouse/pull/61216, We will failed at tryPushDownFilter() in filterPushDown.cpp - /// Here is a workaround, refer to chooseJoinAlgorithm() in PlannerJoins.cpp, it always call TableJoin::setRename to - /// create aliases for columns in the right table - /// By using right table header name sets, so TableJoin::deduplicateAndQualifyColumnNames can do same thing as chooseJoinAlgorithm() - /// - /// Affected UT fixed bh this workaround: - /// GlutenClickHouseTPCHParquetRFSuite:TPCH Q17, Q19, Q20, Q21 + /// If the columns name in right table is duplicated with left table, we need to rename the right table's columns. NameSet left_columns_set; - for (const auto & col : right.getCurrentDataStream().header.getNames()) - { + for (const auto & col : left.getCurrentDataStream().header.getNames()) left_columns_set.emplace(col); - } table_join.setColumnsFromJoinedTable( right.getCurrentDataStream().header.getNamesAndTypesList(), left_columns_set, getUniqueName("right") + "."); @@ -327,7 +474,7 @@ void JoinRelParser::addConvertStep(TableJoin & table_join, DB::QueryPlan & left, rename_dag->getOutputs()[pos] = &alias; } } - rename_dag->projectInput(); + QueryPlanStepPtr project_step = std::make_unique(right.getCurrentDataStream(), rename_dag); project_step->setStepDescription("Right Table Rename"); steps.emplace_back(project_step.get()); @@ -360,117 +507,179 @@ void JoinRelParser::addConvertStep(TableJoin & table_join, DB::QueryPlan & left, } } -void JoinRelParser::addPostFilter(DB::QueryPlan & query_plan, const substrait::JoinRel & join) +/// Join keys are collected from substrait::JoinRel::expression() which only contains the equal join conditions. +void JoinRelParser::collectJoinKeys( + TableJoin & table_join, const substrait::JoinRel & join_rel, const DB::Block & left_header, const DB::Block & right_header) { - std::string filter_name; - auto actions_dag = std::make_shared(query_plan.getCurrentDataStream().header.getColumnsWithTypeAndName()); - if (!join.post_join_filter().has_scalar_function()) + if (!join_rel.has_expression()) + return; + const auto & expr = join_rel.expression(); + auto & join_clause = table_join.getClauses().back(); + std::list expressions_stack; + expressions_stack.push_back(&expr); + while (!expressions_stack.empty()) { - // It may be singular_or_list - auto * in_node = getPlanParser()->parseExpression(actions_dag, join.post_join_filter()); - filter_name = in_node->result_name; - } - else - { - getPlanParser()->parseFunction(query_plan.getCurrentDataStream().header, join.post_join_filter(), filter_name, actions_dag, true); + /// Must handle the expressions in DF order. It matters in sort merge join. + const auto * current_expr = expressions_stack.back(); + expressions_stack.pop_back(); + if (!current_expr->has_scalar_function()) + throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Function expression is expected"); + auto function_name = parseFunctionName(current_expr->scalar_function().function_reference(), current_expr->scalar_function()); + if (!function_name) + throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Invalid function expression"); + if (*function_name == "equals") + { + String left_key, right_key; + size_t left_pos = 0, right_pos = 0; + for (const auto & arg : current_expr->scalar_function().arguments()) + { + if (!arg.value().has_selection() || !arg.value().selection().has_direct_reference() + || !arg.value().selection().direct_reference().has_struct_field()) + { + throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "A column reference is expected"); + } + auto col_pos_ref = arg.value().selection().direct_reference().struct_field().field(); + if (col_pos_ref < left_header.columns()) + { + left_pos = col_pos_ref; + left_key = left_header.getByPosition(col_pos_ref).name; + } + else + { + right_pos = col_pos_ref - left_header.columns(); + right_key = right_header.getByPosition(col_pos_ref - left_header.columns()).name; + } + } + if (left_key.empty() || right_key.empty()) + throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Invalid key equal join condition"); + join_clause.addKey(left_key, right_key, false); + } + else if (*function_name == "and") + { + expressions_stack.push_back(¤t_expr->scalar_function().arguments().at(1).value()); + expressions_stack.push_back(¤t_expr->scalar_function().arguments().at(0).value()); + } + else + { + throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Unknow function: {}", *function_name); + } } - auto filter_step = std::make_unique(query_plan.getCurrentDataStream(), actions_dag, filter_name, true); - filter_step->setStepDescription("Post Join Filter"); - steps.emplace_back(filter_step.get()); - query_plan.addStep(std::move(filter_step)); } -bool JoinRelParser::tryAddPushDownFilter( - TableJoin & table_join, - const substrait::JoinRel & join, - DB::QueryPlan & left, - DB::QueryPlan & right, - const NamesAndTypesList & alias_right, - const Names & names) +bool JoinRelParser::applyJoinFilter( + DB::TableJoin & table_join, const substrait::JoinRel & join_rel, DB::QueryPlan & left, DB::QueryPlan & right, bool allow_mixed_condition) { - try + if (!join_rel.has_post_join_filter()) + return true; + const auto & expr = join_rel.post_join_filter(); + + const auto & left_header = left.getCurrentDataStream().header; + const auto & right_header = right.getCurrentDataStream().header; + ColumnsWithTypeAndName mixed_columns; + std::unordered_set added_column_name; + for (const auto & col : left_header.getColumnsWithTypeAndName()) + { + mixed_columns.emplace_back(col); + added_column_name.insert(col.name); + } + for (const auto & col : right_header.getColumnsWithTypeAndName()) { - ASTParser astParser(context, function_mapping, getPlanParser()); - ASTs args; + const auto & renamed_col_name = table_join.renamedRightColumnNameWithAlias(col.name); + if (added_column_name.find(col.name) != added_column_name.end()) + throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Right column's name conflict with left column: {}", col.name); + mixed_columns.emplace_back(col); + added_column_name.insert(col.name); + } + DB::Block mixed_header(mixed_columns); - if (join.has_expression()) - { - args.emplace_back(astParser.parseToAST(names, join.expression())); - } + auto table_sides = extractTableSidesFromExpression(expr, left_header, right_header); - if (join.has_post_join_filter()) + auto get_input_expressions = [](const DB::Block & header) + { + std::vector exprs; + for (size_t i = 0; i < header.columns(); ++i) { - args.emplace_back(astParser.parseToAST(names, join.post_join_filter())); + substrait::Expression expr; + expr.mutable_selection()->mutable_direct_reference()->mutable_struct_field()->set_field(i); + exprs.emplace_back(expr); } - - if (args.empty()) - return false; - - ASTPtr ast = args.size() == 1 ? args.back() : makeASTFunction("and", args); - - bool is_asof = (table_join.strictness() == JoinStrictness::Asof); - - Aliases aliases; - DatabaseAndTableWithAlias left_table_name; - DatabaseAndTableWithAlias right_table_name; - TableWithColumnNamesAndTypes left_table(left_table_name, left.getCurrentDataStream().header.getNamesAndTypesList()); - TableWithColumnNamesAndTypes right_table(right_table_name, alias_right); - - CollectJoinOnKeysVisitor::Data data{table_join, left_table, right_table, aliases, is_asof}; - if (auto * or_func = ast->as(); or_func && or_func->name == "or") + return exprs; + }; + + /// If the columns in the expression are all from one table, use analyzer_left_filter_condition_column_name + /// and analyzer_left_filter_condition_column_name to filt the join result data. It requires to build the filter + /// column at first. + /// If the columns in the expression are from both tables, use mixed_join_expression to filt the join result data. + /// the filter columns will be built inner the join step. + if (table_sides.size() == 1) + { + auto table_side = *table_sides.begin(); + if (table_side == DB::JoinTableSide::Left) { - for (auto & disjunct : or_func->arguments->children) - { - table_join.addDisjunct(); - CollectJoinOnKeysVisitor(data).visit(disjunct); - } - assert(table_join.getClauses().size() == or_func->arguments->children.size()); + auto input_exprs = get_input_expressions(left_header); + input_exprs.push_back(expr); + auto actions_dag = expressionsToActionsDAG(input_exprs, left_header); + table_join.getClauses().back().analyzer_left_filter_condition_column_name = actions_dag->getOutputs().back()->result_name; + QueryPlanStepPtr before_join_step = std::make_unique(left.getCurrentDataStream(), actions_dag); + before_join_step->setStepDescription("Before JOIN LEFT"); + steps.emplace_back(before_join_step.get()); + left.addStep(std::move(before_join_step)); } else { - table_join.addDisjunct(); - CollectJoinOnKeysVisitor(data).visit(ast); - assert(table_join.oneDisjunct()); - } - - if (join.has_post_join_filter()) - { - auto left_keys = table_join.leftKeysList(); - auto right_keys = table_join.rightKeysList(); - if (!left_keys->children.empty()) + /// since the field reference in expr is the index of left_header ++ right_header, so we use + /// mixed_header to build the actions_dag + auto input_exprs = get_input_expressions(mixed_header); + input_exprs.push_back(expr); + auto actions_dag = expressionsToActionsDAG(input_exprs, mixed_header); + + /// clear unused columns in actions_dag + for (const auto & col : left_header.getColumnsWithTypeAndName()) { - auto actions = astParser.convertToActions(left.getCurrentDataStream().header.getNamesAndTypesList(), left_keys); - QueryPlanStepPtr before_join_step = std::make_unique(left.getCurrentDataStream(), actions); - before_join_step->setStepDescription("Before JOIN LEFT"); - steps.emplace_back(before_join_step.get()); - left.addStep(std::move(before_join_step)); + actions_dag->removeUnusedResult(col.name); } + actions_dag->removeUnusedActions(); - if (!right_keys->children.empty()) - { - auto actions = astParser.convertToActions(right.getCurrentDataStream().header.getNamesAndTypesList(), right_keys); - QueryPlanStepPtr before_join_step = std::make_unique(right.getCurrentDataStream(), actions); - before_join_step->setStepDescription("Before JOIN RIGHT"); - steps.emplace_back(before_join_step.get()); - right.addStep(std::move(before_join_step)); - } + table_join.getClauses().back().analyzer_right_filter_condition_column_name = actions_dag->getOutputs().back()->result_name; + QueryPlanStepPtr before_join_step = std::make_unique(right.getCurrentDataStream(), actions_dag); + before_join_step->setStepDescription("Before JOIN RIGHT"); + steps.emplace_back(before_join_step.get()); + right.addStep(std::move(before_join_step)); } } - // if ch does not support the join type or join conditions, it will throw an exception like 'not support'. - catch (Poco::Exception & e) + else if (table_sides.size() == 2) { - // CH not support join condition has 'or' and has different table in each side. - // But in inner join, we could execute join condition after join. so we have add filter step - if (e.code() == ErrorCodes::INVALID_JOIN_ON_EXPRESSION && table_join.kind() == DB::JoinKind::Inner) - { - return true; - } - else - { - throw; - } + if (!allow_mixed_condition) + return false; + auto mixed_join_expressions_actions = expressionsToActionsDAG({expr}, mixed_header); + table_join.getMixedJoinExpression() + = std::make_shared(mixed_join_expressions_actions, ExpressionActionsSettings::fromContext(context)); + } + else + { + throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Not any table column is used in the join condition.\n{}", join_rel.DebugString()); + } + return true; +} + +void JoinRelParser::addPostFilter(DB::QueryPlan & query_plan, const substrait::JoinRel & join) +{ + std::string filter_name; + auto actions_dag = std::make_shared(query_plan.getCurrentDataStream().header.getColumnsWithTypeAndName()); + if (!join.post_join_filter().has_scalar_function()) + { + // It may be singular_or_list + auto * in_node = getPlanParser()->parseExpression(actions_dag, join.post_join_filter()); + filter_name = in_node->result_name; + } + else + { + getPlanParser()->parseFunction(query_plan.getCurrentDataStream().header, join.post_join_filter(), filter_name, actions_dag, true); } - return false; + auto filter_step = std::make_unique(query_plan.getCurrentDataStream(), actions_dag, filter_name, true); + filter_step->setStepDescription("Post Join Filter"); + steps.emplace_back(filter_step.get()); + query_plan.addStep(std::move(filter_step)); } void registerJoinRelParser(RelParserFactory & factory) diff --git a/cpp-ch/local-engine/Parser/JoinRelParser.h b/cpp-ch/local-engine/Parser/JoinRelParser.h index 445b7e6833006..c423f43908e70 100644 --- a/cpp-ch/local-engine/Parser/JoinRelParser.h +++ b/cpp-ch/local-engine/Parser/JoinRelParser.h @@ -17,6 +17,7 @@ #pragma once #include +#include #include #include @@ -28,6 +29,8 @@ class TableJoin; namespace local_engine { +class StorageJoinFromReadBuffer; + std::pair getJoinKindAndStrictness(substrait::JoinRel_JoinType join_type); class JoinRelParser : public RelParser @@ -50,15 +53,22 @@ class JoinRelParser : public RelParser DB::QueryPlanPtr parseJoin(const substrait::JoinRel & join, DB::QueryPlanPtr left, DB::QueryPlanPtr right); + void renamePlanColumns(DB::QueryPlan & left, DB::QueryPlan & right, const StorageJoinFromReadBuffer & storage_join); void addConvertStep(TableJoin & table_join, DB::QueryPlan & left, DB::QueryPlan & right); - bool tryAddPushDownFilter( - TableJoin & table_join, - const substrait::JoinRel & join, - DB::QueryPlan & left, - DB::QueryPlan & right, - const NamesAndTypesList & alias_right, - const Names & names); + void collectJoinKeys( + TableJoin & table_join, const substrait::JoinRel & join_rel, const DB::Block & left_header, const DB::Block & right_header); + + bool applyJoinFilter( + DB::TableJoin & table_join, + const substrait::JoinRel & join_rel, + DB::QueryPlan & left_plan, + DB::QueryPlan & right_plan, + bool allow_mixed_condition); + void addPostFilter(DB::QueryPlan & plan, const substrait::JoinRel & join); + + static std::unordered_set extractTableSidesFromExpression( + const substrait::Expression & expr, const DB::Block & left_header, const DB::Block & right_header); }; } diff --git a/cpp-ch/local-engine/Parser/MergeTreeRelParser.cpp b/cpp-ch/local-engine/Parser/MergeTreeRelParser.cpp index c89632919e29b..b51b76b974152 100644 --- a/cpp-ch/local-engine/Parser/MergeTreeRelParser.cpp +++ b/cpp-ch/local-engine/Parser/MergeTreeRelParser.cpp @@ -66,21 +66,18 @@ MergeTreeTable MergeTreeRelParser::parseMergeTreeTable(const substrait::ReadRel: return parseMergeTreeTableString(table.value()); } -CustomStorageMergeTreePtr MergeTreeRelParser::parseStorage(const MergeTreeTable & merge_tree_table, ContextMutablePtr context, UUID uuid) +CustomStorageMergeTreePtr +MergeTreeRelParser::parseStorage(const MergeTreeTable & merge_tree_table, ContextMutablePtr context, bool restore) { DB::Block header = TypeParser::buildBlockFromNamedStruct(merge_tree_table.schema, merge_tree_table.low_card_key); auto names_and_types_list = header.getNamesAndTypesList(); - auto storage_factory = StorageMergeTreeFactory::instance(); auto metadata = buildMetaData(names_and_types_list, context, merge_tree_table); - { - // use instance global table (without uuid) to restore metadata folder on current instance - // we need its lock - - auto global_storage = storage_factory.getStorage( + // use instance global table (without uuid) to restore metadata folder on current instance + // we need its lock + auto global_storage = StorageMergeTreeFactory::getStorage( StorageID(merge_tree_table.database, merge_tree_table.table), merge_tree_table.snapshot_id, - metadata->getColumns(), [&]() -> CustomStorageMergeTreePtr { auto custom_storage_merge_tree = std::make_shared( @@ -95,42 +92,47 @@ CustomStorageMergeTreePtr MergeTreeRelParser::parseStorage(const MergeTreeTable return custom_storage_merge_tree; }); + if (restore) restoreMetaData(global_storage, merge_tree_table, *context); - } - // return local table (with a uuid) for isolation - auto storage = storage_factory.getStorage( - StorageID(merge_tree_table.database, merge_tree_table.table, uuid), - merge_tree_table.snapshot_id, - metadata->getColumns(), - [&]() -> CustomStorageMergeTreePtr - { - auto custom_storage_merge_tree = std::make_shared( - StorageID(merge_tree_table.database, merge_tree_table.table, uuid), - merge_tree_table.relative_path, - *metadata, - false, - context, - "", - MergeTreeData::MergingParams(), - buildMergeTreeSettings(merge_tree_table.table_configs)); - return custom_storage_merge_tree; - }); - return storage; + return global_storage; } CustomStorageMergeTreePtr -MergeTreeRelParser::parseStorage(const substrait::ReadRel::ExtensionTable & extension_table, ContextMutablePtr context, UUID uuid) +MergeTreeRelParser::parseStorage(const substrait::ReadRel::ExtensionTable & extension_table, ContextMutablePtr context) { auto merge_tree_table = parseMergeTreeTable(extension_table); - return parseStorage(merge_tree_table, context, uuid); + return parseStorage(merge_tree_table, context, true); +} + +CustomStorageMergeTreePtr +MergeTreeRelParser::copyToDefaultPolicyStorage(MergeTreeTable merge_tree_table, ContextMutablePtr context) +{ + auto temp_uuid = UUIDHelpers::generateV4(); + String temp_uuid_str = toString(temp_uuid); + merge_tree_table.table = merge_tree_table.table + "_" + temp_uuid_str; + merge_tree_table.snapshot_id = ""; + merge_tree_table.table_configs.storage_policy = ""; + merge_tree_table.relative_path = merge_tree_table.relative_path + "_" + temp_uuid_str; + return parseStorage(merge_tree_table, context); +} + +CustomStorageMergeTreePtr +MergeTreeRelParser::copyToVirtualStorage(MergeTreeTable merge_tree_table, ContextMutablePtr context) +{ + auto temp_uuid = UUIDHelpers::generateV4(); + String temp_uuid_str = toString(temp_uuid); + merge_tree_table.table = merge_tree_table.table + "_" + temp_uuid_str; + merge_tree_table.snapshot_id = ""; + return parseStorage(merge_tree_table, context); } DB::QueryPlanPtr MergeTreeRelParser::parseReadRel( DB::QueryPlanPtr query_plan, const substrait::ReadRel & rel, const substrait::ReadRel::ExtensionTable & extension_table) { auto merge_tree_table = parseMergeTreeTable(extension_table); - DB::Block header = TypeParser::buildBlockFromNamedStruct(merge_tree_table.schema, merge_tree_table.low_card_key); + auto storage = parseStorage(extension_table, global_context); + DB::Block input; if (rel.has_base_schema() && rel.base_schema().names_size()) { @@ -139,35 +141,15 @@ DB::QueryPlanPtr MergeTreeRelParser::parseReadRel( else { NamesAndTypesList one_column_name_type; - one_column_name_type.push_back(header.getNamesAndTypesList().front()); + one_column_name_type.push_back(storage->getInMemoryMetadataPtr()->getColumns().getAll().front()); input = BlockUtil::buildHeader(one_column_name_type); - LOG_DEBUG(&Poco::Logger::get("SerializedPlanParser"), "Try to read ({}) instead of empty header", header.dumpNames()); + LOG_DEBUG( + &Poco::Logger::get("SerializedPlanParser"), "Try to read ({}) instead of empty header", one_column_name_type.front().dump()); } - auto storage_factory = StorageMergeTreeFactory::instance(); - auto metadata = buildMetaData(header.getNamesAndTypesList(), context, merge_tree_table); - StorageID table_id(merge_tree_table.database, merge_tree_table.table); - auto storage = storage_factory.getStorage( - table_id, - merge_tree_table.snapshot_id, - metadata->getColumns(), - [&]() -> CustomStorageMergeTreePtr - { - auto custom_storage_merge_tree = std::make_shared( - StorageID(merge_tree_table.database, merge_tree_table.table), - merge_tree_table.relative_path, - *metadata, - false, - global_context, - "", - MergeTreeData::MergingParams(), - buildMergeTreeSettings(merge_tree_table.table_configs)); - return custom_storage_merge_tree; - }); - restoreMetaData(storage, merge_tree_table, *context); for (const auto & [name, sizes] : storage->getColumnSizes()) column_sizes[name] = sizes.data_compressed; - auto storage_snapshot = std::make_shared(*storage, metadata); + auto storage_snapshot = std::make_shared(*storage, storage->getInMemoryMetadataPtr()); auto names_and_types_list = input.getNamesAndTypesList(); auto query_info = buildQueryInfo(names_and_types_list); @@ -180,9 +162,9 @@ DB::QueryPlanPtr MergeTreeRelParser::parseReadRel( query_info->prewhere_info = parsePreWhereInfo(rel.filter(), input); } - std::vector selected_parts = storage_factory.getDataParts(table_id, merge_tree_table.snapshot_id, merge_tree_table.getPartNames()); - if (selected_parts.empty()) - throw Exception(ErrorCodes::NO_SUCH_DATA_PART, "no data part found."); + std::vector selected_parts + = StorageMergeTreeFactory::getDataPartsByNames(storage->getStorageID(), merge_tree_table.snapshot_id, merge_tree_table.getPartNames()); + auto read_step = storage->reader.readFromParts( selected_parts, /* alter_conversions = */ @@ -195,8 +177,7 @@ DB::QueryPlanPtr MergeTreeRelParser::parseReadRel( 1); auto * source_step_with_filter = static_cast(read_step.get()); - const auto & storage_prewhere_info = query_info->prewhere_info; - if (storage_prewhere_info) + if (const auto & storage_prewhere_info = query_info->prewhere_info) { source_step_with_filter->addFilter(storage_prewhere_info->prewhere_actions, storage_prewhere_info->prewhere_column_name); source_step_with_filter->applyFilters(); @@ -230,7 +211,7 @@ PrewhereInfoPtr MergeTreeRelParser::parsePreWhereInfo(const substrait::Expressio prewhere_info->prewhere_column_name = filter_name; prewhere_info->need_filter = true; prewhere_info->remove_prewhere_column = true; - prewhere_info->prewhere_actions->projectInput(false); + for (const auto & name : input.getNames()) prewhere_info->prewhere_actions->tryRestoreColumn(name); return prewhere_info; @@ -417,7 +398,7 @@ String MergeTreeRelParser::filterRangesOnDriver(const substrait::ReadRel & read_ google::protobuf::StringValue table; table.ParseFromString(read_rel.advanced_extension().enhancement().value()); auto merge_tree_table = parseMergeTreeTableString(table.value()); - auto custom_storage_mergetree = parseStorage(merge_tree_table, global_context); + auto custom_storage_mergetree = parseStorage(merge_tree_table, global_context, true); auto input = TypeParser::buildBlockFromNamedStruct(read_rel.base_schema()); auto names_and_types_list = input.getNamesAndTypesList(); @@ -427,7 +408,7 @@ String MergeTreeRelParser::filterRangesOnDriver(const substrait::ReadRel & read_ auto storage_factory = StorageMergeTreeFactory::instance(); std::vector selected_parts - = storage_factory.getDataParts(StorageID(merge_tree_table.database, merge_tree_table.table), merge_tree_table.snapshot_id, merge_tree_table.getPartNames()); + = storage_factory.getDataPartsByNames(StorageID(merge_tree_table.database, merge_tree_table.table), merge_tree_table.snapshot_id, merge_tree_table.getPartNames()); auto storage_snapshot = std::make_shared(*custom_storage_mergetree, custom_storage_mergetree->getInMemoryMetadataPtr()); if (selected_parts.empty()) diff --git a/cpp-ch/local-engine/Parser/MergeTreeRelParser.h b/cpp-ch/local-engine/Parser/MergeTreeRelParser.h index 7619851d93b64..bf27b184f9876 100644 --- a/cpp-ch/local-engine/Parser/MergeTreeRelParser.h +++ b/cpp-ch/local-engine/Parser/MergeTreeRelParser.h @@ -41,9 +41,18 @@ class MergeTreeRelParser : public RelParser { public: static CustomStorageMergeTreePtr - parseStorage(const substrait::ReadRel::ExtensionTable & extension_table, ContextMutablePtr context, UUID uuid = UUIDHelpers::Nil); + parseStorage(const substrait::ReadRel::ExtensionTable & extension_table, ContextMutablePtr context); + static CustomStorageMergeTreePtr parseStorage( + const MergeTreeTable & merge_tree_table, ContextMutablePtr context, bool restore = false); + + // Create random table name and table path and use default storage policy. + // In insert case, mergetree data can be upload after merges in default storage(Local Disk). + static CustomStorageMergeTreePtr + copyToDefaultPolicyStorage(MergeTreeTable merge_tree_table, ContextMutablePtr context); + + // Use same table path and data path as the originial table. static CustomStorageMergeTreePtr - parseStorage(const MergeTreeTable & merge_tree_table, ContextMutablePtr context, UUID uuid = UUIDHelpers::Nil); + copyToVirtualStorage(MergeTreeTable merge_tree_table, ContextMutablePtr context); static MergeTreeTable parseMergeTreeTable(const substrait::ReadRel::ExtensionTable & extension_table); diff --git a/cpp-ch/local-engine/Parser/ProjectRelParser.cpp b/cpp-ch/local-engine/Parser/ProjectRelParser.cpp index caf779ac13bc0..2f75ac396dfe3 100644 --- a/cpp-ch/local-engine/Parser/ProjectRelParser.cpp +++ b/cpp-ch/local-engine/Parser/ProjectRelParser.cpp @@ -21,6 +21,9 @@ #include #include #include +#include + +using namespace DB; namespace local_engine { @@ -99,7 +102,6 @@ ProjectRelParser::SplittedActionsDAGs ProjectRelParser::splitActionsDAGInGenerat std::unordered_set first_split_nodes(array_join_node->children.begin(), array_join_node->children.end()); auto first_split_result = actions_dag->split(first_split_nodes); res.before_array_join = first_split_result.first; - res.before_array_join->projectInput(true); array_join_node = findArrayJoinNode(first_split_result.second); std::unordered_set second_split_nodes = {array_join_node}; @@ -110,15 +112,46 @@ ProjectRelParser::SplittedActionsDAGs ProjectRelParser::splitActionsDAGInGenerat return res; } +bool ProjectRelParser::isReplicateRows(substrait::GenerateRel rel) +{ + return plan_parser->isFunction(rel.generator().scalar_function(), "replicaterows"); +} + +DB::QueryPlanPtr ProjectRelParser::parseReplicateRows(DB::QueryPlanPtr query_plan, substrait::GenerateRel generate_rel) +{ + std::vector expressions; + for (int i = 0; i < generate_rel.generator().scalar_function().arguments_size(); ++i) + { + expressions.emplace_back(generate_rel.generator().scalar_function().arguments(i).value()); + } + auto header = query_plan->getCurrentDataStream().header; + auto actions_dag = expressionsToActionsDAG(expressions, header); + auto before_replicate_rows = std::make_unique(query_plan->getCurrentDataStream(), actions_dag); + before_replicate_rows->setStepDescription("Before ReplicateRows"); + steps.emplace_back(before_replicate_rows.get()); + query_plan->addStep(std::move(before_replicate_rows)); + + auto replicate_rows_step = std::make_unique(query_plan->getCurrentDataStream()); + replicate_rows_step->setStepDescription("ReplicateRows"); + steps.emplace_back(replicate_rows_step.get()); + query_plan->addStep(std::move(replicate_rows_step)); + return query_plan; +} + DB::QueryPlanPtr ProjectRelParser::parseGenerate(DB::QueryPlanPtr query_plan, const substrait::Rel & rel, std::list & /*rel_stack_*/) { const auto & generate_rel = rel.generate(); + if (isReplicateRows(generate_rel)) + { + return parseReplicateRows(std::move(query_plan), generate_rel); + } std::vector expressions; for (int i = 0; i < generate_rel.child_output_size(); ++i) { expressions.emplace_back(generate_rel.child_output(i)); } + expressions.emplace_back(generate_rel.generator()); auto header = query_plan->getCurrentDataStream().header; auto actions_dag = expressionsToActionsDAG(expressions, header); diff --git a/cpp-ch/local-engine/Parser/ProjectRelParser.h b/cpp-ch/local-engine/Parser/ProjectRelParser.h index ae56939144758..48a16d774d887 100644 --- a/cpp-ch/local-engine/Parser/ProjectRelParser.h +++ b/cpp-ch/local-engine/Parser/ProjectRelParser.h @@ -19,7 +19,6 @@ #include #include #include -#include namespace local_engine { @@ -50,6 +49,9 @@ class ProjectRelParser : public RelParser /// Split actions_dag of generate rel into 3 parts: before array join + during array join + after array join static SplittedActionsDAGs splitActionsDAGInGenerate(ActionsDAGPtr actions_dag); + bool isReplicateRows(substrait::GenerateRel rel); + + DB::QueryPlanPtr parseReplicateRows(QueryPlanPtr query_plan, substrait::GenerateRel generate_rel); const substrait::Rel & getSingleInput(const substrait::Rel & rel) override { diff --git a/cpp-ch/local-engine/Parser/RelMetric.cpp b/cpp-ch/local-engine/Parser/RelMetric.cpp index eec31213a69ef..feb930dfc4ed4 100644 --- a/cpp-ch/local-engine/Parser/RelMetric.cpp +++ b/cpp-ch/local-engine/Parser/RelMetric.cpp @@ -142,7 +142,7 @@ const String & RelMetric::getName() const return name; } -std::string RelMetricSerializer::serializeRelMetric(RelMetricPtr rel_metric, bool flatten) +std::string RelMetricSerializer::serializeRelMetric(const RelMetricPtr & rel_metric, bool flatten) { StringBuffer result; Writer writer(result); diff --git a/cpp-ch/local-engine/Parser/RelMetric.h b/cpp-ch/local-engine/Parser/RelMetric.h index 8255654a8dde5..8706bed2ff022 100644 --- a/cpp-ch/local-engine/Parser/RelMetric.h +++ b/cpp-ch/local-engine/Parser/RelMetric.h @@ -58,6 +58,6 @@ class RelMetric class RelMetricSerializer { public: - static std::string serializeRelMetric(RelMetricPtr rel_metric, bool flatten = true); + static std::string serializeRelMetric(const RelMetricPtr & rel_metric, bool flatten = true); }; } diff --git a/cpp-ch/local-engine/Parser/RelParser.cpp b/cpp-ch/local-engine/Parser/RelParser.cpp index 7fc8078271093..282339c4d641f 100644 --- a/cpp-ch/local-engine/Parser/RelParser.cpp +++ b/cpp-ch/local-engine/Parser/RelParser.cpp @@ -15,12 +15,16 @@ * limitations under the License. */ #include "RelParser.h" + #include +#include + #include +#include #include -#include -#include #include +#include + namespace DB { @@ -38,7 +42,20 @@ AggregateFunctionPtr RelParser::getAggregateFunction( { auto & factory = AggregateFunctionFactory::instance(); auto action = NullsAction::EMPTY; - return factory.get(name, action, arg_types, parameters, properties); + + String function_name = name; + if (name == "avg" && isDecimal(removeNullable(arg_types[0]))) + function_name = "sparkAvg"; + else if (name == "avgPartialMerge") + { + if (auto agg_func = typeid_cast(arg_types[0].get()); + !agg_func->getArgumentsDataTypes().empty() && isDecimal(removeNullable(agg_func->getArgumentsDataTypes()[0]))) + { + function_name = "sparkAvgPartialMerge"; + } + } + + return factory.get(function_name, action, arg_types, parameters, properties); } std::optional RelParser::parseSignatureFunctionName(UInt32 function_ref) diff --git a/cpp-ch/local-engine/Parser/RelParser.h b/cpp-ch/local-engine/Parser/RelParser.h index 6ca8af5359551..0228c2867a269 100644 --- a/cpp-ch/local-engine/Parser/RelParser.h +++ b/cpp-ch/local-engine/Parser/RelParser.h @@ -85,7 +85,6 @@ class RelParser static std::map parseFormattedRelAdvancedOptimization(const substrait::extensions::AdvancedExtension &advanced_extension); static std::string getStringConfig(const std::map & configs, const std::string & key, const std::string & default_value = ""); -private: SerializedPlanParser * plan_parser; }; diff --git a/cpp-ch/local-engine/Parser/SerializedPlanParser.cpp b/cpp-ch/local-engine/Parser/SerializedPlanParser.cpp index a26f78699dc88..8c60c6e500a97 100644 --- a/cpp-ch/local-engine/Parser/SerializedPlanParser.cpp +++ b/cpp-ch/local-engine/Parser/SerializedPlanParser.cpp @@ -87,14 +87,14 @@ namespace DB { namespace ErrorCodes { - extern const int LOGICAL_ERROR; - extern const int UNKNOWN_TYPE; - extern const int BAD_ARGUMENTS; - extern const int NO_SUCH_DATA_PART; - extern const int UNKNOWN_FUNCTION; - extern const int CANNOT_PARSE_PROTOBUF_SCHEMA; - extern const int ILLEGAL_TYPE_OF_ARGUMENT; - extern const int INVALID_JOIN_ON_EXPRESSION; +extern const int LOGICAL_ERROR; +extern const int UNKNOWN_TYPE; +extern const int BAD_ARGUMENTS; +extern const int NO_SUCH_DATA_PART; +extern const int UNKNOWN_FUNCTION; +extern const int CANNOT_PARSE_PROTOBUF_SCHEMA; +extern const int ILLEGAL_TYPE_OF_ARGUMENT; +extern const int INVALID_JOIN_ON_EXPRESSION; } } @@ -144,16 +144,13 @@ void SerializedPlanParser::parseExtensions( if (extension.has_extension_function()) { function_mapping.emplace( - std::to_string(extension.extension_function().function_anchor()), - extension.extension_function().name()); + std::to_string(extension.extension_function().function_anchor()), extension.extension_function().name()); } } } std::shared_ptr SerializedPlanParser::expressionsToActionsDAG( - const std::vector & expressions, - const Block & header, - const Block & read_schema) + const std::vector & expressions, const Block & header, const Block & read_schema) { auto actions_dag = std::make_shared(blockToNameAndTypeList(header)); NamesWithAliases required_columns; @@ -214,7 +211,7 @@ std::shared_ptr SerializedPlanParser::expressionsToActionsDAG( } } } - else if (expr.has_cast() || expr.has_if_then() || expr.has_literal()) + else if (expr.has_cast() || expr.has_if_then() || expr.has_literal() || expr.has_singular_or_list()) { const auto * node = parseExpression(actions_dag, expr); actions_dag->addOrReplaceInOutputs(*node); @@ -234,6 +231,7 @@ std::shared_ptr SerializedPlanParser::expressionsToActionsDAG( throw Exception(ErrorCodes::BAD_ARGUMENTS, "unsupported projection type {}.", magic_enum::enum_name(expr.rex_type_case())); } actions_dag->project(required_columns); + actions_dag->appendInputsForUnusedColumns(header); return actions_dag; } @@ -258,8 +256,8 @@ std::string getDecimalFunction(const substrait::Type_Decimal & decimal, bool nul bool SerializedPlanParser::isReadRelFromJava(const substrait::ReadRel & rel) { - return rel.has_local_files() && rel.local_files().items().size() == 1 && rel.local_files().items().at(0).uri_file().starts_with( - "iterator"); + return rel.has_local_files() && rel.local_files().items().size() == 1 + && rel.local_files().items().at(0).uri_file().starts_with("iterator"); } bool SerializedPlanParser::isReadFromMergeTree(const substrait::ReadRel & rel) @@ -335,6 +333,19 @@ IQueryPlanStep * SerializedPlanParser::addRemoveNullableStep(QueryPlan & plan, c return step_ptr; } +IQueryPlanStep * SerializedPlanParser::addRollbackFilterHeaderStep(QueryPlanPtr & query_plan, const Block & input_header) +{ + auto convert_actions_dag = ActionsDAG::makeConvertingActions( + query_plan->getCurrentDataStream().header.getColumnsWithTypeAndName(), + input_header.getColumnsWithTypeAndName(), + ActionsDAG::MatchColumnsMode::Name); + auto expression_step = std::make_unique(query_plan->getCurrentDataStream(), convert_actions_dag); + expression_step->setStepDescription("Generator for rollback filter"); + auto * step_ptr = expression_step.get(); + query_plan->addStep(std::move(expression_step)); + return step_ptr; +} + DataTypePtr wrapNullableType(substrait::Type_Nullability nullable, DataTypePtr nested_type) { return wrapNullableType(nullable == substrait::Type_Nullability_NULLABILITY_NULLABLE, nested_type); @@ -366,13 +377,13 @@ DataTypePtr wrapNullableType(bool nullable, DataTypePtr nested_type) return nested_type; } -QueryPlanPtr SerializedPlanParser::parse(std::unique_ptr plan) +QueryPlanPtr SerializedPlanParser::parse(const substrait::Plan & plan) { - logDebugMessage(*plan, "substrait plan"); - parseExtensions(plan->extensions()); - if (plan->relations_size() == 1) + logDebugMessage(plan, "substrait plan"); + parseExtensions(plan.extensions()); + if (plan.relations_size() == 1) { - auto root_rel = plan->relations().at(0); + auto root_rel = plan.relations().at(0); if (!root_rel.has_root()) { throw Exception(ErrorCodes::BAD_ARGUMENTS, "must have root rel!"); @@ -553,6 +564,16 @@ NamesAndTypesList SerializedPlanParser::blockToNameAndTypeList(const Block & hea return types; } +std::optional SerializedPlanParser::getFunctionSignatureName(UInt32 function_ref) const +{ + auto it = function_mapping.find(std::to_string(function_ref)); + if (it == function_mapping.end()) + return {}; + auto function_signature = it->second; + auto pos = function_signature.find(':'); + return function_signature.substr(0, pos); +} + std::string SerializedPlanParser::getFunctionName(const std::string & function_signature, const substrait::Expression_ScalarFunction & function) { @@ -573,9 +594,7 @@ SerializedPlanParser::getFunctionName(const std::string & function_signature, co { if (args.size() != 2) throw Exception( - ErrorCodes::BAD_ARGUMENTS, - "Spark function extract requires two args, function:{}", - function.ShortDebugString()); + ErrorCodes::BAD_ARGUMENTS, "Spark function extract requires two args, function:{}", function.ShortDebugString()); // Get the first arg: field const auto & extract_field = args.at(0); @@ -639,15 +658,6 @@ SerializedPlanParser::getFunctionName(const std::string & function_signature, co if (null_on_overflow) ch_function_name = ch_function_name + "OrNull"; } - else if (function_name == "char_length") - { - /// In Spark - /// char_length returns the number of bytes when input is binary type, corresponding to CH length function - /// char_length returns the number of characters when input is string type, corresponding to CH char_length function - ch_function_name = SCALAR_FUNCTIONS.at(function_name); - if (function_signature.find("vbin") != std::string::npos) - ch_function_name = "length"; - } else if (function_name == "reverse") { if (function.output_type().has_list()) @@ -655,19 +665,6 @@ SerializedPlanParser::getFunctionName(const std::string & function_signature, co else ch_function_name = "reverseUTF8"; } - else if (function_name == "concat") - { - /// 1. ConcatOverloadResolver cannot build arrayConcat for Nullable(Array) type which causes failures when using functions like concat(split()). - /// So we use arrayConcat directly if the output type is array. - /// 2. CH ConcatImpl can only accept at least 2 arguments, but Spark concat can accept 1 argument, like concat('a') - /// in such case we use identity function - if (function.output_type().has_list()) - ch_function_name = "arrayConcat"; - else if (args.size() == 1) - ch_function_name = "identity"; - else - ch_function_name = "concat"; - } else ch_function_name = SCALAR_FUNCTIONS.at(function_name); @@ -691,9 +688,7 @@ void SerializedPlanParser::parseArrayJoinArguments( /// The argument number of arrayJoin(converted from Spark explode/posexplode) should be 1 if (scalar_function.arguments_size() != 1) throw Exception( - ErrorCodes::BAD_ARGUMENTS, - "Argument number of arrayJoin should be 1 instead of {}", - scalar_function.arguments_size()); + ErrorCodes::BAD_ARGUMENTS, "Argument number of arrayJoin should be 1 instead of {}", scalar_function.arguments_size()); auto function_name_copy = function_name; parseFunctionArguments(actions_dag, parsed_args, function_name_copy, scalar_function); @@ -732,11 +727,7 @@ void SerializedPlanParser::parseArrayJoinArguments( } ActionsDAG::NodeRawConstPtrs SerializedPlanParser::parseArrayJoinWithDAG( - const substrait::Expression & rel, - std::vector & result_names, - ActionsDAGPtr actions_dag, - bool keep_result, - bool position) + const substrait::Expression & rel, std::vector & result_names, ActionsDAGPtr actions_dag, bool keep_result, bool position) { if (!rel.has_scalar_function()) throw Exception(ErrorCodes::BAD_ARGUMENTS, "The root of expression should be a scalar function:\n {}", rel.DebugString()); @@ -760,7 +751,8 @@ ActionsDAG::NodeRawConstPtrs SerializedPlanParser::parseArrayJoinWithDAG( auto tuple_element_builder = FunctionFactory::instance().get("sparkTupleElement", context); auto tuple_index_type = std::make_shared(); - auto add_tuple_element = [&](const ActionsDAG::Node * tuple_node, size_t i) -> const ActionsDAG::Node * { + auto add_tuple_element = [&](const ActionsDAG::Node * tuple_node, size_t i) -> const ActionsDAG::Node * + { ColumnWithTypeAndName index_col(tuple_index_type->createColumnConst(1, i), tuple_index_type, getUniqueName(std::to_string(i))); const auto * index_node = &actions_dag->addColumn(std::move(index_col)); auto result_name = "sparkTupleElement(" + tuple_node->result_name + ", " + index_node->result_name + ")"; @@ -852,10 +844,7 @@ ActionsDAG::NodeRawConstPtrs SerializedPlanParser::parseArrayJoinWithDAG( } const ActionsDAG::Node * SerializedPlanParser::parseFunctionWithDAG( - const substrait::Expression & rel, - std::string & result_name, - ActionsDAGPtr actions_dag, - bool keep_result) + const substrait::Expression & rel, std::string & result_name, ActionsDAGPtr actions_dag, bool keep_result) { if (!rel.has_scalar_function()) throw Exception(ErrorCodes::BAD_ARGUMENTS, "the root of expression should be a scalar function:\n {}", rel.DebugString()); @@ -870,10 +859,7 @@ const ActionsDAG::Node * SerializedPlanParser::parseFunctionWithDAG( if (auto func_parser = FunctionParserFactory::instance().tryGet(func_name, this)) { LOG_DEBUG( - &Poco::Logger::get("SerializedPlanParser"), - "parse function {} by function parser: {}", - func_name, - func_parser->getName()); + &Poco::Logger::get("SerializedPlanParser"), "parse function {} by function parser: {}", func_name, func_parser->getName()); const auto * result_node = func_parser->parse(scalar_function, actions_dag); if (keep_result) actions_dag->addOrReplaceInOutputs(*result_node); @@ -942,12 +928,10 @@ const ActionsDAG::Node * SerializedPlanParser::parseFunctionWithDAG( UInt32 precision = rel.scalar_function().output_type().decimal().precision(); UInt32 scale = rel.scalar_function().output_type().decimal().scale(); auto uint32_type = std::make_shared(); - new_args.emplace_back( - &actions_dag->addColumn( - ColumnWithTypeAndName(uint32_type->createColumnConst(1, precision), uint32_type, getUniqueName(toString(precision))))); - new_args.emplace_back( - &actions_dag->addColumn( - ColumnWithTypeAndName(uint32_type->createColumnConst(1, scale), uint32_type, getUniqueName(toString(scale))))); + new_args.emplace_back(&actions_dag->addColumn( + ColumnWithTypeAndName(uint32_type->createColumnConst(1, precision), uint32_type, getUniqueName(toString(precision))))); + new_args.emplace_back(&actions_dag->addColumn( + ColumnWithTypeAndName(uint32_type->createColumnConst(1, scale), uint32_type, getUniqueName(toString(scale))))); args = std::move(new_args); } else if (startsWith(function_signature, "make_decimal:")) @@ -962,12 +946,10 @@ const ActionsDAG::Node * SerializedPlanParser::parseFunctionWithDAG( UInt32 precision = rel.scalar_function().output_type().decimal().precision(); UInt32 scale = rel.scalar_function().output_type().decimal().scale(); auto uint32_type = std::make_shared(); - new_args.emplace_back( - &actions_dag->addColumn( - ColumnWithTypeAndName(uint32_type->createColumnConst(1, precision), uint32_type, getUniqueName(toString(precision))))); - new_args.emplace_back( - &actions_dag->addColumn( - ColumnWithTypeAndName(uint32_type->createColumnConst(1, scale), uint32_type, getUniqueName(toString(scale))))); + new_args.emplace_back(&actions_dag->addColumn( + ColumnWithTypeAndName(uint32_type->createColumnConst(1, precision), uint32_type, getUniqueName(toString(precision))))); + new_args.emplace_back(&actions_dag->addColumn( + ColumnWithTypeAndName(uint32_type->createColumnConst(1, scale), uint32_type, getUniqueName(toString(scale))))); args = std::move(new_args); } @@ -985,9 +967,8 @@ const ActionsDAG::Node * SerializedPlanParser::parseFunctionWithDAG( actions_dag, function_node, // as stated in isTypeMatched, currently we don't change nullability of the result type - function_node->result_type->isNullable() - ? local_engine::wrapNullableType(true, result_type)->getName() - : local_engine::removeNullable(result_type)->getName(), + function_node->result_type->isNullable() ? local_engine::wrapNullableType(true, result_type)->getName() + : local_engine::removeNullable(result_type)->getName(), function_node->result_name, CastType::accurateOrNull); } @@ -997,16 +978,12 @@ const ActionsDAG::Node * SerializedPlanParser::parseFunctionWithDAG( actions_dag, function_node, // as stated in isTypeMatched, currently we don't change nullability of the result type - function_node->result_type->isNullable() - ? local_engine::wrapNullableType(true, result_type)->getName() - : local_engine::removeNullable(result_type)->getName(), + function_node->result_type->isNullable() ? local_engine::wrapNullableType(true, result_type)->getName() + : local_engine::removeNullable(result_type)->getName(), function_node->result_name); } } - if (ch_func_name == "JSON_VALUE") - result_node->function->setResolver(function_builder); - if (keep_result) actions_dag->addOrReplaceInOutputs(*result_node); @@ -1148,17 +1125,14 @@ void SerializedPlanParser::parseFunctionArgument( } const ActionsDAG::Node * SerializedPlanParser::parseFunctionArgument( - ActionsDAGPtr & actions_dag, - const std::string & function_name, - const substrait::FunctionArgument & arg) + ActionsDAGPtr & actions_dag, const std::string & function_name, const substrait::FunctionArgument & arg) { const ActionsDAG::Node * res; if (arg.value().has_scalar_function()) { std::string arg_name; bool keep_arg = FUNCTION_NEED_KEEP_ARGUMENTS.contains(function_name); - parseFunctionWithDAG(arg.value(), arg_name, actions_dag, keep_arg); - res = &actions_dag->getNodes().back(); + res = parseFunctionWithDAG(arg.value(), arg_name, actions_dag, keep_arg); } else { @@ -1178,11 +1152,8 @@ std::pair SerializedPlanParser::convertStructFieldType(const } auto type_id = type->getTypeId(); - if (type_id == TypeIndex::UInt8 || type_id == TypeIndex::UInt16 || type_id == TypeIndex::UInt32 - || type_id == TypeIndex::UInt64) - { + if (type_id == TypeIndex::UInt8 || type_id == TypeIndex::UInt16 || type_id == TypeIndex::UInt32 || type_id == TypeIndex::UInt64) return {type, field}; - } UINT_CONVERT(type, field, Int8) UINT_CONVERT(type, field, Int16) UINT_CONVERT(type, field, Int32) @@ -1191,12 +1162,14 @@ std::pair SerializedPlanParser::convertStructFieldType(const #undef UINT_CONVERT } +bool SerializedPlanParser::isFunction(substrait::Expression_ScalarFunction rel, String function_name) +{ + auto func_signature = function_mapping[std::to_string(rel.function_reference())]; + return func_signature.starts_with(function_name + ":"); +} + ActionsDAGPtr SerializedPlanParser::parseFunction( - const Block & header, - const substrait::Expression & rel, - std::string & result_name, - ActionsDAGPtr actions_dag, - bool keep_result) + const Block & header, const substrait::Expression & rel, std::string & result_name, ActionsDAGPtr actions_dag, bool keep_result) { if (!actions_dag) actions_dag = std::make_shared(blockToNameAndTypeList(header)); @@ -1206,11 +1179,7 @@ ActionsDAGPtr SerializedPlanParser::parseFunction( } ActionsDAGPtr SerializedPlanParser::parseFunctionOrExpression( - const Block & header, - const substrait::Expression & rel, - std::string & result_name, - ActionsDAGPtr actions_dag, - bool keep_result) + const Block & header, const substrait::Expression & rel, std::string & result_name, ActionsDAGPtr actions_dag, bool keep_result) { if (!actions_dag) actions_dag = std::make_shared(blockToNameAndTypeList(header)); @@ -1292,7 +1261,8 @@ ActionsDAGPtr SerializedPlanParser::parseJsonTuple( = &actions_dag->addFunction(json_extract_builder, {json_expr_node, extract_expr_node}, json_extract_result_name); auto tuple_element_builder = FunctionFactory::instance().get("sparkTupleElement", context); auto tuple_index_type = std::make_shared(); - auto add_tuple_element = [&](const ActionsDAG::Node * tuple_node, size_t i) -> const ActionsDAG::Node * { + auto add_tuple_element = [&](const ActionsDAG::Node * tuple_node, size_t i) -> const ActionsDAG::Node * + { ColumnWithTypeAndName index_col(tuple_index_type->createColumnConst(1, i), tuple_index_type, getUniqueName(std::to_string(i))); const auto * index_node = &actions_dag->addColumn(std::move(index_col)); auto result_name = "sparkTupleElement(" + tuple_node->result_name + ", " + index_node->result_name + ")"; @@ -1517,9 +1487,7 @@ std::pair SerializedPlanParser::parseLiteral(const substrait } default: { throw Exception( - ErrorCodes::UNKNOWN_TYPE, - "Unsupported spark literal type {}", - magic_enum::enum_name(literal.literal_type_case())); + ErrorCodes::UNKNOWN_TYPE, "Unsupported spark literal type {}", magic_enum::enum_name(literal.literal_type_case())); } } return std::make_pair(std::move(type), std::move(field)); @@ -1721,8 +1689,7 @@ substrait::ReadRel::ExtensionTable SerializedPlanParser::parseExtensionTable(con { substrait::ReadRel::ExtensionTable extension_table; google::protobuf::io::CodedInputStream coded_in( - reinterpret_cast(split_info.data()), - static_cast(split_info.size())); + reinterpret_cast(split_info.data()), static_cast(split_info.size())); coded_in.SetRecursionLimit(100000); auto ok = extension_table.ParseFromCodedStream(&coded_in); @@ -1736,8 +1703,7 @@ substrait::ReadRel::LocalFiles SerializedPlanParser::parseLocalFiles(const std:: { substrait::ReadRel::LocalFiles local_files; google::protobuf::io::CodedInputStream coded_in( - reinterpret_cast(split_info.data()), - static_cast(split_info.size())); + reinterpret_cast(split_info.data()), static_cast(split_info.size())); coded_in.SetRecursionLimit(100000); auto ok = local_files.ParseFromCodedStream(&coded_in); @@ -1747,10 +1713,44 @@ substrait::ReadRel::LocalFiles SerializedPlanParser::parseLocalFiles(const std:: return local_files; } +std::unique_ptr SerializedPlanParser::createExecutor(DB::QueryPlanPtr query_plan) +{ + Stopwatch stopwatch; + auto * logger = &Poco::Logger::get("SerializedPlanParser"); + const Settings & settings = context->getSettingsRef(); + + QueryPriorities priorities; + auto query_status = std::make_shared( + context, + "", + context->getClientInfo(), + priorities.insert(static_cast(settings.priority)), + CurrentThread::getGroup(), + IAST::QueryKind::Select, + settings, + 0); -QueryPlanPtr SerializedPlanParser::parse(const std::string & plan) + QueryPlanOptimizationSettings optimization_settings{.optimize_plan = settings.query_plan_enable_optimizations}; + auto pipeline_builder = query_plan->buildQueryPipeline( + optimization_settings, + BuildQueryPipelineSettings{ + .actions_settings + = ExpressionActionsSettings{.can_compile_expressions = true, .min_count_to_compile_expression = 3, .compile_expressions = CompileExpressions::yes}, + .process_list_element = query_status}); + QueryPipeline pipeline = QueryPipelineBuilder::getPipeline(std::move(*pipeline_builder)); + LOG_INFO(logger, "build pipeline {} ms", stopwatch.elapsedMicroseconds() / 1000.0); + + LOG_DEBUG( + logger, "clickhouse plan [optimization={}]:\n{}", settings.query_plan_enable_optimizations, PlanUtil::explainPlan(*query_plan)); + LOG_DEBUG(logger, "clickhouse pipeline:\n{}", QueryPipelineUtil::explainPipeline(pipeline)); + + return std::make_unique( + context, std::move(query_plan), std::move(pipeline), query_plan->getCurrentDataStream().header.cloneEmpty()); +} + +QueryPlanPtr SerializedPlanParser::parse(const std::string_view plan) { - auto plan_ptr = std::make_unique(); + substrait::Plan s_plan; /// https://stackoverflow.com/questions/52028583/getting-error-parsing-protobuf-data /// Parsing may fail when the number of recursive layers is large. /// Here, set a limit large enough to avoid this problem. @@ -1758,11 +1758,10 @@ QueryPlanPtr SerializedPlanParser::parse(const std::string & plan) google::protobuf::io::CodedInputStream coded_in(reinterpret_cast(plan.data()), static_cast(plan.size())); coded_in.SetRecursionLimit(100000); - auto ok = plan_ptr->ParseFromCodedStream(&coded_in); - if (!ok) + if (!s_plan.ParseFromCodedStream(&coded_in)) throw Exception(ErrorCodes::CANNOT_PARSE_PROTOBUF_SCHEMA, "Parse substrait::Plan from string failed"); - auto res = parse(std::move(plan_ptr)); + auto res = parse(s_plan); #ifndef NDEBUG PlanUtil::checkOuputType(*res); @@ -1772,22 +1771,21 @@ QueryPlanPtr SerializedPlanParser::parse(const std::string & plan) if (logger->debug()) { auto out = PlanUtil::explainPlan(*res); - LOG_ERROR(logger, "clickhouse plan:\n{}", out); + LOG_DEBUG(logger, "clickhouse plan:\n{}", out); } return res; } -QueryPlanPtr SerializedPlanParser::parseJson(const std::string & json_plan) +QueryPlanPtr SerializedPlanParser::parseJson(const std::string_view & json_plan) { - auto plan_ptr = std::make_unique(); - auto s = google::protobuf::util::JsonStringToMessage(absl::string_view(json_plan.c_str()), plan_ptr.get()); + substrait::Plan plan; + auto s = google::protobuf::util::JsonStringToMessage(json_plan, &plan); if (!s.ok()) throw Exception(ErrorCodes::CANNOT_PARSE_PROTOBUF_SCHEMA, "Parse substrait::Plan from json string failed: {}", s.ToString()); - return parse(std::move(plan_ptr)); + return parse(plan); } -SerializedPlanParser::SerializedPlanParser(const ContextPtr & context_) - : context(context_) +SerializedPlanParser::SerializedPlanParser(const ContextPtr & context_) : context(context_) { } @@ -1796,13 +1794,10 @@ ContextMutablePtr SerializedPlanParser::global_context = nullptr; Context::ConfigurationPtr SerializedPlanParser::config = nullptr; void SerializedPlanParser::collectJoinKeys( - const substrait::Expression & condition, - std::vector> & join_keys, - int32_t right_key_start) + const substrait::Expression & condition, std::vector> & join_keys, int32_t right_key_start) { auto condition_name = getFunctionName( - function_mapping.at(std::to_string(condition.scalar_function().function_reference())), - condition.scalar_function()); + function_mapping.at(std::to_string(condition.scalar_function().function_reference())), condition.scalar_function()); if (condition_name == "and") { collectJoinKeys(condition.scalar_function().arguments(0).value(), join_keys, right_key_start); @@ -1821,7 +1816,7 @@ void SerializedPlanParser::collectJoinKeys( } } -ActionsDAGPtr ASTParser::convertToActions(const NamesAndTypesList & name_and_types, const ASTPtr & ast) +ActionsDAG ASTParser::convertToActions(const NamesAndTypesList & name_and_types, const ASTPtr & ast) const { NamesAndTypesList aggregation_keys; ColumnNumbersList aggregation_keys_indexes_list; @@ -1830,9 +1825,9 @@ ActionsDAGPtr ASTParser::convertToActions(const NamesAndTypesList & name_and_typ ActionsMatcher::Data visitor_data( context, size_limits_for_set, - size_t(0), + static_cast(0), name_and_types, - std::make_shared(name_and_types), + ActionsDAG(name_and_types), std::make_shared(), false /* no_subqueries */, false /* no_makeset */, @@ -1852,8 +1847,8 @@ ASTPtr ASTParser::parseToAST(const Names & names, const substrait::Expression & auto substrait_name = function_signature.substr(0, function_signature.find(':')); auto func_parser = FunctionParserFactory::instance().tryGet(substrait_name, plan_parser); - String function_name = func_parser ? func_parser->getName() - : SerializedPlanParser::getFunctionName(function_signature, scalar_function); + String function_name + = func_parser ? func_parser->getName() : SerializedPlanParser::getFunctionName(function_signature, scalar_function); ASTs ast_args; parseFunctionArgumentsToAST(names, scalar_function, ast_args); @@ -1865,9 +1860,7 @@ ASTPtr ASTParser::parseToAST(const Names & names, const substrait::Expression & } void ASTParser::parseFunctionArgumentsToAST( - const Names & names, - const substrait::Expression_ScalarFunction & scalar_function, - ASTs & ast_args) + const Names & names, const substrait::Expression_ScalarFunction & scalar_function, ASTs & ast_args) { const auto & args = scalar_function.arguments(); @@ -1891,8 +1884,8 @@ ASTPtr ASTParser::parseArgumentToAST(const Names & names, const substrait::Expre case substrait::Expression::RexTypeCase::kLiteral: { DataTypePtr type; Field field; - std::tie(std::ignore, field) = SerializedPlanParser::parseLiteral(rel.literal()); - return std::make_shared(field); + std::tie(type, field) = SerializedPlanParser::parseLiteral(rel.literal()); + return std::make_shared(field, type); } case substrait::Expression::RexTypeCase::kSelection: { if (!rel.selection().has_direct_reference() || !rel.selection().direct_reference().has_struct_field()) @@ -2010,12 +2003,12 @@ ASTPtr ASTParser::parseArgumentToAST(const Names & names, const substrait::Expre } } -void SerializedPlanParser::removeNullableForRequiredColumns(const std::set & require_columns, ActionsDAGPtr actions_dag) +void SerializedPlanParser::removeNullableForRequiredColumns( + const std::set & require_columns, const ActionsDAGPtr & actions_dag) const { for (const auto & item : require_columns) { - const auto * require_node = actions_dag->tryFindInOutputs(item); - if (require_node) + if (const auto * require_node = actions_dag->tryFindInOutputs(item)) { auto function_builder = FunctionFactory::instance().get("assumeNotNull", context); ActionsDAG::NodeRawConstPtrs args = {require_node}; @@ -2026,9 +2019,7 @@ void SerializedPlanParser::removeNullableForRequiredColumns(const std::set & columns, - ActionsDAGPtr actions_dag, - std::map & nullable_measure_names) + const std::vector & columns, ActionsDAGPtr actions_dag, std::map & nullable_measure_names) { for (const auto & item : columns) { @@ -2046,6 +2037,7 @@ LocalExecutor::~LocalExecutor() { if (context->getConfigRef().getBool("dump_pipeline", false)) LOG_INFO(&Poco::Logger::get("LocalExecutor"), "Dump pipeline:\n{}", dumpPipeline()); + if (spark_buffer) { ch_column_to_spark_row->freeMem(spark_buffer->address, spark_buffer->size); @@ -2053,86 +2045,23 @@ LocalExecutor::~LocalExecutor() } } - -void LocalExecutor::execute(QueryPlanPtr query_plan) -{ - Stopwatch stopwatch; - - const Settings & settings = context->getSettingsRef(); - current_query_plan = std::move(query_plan); - auto * logger = &Poco::Logger::get("LocalExecutor"); - - QueryPriorities priorities; - auto query_status = std::make_shared( - context, - "", - context->getClientInfo(), - priorities.insert(static_cast(settings.priority)), - CurrentThread::getGroup(), - IAST::QueryKind::Select, - settings, - 0); - - QueryPlanOptimizationSettings optimization_settings{.optimize_plan = settings.query_plan_enable_optimizations}; - auto pipeline_builder = current_query_plan->buildQueryPipeline( - optimization_settings, - BuildQueryPipelineSettings{ - .actions_settings - = ExpressionActionsSettings{.can_compile_expressions = true, .min_count_to_compile_expression = 3, - .compile_expressions = CompileExpressions::yes}, - .process_list_element = query_status}); - - LOG_DEBUG(logger, "clickhouse plan after optimization:\n{}", PlanUtil::explainPlan(*current_query_plan)); - query_pipeline = QueryPipelineBuilder::getPipeline(std::move(*pipeline_builder)); - LOG_DEBUG(logger, "clickhouse pipeline:\n{}", QueryPipelineUtil::explainPipeline(query_pipeline)); - auto t_pipeline = stopwatch.elapsedMicroseconds(); - - executor = std::make_unique(query_pipeline); - auto t_executor = stopwatch.elapsedMicroseconds() - t_pipeline; - stopwatch.stop(); - LOG_INFO( - logger, - "build pipeline {} ms; create executor {} ms;", - t_pipeline / 1000.0, - t_executor / 1000.0); - - header = current_query_plan->getCurrentDataStream().header.cloneEmpty(); - ch_column_to_spark_row = std::make_unique(); -} - -std::unique_ptr LocalExecutor::writeBlockToSparkRow(Block & block) +std::unique_ptr LocalExecutor::writeBlockToSparkRow(const Block & block) const { return ch_column_to_spark_row->convertCHColumnToSparkRow(block); } bool LocalExecutor::hasNext() { - bool has_next; - try + size_t columns = currentBlock().columns(); + if (columns == 0 || isConsumed()) { - size_t columns = currentBlock().columns(); - if (columns == 0 || isConsumed()) - { - auto empty_block = header.cloneEmpty(); - setCurrentBlock(empty_block); - has_next = executor->pull(currentBlock()); - produce(); - } - else - { - has_next = true; - } - } - catch (Exception & e) - { - LOG_ERROR( - &Poco::Logger::get("LocalExecutor"), - "LocalExecutor run query plan failed with message: {}. Plan Explained: \n{}", - e.message(), - PlanUtil::explainPlan(*current_query_plan)); - throw; + auto empty_block = header.cloneEmpty(); + setCurrentBlock(empty_block); + bool has_next = executor->pull(currentBlock()); + produce(); + return has_next; } - return has_next; + return true; } SparkRowInfoPtr LocalExecutor::next() @@ -2169,17 +2098,28 @@ Block * LocalExecutor::nextColumnar() return columnar_batch; } +void LocalExecutor::cancel() +{ + if (executor) + executor->cancel(); +} + Block & LocalExecutor::getHeader() { return header; } -LocalExecutor::LocalExecutor(ContextPtr context_) - : context(context_) +LocalExecutor::LocalExecutor(const ContextPtr & context_, QueryPlanPtr query_plan, QueryPipeline && pipeline, const Block & header_) + : query_pipeline(std::move(pipeline)) + , executor(std::make_unique(query_pipeline)) + , header(header_) + , context(context_) + , ch_column_to_spark_row(std::make_unique()) + , current_query_plan(std::move(query_plan)) { } -std::string LocalExecutor::dumpPipeline() +std::string LocalExecutor::dumpPipeline() const { const auto & processors = query_pipeline.getProcessors(); for (auto & processor : processors) @@ -2203,12 +2143,8 @@ std::string LocalExecutor::dumpPipeline() } NonNullableColumnsResolver::NonNullableColumnsResolver( - const Block & header_, - SerializedPlanParser & parser_, - const substrait::Expression & cond_rel_) - : header(header_) - , parser(parser_) - , cond_rel(cond_rel_) + const Block & header_, SerializedPlanParser & parser_, const substrait::Expression & cond_rel_) + : header(header_), parser(parser_), cond_rel(cond_rel_) { } @@ -2280,8 +2216,7 @@ void NonNullableColumnsResolver::visitNonNullable(const substrait::Expression & } std::string NonNullableColumnsResolver::safeGetFunctionName( - const std::string & function_signature, - const substrait::Expression_ScalarFunction & function) + const std::string & function_signature, const substrait::Expression_ScalarFunction & function) const { try { diff --git a/cpp-ch/local-engine/Parser/SerializedPlanParser.h b/cpp-ch/local-engine/Parser/SerializedPlanParser.h index a636ebb9352f4..90086ea28649c 100644 --- a/cpp-ch/local-engine/Parser/SerializedPlanParser.h +++ b/cpp-ch/local-engine/Parser/SerializedPlanParser.h @@ -57,7 +57,7 @@ static const std::map SCALAR_FUNCTIONS {"get_timestamp", "parseDateTimeInJodaSyntaxOrNull"}, // for spark function: to_date/to_timestamp {"quarter", "toQuarter"}, {"to_unix_timestamp", "parseDateTimeInJodaSyntaxOrNull"}, - // {"unix_timestamp", "toUnixTimestamp"}, + //{"unix_timestamp", "toUnixTimestamp"}, {"date_format", "formatDateTimeInJodaSyntax"}, {"timestamp_add", "timestamp_add"}, @@ -94,6 +94,8 @@ static const std::map SCALAR_FUNCTIONS {"bitwise_and", "bitAnd"}, {"bitwise_or", "bitOr"}, {"bitwise_xor", "bitXor"}, + {"bit_get", "bitTest"}, + {"bit_count", "bitCount"}, {"sqrt", "sqrt"}, {"cbrt", "cbrt"}, {"degrees", "degrees"}, @@ -104,14 +106,15 @@ static const std::map SCALAR_FUNCTIONS {"hypot", "hypot"}, {"sign", "sign"}, {"radians", "radians"}, - {"greatest", "greatest"}, - {"least", "least"}, + {"greatest", "sparkGreatest"}, + {"least", "sparkLeast"}, {"shiftleft", "bitShiftLeft"}, {"shiftright", "bitShiftRight"}, {"check_overflow", "checkDecimalOverflowSpark"}, {"rand", "randCanonical"}, {"isnan", "isNaN"}, {"bin", "sparkBin"}, + {"rint", "sparkRint"}, /// string functions {"like", "like"}, @@ -126,15 +129,10 @@ static const std::map SCALAR_FUNCTIONS {"trim", ""}, // trimLeft or trimLeftSpark, depends on argument size {"ltrim", ""}, // trimRight or trimRightSpark, depends on argument size {"rtrim", ""}, // trimBoth or trimBothSpark, depends on argument size - {"concat", ""}, /// dummy mapping {"strpos", "positionUTF8"}, - {"char_length", - "char_length"}, /// Notice: when input argument is binary type, corresponding ch function is length instead of char_length {"replace", "replaceAll"}, {"regexp_replace", "replaceRegexpAll"}, - // {"regexp_extract", "regexpExtract"}, {"regexp_extract_all", "regexpExtractAllSpark"}, - {"chr", "char"}, {"rlike", "match"}, {"ascii", "ascii"}, {"split", "splitByRegexp"}, @@ -150,6 +148,7 @@ static const std::map SCALAR_FUNCTIONS {"initcap", "initcapUTF8"}, {"conv", "sparkConv"}, {"uuid", "generateUUIDv4"}, + {"levenshteinDistance", "editDistanceUTF8"}, /// hash functions {"crc32", "CRC32"}, @@ -178,6 +177,7 @@ static const std::map SCALAR_FUNCTIONS {"array", "array"}, {"shuffle", "arrayShuffle"}, {"range", "range"}, /// dummy mapping + {"flatten", "sparkArrayFlatten"}, // map functions {"map", "map"}, @@ -216,6 +216,7 @@ DataTypePtr wrapNullableType(bool nullable, DataTypePtr nested_type); std::string join(const ActionsDAG::NodeRawConstPtrs & v, char c); class SerializedPlanParser; +class LocalExecutor; // Give a condition expression `cond_rel_`, found all columns with nullability that must not containt // null after this filter. @@ -239,7 +240,7 @@ class NonNullableColumnsResolver void visit(const substrait::Expression & expr); void visitNonNullable(const substrait::Expression & expr); - String safeGetFunctionName(const String & function_signature, const substrait::Expression_ScalarFunction & function); + String safeGetFunctionName(const String & function_signature, const substrait::Expression_ScalarFunction & function) const; }; class SerializedPlanParser @@ -254,12 +255,23 @@ class SerializedPlanParser friend class NonNullableColumnsResolver; friend class JoinRelParser; friend class MergeTreeRelParser; + friend class ProjectRelParser; + + std::unique_ptr createExecutor(DB::QueryPlanPtr query_plan); + + DB::QueryPlanPtr parse(const std::string_view plan); + DB::QueryPlanPtr parse(const substrait::Plan & plan); public: explicit SerializedPlanParser(const ContextPtr & context); - DB::QueryPlanPtr parse(const std::string & plan); - DB::QueryPlanPtr parseJson(const std::string & json_plan); - DB::QueryPlanPtr parse(std::unique_ptr plan); + + /// UT only + DB::QueryPlanPtr parseJson(const std::string_view & json_plan); + std::unique_ptr createExecutor(const substrait::Plan & plan) { return createExecutor(parse((plan))); } + /// + + template + std::unique_ptr createExecutor(const std::string_view plan); DB::QueryPlanStepPtr parseReadRealWithLocalFile(const substrait::ReadRel & rel); DB::QueryPlanStepPtr parseReadRealWithJavaIter(const substrait::ReadRel & rel); @@ -276,7 +288,7 @@ class SerializedPlanParser materialize_inputs.emplace_back(materialize_input); } - void addSplitInfo(std::string & split_info) { split_infos.emplace_back(std::move(split_info)); } + void addSplitInfo(std::string && split_info) { split_infos.emplace_back(std::move(split_info)); } int nextSplitInfoIndex() { @@ -293,10 +305,15 @@ class SerializedPlanParser std::shared_ptr expressionsToActionsDAG( const std::vector & expressions, const DB::Block & header, const DB::Block & read_schema); RelMetricPtr getMetric() { return metrics.empty() ? nullptr : metrics.at(0); } + const std::unordered_map & getFunctionMapping() { return function_mapping; } static std::string getFunctionName(const std::string & function_sig, const substrait::Expression_ScalarFunction & function); + std::optional getFunctionSignatureName(UInt32 function_ref) const; IQueryPlanStep * addRemoveNullableStep(QueryPlan & plan, const std::set & columns); + IQueryPlanStep * addRollbackFilterHeaderStep(QueryPlanPtr & query_plan, const Block & input_header); + + static std::pair parseLiteral(const substrait::Expression_Literal & literal); static ContextMutablePtr global_context; static Context::ConfigurationPtr config; @@ -369,13 +386,14 @@ class SerializedPlanParser const ActionsDAG::Node * toFunctionNode(ActionsDAGPtr actions_dag, const String & function, const DB::ActionsDAG::NodeRawConstPtrs & args); // remove nullable after isNotNull - void removeNullableForRequiredColumns(const std::set & require_columns, ActionsDAGPtr actions_dag); + void removeNullableForRequiredColumns(const std::set & require_columns, const ActionsDAGPtr & actions_dag) const; std::string getUniqueName(const std::string & name) { return name + "_" + std::to_string(name_no++); } - static std::pair parseLiteral(const substrait::Expression_Literal & literal); void wrapNullable( const std::vector & columns, ActionsDAGPtr actions_dag, std::map & nullable_measure_names); static std::pair convertStructFieldType(const DB::DataTypePtr & type, const DB::Field & field); + bool isFunction(substrait::Expression_ScalarFunction rel, String function_name); + int name_no = 0; std::unordered_map function_mapping; std::vector input_iters; @@ -391,6 +409,12 @@ class SerializedPlanParser const ActionsDAG::Node * addColumn(DB::ActionsDAGPtr actions_dag, const DataTypePtr & type, const Field & field); }; +template +std::unique_ptr SerializedPlanParser::createExecutor(const std::string_view plan) +{ + return createExecutor(JsonPlan ? parseJson(plan) : parse(plan)); +} + struct SparkBuffer { char * address; @@ -400,35 +424,36 @@ struct SparkBuffer class LocalExecutor : public BlockIterator { public: - LocalExecutor() = default; - explicit LocalExecutor(ContextPtr context); - void execute(QueryPlanPtr query_plan); + LocalExecutor(const ContextPtr & context_, QueryPlanPtr query_plan, QueryPipeline && pipeline, const Block & header_); + ~LocalExecutor(); + SparkRowInfoPtr next(); Block * nextColumnar(); bool hasNext(); - ~LocalExecutor(); - Block & getHeader(); + /// Stop execution, used when task receives shutdown command or executor receives SIGTERM signal + void cancel(); + Block & getHeader(); RelMetricPtr getMetric() const { return metric; } void setMetric(RelMetricPtr metric_) { metric = metric_; } - void setExtraPlanHolder(std::vector & extra_plan_holder_) { extra_plan_holder = std::move(extra_plan_holder_); } private: - std::unique_ptr writeBlockToSparkRow(DB::Block & block); + std::unique_ptr writeBlockToSparkRow(const DB::Block & block) const; + + /// Dump processor runtime information to log + std::string dumpPipeline() const; + QueryPipeline query_pipeline; std::unique_ptr executor; Block header; ContextPtr context; std::unique_ptr ch_column_to_spark_row; std::unique_ptr spark_buffer; - DB::QueryPlanPtr current_query_plan; + QueryPlanPtr current_query_plan; RelMetricPtr metric; std::vector extra_plan_holder; - - /// Dump processor runtime information to log - std::string dumpPipeline(); }; @@ -444,7 +469,7 @@ class ASTParser ~ASTParser() = default; ASTPtr parseToAST(const Names & names, const substrait::Expression & rel); - ActionsDAGPtr convertToActions(const NamesAndTypesList & name_and_types, const ASTPtr & ast); + ActionsDAG convertToActions(const NamesAndTypesList & name_and_types, const ASTPtr & ast) const; private: ContextPtr context; diff --git a/cpp-ch/local-engine/Parser/SortRelParser.cpp b/cpp-ch/local-engine/Parser/SortRelParser.cpp index 88141d0306726..ea29e72d1324c 100644 --- a/cpp-ch/local-engine/Parser/SortRelParser.cpp +++ b/cpp-ch/local-engine/Parser/SortRelParser.cpp @@ -40,8 +40,15 @@ SortRelParser::parse(DB::QueryPlanPtr query_plan, const substrait::Rel & rel, st size_t limit = parseLimit(rel_stack_); const auto & sort_rel = rel.sort(); auto sort_descr = parseSortDescription(sort_rel.sorts(), query_plan->getCurrentDataStream().header); + SortingStep::Settings settings(*getContext()); + size_t offheap_per_task = getContext()->getConfigRef().getUInt64("off_heap_per_task"); + double spill_mem_ratio = getContext()->getConfigRef().getDouble("spill_mem_ratio", 0.9); + settings.worth_external_sort = [offheap_per_task, spill_mem_ratio]() -> bool + { + return CurrentMemoryTracker::current_memory() > offheap_per_task * spill_mem_ratio; + }; auto sorting_step = std::make_unique( - query_plan->getCurrentDataStream(), sort_descr, limit, SortingStep::Settings(*getContext()), false); + query_plan->getCurrentDataStream(), sort_descr, limit, settings, false); sorting_step->setStepDescription("Sorting step"); steps.emplace_back(sorting_step.get()); query_plan->addStep(std::move(sorting_step)); diff --git a/cpp-ch/local-engine/Parser/aggregate_function_parser/CommonAggregateFunctionParser.cpp b/cpp-ch/local-engine/Parser/aggregate_function_parser/CommonAggregateFunctionParser.cpp index afe65f7931a6b..1619c74106d1c 100644 --- a/cpp-ch/local-engine/Parser/aggregate_function_parser/CommonAggregateFunctionParser.cpp +++ b/cpp-ch/local-engine/Parser/aggregate_function_parser/CommonAggregateFunctionParser.cpp @@ -25,8 +25,6 @@ REGISTER_COMMON_AGGREGATE_FUNCTION_PARSER(Sum, sum, sum) REGISTER_COMMON_AGGREGATE_FUNCTION_PARSER(Avg, avg, avg) REGISTER_COMMON_AGGREGATE_FUNCTION_PARSER(Min, min, min) REGISTER_COMMON_AGGREGATE_FUNCTION_PARSER(Max, max, max) -REGISTER_COMMON_AGGREGATE_FUNCTION_PARSER(StdDev, stddev, stddev_samp) -REGISTER_COMMON_AGGREGATE_FUNCTION_PARSER(StdDevSamp, stddev_samp, stddev_samp) REGISTER_COMMON_AGGREGATE_FUNCTION_PARSER(StdDevPop, stddev_pop, stddev_pop) REGISTER_COMMON_AGGREGATE_FUNCTION_PARSER(BitAnd, bit_and, groupBitAnd) REGISTER_COMMON_AGGREGATE_FUNCTION_PARSER(BitOr, bit_or, groupBitOr) diff --git a/cpp-ch/local-engine/Parser/aggregate_function_parser/SimpleStatisticsFunctions.cpp b/cpp-ch/local-engine/Parser/aggregate_function_parser/SimpleStatisticsFunctions.cpp new file mode 100644 index 0000000000000..7e75e20bb7422 --- /dev/null +++ b/cpp-ch/local-engine/Parser/aggregate_function_parser/SimpleStatisticsFunctions.cpp @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include + +namespace local_engine +{ +/// For stddev +struct StddevNameStruct +{ + static constexpr auto spark_name = "stddev"; + static constexpr auto ch_name = "stddev"; +}; + +struct StddevSampNameStruct +{ + static constexpr auto spark_name = "stddev_samp"; + static constexpr auto ch_name = "stddev_samp"; +}; +template +class AggregateFunctionParserStddev final : public AggregateFunctionParser +{ +public: + AggregateFunctionParserStddev(SerializedPlanParser * plan_parser_) : AggregateFunctionParser(plan_parser_) { } + ~AggregateFunctionParserStddev() override = default; + String getName() const override { return NameStruct::spark_name; } + static constexpr auto name = NameStruct::spark_name; + String getCHFunctionName(const CommonFunctionInfo &) const override { return NameStruct::ch_name; } + String getCHFunctionName(DB::DataTypes &) const override { return NameStruct::ch_name; } + const DB::ActionsDAG::Node * convertNodeTypeIfNeeded( + const CommonFunctionInfo & func_info, + const DB::ActionsDAG::Node * func_node, + DB::ActionsDAGPtr & actions_dag, + bool with_nullability) const override + { + /// result is nullable. + /// if result is NaN, convert it to NULL. + auto is_nan_func_node = toFunctionNode(actions_dag, "isNaN", getUniqueName("isNaN"), {func_node}); + auto null_type = DB::makeNullable(func_node->result_type); + auto nullable_col = null_type->createColumn(); + nullable_col->insertDefault(); + const auto * null_node + = &actions_dag->addColumn(DB::ColumnWithTypeAndName(std::move(nullable_col), null_type, getUniqueName("null"))); + DB::ActionsDAG::NodeRawConstPtrs convert_nan_func_args = {is_nan_func_node, null_node, func_node}; + + func_node = toFunctionNode(actions_dag, "if", func_node->result_name, convert_nan_func_args); + actions_dag->addOrReplaceInOutputs(*func_node); + return func_node; + } +}; + +static const AggregateFunctionParserRegister> registerer_stddev; +static const AggregateFunctionParserRegister> registerer_stddev_samp; +} diff --git a/cpp-ch/local-engine/Parser/scalar_function_parser/arithmetic.cpp b/cpp-ch/local-engine/Parser/scalar_function_parser/arithmetic.cpp index 2a6e435667aa7..d58b22a87e6cc 100644 --- a/cpp-ch/local-engine/Parser/scalar_function_parser/arithmetic.cpp +++ b/cpp-ch/local-engine/Parser/scalar_function_parser/arithmetic.cpp @@ -65,6 +65,13 @@ class DecimalType return bounded_to_click_house(precision, scale); } + static DecimalType evalModuloDecimalType(const Int32 p1, const Int32 s1, const Int32 p2, const Int32 s2) + { + const Int32 scale = std::max(s1, s2); + const Int32 precision = std::min(p1 - s1, p2 - s2) + scale; + return bounded_to_click_house(precision, scale); + } + static DecimalType evalMultiplyDecimalType(const Int32 p1, const Int32 s1, const Int32 p2, const Int32 s2) { const Int32 scale = s1; @@ -221,6 +228,20 @@ class FunctionParserMultiply final : public FunctionParserBinaryArithmetic } }; +class FunctionParserModulo final : public FunctionParserBinaryArithmetic +{ +public: + explicit FunctionParserModulo(SerializedPlanParser * plan_parser_) : FunctionParserBinaryArithmetic(plan_parser_) { } + static constexpr auto name = "modulus"; + String getName() const override { return name; } + +protected: + DecimalType internalEvalType(const Int32 p1, const Int32 s1, const Int32 p2, const Int32 s2) const override + { + return DecimalType::evalModuloDecimalType(p1, s1, p2, s2); + } +}; + class FunctionParserDivide final : public FunctionParserBinaryArithmetic { public: @@ -252,5 +273,6 @@ static FunctionParserRegister register_plus; static FunctionParserRegister register_minus; static FunctionParserRegister register_mltiply; static FunctionParserRegister register_divide; +static FunctionParserRegister register_modulo; } diff --git a/cpp-ch/local-engine/Parser/scalar_function_parser/arrayHighOrderFunctions.cpp b/cpp-ch/local-engine/Parser/scalar_function_parser/arrayHighOrderFunctions.cpp new file mode 100644 index 0000000000000..584bc0ef1e04f --- /dev/null +++ b/cpp-ch/local-engine/Parser/scalar_function_parser/arrayHighOrderFunctions.cpp @@ -0,0 +1,154 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB::ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + +namespace local_engine +{ +class ArrayFilter : public FunctionParser +{ +public: + static constexpr auto name = "filter"; + explicit ArrayFilter(SerializedPlanParser * plan_parser_) : FunctionParser(plan_parser_) {} + ~ArrayFilter() override = default; + + String getName() const override { return name; } + + String getCHFunctionName(const substrait::Expression_ScalarFunction & scalar_function) const override + { + return "arrayFilter"; + } + + const DB::ActionsDAG::Node * parse(const substrait::Expression_ScalarFunction & substrait_func, + DB::ActionsDAGPtr & actions_dag) const + { + auto ch_func_name = getCHFunctionName(substrait_func); + auto parsed_args = parseFunctionArguments(substrait_func, ch_func_name, actions_dag); + assert(parsed_args.size() == 2); + if (collectLambdaArguments(*plan_parser, substrait_func.arguments()[1].value().scalar_function()).size() == 1) + return toFunctionNode(actions_dag, ch_func_name, {parsed_args[1], parsed_args[0]}); + + /// filter with index argument. + const auto * range_end_node = toFunctionNode(actions_dag, "length", {toFunctionNode(actions_dag, "assumeNotNull", {parsed_args[0]})}); + range_end_node = ActionsDAGUtil::convertNodeType( + actions_dag, range_end_node, "Nullable(Int32)", range_end_node->result_name); + const auto * index_array_node = toFunctionNode( + actions_dag, + "range", + {addColumnToActionsDAG(actions_dag, std::make_shared(), 0), range_end_node}); + return toFunctionNode(actions_dag, ch_func_name, {parsed_args[1], parsed_args[0], index_array_node}); + } +}; +static FunctionParserRegister register_array_filter; + +class ArrayTransform : public FunctionParser +{ +public: + static constexpr auto name = "transform"; + explicit ArrayTransform(SerializedPlanParser * plan_parser_) : FunctionParser(plan_parser_) {} + ~ArrayTransform() override = default; + String getName() const override { return name; } + String getCHFunctionName(const substrait::Expression_ScalarFunction & scalar_function) const override + { + return "arrayMap"; + } + + const DB::ActionsDAG::Node * parse(const substrait::Expression_ScalarFunction & substrait_func, + DB::ActionsDAGPtr & actions_dag) const + { + auto ch_func_name = getCHFunctionName(substrait_func); + auto lambda_args = collectLambdaArguments(*plan_parser, substrait_func.arguments()[1].value().scalar_function()); + auto parsed_args = parseFunctionArguments(substrait_func, ch_func_name, actions_dag); + assert(parsed_args.size() == 2); + if (lambda_args.size() == 1) + { + return toFunctionNode(actions_dag, ch_func_name, {parsed_args[1], parsed_args[0]}); + } + + /// transform with index argument. + const auto * range_end_node = toFunctionNode(actions_dag, "length", {toFunctionNode(actions_dag, "assumeNotNull", {parsed_args[0]})}); + range_end_node = ActionsDAGUtil::convertNodeType( + actions_dag, range_end_node, "Nullable(Int32)", range_end_node->result_name); + const auto * index_array_node = toFunctionNode( + actions_dag, + "range", + {addColumnToActionsDAG(actions_dag, std::make_shared(), 0), range_end_node}); + return toFunctionNode(actions_dag, ch_func_name, {parsed_args[1], parsed_args[0], index_array_node}); + } +}; +static FunctionParserRegister register_array_map; + +class ArrayAggregate : public FunctionParser +{ +public: + static constexpr auto name = "aggregate"; + explicit ArrayAggregate(SerializedPlanParser * plan_parser_) : FunctionParser(plan_parser_) {} + ~ArrayAggregate() override = default; + String getName() const override { return name; } + String getCHFunctionName(const substrait::Expression_ScalarFunction & scalar_function) const override + { + return "arrayFold"; + } + const DB::ActionsDAG::Node * parse(const substrait::Expression_ScalarFunction & substrait_func, + DB::ActionsDAGPtr & actions_dag) const + { + auto ch_func_name = getCHFunctionName(substrait_func); + auto parsed_args = parseFunctionArguments(substrait_func, ch_func_name, actions_dag); + assert(parsed_args.size() == 3); + const auto * function_type = typeid_cast(parsed_args[2]->result_type.get()); + if (!function_type) + throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "The third argument of aggregate function must be a lambda function"); + if (!parsed_args[1]->result_type->equals(*(function_type->getReturnType()))) + { + parsed_args[1] = ActionsDAGUtil::convertNodeType( + actions_dag, + parsed_args[1], + function_type->getReturnType()->getName(), + parsed_args[1]->result_name); + } + + /// arrayFold cannot accept nullable(array) + const auto * array_col_node = parsed_args[0]; + if (parsed_args[0]->result_type->isNullable()) + { + array_col_node = toFunctionNode(actions_dag, "assumeNotNull", {parsed_args[0]}); + } + const auto * func_node = toFunctionNode(actions_dag, ch_func_name, {parsed_args[2], array_col_node, parsed_args[1]}); + /// For null array, result is null. + /// TODO: make a new version of arrayFold that can handle nullable array. + const auto * is_null_node = toFunctionNode(actions_dag, "isNull", {parsed_args[0]}); + const auto * null_node = addColumnToActionsDAG(actions_dag, DB::makeNullable(func_node->result_type), DB::Null()); + return toFunctionNode(actions_dag, "if", {is_null_node, null_node, func_node}); + } +}; +static FunctionParserRegister register_array_aggregate; + +} diff --git a/cpp-ch/local-engine/Parser/scalar_function_parser/bitLength.cpp b/cpp-ch/local-engine/Parser/scalar_function_parser/bitLength.cpp new file mode 100644 index 0000000000000..9358c45788cf8 --- /dev/null +++ b/cpp-ch/local-engine/Parser/scalar_function_parser/bitLength.cpp @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +namespace DB +{ +namespace ErrorCodes +{ + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; +} +} + +namespace local_engine +{ +class FunctionParserBitLength : public FunctionParser +{ +public: + explicit FunctionParserBitLength(SerializedPlanParser * plan_parser_) : FunctionParser(plan_parser_) { } + ~FunctionParserBitLength() override = default; + + static constexpr auto name = "bit_length"; + + String getName() const override { return name; } + + const ActionsDAG::Node * parse(const substrait::Expression_ScalarFunction & substrait_func, ActionsDAGPtr & actions_dag) const override + { + // parse bit_length(a) as octet_length(a) * 8 + auto parsed_args = parseFunctionArguments(substrait_func, "", actions_dag); + if (parsed_args.size() != 1) + throw Exception(DB::ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} requires exactly one arguments", getName()); + + const auto * arg = parsed_args[0]; + const auto * new_arg = arg; + if (isInt(DB::removeNullable(arg->result_type))) + { + const auto * string_type_node = addColumnToActionsDAG(actions_dag, std::make_shared(), "Nullable(String)"); + new_arg = toFunctionNode(actions_dag, "CAST", {arg, string_type_node}); + } + + const auto * octet_length_node = toFunctionNode(actions_dag, "octet_length", {new_arg}); + const auto * const_eight_node = addColumnToActionsDAG(actions_dag, std::make_shared(), 8); + const auto * result_node = toFunctionNode(actions_dag, "multiply", {octet_length_node, const_eight_node}); + + return convertNodeTypeIfNeeded(substrait_func, result_node, actions_dag);; + } +}; + +static FunctionParserRegister register_bit_length; +} diff --git a/cpp-ch/local-engine/Parser/scalar_function_parser/chr.cpp b/cpp-ch/local-engine/Parser/scalar_function_parser/chr.cpp new file mode 100644 index 0000000000000..d168e63d11dc6 --- /dev/null +++ b/cpp-ch/local-engine/Parser/scalar_function_parser/chr.cpp @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include + +namespace DB +{ +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; +} +} + +namespace local_engine +{ +class FunctionParserChr : public FunctionParser +{ +public: + explicit FunctionParserChr(SerializedPlanParser * plan_parser_) : FunctionParser(plan_parser_) { } + ~FunctionParserChr() override = default; + static constexpr auto name = "chr"; + String getName() const override { return name; } + + const ActionsDAG::Node * parse( + const substrait::Expression_ScalarFunction & substrait_func, + ActionsDAGPtr & actions_dag) const override + { + auto parsed_args = parseFunctionArguments(substrait_func, "", actions_dag); + if (parsed_args.size() != 1) + throw Exception(DB::ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} requires two or three arguments", getName()); + + /* + parse chr(number) as if(number < 0, '', convertCharset(char(0, number), 'unicode', 'utf-8')) + */ + const auto & num_arg = parsed_args[0]; + const auto * const_zero_node = addColumnToActionsDAG(actions_dag, std::make_shared(), 0); + const auto * const_empty_node = addColumnToActionsDAG(actions_dag, std::make_shared(), ""); + const auto * const_four_node = addColumnToActionsDAG(actions_dag, std::make_shared(), 4); + const auto * const_unicode_node = addColumnToActionsDAG(actions_dag, std::make_shared(), "unicode"); + const auto * const_utf8_node = addColumnToActionsDAG(actions_dag, std::make_shared(), "utf-8"); + + const auto * less_node = toFunctionNode(actions_dag, "less", {num_arg, const_zero_node}); + + const auto * char_node = toFunctionNode(actions_dag, "char", {const_zero_node, num_arg}); + const auto * convert_charset_node = toFunctionNode(actions_dag, "convertCharset", {char_node, const_unicode_node, const_utf8_node}); + + const auto * if_node = toFunctionNode(actions_dag, "if", {less_node, const_empty_node, convert_charset_node}); + const auto * result_node = convertNodeTypeIfNeeded(substrait_func, if_node, actions_dag); + return result_node; + } +}; + +static FunctionParserRegister register_chr; +} diff --git a/cpp-ch/local-engine/Parser/scalar_function_parser/concat.cpp b/cpp-ch/local-engine/Parser/scalar_function_parser/concat.cpp new file mode 100644 index 0000000000000..416fe7741812d --- /dev/null +++ b/cpp-ch/local-engine/Parser/scalar_function_parser/concat.cpp @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; +} +} + +namespace local_engine +{ + +class FunctionParserConcat : public FunctionParser +{ +public: + explicit FunctionParserConcat(SerializedPlanParser * plan_parser_) : FunctionParser(plan_parser_) {} + ~FunctionParserConcat() override = default; + + static constexpr auto name = "concat"; + + String getName() const override { return name; } + + const ActionsDAG::Node * parse( + const substrait::Expression_ScalarFunction & substrait_func, + ActionsDAGPtr & actions_dag) const override + { + /* + parse concat(args) as: + 1. if output type is array, return arrayConcat(args) + 2. otherwise: + 1) if args is empty, return empty string + 2) if args have size 1, return identity(args[0]) + 3) otherwise return concat(args) + */ + auto args = parseFunctionArguments(substrait_func, "", actions_dag); + const auto & output_type = substrait_func.output_type(); + const ActionsDAG::Node * result_node = nullptr; + if (output_type.has_list()) + { + result_node = toFunctionNode(actions_dag, "arrayConcat", args); + } + else + { + if (args.empty()) + result_node = addColumnToActionsDAG(actions_dag, std::make_shared(), ""); + else if (args.size() == 1) + result_node = toFunctionNode(actions_dag, "identity", args); + else + result_node = toFunctionNode(actions_dag, "concat", args); + } + return convertNodeTypeIfNeeded(substrait_func, result_node, actions_dag); + } +}; + +static FunctionParserRegister register_concat; +} diff --git a/cpp-ch/local-engine/Parser/scalar_function_parser/fromUtcTimestamp.cpp b/cpp-ch/local-engine/Parser/scalar_function_parser/fromUtcTimestamp.cpp new file mode 100644 index 0000000000000..8d23231055c3f --- /dev/null +++ b/cpp-ch/local-engine/Parser/scalar_function_parser/fromUtcTimestamp.cpp @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +namespace local_engine +{ + +class FunctionParserFromUtcTimestamp : public FunctionParserUtcTimestampTransform +{ +public: + explicit FunctionParserFromUtcTimestamp(SerializedPlanParser * plan_parser_) : FunctionParserUtcTimestampTransform(plan_parser_) { } + ~FunctionParserFromUtcTimestamp() = default; + + static constexpr auto name = "from_utc_timestamp"; + String getCHFunctionName(const substrait::Expression_ScalarFunction &) const override { return "from_utc_timestamp"; } + String getName() const override { return "from_utc_timestamp"; } +}; + +static FunctionParserRegister fromUtcTimestamp; +} diff --git a/cpp-ch/local-engine/Parser/scalar_function_parser/lambdaFunction.cpp b/cpp-ch/local-engine/Parser/scalar_function_parser/lambdaFunction.cpp new file mode 100644 index 0000000000000..57c076ed2670d --- /dev/null +++ b/cpp-ch/local-engine/Parser/scalar_function_parser/lambdaFunction.cpp @@ -0,0 +1,222 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB::ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + +namespace local_engine +{ +DB::NamesAndTypesList collectLambdaArguments(const SerializedPlanParser & plan_parser_, const substrait::Expression_ScalarFunction & substrait_func) +{ + DB::NamesAndTypesList lambda_arguments; + std::unordered_set collected_names; + + for (const auto & arg : substrait_func.arguments()) + { + if (arg.value().has_scalar_function() + && plan_parser_.getFunctionSignatureName(arg.value().scalar_function().function_reference()) == "namedlambdavariable") + { + auto [_, col_name_field] = plan_parser_.parseLiteral(arg.value().scalar_function().arguments()[0].value().literal()); + String col_name = col_name_field.get(); + if (collected_names.contains(col_name)) + { + continue; + } + collected_names.insert(col_name); + auto type = TypeParser::parseType(arg.value().scalar_function().output_type()); + lambda_arguments.emplace_back(col_name, type); + } + } + return lambda_arguments; +} + +/// Refer to `PlannerActionsVisitorImpl::visitLambda` for how to build a lambda function node. +class LambdaFunction : public FunctionParser +{ +public: + static constexpr auto name = "lambdafunction"; + explicit LambdaFunction(SerializedPlanParser * plan_parser_) : FunctionParser(plan_parser_) {} + ~LambdaFunction() override = default; + + String getName() const override { return name; } +protected: + String getCHFunctionName(const substrait::Expression_ScalarFunction & scalar_function) const override + { + throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "getCHFunctionName is not implemented for LambdaFunction"); + } + + DB::ActionsDAG::NodeRawConstPtrs parseFunctionArguments( + const substrait::Expression_ScalarFunction & substrait_func, + const String & ch_func_name, + DB::ActionsDAGPtr & actions_dag) const override + { + throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "parseFunctionArguments is not implemented for LambdaFunction"); + } + + const DB::ActionsDAG::Node * convertNodeTypeIfNeeded( + const substrait::Expression_ScalarFunction & substrait_func, + const DB::ActionsDAG::Node * func_node, + DB::ActionsDAGPtr & actions_dag) const override + { + throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "convertNodeTypeIfNeeded is not implemented for NamedLambdaVariable"); + } + + const DB::ActionsDAG::Node * parse(const substrait::Expression_ScalarFunction & substrait_func, DB::ActionsDAGPtr & actions_dag) const override + { + /// Some special cases, for example, `transform(arr, x -> concat(arr, array(x)))` refers to + /// a column `arr` out of it directly. We need a `arr` as an input column for `lambda_actions_dag` + DB::NamesAndTypesList parent_header; + for (const auto * output_node : actions_dag->getOutputs()) + { + parent_header.emplace_back(output_node->result_name, output_node->result_type); + } + auto lambda_actions_dag = std::make_shared(parent_header); + + /// The first argument is the lambda function body, followings are the lambda arguments which is + /// needed by the lambda function body. + /// There could be a nested lambda function in the lambda function body, and it refer a variable from + /// this outside lambda function's arguments. For an example, transform(number, x -> transform(letter, y -> struct(x, y))). + /// Before parsing the lambda function body, we add lambda function arguments int actions dag at first. + for (size_t i = 1; i < substrait_func.arguments().size(); ++i) + { + (void)parseExpression(lambda_actions_dag, substrait_func.arguments()[i].value()); + } + const auto & substrait_lambda_body = substrait_func.arguments()[0].value(); + const auto * lambda_body_node = parseExpression(lambda_actions_dag, substrait_lambda_body); + lambda_actions_dag->getOutputs().push_back(lambda_body_node); + lambda_actions_dag->removeUnusedActions(Names(1, lambda_body_node->result_name)); + + auto expression_actions_settings = DB::ExpressionActionsSettings::fromContext(getContext(), DB::CompileExpressions::yes); + auto lambda_actions = std::make_shared(lambda_actions_dag, expression_actions_settings); + + DB::Names captured_column_names; + DB::Names required_column_names = lambda_actions->getRequiredColumns(); + DB::ActionsDAG::NodeRawConstPtrs lambda_children; + auto lambda_function_args = collectLambdaArguments(*plan_parser, substrait_func); + const auto & lambda_actions_inputs = lambda_actions_dag->getInputs(); + + std::unordered_map parent_nodes; + for (const auto & node : actions_dag->getNodes()) + { + parent_nodes[node.result_name] = &node; + } + for (const auto & required_column_name : required_column_names) + { + if (std::find_if( + lambda_function_args.begin(), + lambda_function_args.end(), + [&required_column_name](const DB::NameAndTypePair & name_type) { return name_type.name == required_column_name; }) + == lambda_function_args.end()) + { + auto it = std::find_if( + lambda_actions_inputs.begin(), + lambda_actions_inputs.end(), + [&required_column_name](const auto & node) { return node->result_name == required_column_name; }); + if (it == lambda_actions_inputs.end()) + { + throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Required column not found: {}", required_column_name); + } + auto parent_node_it = parent_nodes.find(required_column_name); + if (parent_node_it == parent_nodes.end()) + { + throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Not found column {} in actions dag:\n{}", + required_column_name, + actions_dag->dumpDAG()); + } + /// The nodes must be the ones in `actions_dag`, otherwise `ActionsDAG::evaluatePartialResult` will fail. Because nodes may have the + /// same name but their addresses are different. + lambda_children.push_back(parent_node_it->second); + captured_column_names.push_back(required_column_name); + } + } + + auto function_capture = std::make_shared( + lambda_actions, + captured_column_names, + lambda_function_args, + lambda_body_node->result_type, + lambda_body_node->result_name); + + const auto * result = &actions_dag->addFunction(function_capture, lambda_children, lambda_body_node->result_name); + return result; + } +}; + +static FunctionParserRegister register_lambda_function; + + +class NamedLambdaVariable : public FunctionParser +{ +public: + static constexpr auto name = "namedlambdavariable"; + explicit NamedLambdaVariable(SerializedPlanParser * plan_parser_) : FunctionParser(plan_parser_) {} + ~NamedLambdaVariable() override = default; + + String getName() const override { return name; } +protected: + String getCHFunctionName(const substrait::Expression_ScalarFunction & scalar_function) const override + { + throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "getCHFunctionName is not implemented for NamedLambdaVariable"); + } + + DB::ActionsDAG::NodeRawConstPtrs parseFunctionArguments( + const substrait::Expression_ScalarFunction & substrait_func, + const String & ch_func_name, + DB::ActionsDAGPtr & actions_dag) const override + { + throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "parseFunctionArguments is not implemented for NamedLambdaVariable"); + } + + const DB::ActionsDAG::Node * convertNodeTypeIfNeeded( + const substrait::Expression_ScalarFunction & substrait_func, + const DB::ActionsDAG::Node * func_node, + DB::ActionsDAGPtr & actions_dag) const override + { + throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "convertNodeTypeIfNeeded is not implemented for NamedLambdaVariable"); + } + + const DB::ActionsDAG::Node * parse(const substrait::Expression_ScalarFunction & substrait_func, DB::ActionsDAGPtr & actions_dag) const override + { + auto [_, col_name_field] = parseLiteral(substrait_func.arguments()[0].value().literal()); + String col_name = col_name_field.get(); + + auto type = TypeParser::parseType(substrait_func.output_type()); + const auto & inputs = actions_dag->getInputs(); + auto it = std::find_if(inputs.begin(), inputs.end(), [&col_name](const auto * node) { return node->result_name == col_name; }); + if (it == inputs.end()) + { + return &(actions_dag->addInput(col_name, type)); + } + return *it; + } +}; + +static FunctionParserRegister register_named_lambda_variable; + +} diff --git a/cpp-ch/local-engine/Operator/BlockCoalesceOperator.h b/cpp-ch/local-engine/Parser/scalar_function_parser/lambdaFunction.h similarity index 65% rename from cpp-ch/local-engine/Operator/BlockCoalesceOperator.h rename to cpp-ch/local-engine/Parser/scalar_function_parser/lambdaFunction.h index 2b67b40cea1bc..327c72ade47c1 100644 --- a/cpp-ch/local-engine/Operator/BlockCoalesceOperator.h +++ b/cpp-ch/local-engine/Parser/scalar_function_parser/lambdaFunction.h @@ -15,33 +15,9 @@ * limitations under the License. */ #pragma once - -#include - -namespace DB -{ -class Block; -} - +#include +#include namespace local_engine { - -class BlockCoalesceOperator -{ -public: - explicit BlockCoalesceOperator(size_t buf_size_) : buf_size(buf_size_) { } - ~BlockCoalesceOperator(); - - void mergeBlock(DB::Block & block); - bool isFull(); - DB::Block * releaseBlock(); - -private: - void clearCache(); - - size_t buf_size; - ColumnsBuffer block_buffer; - DB::Block * cached_block = nullptr; - -}; -} +DB::NamesAndTypesList collectLambdaArguments(const SerializedPlanParser & plan_parser_, const substrait::Expression_ScalarFunction & substrait_func); +} diff --git a/cpp-ch/local-engine/Parser/scalar_function_parser/length.cpp b/cpp-ch/local-engine/Parser/scalar_function_parser/length.cpp new file mode 100644 index 0000000000000..85fe1f29aa253 --- /dev/null +++ b/cpp-ch/local-engine/Parser/scalar_function_parser/length.cpp @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +namespace DB +{ +namespace ErrorCodes +{ + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; +} +} + +namespace local_engine +{ +class FunctionParserLength : public FunctionParser +{ +public: + explicit FunctionParserLength(SerializedPlanParser * plan_parser_) : FunctionParser(plan_parser_) { } + ~FunctionParserLength() override = default; + + static constexpr auto name = "char_length"; + + String getName() const override { return name; } + + const ActionsDAG::Node * parse(const substrait::Expression_ScalarFunction & substrait_func, ActionsDAGPtr & actions_dag) const override + { + /** + parse length(a) as + if input is binary type + length(a) as length(a) + else + length(a) as char_length(a) + */ + auto parsed_args = parseFunctionArguments(substrait_func, "", actions_dag); + if (parsed_args.size() != 1) + throw Exception(DB::ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} requires exactly one arguments", getName()); + + const auto * arg = parsed_args[0]; + const auto * new_arg = arg; + if (isInt(removeNullable(arg->result_type))) + { + const auto * string_type_node = addColumnToActionsDAG(actions_dag, std::make_shared(), "Nullable(String)"); + new_arg = toFunctionNode(actions_dag, "CAST", {arg, string_type_node}); + } + + auto function_signature = plan_parser->getFunctionMapping().at(std::to_string(substrait_func.function_reference())); + const ActionsDAG::Node * result_node; + if (function_signature.find("vbin") != std::string::npos) + result_node = toFunctionNode(actions_dag, "length", {new_arg}); + else + result_node = toFunctionNode(actions_dag, "char_length", {new_arg}); + + return convertNodeTypeIfNeeded(substrait_func, result_node, actions_dag);; + } +}; + +static FunctionParserRegister register_length; +} diff --git a/cpp-ch/local-engine/Parser/scalar_function_parser/ln.cpp b/cpp-ch/local-engine/Parser/scalar_function_parser/ln.cpp index 081444ee5f4a5..0bb19dd1d2063 100644 --- a/cpp-ch/local-engine/Parser/scalar_function_parser/ln.cpp +++ b/cpp-ch/local-engine/Parser/scalar_function_parser/ln.cpp @@ -25,7 +25,7 @@ class FunctionParserLn : public FunctionParserLogBase explicit FunctionParserLn(SerializedPlanParser * plan_parser_) : FunctionParserLogBase(plan_parser_) {} ~FunctionParserLn() override = default; - static constexpr auto name = "log"; + static constexpr auto name = "ln"; String getName() const override { return name; } String getCHFunctionName() const override { return name; } diff --git a/cpp-ch/local-engine/Parser/scalar_function_parser/log.cpp b/cpp-ch/local-engine/Parser/scalar_function_parser/log.cpp index 264c0da930c6c..bafca3b213d71 100644 --- a/cpp-ch/local-engine/Parser/scalar_function_parser/log.cpp +++ b/cpp-ch/local-engine/Parser/scalar_function_parser/log.cpp @@ -38,7 +38,7 @@ class FunctionParserLog : public FunctionParser explicit FunctionParserLog(SerializedPlanParser * plan_parser_) : FunctionParser(plan_parser_) {} ~FunctionParserLog() override = default; - static constexpr auto name = "logarithm"; + static constexpr auto name = "log"; String getName() const override { return name; } diff --git a/cpp-ch/local-engine/Parser/scalar_function_parser/octetLength.cpp b/cpp-ch/local-engine/Parser/scalar_function_parser/octetLength.cpp new file mode 100644 index 0000000000000..52cbd03172906 --- /dev/null +++ b/cpp-ch/local-engine/Parser/scalar_function_parser/octetLength.cpp @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +namespace DB +{ +namespace ErrorCodes +{ + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; +} +} + +namespace local_engine +{ +class FunctionParserOctetLength : public FunctionParser +{ +public: + explicit FunctionParserOctetLength(SerializedPlanParser * plan_parser_) : FunctionParser(plan_parser_) { } + ~FunctionParserOctetLength() override = default; + + static constexpr auto name = "octet_length"; + + String getName() const override { return name; } + + const ActionsDAG::Node * parse(const substrait::Expression_ScalarFunction & substrait_func, ActionsDAGPtr & actions_dag) const override + { + auto parsed_args = parseFunctionArguments(substrait_func, "", actions_dag); + if (parsed_args.size() != 1) + throw Exception(DB::ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} requires exactly one arguments", getName()); + + const auto * arg = parsed_args[0]; + const auto * new_arg = arg; + if (isInt(DB::removeNullable(arg->result_type))) + { + const auto * string_type_node = addColumnToActionsDAG(actions_dag, std::make_shared(), "Nullable(String)"); + new_arg = toFunctionNode(actions_dag, "CAST", {arg, string_type_node}); + } + const auto * octet_length_node = toFunctionNode(actions_dag, "octet_length", {new_arg}); + return convertNodeTypeIfNeeded(substrait_func, octet_length_node, actions_dag);; + } +}; + +static FunctionParserRegister register_octet_length; +} diff --git a/cpp-ch/local-engine/Parser/scalar_function_parser/regexp_extract.cpp b/cpp-ch/local-engine/Parser/scalar_function_parser/regexp_extract.cpp index 2c0eeff1ccd37..8f75baf689b26 100644 --- a/cpp-ch/local-engine/Parser/scalar_function_parser/regexp_extract.cpp +++ b/cpp-ch/local-engine/Parser/scalar_function_parser/regexp_extract.cpp @@ -15,6 +15,8 @@ * limitations under the License. */ +#include + #include namespace DB @@ -56,10 +58,11 @@ class FunctionParserRegexpExtract : public FunctionParser size_t expr_size = expr_str.size(); if (expr_str.data()[expr_size - 1] == '$') expr_str.replace(expr_str.find_last_of("$"), 1, "(?:(\n)*)$"); - - const auto * regex_expr_node = addColumnToActionsDAG(actions_dag, std::make_shared(), expr_str); + + String sparkRegexp = adjustSparkRegexpRule(expr_str); + const auto * regex_expr_node = addColumnToActionsDAG(actions_dag, std::make_shared(), sparkRegexp); auto parsed_args = parseFunctionArguments(substrait_func, "", actions_dag); - parsed_args[1] = regex_expr_node; + parsed_args[1] = regex_expr_node; const auto * result_node = toFunctionNode(actions_dag, "regexpExtract", parsed_args); return convertNodeTypeIfNeeded(substrait_func, result_node, actions_dag); } @@ -69,6 +72,77 @@ class FunctionParserRegexpExtract : public FunctionParser else throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Function {} 2nd argument's type must be const", getName()); } + +private: + String adjustSparkRegexpRule(String & str) const + { + const auto left_bracket_pos = str.find('['); + const auto right_bracket_pos = str.find(']'); + + if (left_bracket_pos == str.npos || right_bracket_pos == str.npos || left_bracket_pos >= right_bracket_pos) + return str; + + auto throw_message = [this, &str]() -> void { + throw Exception( + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "The value of parameter(s) 'regexp' in `{}` is invalid: '{}'", getName(), str); + }; + + ReadBufferFromString buf(str); + std::stack strs; + strs.emplace(""); + bool nead_right_bracket = false; + + while (!buf.eof()) + { + if (*buf.position() == '[') + { + strs.emplace(""); + } + else if (*buf.position() == ']') + { + if (strs.size() == 1) + { + // "ab]c" + strs.top().append("]"); + } + else + { + String back = strs.top(); + strs.pop(); + if (strs.size() == 1) + { + // "abc[abc]abc" + strs.top().append("[").append(back).append("]"); + nead_right_bracket = false; + } + else + { + // "abc[a[abc]c]abc" + strs.top().append(back); + nead_right_bracket = true; + } + } + } + else + { + strs.top() += *buf.position(); + } + + ++buf.position(); + } + + if (nead_right_bracket && strs.size() != 1) + throw_message(); + + while (strs.size() != 1) + { + String back = strs.top(); + strs.pop(); + strs.top().append("[").append(back); + } + + return strs.top(); + } }; static FunctionParserRegister register_regexp_extract; diff --git a/cpp-ch/local-engine/Parser/scalar_function_parser/substring.cpp b/cpp-ch/local-engine/Parser/scalar_function_parser/substring.cpp index 64c97da805817..550e77344ddf3 100644 --- a/cpp-ch/local-engine/Parser/scalar_function_parser/substring.cpp +++ b/cpp-ch/local-engine/Parser/scalar_function_parser/substring.cpp @@ -59,7 +59,9 @@ class FunctionParserSubstring : public FunctionParser const auto * const_one_node = addColumnToActionsDAG(actions_dag, index_type, 1); const auto * equals_zero_node = toFunctionNode(actions_dag, "equals", {index_arg, const_zero_node}); const auto * if_node = toFunctionNode(actions_dag, "if", {equals_zero_node, const_one_node, index_arg}); - const auto * substring_func_node = toFunctionNode(actions_dag, "substringUTF8", {str_arg, if_node, length_arg}); + const auto * less_zero_node = toFunctionNode(actions_dag, "less", {length_arg, const_zero_node}); + const auto * if_len_node = toFunctionNode(actions_dag, "if", {less_zero_node, const_zero_node, length_arg}); + const auto * substring_func_node = toFunctionNode(actions_dag, "substringUTF8", {str_arg, if_node, if_len_node}); return convertNodeTypeIfNeeded(substrait_func, substring_func_node, actions_dag); } protected: diff --git a/cpp-ch/local-engine/Operator/BlockCoalesceOperator.cpp b/cpp-ch/local-engine/Parser/scalar_function_parser/toUtcTimestamp.cpp similarity index 56% rename from cpp-ch/local-engine/Operator/BlockCoalesceOperator.cpp rename to cpp-ch/local-engine/Parser/scalar_function_parser/toUtcTimestamp.cpp index 756249e8a571d..4b04942bab310 100644 --- a/cpp-ch/local-engine/Operator/BlockCoalesceOperator.cpp +++ b/cpp-ch/local-engine/Parser/scalar_function_parser/toUtcTimestamp.cpp @@ -14,40 +14,22 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "BlockCoalesceOperator.h" -#include -namespace local_engine -{ - -void BlockCoalesceOperator::mergeBlock(DB::Block & block) -{ - block_buffer.add(block, 0, static_cast(block.rows())); -} +#include -bool BlockCoalesceOperator::isFull() +namespace local_engine { - return block_buffer.size() >= buf_size; -} -DB::Block * BlockCoalesceOperator::releaseBlock() +class FunctionParserToUtcTimestamp : public FunctionParserUtcTimestampTransform { - clearCache(); - cached_block = new DB::Block(block_buffer.releaseColumns()); - return cached_block; -} +public: + explicit FunctionParserToUtcTimestamp(SerializedPlanParser * plan_parser_) : FunctionParserUtcTimestampTransform(plan_parser_) { } + ~FunctionParserToUtcTimestamp() = default; -BlockCoalesceOperator::~BlockCoalesceOperator() -{ - clearCache(); -} + static constexpr auto name = "to_utc_timestamp"; + String getCHFunctionName(const substrait::Expression_ScalarFunction &) const override { return "to_utc_timestamp"; } + String getName() const override { return "to_utc_timestamp"; } +}; -void BlockCoalesceOperator::clearCache() -{ - if (cached_block) - { - delete cached_block; - cached_block = nullptr; - } -} +static FunctionParserRegister toUtcTimestamp; } diff --git a/cpp-ch/local-engine/Parser/scalar_function_parser/utcTimestampTransform.h b/cpp-ch/local-engine/Parser/scalar_function_parser/utcTimestampTransform.h new file mode 100644 index 0000000000000..87ea19024169a --- /dev/null +++ b/cpp-ch/local-engine/Parser/scalar_function_parser/utcTimestampTransform.h @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ +extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; +extern const int ILLEGAL_TYPE_OF_ARGUMENT; +} +} + +namespace local_engine +{ + +class FunctionParserUtcTimestampTransform : public FunctionParser +{ +public: + explicit FunctionParserUtcTimestampTransform(SerializedPlanParser * plan_parser_) : FunctionParser(plan_parser_) { } + ~FunctionParserUtcTimestampTransform() override = default; + + const ActionsDAG::Node * parse(const substrait::Expression_ScalarFunction & substrait_func, ActionsDAGPtr & actions_dag) const override + { + /// Convert timezone value to clickhouse backend supported, i.e. GMT+8 -> Etc/GMT-8, +08:00 -> Etc/GMT-8 + if (substrait_func.arguments_size() != 2) + throw DB::Exception(DB::ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {}'s must have 2 arguments", getName()); + + const substrait::Expression & arg1 = substrait_func.arguments()[1].value(); + if (!arg1.has_literal() || !arg1.literal().has_string()) + throw DB::Exception(DB::ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Function {}'s 2nd argument should be string literal", getName()); + + const String & arg1_literal = arg1.literal().string(); + String time_zone_val = DateTimeUtil::convertTimeZone(arg1_literal); + auto parsed_args = parseFunctionArguments(substrait_func, "", actions_dag); + auto nullable_string_type = DB::makeNullable(std::make_shared()); + const auto * time_zone_node = addColumnToActionsDAG(actions_dag, nullable_string_type, time_zone_val); + const auto * result_node = toFunctionNode(actions_dag, getCHFunctionName(substrait_func), {parsed_args[0], time_zone_node}); + return convertNodeTypeIfNeeded(substrait_func, result_node, actions_dag); + } +}; +} diff --git a/cpp-ch/local-engine/Rewriter/ExpressionRewriter.h b/cpp-ch/local-engine/Rewriter/ExpressionRewriter.h index cdea86133eb21..c22c64eae7012 100644 --- a/cpp-ch/local-engine/Rewriter/ExpressionRewriter.h +++ b/cpp-ch/local-engine/Rewriter/ExpressionRewriter.h @@ -38,7 +38,7 @@ class GetJsonObjectFunctionWriter : public RelRewriter void rewrite(substrait::Rel & rel) override { - if (!rel.has_project()) + if (!rel.has_filter() && !rel.has_project()) { return; } @@ -51,6 +51,11 @@ class GetJsonObjectFunctionWriter : public RelRewriter /// Collect all get_json_object functions and group by json strings void prepare(const substrait::Rel & rel) { + if (rel.has_filter()) + { + auto & expr = rel.filter().condition(); + prepareOnExpression(expr); + } if (rel.has_project()) { for (auto & expr : rel.project().expressions()) @@ -62,6 +67,12 @@ class GetJsonObjectFunctionWriter : public RelRewriter void rewriteImpl(substrait::Rel & rel) { + if (rel.has_filter()) + { + auto * filter = rel.mutable_filter(); + auto * expression = filter->mutable_condition(); + rewriteExpression(*expression); + } if (rel.has_project()) { auto * project = rel.mutable_project(); diff --git a/cpp-ch/local-engine/Shuffle/CachedShuffleWriter.cpp b/cpp-ch/local-engine/Shuffle/CachedShuffleWriter.cpp index 5a86294341729..fd6f6fd81b5d1 100644 --- a/cpp-ch/local-engine/Shuffle/CachedShuffleWriter.cpp +++ b/cpp-ch/local-engine/Shuffle/CachedShuffleWriter.cpp @@ -25,25 +25,23 @@ namespace DB { - namespace ErrorCodes { - extern const int BAD_ARGUMENTS; +extern const int BAD_ARGUMENTS; } } namespace local_engine { - using namespace DB; -CachedShuffleWriter::CachedShuffleWriter(const String & short_name, const SplitOptions & options_, jobject rss_pusher) : options(options_) +CachedShuffleWriter::CachedShuffleWriter(const String & short_name, const SplitOptions & options_, jobject rss_pusher) + : options(options_) { - bool use_external_sort_shuffle = (options.force_sort) && !rss_pusher; if (short_name == "rr") { - partitioner = std::make_unique(options.partition_num, use_external_sort_shuffle); + partitioner = std::make_unique(options.partition_num); } else if (short_name == "hash") { @@ -53,15 +51,15 @@ CachedShuffleWriter::CachedShuffleWriter(const String & short_name, const SplitO { hash_fields.push_back(std::stoi(expr)); } - partitioner = std::make_unique(options.partition_num, hash_fields, options_.hash_algorithm, use_external_sort_shuffle); + partitioner = std::make_unique(options.partition_num, hash_fields, options_.hash_algorithm); } else if (short_name == "single") { options.partition_num = 1; - partitioner = std::make_unique(options.partition_num, use_external_sort_shuffle); + partitioner = std::make_unique(options.partition_num); } else if (short_name == "range") - partitioner = std::make_unique(options.hash_exprs, options.partition_num, use_external_sort_shuffle); + partitioner = std::make_unique(options.hash_exprs, options.partition_num); else throw DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "unsupported splitter {}", short_name); @@ -77,32 +75,17 @@ CachedShuffleWriter::CachedShuffleWriter(const String & short_name, const SplitO jmethodID celeborn_push_partition_data_method = GetMethodID(env, celeborn_partition_pusher_class, "pushPartitionData", "(I[BI)I"); CLEAN_JNIENV - auto celeborn_client = std::make_unique(rss_pusher, celeborn_push_partition_data_method); - if (use_external_sort_shuffle) - { - partition_writer = std::make_unique(this, std::move(celeborn_client)); - sort_shuffle = true; - } - else - partition_writer = std::make_unique(this, std::move(celeborn_client)); - } - else - { - if (use_external_sort_shuffle) - { - partition_writer = std::make_unique(this); - sort_shuffle = true; - } - else - partition_writer = std::make_unique(this); + celeborn_client = std::make_unique(rss_pusher, celeborn_push_partition_data_method); } + split_result.partition_lengths.resize(options.partition_num, 0); split_result.raw_partition_lengths.resize(options.partition_num, 0); } void CachedShuffleWriter::split(DB::Block & block) { + lazyInitPartitionWriter(block); auto block_info = block.info; initOutputIfNeeded(block); @@ -145,18 +128,50 @@ void CachedShuffleWriter::initOutputIfNeeded(Block & block) } } -SplitResult CachedShuffleWriter::stop() +void CachedShuffleWriter::lazyInitPartitionWriter(Block & input_sample) { - partition_writer->stop(); + if (partition_writer) + return; + +// auto avg_row_size = input_sample.allocatedBytes() / input_sample.rows(); +// auto overhead_memory = std::max(avg_row_size, input_sample.columns() * 16) * options.split_size * options.partition_num; +// auto use_sort_shuffle = overhead_memory > options.spill_threshold * 0.5 || options.partition_num >= 300; + auto use_external_sort_shuffle = options.force_external_sort; + auto use_memory_sort_shuffle = options.force_mermory_sort; + sort_shuffle = use_memory_sort_shuffle || use_external_sort_shuffle; + if (celeborn_client) + { + if (use_external_sort_shuffle) + partition_writer = std::make_unique(this, std::move(celeborn_client)); + else if (use_memory_sort_shuffle) + partition_writer = std::make_unique(this, std::move(celeborn_client)); + else + partition_writer = std::make_unique(this, std::move(celeborn_client)); + } + else + { + if (use_external_sort_shuffle) + partition_writer = std::make_unique(this); + else if (use_memory_sort_shuffle) + partition_writer = std::make_unique(this); + else + partition_writer = std::make_unique(this); + } + partitioner->setUseSortShuffle(sort_shuffle); + LOG_INFO(logger, "Use Partition Writer {}", partition_writer->getName()); +} - static auto * logger = &Poco::Logger::get("CachedShuffleWriter"); +SplitResult CachedShuffleWriter::stop() +{ + if (partition_writer) + partition_writer->stop(); LOG_INFO(logger, "CachedShuffleWriter stop, split result: {}", split_result.toString()); return split_result; } size_t CachedShuffleWriter::evictPartitions() { + if (!partition_writer) return 0; return partition_writer->evictPartitions(true, options.flush_block_buffer_before_evict); } - -} +} \ No newline at end of file diff --git a/cpp-ch/local-engine/Shuffle/CachedShuffleWriter.h b/cpp-ch/local-engine/Shuffle/CachedShuffleWriter.h index d1dd4ff2fce6c..e6395c8e47128 100644 --- a/cpp-ch/local-engine/Shuffle/CachedShuffleWriter.h +++ b/cpp-ch/local-engine/Shuffle/CachedShuffleWriter.h @@ -24,10 +24,10 @@ namespace local_engine { - -class PartitionWriter; -class LocalPartitionWriter; -class CelebornPartitionWriter; + class CelebornClient; + class PartitionWriter; + class LocalPartitionWriter; + class CelebornPartitionWriter; class CachedShuffleWriter : public ShuffleWriterBase { @@ -35,8 +35,12 @@ class CachedShuffleWriter : public ShuffleWriterBase friend class PartitionWriter; friend class LocalPartitionWriter; friend class CelebornPartitionWriter; + friend class SortBasedPartitionWriter; + friend class MemorySortLocalPartitionWriter; + friend class MemorySortCelebornPartitionWriter; friend class ExternalSortLocalPartitionWriter; friend class ExternalSortCelebornPartitionWriter; + friend class Spillable; explicit CachedShuffleWriter(const String & short_name, const SplitOptions & options, jobject rss_pusher = nullptr); ~CachedShuffleWriter() override = default; @@ -47,6 +51,7 @@ class CachedShuffleWriter : public ShuffleWriterBase private: void initOutputIfNeeded(DB::Block & block); + void lazyInitPartitionWriter(DB::Block & input_sample); bool stopped = false; DB::Block output_header; @@ -55,7 +60,9 @@ class CachedShuffleWriter : public ShuffleWriterBase std::unique_ptr partitioner; std::vector output_columns_indicies; std::unique_ptr partition_writer; + std::unique_ptr celeborn_client; bool sort_shuffle = false; + Poco::Logger* logger = &Poco::Logger::get("CachedShuffleWriter"); }; } diff --git a/cpp-ch/local-engine/Shuffle/PartitionWriter.cpp b/cpp-ch/local-engine/Shuffle/PartitionWriter.cpp index e0b69316de6a6..d02c79e0a5d6d 100644 --- a/cpp-ch/local-engine/Shuffle/PartitionWriter.cpp +++ b/cpp-ch/local-engine/Shuffle/PartitionWriter.cpp @@ -49,6 +49,7 @@ extern const int LOGICAL_ERROR; using namespace DB; namespace local_engine { +static const String PARTITION_COLUMN_NAME = "partition"; void PartitionWriter::write(const PartitionInfo & partition_info, DB::Block & block) { @@ -120,7 +121,7 @@ void PartitionWriter::write(const PartitionInfo & partition_info, DB::Block & bl } /// Only works for local partition writer - if (!supportsEvictSinglePartition() && options->spill_threshold && current_cached_bytes >= options->spill_threshold) + if (!supportsEvictSinglePartition() && options->spill_threshold && CurrentMemoryTracker::current_memory() >= options->spill_threshold) unsafeEvictPartitions(false, options->flush_block_buffer_before_evict); shuffle_writer->split_result.total_split_time += watch.elapsedNanoseconds(); @@ -157,20 +158,18 @@ size_t LocalPartitionWriter::unsafeEvictPartitions(bool for_memory_spill, bool f if (buffer->empty()) continue; - PartitionSpillInfo partition_spill_info; - partition_spill_info.start = output.count(); + std::pair offsets; + offsets.first = output.count(); spilled_bytes += buffer->bytes(); size_t written_bytes = buffer->spill(writer); res += written_bytes; compressed_output.sync(); - partition_spill_info.length = output.count() - partition_spill_info.start; + offsets.second = output.count() - offsets.first; shuffle_writer->split_result.raw_partition_lengths[partition_id] += written_bytes; - partition_spill_info.partition_id = partition_id; - info.partition_spill_infos.emplace_back(partition_spill_info); + info.partition_spill_infos[partition_id] = offsets; } - spill_infos.emplace_back(info); shuffle_writer->split_result.total_compress_time += compressed_output.getCompressTime(); shuffle_writer->split_result.total_write_time += compressed_output.getWriteTime(); @@ -182,8 +181,7 @@ size_t LocalPartitionWriter::unsafeEvictPartitions(bool for_memory_spill, bool f { // escape memory track from current thread status; add untracked memory limit for create thread object, avoid trigger memory spill again IgnoreMemoryTracker ignore(settings.spill_memory_overhead); - ThreadFromGlobalPool thread(spill_to_file); - thread.join(); + spill_to_file(); } else { @@ -194,20 +192,35 @@ size_t LocalPartitionWriter::unsafeEvictPartitions(bool for_memory_spill, bool f return res; } -std::vector LocalPartitionWriter::mergeSpills(WriteBuffer & data_file) +String Spillable::getNextSpillFile() +{ + auto file_name = std::to_string(split_options.shuffle_id) + "_" + std::to_string(split_options.map_id) + "_" + std::to_string(spill_infos.size()); + std::hash hasher; + auto hash = hasher(file_name); + auto dir_id = hash % split_options.local_dirs_list.size(); + auto sub_dir_id = (hash / split_options.local_dirs_list.size()) % split_options.num_sub_dirs; + + std::string dir = std::filesystem::path(split_options.local_dirs_list[dir_id]) / std::format("{:02x}", sub_dir_id); + if (!std::filesystem::exists(dir)) + std::filesystem::create_directories(dir); + return std::filesystem::path(dir) / file_name; +} + +std::vector Spillable::mergeSpills(CachedShuffleWriter * shuffle_writer, WriteBuffer & data_file, ExtraData extra_data) { auto codec = DB::CompressionCodecFactory::instance().get(boost::to_upper_copy(shuffle_writer->options.compress_method), {}); + CompressedWriteBuffer compressed_output(data_file, codec, shuffle_writer->options.io_buffer_size); NativeWriter writer(compressed_output, shuffle_writer->output_header); std::vector partition_length(shuffle_writer->options.partition_num, 0); - std::vector spill_inputs; + std::vector> spill_inputs; spill_inputs.reserve(spill_infos.size()); for (const auto & spill : spill_infos) { // only use readBig - spill_inputs.emplace_back(std::make_shared(spill.spilled_file, 0)); + spill_inputs.emplace_back(std::make_shared(spill.spilled_file, 0)); } Stopwatch write_time_watch; @@ -215,33 +228,46 @@ std::vector LocalPartitionWriter::mergeSpills(WriteBuffer & data_file) Stopwatch serialization_time_watch; size_t merge_io_time = 0; String buffer; - for (size_t partition_id = 0; partition_id < partition_block_buffer.size(); ++partition_id) + for (size_t partition_id = 0; partition_id < split_options.partition_num; ++partition_id) { auto size_before = data_file.count(); io_time_watch.restart(); for (size_t i = 0; i < spill_infos.size(); ++i) { - size_t size = spill_infos[i].partition_spill_infos[partition_id].length; + if (!spill_infos[i].partition_spill_infos.contains(partition_id)) + { + continue; + } + size_t size = spill_infos[i].partition_spill_infos[partition_id].second; + size_t offset = spill_infos[i].partition_spill_infos[partition_id].first; + if (!size) + { + continue; + } buffer.reserve(size); - auto count = spill_inputs[i]->readBig(buffer.data(), size); + auto count = spill_inputs[i]->readBigAt(buffer.data(), size, offset, nullptr); + + chassert(count == size); data_file.write(buffer.data(), count); } merge_io_time += io_time_watch.elapsedNanoseconds(); serialization_time_watch.restart(); - if (!partition_block_buffer[partition_id]->empty()) + if (!extra_data.partition_block_buffer.empty() && !extra_data.partition_block_buffer[partition_id]->empty()) { - Block block = partition_block_buffer[partition_id]->releaseColumns(); - partition_buffer[partition_id]->addBlock(std::move(block)); + Block block = extra_data.partition_block_buffer[partition_id]->releaseColumns(); + extra_data.partition_buffer[partition_id]->addBlock(std::move(block)); + } + if (!extra_data.partition_buffer.empty()) + { + size_t raw_size = extra_data.partition_buffer[partition_id]->spill(writer); + shuffle_writer->split_result.raw_partition_lengths[partition_id] += raw_size; } - size_t raw_size = partition_buffer[partition_id]->spill(writer); - compressed_output.sync(); partition_length[partition_id] = data_file.count() - size_before; shuffle_writer->split_result.total_serialize_time += serialization_time_watch.elapsedNanoseconds(); shuffle_writer->split_result.total_bytes_written += partition_length[partition_id]; - shuffle_writer->split_result.raw_partition_lengths[partition_id] += raw_size; } shuffle_writer->split_result.total_write_time += write_time_watch.elapsedNanoseconds(); @@ -253,32 +279,43 @@ std::vector LocalPartitionWriter::mergeSpills(WriteBuffer & data_file) for (const auto & spill : spill_infos) std::filesystem::remove(spill.spilled_file); - return partition_length; } -LocalPartitionWriter::LocalPartitionWriter(CachedShuffleWriter * shuffle_writer_) : PartitionWriter(shuffle_writer_) +void SortBasedPartitionWriter::write(const PartitionInfo & info, DB::Block & block) { + Stopwatch write_time_watch; + if (output_header.columns() == 0) + output_header = block.cloneEmpty(); + auto partition_column = ColumnUInt64::create(); + partition_column->reserve(block.rows()); + partition_column->getData().insert_assume_reserved(info.src_partition_num.begin(), info.src_partition_num.end()); + block.insert({std::move(partition_column), std::make_shared(), PARTITION_COLUMN_NAME}); + if (sort_header.columns() == 0) + { + sort_header = block.cloneEmpty(); + sort_description.emplace_back(SortColumnDescription(PARTITION_COLUMN_NAME)); + } + // partial sort + sortBlock(block, sort_description); + Chunk chunk; + chunk.setColumns(block.getColumns(), block.rows()); + accumulated_blocks.emplace_back(std::move(chunk)); + current_accumulated_bytes += accumulated_blocks.back().allocatedBytes(); + current_accumulated_rows += accumulated_blocks.back().getNumRows(); + shuffle_writer->split_result.total_write_time += write_time_watch.elapsedNanoseconds(); + if (options->spill_threshold && CurrentMemoryTracker::current_memory() >= options->spill_threshold) + unsafeEvictPartitions(false, false); } -String LocalPartitionWriter::getNextSpillFile() +LocalPartitionWriter::LocalPartitionWriter(CachedShuffleWriter * shuffle_writer_) : PartitionWriter(shuffle_writer_), Spillable(shuffle_writer_->options) { - auto file_name = std::to_string(options->shuffle_id) + "_" + std::to_string(options->map_id) + "_" + std::to_string(spill_infos.size()); - std::hash hasher; - auto hash = hasher(file_name); - auto dir_id = hash % options->local_dirs_list.size(); - auto sub_dir_id = (hash / options->local_dirs_list.size()) % options->num_sub_dirs; - - std::string dir = std::filesystem::path(options->local_dirs_list[dir_id]) / std::format("{:02x}", sub_dir_id); - if (!std::filesystem::exists(dir)) - std::filesystem::create_directories(dir); - return std::filesystem::path(dir) / file_name; } void LocalPartitionWriter::unsafeStop() { WriteBufferFromFile output(options->data_file, options->io_buffer_size); - auto offsets = mergeSpills(output); + auto offsets = mergeSpills(shuffle_writer, output, {partition_block_buffer, partition_buffer}); shuffle_writer->split_result.partition_lengths = offsets; } @@ -335,30 +372,211 @@ size_t PartitionWriter::bytes() const return bytes; } -void ExternalSortLocalPartitionWriter::write(const PartitionInfo & info, DB::Block & block) +size_t MemorySortLocalPartitionWriter::unsafeEvictPartitions(bool for_memory_spill, bool /*flush_block_buffer*/) { - Stopwatch write_time_watch; - if (output_header.columns() == 0) - output_header = block.cloneEmpty(); - static const String partition_column_name = "partition"; - auto partition_column = ColumnUInt64::create(); - partition_column->reserve(block.rows()); - partition_column->getData().insert_assume_reserved(info.src_partition_num.begin(), info.src_partition_num.end()); - block.insert({std::move(partition_column), std::make_shared(), partition_column_name}); - if (sort_header.columns() == 0) + size_t res = 0; + size_t spilled_bytes = 0; + + auto spill_to_file = [this, &res, &spilled_bytes]() { - sort_header = block.cloneEmpty(); - sort_description.emplace_back(SortColumnDescription(partition_column_name)); + if (accumulated_blocks.empty()) + return; + auto file = getNextSpillFile(); + WriteBufferFromFile output(file, shuffle_writer->options.io_buffer_size); + auto codec = DB::CompressionCodecFactory::instance().get(boost::to_upper_copy(shuffle_writer->options.compress_method), {}); + CompressedWriteBuffer compressed_output(output, codec, shuffle_writer->options.io_buffer_size); + NativeWriter writer(compressed_output, output_header); + + SpillInfo info; + info.spilled_file = file; + + Stopwatch serialization_time_watch; + MergeSorter sorter(sort_header, std::move(accumulated_blocks), sort_description, adaptiveBlockSize(), 0); + size_t cur_partition_id = 0; + info.partition_spill_infos[cur_partition_id] = {0,0}; + while (auto data = sorter.read()) + { + Block serialized_block = sort_header.cloneWithColumns(data.detachColumns()); + const auto partitions = serialized_block.getByName(PARTITION_COLUMN_NAME).column; + serialized_block.erase(PARTITION_COLUMN_NAME); + size_t row_offset = 0; + while (row_offset < serialized_block.rows()) + { + auto last_idx = searchLastPartitionIdIndex(partitions, row_offset, cur_partition_id); + if (last_idx < 0) + { + auto& last = info.partition_spill_infos[cur_partition_id]; + compressed_output.sync(); + last.second = output.count() - last.first; + cur_partition_id++; + info.partition_spill_infos[cur_partition_id] = {last.first + last.second, 0}; + continue; + } + + if (row_offset == 0 && last_idx == serialized_block.rows() - 1) + { + auto count = writer.write(serialized_block); + shuffle_writer->split_result.raw_partition_lengths[cur_partition_id] += count; + break; + } + else + { + auto cut_block = serialized_block.cloneWithCutColumns(row_offset, last_idx - row_offset + 1); + + auto count = writer.write(cut_block); + shuffle_writer->split_result.raw_partition_lengths[cur_partition_id] += count; + row_offset = last_idx + 1; + if (last_idx != serialized_block.rows() - 1) + { + auto& last = info.partition_spill_infos[cur_partition_id]; + compressed_output.sync(); + last.second = output.count() - last.first; + cur_partition_id++; + info.partition_spill_infos[cur_partition_id] = {last.first + last.second, 0}; + } + } + } + } + compressed_output.sync(); + auto& last = info.partition_spill_infos[cur_partition_id]; + last.second = output.count() - last.first; + spilled_bytes = current_accumulated_bytes; + res = current_accumulated_bytes; + current_accumulated_bytes = 0; + current_accumulated_rows = 0; + std::erase_if(info.partition_spill_infos, [](const auto & item) + { + auto const& [key, value] = item; + return value.second == 0; + }); + spill_infos.emplace_back(info); + shuffle_writer->split_result.total_compress_time += compressed_output.getCompressTime(); + shuffle_writer->split_result.total_io_time += compressed_output.getWriteTime(); + shuffle_writer->split_result.total_serialize_time += serialization_time_watch.elapsedNanoseconds(); + }; + + Stopwatch spill_time_watch; + if (for_memory_spill && options->throw_if_memory_exceed) + { + // escape memory track from current thread status; add untracked memory limit for create thread object, avoid trigger memory spill again + IgnoreMemoryTracker ignore(settings.spill_memory_overhead); + spill_to_file(); } - // partial sort - sortBlock(block, sort_description); - Chunk chunk; - chunk.setColumns(block.getColumns(), block.rows()); - accumulated_blocks.emplace_back(std::move(chunk)); - current_accumulated_bytes += accumulated_blocks.back().allocatedBytes(); - if (current_accumulated_bytes >= max_sort_buffer_size) - unsafeEvictPartitions(false, false); - shuffle_writer->split_result.total_write_time += write_time_watch.elapsedNanoseconds(); + else + { + spill_to_file(); + } + shuffle_writer->split_result.total_spill_time += spill_time_watch.elapsedNanoseconds(); + shuffle_writer->split_result.total_bytes_spilled += spilled_bytes; + return res; +} + +void MemorySortLocalPartitionWriter::unsafeStop() +{ + unsafeEvictPartitions(false, false); + WriteBufferFromFile output(options->data_file, options->io_buffer_size); + auto offsets = mergeSpills(shuffle_writer, output); + shuffle_writer->split_result.partition_lengths = offsets; +} + +size_t MemorySortCelebornPartitionWriter::unsafeEvictPartitions(bool for_memory_spill, bool flush_block_buffer) +{ + size_t res = 0; + size_t spilled_bytes = 0; + auto spill_to_celeborn = [this, for_memory_spill, flush_block_buffer, &res, &spilled_bytes]() + { + Stopwatch serialization_time_watch; + + /// Skip empty buffer + if (accumulated_blocks.empty()) + return; + + WriteBufferFromOwnString output; + auto codec = DB::CompressionCodecFactory::instance().get(boost::to_upper_copy(shuffle_writer->options.compress_method), {}); + CompressedWriteBuffer compressed_output(output, codec, shuffle_writer->options.io_buffer_size); + NativeWriter writer(compressed_output, shuffle_writer->output_header); + + MergeSorter sorter(sort_header, std::move(accumulated_blocks), sort_description, adaptiveBlockSize(), 0); + size_t cur_partition_id = 0; + auto push_to_celeborn = [&]() + { + compressed_output.sync(); + auto& data = output.str(); + if (!data.empty()) + { + Stopwatch push_time_watch; + celeborn_client->pushPartitionData(cur_partition_id, data.data(), data.size()); + shuffle_writer->split_result.total_io_time += push_time_watch.elapsedNanoseconds(); + shuffle_writer->split_result.partition_lengths[cur_partition_id] += data.size(); + } + output.restart(); + }; + + while (auto data = sorter.read()) + { + Block serialized_block = sort_header.cloneWithColumns(data.detachColumns()); + const auto partitions = serialized_block.getByName(PARTITION_COLUMN_NAME).column; + serialized_block.erase(PARTITION_COLUMN_NAME); + size_t row_offset = 0; + while (row_offset < serialized_block.rows()) + { + auto last_idx = searchLastPartitionIdIndex(partitions, row_offset, cur_partition_id); + if (last_idx < 0) + { + push_to_celeborn(); + cur_partition_id++; + continue; + } + + if (row_offset == 0 && last_idx == serialized_block.rows() - 1) + { + auto count = writer.write(serialized_block); + shuffle_writer->split_result.raw_partition_lengths[cur_partition_id] += count; + break; + } + auto cut_block = serialized_block.cloneWithCutColumns(row_offset, last_idx - row_offset + 1); + auto count = writer.write(cut_block); + shuffle_writer->split_result.raw_partition_lengths[cur_partition_id] += count; + row_offset = last_idx + 1; + if (last_idx != serialized_block.rows() - 1) + { + push_to_celeborn(); + cur_partition_id++; + } + } + } + push_to_celeborn(); + spilled_bytes = current_accumulated_bytes; + res = current_accumulated_bytes; + current_accumulated_bytes = 0; + current_accumulated_rows = 0; + + shuffle_writer->split_result.total_compress_time += compressed_output.getCompressTime(); + shuffle_writer->split_result.total_io_time += compressed_output.getWriteTime(); + + shuffle_writer->split_result.total_serialize_time += serialization_time_watch.elapsedNanoseconds(); + }; + + Stopwatch spill_time_watch; + if (for_memory_spill && options->throw_if_memory_exceed) + { + // escape memory track from current thread status; add untracked memory limit for create thread object, avoid trigger memory spill again + IgnoreMemoryTracker ignore(settings.spill_memory_overhead); + spill_to_celeborn(); + } + else + { + spill_to_celeborn(); + } + + shuffle_writer->split_result.total_spill_time += spill_time_watch.elapsedNanoseconds(); + shuffle_writer->split_result.total_bytes_spilled += spilled_bytes; + return res; +} + +void MemorySortCelebornPartitionWriter::unsafeStop() +{ + unsafeEvictPartitions(false, false); } size_t ExternalSortLocalPartitionWriter::unsafeEvictPartitions(bool, bool) @@ -367,6 +585,10 @@ size_t ExternalSortLocalPartitionWriter::unsafeEvictPartitions(bool, bool) IgnoreMemoryTracker ignore(settings.spill_memory_overhead); if (accumulated_blocks.empty()) return 0; + if (max_merge_block_bytes) + { + max_merge_block_size = std::max(max_merge_block_bytes / (current_accumulated_bytes / current_accumulated_rows), 128UL); + } Stopwatch watch; MergeSorter sorter(sort_header, std::move(accumulated_blocks), sort_description, max_merge_block_size, 0); streams.emplace_back(&tmp_data->createStream(sort_header)); @@ -378,6 +600,7 @@ size_t ExternalSortLocalPartitionWriter::unsafeEvictPartitions(bool, bool) streams.back()->finishWriting(); auto result = current_accumulated_bytes; current_accumulated_bytes = 0; + current_accumulated_rows = 0; shuffle_writer->split_result.total_spill_time += watch.elapsedNanoseconds(); return result; } @@ -562,8 +785,7 @@ size_t CelebornPartitionWriter::unsafeEvictSinglePartition(bool for_memory_spill { // escape memory track from current thread status; add untracked memory limit for create thread object, avoid trigger memory spill again IgnoreMemoryTracker ignore(settings.spill_memory_overhead); - ThreadFromGlobalPool thread(spill_to_celeborn); - thread.join(); + spill_to_celeborn(); } else { diff --git a/cpp-ch/local-engine/Shuffle/PartitionWriter.h b/cpp-ch/local-engine/Shuffle/PartitionWriter.h index 9c4e75db6efb3..5b4285afda1f9 100644 --- a/cpp-ch/local-engine/Shuffle/PartitionWriter.h +++ b/cpp-ch/local-engine/Shuffle/PartitionWriter.h @@ -17,7 +17,6 @@ #pragma once #include #include -#include #include #include #include @@ -26,6 +25,8 @@ #include #include +#include "CachedShuffleWriter.h" + namespace DB { class MergingSortedAlgorithm; @@ -33,17 +34,11 @@ class MergingSortedAlgorithm; namespace local_engine { -struct PartitionSpillInfo -{ - size_t partition_id; - size_t start; - size_t length; // in Bytes -}; struct SpillInfo { std::string spilled_file; - std::vector partition_spill_infos; + std::map> partition_spill_infos; }; class Partition @@ -113,7 +108,28 @@ class PartitionWriter : boost::noncopyable size_t last_partition_id; }; -class LocalPartitionWriter : public PartitionWriter +class Spillable +{ +public: + struct ExtraData + { + std::vector partition_block_buffer; + std::vector partition_buffer; + }; + + Spillable(SplitOptions options_) : split_options(std::move(options_)) {} + virtual ~Spillable() = default; + +protected: + String getNextSpillFile(); + std::vector mergeSpills(CachedShuffleWriter * shuffle_writer, WriteBuffer & data_file, ExtraData extra_data = {}); + std::vector spill_infos; + +private: + const SplitOptions split_options; +}; + +class LocalPartitionWriter : public PartitionWriter, public Spillable { public: explicit LocalPartitionWriter(CachedShuffleWriter * shuffle_writer); @@ -124,16 +140,79 @@ class LocalPartitionWriter : public PartitionWriter protected: size_t unsafeEvictPartitions(bool for_memory_spill, bool flush_block_buffer) override; void unsafeStop() override; +}; - String getNextSpillFile(); - std::vector mergeSpills(DB::WriteBuffer & data_file); +class SortBasedPartitionWriter : public PartitionWriter +{ +public: + explicit SortBasedPartitionWriter(CachedShuffleWriter * shuffle_writer_) : PartitionWriter(shuffle_writer_) + { + max_merge_block_size = options->split_size; + max_sort_buffer_size = options->max_sort_buffer_size; + max_merge_block_bytes = SerializedPlanParser::global_context->getSettings().prefer_external_sort_block_bytes; + } - std::vector spill_infos; + String getName() const override { return "SortBasedPartitionWriter"; } + void write(const PartitionInfo & info, DB::Block & block) override; + size_t adaptiveBlockSize() + { + size_t res = max_merge_block_size; + if (max_merge_block_bytes) + { + res = std::min(std::max(max_merge_block_bytes / (current_accumulated_bytes / current_accumulated_rows), 128UL), res); + } + return res; + } + +protected: + size_t max_merge_block_size = DB::DEFAULT_BLOCK_SIZE; + size_t max_sort_buffer_size = 1_GiB; + size_t max_merge_block_bytes = 0; + size_t current_accumulated_bytes = 0; + size_t current_accumulated_rows = 0; + Chunks accumulated_blocks; + Block output_header; + Block sort_header; + SortDescription sort_description; +}; + +class MemorySortLocalPartitionWriter : public SortBasedPartitionWriter, public Spillable +{ +public: + explicit MemorySortLocalPartitionWriter(CachedShuffleWriter* shuffle_writer_) + : SortBasedPartitionWriter(shuffle_writer_), Spillable(shuffle_writer_->options) + { + } + + ~MemorySortLocalPartitionWriter() override = default; + String getName() const override { return "MemorySortLocalPartitionWriter"; } + +protected: + size_t unsafeEvictPartitions(bool for_memory_spill, bool flush_block_buffer) override; + void unsafeStop() override; +}; + +class MemorySortCelebornPartitionWriter : public SortBasedPartitionWriter +{ +public: + explicit MemorySortCelebornPartitionWriter(CachedShuffleWriter* shuffle_writer_, std::unique_ptr celeborn_client_) + : SortBasedPartitionWriter(shuffle_writer_), celeborn_client(std::move(celeborn_client_)) + { + } + + ~MemorySortCelebornPartitionWriter() override = default; + +protected: + size_t unsafeEvictPartitions(bool for_memory_spill, bool flush_block_buffer) override; + void unsafeStop() override; + +private: + std::unique_ptr celeborn_client; }; class SortedPartitionDataMerger; -class ExternalSortLocalPartitionWriter : public PartitionWriter +class ExternalSortLocalPartitionWriter : public SortBasedPartitionWriter { public: struct MergeContext @@ -142,37 +221,30 @@ class ExternalSortLocalPartitionWriter : public PartitionWriter std::unique_ptr merger; }; - explicit ExternalSortLocalPartitionWriter(CachedShuffleWriter * shuffle_writer_) : PartitionWriter(shuffle_writer_) + explicit ExternalSortLocalPartitionWriter(CachedShuffleWriter * shuffle_writer_) : SortBasedPartitionWriter(shuffle_writer_) { max_merge_block_size = options->split_size; max_sort_buffer_size = options->max_sort_buffer_size; + max_merge_block_bytes = SerializedPlanParser::global_context->getSettings().prefer_external_sort_block_bytes; tmp_data = std::make_unique(SerializedPlanParser::global_context->getTempDataOnDisk()); } ~ExternalSortLocalPartitionWriter() override = default; String getName() const override { return "ExternalSortLocalPartitionWriter"; } - void write(const PartitionInfo & info, DB::Block & block) override; protected: size_t unsafeEvictPartitions(bool for_memory_spill, bool flush_block_buffer) override; /// Prepare for data merging, spill the remaining memory data,and create a merger object. MergeContext prepareMerge(); void unsafeStop() override; - std::queue mergeDataInMemory(); + std::queue mergeDataInMemory(); - size_t max_sort_buffer_size = 1_GiB; - size_t max_merge_block_size = DB::DEFAULT_BLOCK_SIZE; - size_t current_accumulated_bytes = 0; - DB::Chunks accumulated_blocks; - DB::Block output_header; - DB::Block sort_header; - DB::SortDescription sort_description; - DB::TemporaryDataOnDiskPtr tmp_data; - std::vector streams; + TemporaryDataOnDiskPtr tmp_data; + std::vector streams; }; -class ExternalSortCelebornPartitionWriter : public ExternalSortLocalPartitionWriter +class ExternalSortCelebornPartitionWriter : public ExternalSortLocalPartitionWriter { public: explicit ExternalSortCelebornPartitionWriter(CachedShuffleWriter * shuffle_writer_, std::unique_ptr celeborn_client_) diff --git a/cpp-ch/local-engine/Shuffle/SelectorBuilder.cpp b/cpp-ch/local-engine/Shuffle/SelectorBuilder.cpp index 5a5e969e1f584..7e3642dacd523 100644 --- a/cpp-ch/local-engine/Shuffle/SelectorBuilder.cpp +++ b/cpp-ch/local-engine/Shuffle/SelectorBuilder.cpp @@ -81,7 +81,7 @@ PartitionInfo RoundRobinSelectorBuilder::build(DB::Block & block) pid = pid_selection; pid_selection = (pid_selection + 1) % parts_num; } - return PartitionInfo::fromSelector(std::move(result), parts_num, use_external_sort_shuffle); + return PartitionInfo::fromSelector(std::move(result), parts_num, use_sort_shuffle); } HashSelectorBuilder::HashSelectorBuilder( @@ -156,7 +156,7 @@ PartitionInfo HashSelectorBuilder::build(DB::Block & block) } } } - return PartitionInfo::fromSelector(std::move(partition_ids), parts_num, use_external_sort_shuffle); + return PartitionInfo::fromSelector(std::move(partition_ids), parts_num, use_sort_shuffle); } @@ -177,7 +177,7 @@ PartitionInfo RangeSelectorBuilder::build(DB::Block & block) { DB::IColumn::Selector result; computePartitionIdByBinarySearch(block, result); - return PartitionInfo::fromSelector(std::move(result), partition_num, use_external_sort_shuffle); + return PartitionInfo::fromSelector(std::move(result), partition_num, use_sort_shuffle); } void RangeSelectorBuilder::initSortInformation(Poco::JSON::Array::Ptr orderings) diff --git a/cpp-ch/local-engine/Shuffle/SelectorBuilder.h b/cpp-ch/local-engine/Shuffle/SelectorBuilder.h index 9510291c88649..97894daa3c14d 100644 --- a/cpp-ch/local-engine/Shuffle/SelectorBuilder.h +++ b/cpp-ch/local-engine/Shuffle/SelectorBuilder.h @@ -46,11 +46,12 @@ struct PartitionInfo class SelectorBuilder { public: - explicit SelectorBuilder(bool use_external_sort_shuffle) : use_external_sort_shuffle(use_external_sort_shuffle) { } + explicit SelectorBuilder(bool use_external_sort_shuffle) : use_sort_shuffle(use_external_sort_shuffle) { } virtual ~SelectorBuilder() = default; virtual PartitionInfo build(DB::Block & block) = 0; + void setUseSortShuffle(bool use_external_sort_shuffle_) { use_sort_shuffle = use_external_sort_shuffle_; } protected: - bool use_external_sort_shuffle = false; + bool use_sort_shuffle = false; }; class RoundRobinSelectorBuilder : public SelectorBuilder diff --git a/cpp-ch/local-engine/Shuffle/ShuffleSplitter.h b/cpp-ch/local-engine/Shuffle/ShuffleSplitter.h index cfd4062611edd..75edea325c67b 100644 --- a/cpp-ch/local-engine/Shuffle/ShuffleSplitter.h +++ b/cpp-ch/local-engine/Shuffle/ShuffleSplitter.h @@ -53,7 +53,8 @@ struct SplitOptions size_t max_sort_buffer_size = 1_GiB; // Whether to spill firstly before stop external sort shuffle. bool spill_firstly_before_stop = true; - bool force_sort = true; + bool force_external_sort = false; + bool force_mermory_sort = false; }; class ColumnsBuffer diff --git a/cpp-ch/local-engine/Shuffle/SortedPartitionDataMerger.h b/cpp-ch/local-engine/Shuffle/SortedPartitionDataMerger.h index 31f5547fba55d..e38f58647e963 100644 --- a/cpp-ch/local-engine/Shuffle/SortedPartitionDataMerger.h +++ b/cpp-ch/local-engine/Shuffle/SortedPartitionDataMerger.h @@ -22,6 +22,8 @@ namespace local_engine { + +int64_t searchLastPartitionIdIndex(DB::ColumnPtr column, size_t start, size_t partition_id); class SortedPartitionDataMerger; using SortedPartitionDataMergerPtr = std::unique_ptr; class SortedPartitionDataMerger diff --git a/cpp-ch/local-engine/Storages/CustomStorageMergeTree.cpp b/cpp-ch/local-engine/Storages/CustomStorageMergeTree.cpp index 368015fb92786..ec0e0932fc763 100644 --- a/cpp-ch/local-engine/Storages/CustomStorageMergeTree.cpp +++ b/cpp-ch/local-engine/Storages/CustomStorageMergeTree.cpp @@ -150,6 +150,7 @@ std::atomic CustomStorageMergeTree::part_num; std::vector CustomStorageMergeTree::loadDataPartsWithNames(std::unordered_set parts) { + auto parts_lock = lockParts(); std::vector data_parts; const auto disk = getStoragePolicy()->getDisks().at(0); for (const auto& name : parts) diff --git a/cpp-ch/local-engine/Storages/Mergetree/SparkMergeTreeWriter.cpp b/cpp-ch/local-engine/Storages/Mergetree/SparkMergeTreeWriter.cpp index c709a5f24cca6..406f2aaa23df2 100644 --- a/cpp-ch/local-engine/Storages/Mergetree/SparkMergeTreeWriter.cpp +++ b/cpp-ch/local-engine/Storages/Mergetree/SparkMergeTreeWriter.cpp @@ -16,12 +16,15 @@ */ #include "SparkMergeTreeWriter.h" +#include + #include #include #include +#include #include #include -#include +#include #include @@ -40,7 +43,7 @@ using namespace DB; namespace local_engine { -Block removeColumnSuffix(const DB::Block & block) +Block removeColumnSuffix(const Block & block) { ColumnsWithTypeAndName columns; for (int i = 0; i < block.columns(); ++i) @@ -55,54 +58,87 @@ Block removeColumnSuffix(const DB::Block & block) } SparkMergeTreeWriter::SparkMergeTreeWriter( - CustomStorageMergeTreePtr storage_, - const DB::StorageMetadataPtr & metadata_snapshot_, + const MergeTreeTable & merge_tree_table, const DB::ContextPtr & context_, - const String & uuid_, + const String & part_name_prefix_, const String & partition_dir_, const String & bucket_dir_) - : storage(storage_) - , metadata_snapshot(metadata_snapshot_) - , context(context_) - , uuid(uuid_) + : context(context_) + , part_name_prefix(part_name_prefix_) , partition_dir(partition_dir_) , bucket_dir(bucket_dir_) , thread_pool(CurrentMetrics::LocalThread, CurrentMetrics::LocalThreadActive, CurrentMetrics::LocalThreadScheduled, 1, 1, 100000) { const DB::Settings & settings = context->getSettingsRef(); - squashing_transform - = std::make_unique(settings.min_insert_block_size_rows, settings.min_insert_block_size_bytes); - if (!partition_dir.empty()) - { - extractPartitionValues(partition_dir, partition_values); - } - header = metadata_snapshot->getSampleBlock(); - - Field is_merge; - if (context->getSettings().tryGet("mergetree.merge_after_insert", is_merge)) - merge_after_insert = is_merge.get(); + merge_after_insert = settings.get(MERGETREE_MERGE_AFTER_INSERT).get(); + insert_without_local_storage = settings.get(MERGETREE_INSERT_WITHOUT_LOCAL_STORAGE).get(); Field limit_size_field; - if (context->getSettings().tryGet("optimize.minFileSize", limit_size_field)) + if (settings.tryGet("optimize.minFileSize", limit_size_field)) merge_min_size = limit_size_field.get() <= 0 ? merge_min_size : limit_size_field.get(); Field limit_cnt_field; - if (context->getSettings().tryGet("mergetree.max_num_part_per_merge_task", limit_cnt_field)) + if (settings.tryGet("mergetree.max_num_part_per_merge_task", limit_cnt_field)) merge_limit_parts = limit_cnt_field.get() <= 0 ? merge_limit_parts : limit_cnt_field.get(); + + dest_storage = MergeTreeRelParser::parseStorage(merge_tree_table, SerializedPlanParser::global_context); + isRemoteStorage = dest_storage->getStoragePolicy()->getAnyDisk()->isRemote(); + + if (useLocalStorage()) + { + temp_storage = MergeTreeRelParser::copyToDefaultPolicyStorage(merge_tree_table, SerializedPlanParser::global_context); + storage = temp_storage; + LOG_DEBUG( + &Poco::Logger::get("SparkMergeTreeWriter"), + "Create temp table {} for local merge.", + temp_storage->getStorageID().getFullNameNotQuoted()); + } + else + storage = dest_storage; + + metadata_snapshot = storage->getInMemoryMetadataPtr(); + header = metadata_snapshot->getSampleBlock(); + squashing = std::make_unique(header, settings.min_insert_block_size_rows, settings.min_insert_block_size_bytes); + if (!partition_dir.empty()) + extractPartitionValues(partition_dir, partition_values); } -void SparkMergeTreeWriter::write(DB::Block & block) +bool SparkMergeTreeWriter::useLocalStorage() const +{ + return !insert_without_local_storage && isRemoteStorage; +} + +void SparkMergeTreeWriter::write(const DB::Block & block) { auto new_block = removeColumnSuffix(block); - auto converter = ActionsDAG::makeConvertingActions(new_block.getColumnsWithTypeAndName(), header.getColumnsWithTypeAndName(), DB::ActionsDAG::MatchColumnsMode::Position);; - if (converter) + if (auto converter = ActionsDAG::makeConvertingActions( + new_block.getColumnsWithTypeAndName(), header.getColumnsWithTypeAndName(), DB::ActionsDAG::MatchColumnsMode::Position)) + ExpressionActions(converter).execute(new_block); + + bool has_part = chunkToPart(squashing->add({new_block.getColumns(), new_block.rows()})); + + if (has_part && merge_after_insert) + checkAndMerge(); +} + +bool SparkMergeTreeWriter::chunkToPart(Chunk && chunk) +{ + if (chunk.hasChunkInfo()) { - ExpressionActions do_convert = ExpressionActions(converter); - do_convert.execute(new_block); + Chunk squash_chunk = DB::Squashing::squash(std::move(chunk)); + Block result = header.cloneWithColumns(squash_chunk.getColumns()); + return blockToPart(result); } + return false; +} + +bool SparkMergeTreeWriter::blockToPart(Block & block) +{ + auto blocks_with_partition = MergeTreeDataWriter::splitBlockIntoParts(std::move(block), 10, metadata_snapshot, context); + + if (blocks_with_partition.empty()) + return false; - auto blocks_with_partition - = MergeTreeDataWriter::splitBlockIntoParts(squashing_transform->add(new_block), 10, metadata_snapshot, context); for (auto & item : blocks_with_partition) { size_t before_write_memory = 0; @@ -120,8 +156,7 @@ void SparkMergeTreeWriter::write(DB::Block & block) item.partition.clear(); } - if (!blocks_with_partition.empty() && merge_after_insert) - checkAndMerge(); + return true; } void SparkMergeTreeWriter::manualFreeMemory(size_t before_write_memory) @@ -130,10 +165,10 @@ void SparkMergeTreeWriter::manualFreeMemory(size_t before_write_memory) // it may alloc memory in current thread, and free on global thread. // Now, wo have not idea to clear global memory by used spark thread tracker. // So we manually correct the memory usage. - auto disk = storage->getStoragePolicy()->getAnyDisk(); - if (!disk->isRemote()) + if (isRemoteStorage && insert_without_local_storage) return; + auto disk = storage->getStoragePolicy()->getAnyDisk(); std::lock_guard lock(memory_mutex); auto * memory_tracker = CurrentThread::getMemoryTracker(); if (memory_tracker && CurrentMemoryTracker::before_free) @@ -146,7 +181,6 @@ void SparkMergeTreeWriter::manualFreeMemory(size_t before_write_memory) memory_tracker->adjustWithUntrackedMemory(diff_ch_alloc); } - const size_t a = memory_tracker->get(); const size_t spark_alloc = CurrentMemoryTracker::current_memory(); const size_t diff_alloc = spark_alloc - memory_tracker->get(); @@ -157,38 +191,81 @@ void SparkMergeTreeWriter::manualFreeMemory(size_t before_write_memory) void SparkMergeTreeWriter::finalize() { - auto block = squashing_transform->add({}); - if (block.rows()) + chunkToPart(squashing->flush()); + if (merge_after_insert) + finalizeMerge(); + + commitPartToRemoteStorageIfNeeded(); + saveMetadata(); +} + +void SparkMergeTreeWriter::saveMetadata() +{ + if (!isRemoteStorage) + return; + + for (const auto & merge_tree_data_part : new_parts.unsafeGet()) { - auto blocks_with_partition = MergeTreeDataWriter::splitBlockIntoParts(std::move(block), 10, metadata_snapshot, context); - for (auto & item : blocks_with_partition) + auto part = dest_storage->loadDataPartsWithNames({merge_tree_data_part->name}); + if (part.empty()) { - size_t before_write_memory = 0; - if (auto * memory_tracker = CurrentThread::getMemoryTracker()) - { - CurrentThread::flushUntrackedMemory(); - before_write_memory = memory_tracker->get(); - } - - new_parts.emplace_back(writeTempPartAndFinalize(item, metadata_snapshot).part); - part_num++; - manualFreeMemory(before_write_memory); - /// Reset earlier to free memory - item.block.clear(); - item.partition.clear(); + LOG_WARNING( + &Poco::Logger::get("SparkMergeTreeWriter"), + "Save metadata failed because dest storage load part name {} empty.", + merge_tree_data_part->name); + continue; } - } - SCOPE_EXIT({ - for (auto merge_tree_data_part : new_parts.unsafeGet()) - saveFileStatus( - *storage, context, merge_tree_data_part->name, const_cast(merge_tree_data_part->getDataPartStorage())); - }); + saveFileStatus( + *dest_storage, context, merge_tree_data_part->name, const_cast(part.at(0)->getDataPartStorage())); + } +} - if (!merge_after_insert) +void SparkMergeTreeWriter::commitPartToRemoteStorageIfNeeded() +{ + if (!useLocalStorage()) return; - // wait all merge task end and do final merge + LOG_DEBUG( + &Poco::Logger::get("SparkMergeTreeWriter"), "Begin upload to disk {}.", dest_storage->getStoragePolicy()->getAnyDisk()->getName()); + + auto read_settings = context->getReadSettings(); + auto write_settings = context->getWriteSettings(); + Stopwatch watch; + for (const auto & merge_tree_data_part : new_parts.unsafeGet()) + { + String local_relative_path = storage->getRelativeDataPath() + "/" + merge_tree_data_part->name; + String remote_relative_path = dest_storage->getRelativeDataPath() + "/" + merge_tree_data_part->name; + + storage->getStoragePolicy()->getAnyDisk()->copyDirectoryContent( + local_relative_path, + dest_storage->getStoragePolicy()->getAnyDisk(), + remote_relative_path, + read_settings, + write_settings, + nullptr); + LOG_DEBUG( + &Poco::Logger::get("SparkMergeTreeWriter"), + "Upload part {} to disk {} success.", + merge_tree_data_part->name, + dest_storage->getStoragePolicy()->getAnyDisk()->getName()); + } + watch.stop(); + LOG_INFO( + &Poco::Logger::get("SparkMergeTreeWriter"), + "Upload to disk {} finished, total elapsed {} ms", + dest_storage->getStoragePolicy()->getAnyDisk()->getName(), + watch.elapsedMilliseconds()); + StorageMergeTreeFactory::freeStorage(temp_storage->getStorageID()); + temp_storage->dropAllData(); + LOG_DEBUG( + &Poco::Logger::get("SparkMergeTreeWriter"), "Clean temp table {} success.", temp_storage->getStorageID().getFullNameNotQuoted()); +} + +void SparkMergeTreeWriter::finalizeMerge() +{ + LOG_DEBUG(&Poco::Logger::get("SparkMergeTreeWriter"), "Waiting all merge task end and do final merge"); + // waiting all merge task end and do final merge thread_pool.wait(); size_t before_merge_size; @@ -200,30 +277,32 @@ void SparkMergeTreeWriter::finalize() } while (before_merge_size != new_parts.size()); std::unordered_set final_parts; - for (auto merge_tree_data_part : new_parts.unsafeGet()) + for (const auto & merge_tree_data_part : new_parts.unsafeGet()) final_parts.emplace(merge_tree_data_part->name); - for (const auto & tmp_part : tmp_parts) + // default storage need clean temp. + if (!temp_storage) { - if (final_parts.contains(tmp_part)) - continue; + for (const auto & tmp_part : tmp_parts) + { + if (final_parts.contains(tmp_part)) + continue; - GlobalThreadPool::instance().scheduleOrThrow( - [&]() -> void - { - for (auto disk : storage->getDisks()) + GlobalThreadPool::instance().scheduleOrThrow( + [&]() -> void { - auto full_path = storage->getFullPathOnDisk(disk); - disk->removeRecursive(full_path + "/" + tmp_part); - } - }); + for (const auto & disk : storage->getDisks()) + { + auto rel_path = storage->getRelativeDataPath() + "/" + tmp_part; + disk->removeRecursive(rel_path); + } + }); + } } } -DB::MergeTreeDataWriter::TemporaryPart -SparkMergeTreeWriter::writeTempPartAndFinalize( - DB::BlockWithPartition & block_with_partition, - const DB::StorageMetadataPtr & metadata_snapshot) +DB::MergeTreeDataWriter::TemporaryPart SparkMergeTreeWriter::writeTempPartAndFinalize( + DB::BlockWithPartition & block_with_partition, const DB::StorageMetadataPtr & metadata_snapshot) { MergeTreeDataWriter::TemporaryPart temp_part; writeTempPart(temp_part, block_with_partition, metadata_snapshot); @@ -231,8 +310,8 @@ SparkMergeTreeWriter::writeTempPartAndFinalize( return temp_part; } -void SparkMergeTreeWriter::writeTempPart(MergeTreeDataWriter::TemporaryPart & temp_part, - BlockWithPartition & block_with_partition, const StorageMetadataPtr & metadata_snapshot) +void SparkMergeTreeWriter::writeTempPart( + MergeTreeDataWriter::TemporaryPart & temp_part, BlockWithPartition & block_with_partition, const StorageMetadataPtr & metadata_snapshot) { Block & block = block_with_partition.block; @@ -251,21 +330,13 @@ void SparkMergeTreeWriter::writeTempPart(MergeTreeDataWriter::TemporaryPart & te std::string part_dir; if (!partition_dir.empty() && !bucket_dir.empty()) - { - part_dir = fmt::format("{}/{}/{}_{:03d}", partition_dir, bucket_dir, uuid, part_num); - } + part_dir = fmt::format("{}/{}/{}_{:03d}", partition_dir, bucket_dir, part_name_prefix, part_num); else if (!partition_dir.empty()) - { - part_dir = fmt::format("{}/{}_{:03d}", partition_dir, uuid, part_num); - } + part_dir = fmt::format("{}/{}_{:03d}", partition_dir, part_name_prefix, part_num); else if (!bucket_dir.empty()) - { - part_dir = fmt::format("{}/{}_{:03d}", bucket_dir, uuid, part_num); - } + part_dir = fmt::format("{}/{}_{:03d}", bucket_dir, part_name_prefix, part_num); else - { - part_dir = fmt::format("{}_{:03d}", uuid, part_num); - } + part_dir = fmt::format("{}_{:03d}", part_name_prefix, part_num); String part_name = part_dir; @@ -352,7 +423,7 @@ void SparkMergeTreeWriter::writeTempPart(MergeTreeDataWriter::TemporaryPart & te /// This effectively chooses minimal compression method: /// either default lz4 or compression method with zero thresholds on absolute and relative part size. auto compression_codec = storage->getContext()->chooseCompressionCodec(0, 0); - + auto txn = context->getCurrentTransaction(); auto out = std::make_unique( new_data_part, metadata_snapshot, @@ -360,7 +431,7 @@ void SparkMergeTreeWriter::writeTempPart(MergeTreeDataWriter::TemporaryPart & te indices, MergeTreeStatisticsFactory::instance().getMany(metadata_snapshot->getColumns()), compression_codec, - context->getCurrentTransaction(), + txn ? txn->tid : Tx::PrehistoricTID, false, false, context->getWriteSettings()); @@ -369,8 +440,7 @@ void SparkMergeTreeWriter::writeTempPart(MergeTreeDataWriter::TemporaryPart & te auto finalizer = out->finalizePartAsync(new_data_part, data_settings->fsync_after_insert, nullptr, nullptr); temp_part.part = new_data_part; - temp_part.streams.emplace_back( - MergeTreeDataWriter::TemporaryPart::Stream{.stream = std::move(out), .finalizer = std::move(finalizer)}); + temp_part.streams.emplace_back(MergeTreeDataWriter::TemporaryPart::Stream{.stream = std::move(out), .finalizer = std::move(finalizer)}); } std::vector SparkMergeTreeWriter::getAllPartInfo() @@ -378,12 +448,11 @@ std::vector SparkMergeTreeWriter::getAllPartInfo() std::vector res; res.reserve(new_parts.size()); - for (auto part : new_parts.unsafeGet()) + for (const auto & part : new_parts.unsafeGet()) { res.emplace_back( PartInfo{part->name, part->getMarksCount(), part->getBytesOnDisk(), part->rows_count, partition_values, bucket_dir}); } - return res; } @@ -425,37 +494,42 @@ void SparkMergeTreeWriter::checkAndMerge(bool force) if (!force && new_parts.size() < merge_limit_parts) return; - auto doTask = [this]( - const ThreadGroupPtr & thread_group, - const std::vector prepare_merge_parts, - CustomStorageMergeTreePtr & storage, - String & partition_dir, - String & bucket_dir) -> std::vector + auto doMergeTask = [this](const std::vector & prepare_merge_parts) { - setThreadName("InsertWithMerge"); - ThreadStatus thread_status; - thread_status.attachToGroup(thread_group); - - size_t before_size = 0; - size_t after_size = 0; - for (const auto & prepare_merge_part : prepare_merge_parts) - before_size += prepare_merge_part->getBytesOnDisk(); - - std::unordered_map partition_values; - auto merged_parts - = mergeParts(prepare_merge_parts, partition_values, toString(UUIDHelpers::generateV4()), storage, partition_dir, bucket_dir); - for (const auto & merge_tree_data_part : merged_parts) - after_size += merge_tree_data_part->getBytesOnDisk(); - - LOG_DEBUG( - &Poco::Logger::get("SparkMergeTreeWriter"), - "Mergetree merge on insert finished, before merge part size {}, part count {}, after part size {}, part count {}.", - before_size, - prepare_merge_parts.size(), - after_size, - merged_parts.size()); + for (const auto & selected_part : prepare_merge_parts) + tmp_parts.emplace(selected_part->name); - return merged_parts; + thread_pool.scheduleOrThrow( + [this, prepare_merge_parts, thread_group = CurrentThread::getGroup()]() -> void + { + Stopwatch watch; + setThreadName("InsertWithMerge"); + ThreadStatus thread_status; + thread_status.attachToGroup(thread_group); + + size_t before_size = 0; + size_t after_size = 0; + for (const auto & prepare_merge_part : prepare_merge_parts) + before_size += prepare_merge_part->getBytesOnDisk(); + + std::unordered_map partition_values; + const auto merged_parts = mergeParts( + prepare_merge_parts, partition_values, toString(UUIDHelpers::generateV4()), storage, partition_dir, bucket_dir); + for (const auto & merge_tree_data_part : merged_parts) + after_size += merge_tree_data_part->getBytesOnDisk(); + + new_parts.emplace_back(merged_parts); + watch.stop(); + LOG_INFO( + &Poco::Logger::get("SparkMergeTreeWriter"), + "Merge success. Before merge part size {}, part count {}, after part size {}, part count {}, " + "total elapsed {} ms", + before_size, + prepare_merge_parts.size(), + after_size, + merged_parts.size(), + watch.elapsedMilliseconds()); + }); }; std::vector selected_parts; @@ -477,13 +551,7 @@ void SparkMergeTreeWriter::checkAndMerge(bool force) if (merge_min_size > totol_size && merge_limit_parts > selected_parts.size()) continue; - for (auto selected_part : selected_parts) - { - tmp_parts.emplace(selected_part->name); - } - - thread_pool.scheduleOrThrow([this, doTask, selected_parts, thread_group = CurrentThread::getGroup()]() -> void - { new_parts.emplace_back(doTask(thread_group, selected_parts, storage, partition_dir, bucket_dir)); }); + doMergeTask(selected_parts); selected_parts.clear(); totol_size = 0; } @@ -491,13 +559,7 @@ void SparkMergeTreeWriter::checkAndMerge(bool force) if (!selected_parts.empty()) { if (force && selected_parts.size() > 1) - { - for (auto selected_part : selected_parts) - tmp_parts.emplace(selected_part->name); - thread_pool.scheduleOrThrow( - [this, doTask, selected_parts, thread_group = CurrentThread::getGroup()]() -> void - { new_parts.emplace_back(doTask(thread_group, selected_parts, storage, partition_dir, bucket_dir)); }); - } + doMergeTask(selected_parts); else new_parts.emplace_back(selected_parts); } diff --git a/cpp-ch/local-engine/Storages/Mergetree/SparkMergeTreeWriter.h b/cpp-ch/local-engine/Storages/Mergetree/SparkMergeTreeWriter.h index 5251d4cc447a3..13ac223944775 100644 --- a/cpp-ch/local-engine/Storages/Mergetree/SparkMergeTreeWriter.h +++ b/cpp-ch/local-engine/Storages/Mergetree/SparkMergeTreeWriter.h @@ -15,13 +15,15 @@ * limitations under the License. */ #pragma once + #include -#include -#include +#include #include +#include #include #include #include +#include namespace DB { @@ -51,44 +53,57 @@ class SparkMergeTreeWriter public: static String partInfosToJson(const std::vector & part_infos); SparkMergeTreeWriter( - CustomStorageMergeTreePtr storage_, - const DB::StorageMetadataPtr & metadata_snapshot_, + const MergeTreeTable & merge_tree_table, const DB::ContextPtr & context_, - const String & uuid_, + const String & part_name_prefix_, const String & partition_dir_ = "", const String & bucket_dir_ = ""); - void write(DB::Block & block); + void write(const DB::Block & block); void finalize(); std::vector getAllPartInfo(); private: - void - writeTempPart(MergeTreeDataWriter::TemporaryPart & temp_part, DB::BlockWithPartition & block_with_partition, const DB::StorageMetadataPtr & metadata_snapshot); + void writeTempPart( + MergeTreeDataWriter::TemporaryPart & temp_part, + DB::BlockWithPartition & block_with_partition, + const DB::StorageMetadataPtr & metadata_snapshot); DB::MergeTreeDataWriter::TemporaryPart writeTempPartAndFinalize(DB::BlockWithPartition & block_with_partition, const DB::StorageMetadataPtr & metadata_snapshot); void checkAndMerge(bool force = false); void safeEmplaceBackPart(DB::MergeTreeDataPartPtr); void safeAddPart(DB::MergeTreeDataPartPtr); void manualFreeMemory(size_t before_write_memory); + void saveMetadata(); + void commitPartToRemoteStorageIfNeeded(); + void finalizeMerge(); + bool chunkToPart(Chunk && chunk); + bool blockToPart(Block & block); + bool useLocalStorage() const; - String uuid; + CustomStorageMergeTreePtr storage = nullptr; + CustomStorageMergeTreePtr dest_storage = nullptr; + CustomStorageMergeTreePtr temp_storage = nullptr; + DB::StorageMetadataPtr metadata_snapshot = nullptr; + + String part_name_prefix; String partition_dir; String bucket_dir; - CustomStorageMergeTreePtr storage; - DB::StorageMetadataPtr metadata_snapshot; + DB::ContextPtr context; - std::unique_ptr squashing_transform; + std::unique_ptr squashing; int part_num = 1; ConcurrentDeque new_parts; std::unordered_map partition_values; std::unordered_set tmp_parts; DB::Block header; bool merge_after_insert; + bool insert_without_local_storage; FreeThreadPool thread_pool; size_t merge_min_size = 1024 * 1024 * 1024; size_t merge_limit_parts = 10; std::mutex memory_mutex; + bool isRemoteStorage = false; }; } diff --git a/cpp-ch/local-engine/Storages/Parquet/CMakeLists.txt b/cpp-ch/local-engine/Storages/Parquet/CMakeLists.txt index f3d9e14f4b9c9..bfe5387108044 100644 --- a/cpp-ch/local-engine/Storages/Parquet/CMakeLists.txt +++ b/cpp-ch/local-engine/Storages/Parquet/CMakeLists.txt @@ -18,13 +18,10 @@ set(ARROW_INCLUDE_DIR "${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/src") add_headers_and_sources(Parquet .) add_library(ch_parquet ${Parquet_sources}) -target_link_libraries(ch_parquet PUBLIC - boost::headers_only - clickhouse_common_io -) +target_link_libraries(ch_parquet PUBLIC boost::headers_only + clickhouse_common_io) -target_include_directories(ch_parquet SYSTEM BEFORE PUBLIC - ${ARROW_INCLUDE_DIR} - ${CMAKE_BINARY_DIR}/contrib/arrow-cmake/cpp/src - ${ClickHouse_SOURCE_DIR}/contrib/arrow-cmake/cpp/src -) +target_include_directories( + ch_parquet SYSTEM BEFORE + PUBLIC ${ARROW_INCLUDE_DIR} ${CMAKE_BINARY_DIR}/contrib/arrow-cmake/cpp/src + ${ClickHouse_SOURCE_DIR}/contrib/arrow-cmake/cpp/src) diff --git a/cpp-ch/local-engine/Storages/Parquet/ColumnIndexFilter.cpp b/cpp-ch/local-engine/Storages/Parquet/ColumnIndexFilter.cpp index 0d3b07e4786a3..817de7f27ef85 100644 --- a/cpp-ch/local-engine/Storages/Parquet/ColumnIndexFilter.cpp +++ b/cpp-ch/local-engine/Storages/Parquet/ColumnIndexFilter.cpp @@ -547,7 +547,8 @@ PageIndexs TypedColumnIndexImpl::notEq(const DB::Field & value) co } // Merging value filtering with pages containing nulls - auto real_value{parquetCast(value)}; + ToParquet to_parquet; + auto real_value{to_parquet.as(value, *descr_)}; TypedComparator typed_comparator{real_value, *column_index_, *comparator_}; auto pages = ORDER::notEq(typed_comparator); const std::set matchingIndexes(pages.begin(), pages.end()); @@ -573,7 +574,8 @@ PageIndexs TypedColumnIndexImpl::eq(const DB::Field & value) const return {PageIndexsBuilder::ALL_PAGES}; } } - auto real_value = parquetCast(value); + ToParquet to_parquet; + auto real_value{to_parquet.as(value, *descr_)}; TypedComparator typed_comparator{real_value, *column_index_, *comparator_}; return ORDER::eq(typed_comparator); } @@ -581,7 +583,8 @@ PageIndexs TypedColumnIndexImpl::eq(const DB::Field & value) const template ORDER> PageIndexs TypedColumnIndexImpl::gt(const DB::Field & value) const { - auto real_value{parquetCast(value)}; + ToParquet to_parquet; + auto real_value{to_parquet.as(value, *descr_)}; TypedComparator typed_comparator{real_value, *column_index_, *comparator_}; return ORDER::gt(typed_comparator); } @@ -589,7 +592,8 @@ PageIndexs TypedColumnIndexImpl::gt(const DB::Field & value) const template ORDER> PageIndexs TypedColumnIndexImpl::gtEg(const DB::Field & value) const { - auto real_value{parquetCast(value)}; + ToParquet to_parquet; + auto real_value{to_parquet.as(value, *descr_)}; TypedComparator typed_comparator{real_value, *column_index_, *comparator_}; return ORDER::gtEq(typed_comparator); } @@ -597,7 +601,8 @@ PageIndexs TypedColumnIndexImpl::gtEg(const DB::Field & value) con template ORDER> PageIndexs TypedColumnIndexImpl::lt(const DB::Field & value) const { - auto real_value{parquetCast(value)}; + ToParquet to_parquet; + auto real_value{to_parquet.as(value, *descr_)}; TypedComparator typed_comparator{real_value, *column_index_, *comparator_}; return ORDER::lt(typed_comparator); } @@ -605,7 +610,8 @@ PageIndexs TypedColumnIndexImpl::lt(const DB::Field & value) const template ORDER> PageIndexs TypedColumnIndexImpl::ltEg(const DB::Field & value) const { - auto real_value{parquetCast(value)}; + ToParquet to_parquet; + auto real_value{to_parquet.as(value, *descr_)}; TypedComparator typed_comparator{real_value, *column_index_, *comparator_}; return ORDER::ltEq(typed_comparator); } @@ -615,7 +621,7 @@ PageIndexs TypedColumnIndexImpl::in(const DB::ColumnPtr & column) { /// TDDO: handle null /// - std::shared_ptr> converter = ParquetConverter::Make(column); + std::shared_ptr> converter = ParquetConverter::Make(column, *descr_); const auto * value = converter->getBatch(0, column->size()); T min, max; std::tie(min, max) = comparator_->GetMinMax(value, column->size()); @@ -659,7 +665,8 @@ ColumnIndexPtr internalMakeColumnIndex( switch (physical_type) { case parquet::Type::BOOLEAN: - break; + return std::make_unique>( + descr, dynamic_pointer_cast(column_index), offset_index); case parquet::Type::INT32: return std::make_unique>( descr, dynamic_pointer_cast(column_index), offset_index); @@ -669,20 +676,21 @@ ColumnIndexPtr internalMakeColumnIndex( case parquet::Type::INT96: break; case parquet::Type::FLOAT: - break; + return std::make_unique>( + descr, dynamic_pointer_cast(column_index), offset_index); case parquet::Type::DOUBLE: return std::make_unique>( descr, dynamic_pointer_cast(column_index), offset_index); - break; case parquet::Type::BYTE_ARRAY: return std::make_unique>( descr, dynamic_pointer_cast(column_index), offset_index); case parquet::Type::FIXED_LEN_BYTE_ARRAY: - break; + return std::make_unique>( + descr, dynamic_pointer_cast(column_index), offset_index); case parquet::Type::UNDEFINED: break; } - throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Unsupported physical type {}", physical_type); + throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Unsupported physical type {}", TypeToString(physical_type)); } ColumnIndexPtr ColumnIndex::create( diff --git a/cpp-ch/local-engine/Storages/Parquet/ParquetConverter.h b/cpp-ch/local-engine/Storages/Parquet/ParquetConverter.h index ac7b2479abe22..89e83e668aebc 100644 --- a/cpp-ch/local-engine/Storages/Parquet/ParquetConverter.h +++ b/cpp-ch/local-engine/Storages/Parquet/ParquetConverter.h @@ -15,33 +15,72 @@ * limitations under the License. */ #pragma once +#include #include +#include +#include #include #include #include +namespace DB::ErrorCodes +{ +extern const int LOGICAL_ERROR; +} + namespace local_engine { + template -auto parquetCast(const DB::Field & value) -> typename PhysicalType::c_type +struct ToParquet { using T = typename PhysicalType::c_type; - if constexpr (std::is_same_v) - return static_cast(value.get()); - else if constexpr (std::is_same_v) + T as(const DB::Field & value, const parquet::ColumnDescriptor &) + { + if constexpr (std::is_same_v) + return static_cast(value.get()); + // parquet::BooleanType, parquet::Int64Type, parquet::FloatType, parquet::DoubleType + return value.get(); // FLOAT, DOUBLE, INT64 + } +}; + +template <> +struct ToParquet +{ + using T = parquet::ByteArray; + T as(const DB::Field & value, const parquet::ColumnDescriptor &) { assert(value.getType() == DB::Field::Types::String); const std::string & s = value.get(); const auto * const ptr = reinterpret_cast(s.data()); return parquet::ByteArray(static_cast(s.size()), ptr); } - else if constexpr (std::is_same_v) +}; + +template <> +struct ToParquet +{ + uint8_t buf[256]; + using T = parquet::FixedLenByteArray; + T as(const DB::Field & value, const parquet::ColumnDescriptor & descriptor) { - abort(); + if (value.getType() != DB::Field::Types::Decimal128) + throw DB::Exception( + DB::ErrorCodes::LOGICAL_ERROR, "Field type '{}' for FIXED_LEN_BYTE_ARRAY is not supported", value.getTypeName()); + static_assert(sizeof(Int128) <= sizeof(buf)); + if (descriptor.type_length() > sizeof(Int128)) + throw DB::Exception( + DB::ErrorCodes::LOGICAL_ERROR, + "descriptor.type_length() = {} , which is > {}, e.g. sizeof(Int128)", + descriptor.type_length(), + sizeof(Int128)); + Int128 val = value.get>().getValue(); + std::reverse(reinterpret_cast(&val), reinterpret_cast(&val) + sizeof(val)); + const int offset = sizeof(Int128) - descriptor.type_length(); + memcpy(buf, reinterpret_cast(&val) + offset, descriptor.type_length()); + return parquet::FixedLenByteArray(buf); } - else - return value.get(); // FLOAT, DOUBLE, INT64 -} +}; // Int32 Int64 Float Double template @@ -100,6 +139,42 @@ struct ConverterString } }; +/// Like ConverterNumberAsFixedString, but converts to big-endian. Because that's the byte order +/// Parquet uses for decimal types and literally nothing else, for some reason. +template +struct ConverterDecimal +{ + const parquet::ColumnDescriptor & descriptor; + const DB::ColumnDecimal & column; + DB::PODArray data_buf; + DB::PODArray ptr_buf; + + explicit ConverterDecimal(const DB::ColumnPtr & c, const parquet::ColumnDescriptor & desc) + : descriptor(desc), column(assert_cast &>(*c)) + { + if (descriptor.type_length() > sizeof(T)) + throw DB::Exception( + DB::ErrorCodes::LOGICAL_ERROR, + "descriptor.type_length() = {} , which is > {}, e.g. sizeof(T)", + descriptor.type_length(), + sizeof(T)); + } + + const parquet::FixedLenByteArray * getBatch(size_t offset, size_t count) + { + data_buf.resize(count * sizeof(T)); + ptr_buf.resize(count); + memcpy(data_buf.data(), reinterpret_cast(column.getData().data() + offset), count * sizeof(T)); + const size_t offset_in_buf = sizeof(Int128) - descriptor.type_length(); + ; + for (size_t i = 0; i < count; ++i) + { + std::reverse(data_buf.data() + i * sizeof(T), data_buf.data() + (i + 1) * sizeof(T)); + ptr_buf[i].ptr = data_buf.data() + i * sizeof(T) + offset_in_buf; + } + return ptr_buf.data(); + } +}; class BaseConverter { @@ -115,7 +190,7 @@ class ParquetConverter : public BaseConverter public: virtual const T * getBatch(size_t offset, size_t count) = 0; - static std::shared_ptr> Make(const DB::ColumnPtr & c); + static std::shared_ptr> Make(const DB::ColumnPtr & c, const parquet::ColumnDescriptor & desc); }; template @@ -134,7 +209,7 @@ class ParquetConverterImpl final : public ParquetConverter template -std::shared_ptr> ParquetConverter::Make(const DB::ColumnPtr & c) +std::shared_ptr> ParquetConverter::Make(const DB::ColumnPtr & c, const parquet::ColumnDescriptor & desc) { std::shared_ptr result; @@ -204,6 +279,17 @@ std::shared_ptr> ParquetConverter::Make(const DB: break; } break; + case parquet::Type::FIXED_LEN_BYTE_ARRAY: + switch (c->getDataType()) + { + case TypeIndex::Decimal128: + result = std::make_shared>>( + ConverterDecimal(c, desc)); + break; + default: + break; + } + break; default: break; } diff --git a/cpp-ch/local-engine/Storages/SourceFromJavaIter.cpp b/cpp-ch/local-engine/Storages/SourceFromJavaIter.cpp index 54d1d253e5395..37501e98504a0 100644 --- a/cpp-ch/local-engine/Storages/SourceFromJavaIter.cpp +++ b/cpp-ch/local-engine/Storages/SourceFromJavaIter.cpp @@ -53,12 +53,11 @@ static DB::Block getRealHeader(const DB::Block & header) DB::Block * SourceFromJavaIter::peekBlock(JNIEnv * env, jobject java_iter) { jboolean has_next = safeCallBooleanMethod(env, java_iter, serialized_record_batch_iterator_hasNext); - if (has_next) - { - jbyteArray block = static_cast(safeCallObjectMethod(env, java_iter, serialized_record_batch_iterator_next)); - return reinterpret_cast(byteArrayToLong(env, block)); - } - return nullptr; + if (!has_next) + return nullptr; + + jbyteArray block = static_cast(safeCallObjectMethod(env, java_iter, serialized_record_batch_iterator_next)); + return reinterpret_cast(byteArrayToLong(env, block)); } @@ -75,6 +74,9 @@ SourceFromJavaIter::SourceFromJavaIter( DB::Chunk SourceFromJavaIter::generate() { + if (isCancelled()) + return {}; + GET_JNIENV(env) SCOPE_EXIT({CLEAN_JNIENV}); @@ -152,6 +154,7 @@ void SourceFromJavaIter::convertNullable(DB::Chunk & chunk) chunk.setColumns(columns, rows); } + DB::ColumnPtr SourceFromJavaIter::convertNestedNullable(const DB::ColumnPtr & column, const DB::DataTypePtr & target_type) { DB::WhichDataType column_type(column->getDataType()); diff --git a/cpp-ch/local-engine/Storages/StorageMergeTreeFactory.cpp b/cpp-ch/local-engine/Storages/StorageMergeTreeFactory.cpp index bb8d9d3d71403..0731ac92cd078 100644 --- a/cpp-ch/local-engine/Storages/StorageMergeTreeFactory.cpp +++ b/cpp-ch/local-engine/Storages/StorageMergeTreeFactory.cpp @@ -18,71 +18,56 @@ namespace local_engine { + +String StorageMergeTreeFactory::getTableName(const StorageID & id, const String & snapshot_id) +{ + auto table_name = id.database_name + "." + id.table_name; + // for optimize table + if (!snapshot_id.empty()) + table_name += "_" + snapshot_id; + + return table_name; +} + + StorageMergeTreeFactory & StorageMergeTreeFactory::instance() { static StorageMergeTreeFactory ret; return ret; } -void StorageMergeTreeFactory::freeStorage(StorageID id) +void StorageMergeTreeFactory::freeStorage(const StorageID & id, const String & snapshot_id) { - if (!id.hasUUID()) - { - return; - } - auto table_name = id.database_name + "." + id.table_name + "@" + toString(id.uuid); + auto table_name = getTableName(id, snapshot_id); { std::lock_guard lock(storage_map_mutex); if (storage_map->has(table_name)) - { storage_map->remove(table_name); - } } { std::lock_guard lock(datapart_mutex); if (datapart_map->has(table_name)) - { datapart_map->remove(table_name); - } } } CustomStorageMergeTreePtr -StorageMergeTreeFactory::getStorage(StorageID id, const String & snapshot_id, ColumnsDescription columns, std::function creator) +StorageMergeTreeFactory::getStorage(StorageID id, const String & snapshot_id, std::function creator) { - auto table_name = id.database_name + "." + id.table_name; - // for optimize table - if (id.hasUUID()) - { - table_name += "@" + toString(id.uuid); - } - else - { - table_name += "_" + snapshot_id; - } + auto table_name = getTableName(id, snapshot_id); std::lock_guard lock(storage_map_mutex); if (!storage_map->has(table_name)) - { storage_map->add(table_name, creator()); - } return *(storage_map->get(table_name)); } -DataPartsVector StorageMergeTreeFactory::getDataParts(StorageID id, const String & snapshot_id, std::unordered_set part_name) +DataPartsVector StorageMergeTreeFactory::getDataPartsByNames(const StorageID & id, const String & snapshot_id, std::unordered_set part_name) { DataPartsVector res; - auto table_name = id.database_name + "." + id.table_name; - // for optimize table - if (id.hasUUID()) - { - table_name += "@" + toString(id.uuid); - } - else - { - table_name += "_" + snapshot_id; - } + auto table_name = getTableName(id, snapshot_id); + std::lock_guard lock(datapart_mutex); std::unordered_set missing_names; if (!datapart_map->has(table_name)) [[unlikely]] diff --git a/cpp-ch/local-engine/Storages/StorageMergeTreeFactory.h b/cpp-ch/local-engine/Storages/StorageMergeTreeFactory.h index 82dae37452115..d7bcb93c07d7d 100644 --- a/cpp-ch/local-engine/Storages/StorageMergeTreeFactory.h +++ b/cpp-ch/local-engine/Storages/StorageMergeTreeFactory.h @@ -28,10 +28,10 @@ class StorageMergeTreeFactory { public: static StorageMergeTreeFactory & instance(); - static void freeStorage(StorageID id); + static void freeStorage(const StorageID & id, const String & snapshot_id = ""); static CustomStorageMergeTreePtr - getStorage(StorageID id, const String & snapshot_id, ColumnsDescription columns, std::function creator); - static DataPartsVector getDataParts(StorageID id, const String & snapshot_id, std::unordered_set part_name); + getStorage(StorageID id, const String & snapshot_id, std::function creator); + static DataPartsVector getDataPartsByNames(const StorageID & id, const String & snapshot_id, std::unordered_set part_name); static void init_cache_map() { auto & storage_map_v = storage_map; @@ -61,6 +61,8 @@ class StorageMergeTreeFactory if (datapart_map) datapart_map->clear(); } + static String getTableName(const StorageID & id, const String & snapshot_id); + private: static std::unique_ptr> storage_map; static std::unique_ptr>>> datapart_map; diff --git a/cpp-ch/local-engine/Storages/SubstraitSource/CMakeLists.txt b/cpp-ch/local-engine/Storages/SubstraitSource/CMakeLists.txt index 4e43c924f6fef..228f54255cb43 100644 --- a/cpp-ch/local-engine/Storages/SubstraitSource/CMakeLists.txt +++ b/cpp-ch/local-engine/Storages/SubstraitSource/CMakeLists.txt @@ -15,30 +15,24 @@ set(ARROW_INCLUDE_DIR "${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/src") - macro(add_headers_and_sources_including_cc prefix common_path) - add_glob(${prefix}_headers ${CMAKE_CURRENT_SOURCE_DIR} ${common_path}/*.h) - add_glob(${prefix}_sources ${common_path}/*.cpp ${common_path}/*.c ${common_path}/*.cc ${common_path}/*.h) + add_glob(${prefix}_headers ${CMAKE_CURRENT_SOURCE_DIR} ${common_path}/*.h) + add_glob(${prefix}_sources ${common_path}/*.cpp ${common_path}/*.c + ${common_path}/*.cc ${common_path}/*.h) endmacro() add_headers_and_sources(substrait_source .) add_headers_and_sources_including_cc(ch_parquet arrow) add_library(substrait_source ${substrait_source_sources}) -target_compile_options(substrait_source PRIVATE - -Wno-suggest-destructor-override - -Wno-inconsistent-missing-destructor-override -) +target_compile_options( + substrait_source PRIVATE -Wno-suggest-destructor-override + -Wno-inconsistent-missing-destructor-override) -target_link_libraries(substrait_source PUBLIC - boost::headers_only - ch_contrib::protobuf - clickhouse_common_io - ch_contrib::hdfs - substrait -) +target_link_libraries( + substrait_source PUBLIC boost::headers_only ch_contrib::protobuf + clickhouse_common_io ch_contrib::hdfs substrait) -target_include_directories(substrait_source SYSTEM BEFORE PUBLIC - ${ARROW_INCLUDE_DIR} - ${CMAKE_BINARY_DIR}/contrib/arrow-cmake/cpp/src - ${ClickHouse_SOURCE_DIR}/contrib/arrow-cmake/cpp/src -) \ No newline at end of file +target_include_directories( + substrait_source SYSTEM BEFORE + PUBLIC ${ARROW_INCLUDE_DIR} ${CMAKE_BINARY_DIR}/contrib/arrow-cmake/cpp/src + ${ClickHouse_SOURCE_DIR}/contrib/arrow-cmake/cpp/src) diff --git a/cpp-ch/local-engine/Storages/SubstraitSource/ExcelTextFormatFile.cpp b/cpp-ch/local-engine/Storages/SubstraitSource/ExcelTextFormatFile.cpp index 038f280b05600..31ef5b9e17159 100644 --- a/cpp-ch/local-engine/Storages/SubstraitSource/ExcelTextFormatFile.cpp +++ b/cpp-ch/local-engine/Storages/SubstraitSource/ExcelTextFormatFile.cpp @@ -293,7 +293,12 @@ bool ExcelTextFormatReader::readField( return false; } - if (column_size == column.size()) + // See https://github.com/ClickHouse/ClickHouse/pull/60556 + // In case of failing to parse, we will always push element into nullmap. + // so, we need using nestedColumn to check if error occurs. + /// FIXME: move it to ExcelSerialization ??? + const auto nestedColumn = DB::removeNullable(column.getPtr()); + if (column_size == nestedColumn->size()) { skipErrorChars(*buf, has_quote, maybe_quote, escape, format_settings); column_back_func(column); diff --git a/cpp-ch/local-engine/Storages/SubstraitSource/ParquetFormatFile.cpp b/cpp-ch/local-engine/Storages/SubstraitSource/ParquetFormatFile.cpp index 2e0f000456d82..f557df5b27bf1 100644 --- a/cpp-ch/local-engine/Storages/SubstraitSource/ParquetFormatFile.cpp +++ b/cpp-ch/local-engine/Storages/SubstraitSource/ParquetFormatFile.cpp @@ -22,7 +22,7 @@ #include #include -#include +#include #include #include #include @@ -46,12 +46,13 @@ extern const int UNKNOWN_TYPE; namespace local_engine { + ParquetFormatFile::ParquetFormatFile( const DB::ContextPtr & context_, const substrait::ReadRel::LocalFiles::FileOrFiles & file_info_, const ReadBufferBuilderPtr & read_buffer_builder_, bool use_local_format_) - : FormatFile(context_, file_info_, read_buffer_builder_), use_local_format(use_local_format_) + : FormatFile(context_, file_info_, read_buffer_builder_), use_pageindex_reader(use_local_format_) { } @@ -85,7 +86,7 @@ FormatFile::InputFormatPtr ParquetFormatFile::createInputFormat(const DB::Block std::ranges::set_difference(total_row_group_indices, required_row_group_indices, std::back_inserter(skip_row_group_indices)); format_settings.parquet.skip_row_groups = std::unordered_set(skip_row_group_indices.begin(), skip_row_group_indices.end()); - if (use_local_format) + if (use_pageindex_reader && pageindex_reader_support(header)) res->input = std::make_shared(*(res->read_buffer), header, format_settings); else res->input = std::make_shared(*(res->read_buffer), header, format_settings, 1, 8192); @@ -112,6 +113,19 @@ std::optional ParquetFormatFile::getTotalRows() return total_rows; } } +bool ParquetFormatFile::pageindex_reader_support(const DB::Block & header) +{ + const auto result = std::ranges::find_if( + header, + [](DB::ColumnWithTypeAndName const & col) + { + const DB::DataTypePtr type_not_nullable = DB::removeNullable(col.type); + const DB::WhichDataType which(type_not_nullable); + return DB::isArray(which) || DB::isMap(which) || DB::isTuple(which); + }); + + return result == header.end(); +} std::vector ParquetFormatFile::collectRequiredRowGroups(int & total_row_groups) const { diff --git a/cpp-ch/local-engine/Storages/SubstraitSource/ParquetFormatFile.h b/cpp-ch/local-engine/Storages/SubstraitSource/ParquetFormatFile.h index 045f0049d6742..ba7f28883e656 100644 --- a/cpp-ch/local-engine/Storages/SubstraitSource/ParquetFormatFile.h +++ b/cpp-ch/local-engine/Storages/SubstraitSource/ParquetFormatFile.h @@ -55,8 +55,10 @@ class ParquetFormatFile : public FormatFile String getFileFormat() const override { return "Parquet"; } + static bool pageindex_reader_support(const DB::Block & header); + private: - bool use_local_format; + bool use_pageindex_reader; std::mutex mutex; std::optional total_rows; diff --git a/cpp-ch/local-engine/Storages/SubstraitSource/ReadBufferBuilder.cpp b/cpp-ch/local-engine/Storages/SubstraitSource/ReadBufferBuilder.cpp index d54ff985ec5f0..ec967a869600f 100644 --- a/cpp-ch/local-engine/Storages/SubstraitSource/ReadBufferBuilder.cpp +++ b/cpp-ch/local-engine/Storages/SubstraitSource/ReadBufferBuilder.cpp @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -38,7 +39,6 @@ #include #include #include -#include #include #include #include @@ -437,7 +437,7 @@ class S3FileReadBufferBuilder : public ReadBufferBuilder bucket, object.remote_path, "", - DB::S3Settings::RequestSettings(), + DB::S3::RequestSettings(), new_settings, /* use_external_buffer */ true, /* offset */ 0, diff --git a/cpp-ch/local-engine/Storages/SubstraitSource/SubstraitFileSource.cpp b/cpp-ch/local-engine/Storages/SubstraitSource/SubstraitFileSource.cpp index 80dccf7590601..5b872244eab5c 100644 --- a/cpp-ch/local-engine/Storages/SubstraitSource/SubstraitFileSource.cpp +++ b/cpp-ch/local-engine/Storages/SubstraitSource/SubstraitFileSource.cpp @@ -104,6 +104,9 @@ DB::Chunk SubstraitFileSource::generate() bool SubstraitFileSource::tryPrepareReader() { + if (isCancelled()) + return false; + if (file_reader) return true; @@ -140,6 +143,13 @@ bool SubstraitFileSource::tryPrepareReader() return true; } + +void SubstraitFileSource::onCancel() +{ + if (file_reader) + file_reader->cancel(); +} + DB::ColumnPtr FileReaderWrapper::createConstColumn(DB::DataTypePtr data_type, const DB::Field & field, size_t rows) { auto nested_type = DB::removeNullable(data_type); @@ -280,9 +290,13 @@ ConstColumnsFileReader::ConstColumnsFileReader(FormatFilePtr file_, DB::ContextP remained_rows = *rows; } + bool ConstColumnsFileReader::pull(DB::Chunk & chunk) { - if (!remained_rows) [[unlikely]] + if (isCancelled()) + return false; + + if (!remained_rows) return false; size_t to_read_rows = 0; @@ -296,6 +310,7 @@ bool ConstColumnsFileReader::pull(DB::Chunk & chunk) to_read_rows = block_size; remained_rows -= block_size; } + DB::Columns res_columns; if (const size_t col_num = header.columns()) { @@ -307,8 +322,9 @@ bool ConstColumnsFileReader::pull(DB::Chunk & chunk) auto type = col_with_name_and_type.type; const auto & name = col_with_name_and_type.name; auto it = partition_values.find(name); - if (it == partition_values.end()) [[unlikely]] + if (it == partition_values.end()) throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Unknow partition column : {}", name); + res_columns.emplace_back(createColumn(it->second, type, to_read_rows)); } } @@ -331,6 +347,9 @@ NormalFileReader::NormalFileReader( bool NormalFileReader::pull(DB::Chunk & chunk) { + if (isCancelled()) + return false; + DB::Chunk raw_chunk = input_format->input->generate(); const size_t rows = raw_chunk.getNumRows(); if (!rows) diff --git a/cpp-ch/local-engine/Storages/SubstraitSource/SubstraitFileSource.h b/cpp-ch/local-engine/Storages/SubstraitSource/SubstraitFileSource.h index 973f3cd35b2a7..650ec5d967a0f 100644 --- a/cpp-ch/local-engine/Storages/SubstraitSource/SubstraitFileSource.h +++ b/cpp-ch/local-engine/Storages/SubstraitSource/SubstraitFileSource.h @@ -34,6 +34,15 @@ class FileReaderWrapper virtual ~FileReaderWrapper() = default; virtual bool pull(DB::Chunk & chunk) = 0; + void cancel() + { + bool already_cancelled = is_cancelled.exchange(true, std::memory_order_acq_rel); + if (!already_cancelled) + onCancel(); + } + + bool isCancelled() const { return is_cancelled.load(std::memory_order_acquire); } + /// Apply key condition to the reader, if use_local_format is true, column_index_filter will be used /// otherwise it will be ignored virtual void applyKeyCondition( @@ -42,7 +51,11 @@ class FileReaderWrapper } protected: + virtual void onCancel() {}; + FormatFilePtr file; + std::atomic is_cancelled{false}; + static DB::ColumnPtr createConstColumn(DB::DataTypePtr type, const DB::Field & field, size_t rows); static DB::ColumnPtr createColumn(const String & value, DB::DataTypePtr type, size_t rows); @@ -68,10 +81,14 @@ class NormalFileReader : public FileReaderWrapper } private: + void onCancel() override + { + input_format->input->cancel(); + } + DB::ContextPtr context; DB::Block to_read_header; DB::Block output_header; - FormatFile::InputFormatPtr input_format; }; @@ -89,6 +106,7 @@ class ConstColumnsFileReader : public FileReaderWrapper ConstColumnsFileReader( FormatFilePtr file_, DB::ContextPtr context_, const DB::Block & header_, size_t block_size_ = DB::DEFAULT_BLOCK_SIZE); ~ConstColumnsFileReader() override = default; + bool pull(DB::Chunk & chunk) override; private: @@ -112,6 +130,9 @@ class SubstraitFileSource : public DB::SourceWithKeyCondition DB::Chunk generate() override; private: + bool tryPrepareReader(); + void onCancel() override; + DB::ContextPtr context; DB::Block output_header; /// Sample header may contains partitions keys DB::Block to_read_header; // Sample header not include partition keys @@ -120,9 +141,6 @@ class SubstraitFileSource : public DB::SourceWithKeyCondition UInt32 current_file_index = 0; std::unique_ptr file_reader; ReadBufferBuilderPtr read_buffer_builder; - ColumnIndexFilterPtr column_index_filter; - - bool tryPrepareReader(); }; } diff --git a/cpp-ch/local-engine/examples/CMakeLists.txt b/cpp-ch/local-engine/examples/CMakeLists.txt index bbeeb98d24452..03cd3bfe3f193 100644 --- a/cpp-ch/local-engine/examples/CMakeLists.txt +++ b/cpp-ch/local-engine/examples/CMakeLists.txt @@ -13,5 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -clickhouse_add_executable (signal_demo signal_demo.cpp) -target_link_libraries(signal_demo PRIVATE gluten_clickhouse_backend_libs loggers) \ No newline at end of file +clickhouse_add_executable(signal_demo signal_demo.cpp) +target_link_libraries(signal_demo PRIVATE gluten_clickhouse_backend_libs + loggers) diff --git a/cpp-ch/local-engine/jni/ReservationListenerWrapper.cpp b/cpp-ch/local-engine/jni/ReservationListenerWrapper.cpp index 65b29c2a2d1c7..dad0ecf66ed76 100644 --- a/cpp-ch/local-engine/jni/ReservationListenerWrapper.cpp +++ b/cpp-ch/local-engine/jni/ReservationListenerWrapper.cpp @@ -58,6 +58,13 @@ void ReservationListenerWrapper::free(int64_t size) CLEAN_JNIENV } +void ReservationListenerWrapper::tryFree(int64_t size) +{ + GET_JNIENV(env) + tryCallVoidMethod(env, listener, reservation_listener_unreserve, size); + CLEAN_JNIENV +} + size_t ReservationListenerWrapper::currentMemory() { GET_JNIENV(env) diff --git a/cpp-ch/local-engine/jni/ReservationListenerWrapper.h b/cpp-ch/local-engine/jni/ReservationListenerWrapper.h index 1dfb3671f21b1..a4d26cb5417ea 100644 --- a/cpp-ch/local-engine/jni/ReservationListenerWrapper.h +++ b/cpp-ch/local-engine/jni/ReservationListenerWrapper.h @@ -35,6 +35,8 @@ class ReservationListenerWrapper void reserve(int64_t size); void reserveOrThrow(int64_t size); void free(int64_t size); + /// Make sure destructors in CH Backend do not throw exceptions + void tryFree(int64_t size); size_t currentMemory(); diff --git a/cpp-ch/local-engine/jni/jni_common.cpp b/cpp-ch/local-engine/jni/jni_common.cpp index 4d05b5f48e58a..6eb02a2f450b4 100644 --- a/cpp-ch/local-engine/jni/jni_common.cpp +++ b/cpp-ch/local-engine/jni/jni_common.cpp @@ -73,13 +73,13 @@ jmethodID GetStaticMethodID(JNIEnv * env, jclass this_class, const char * name, jstring charTojstring(JNIEnv * env, const char * pat) { - jclass str_class = (env)->FindClass("Ljava/lang/String;"); - jmethodID ctor_id = (env)->GetMethodID(str_class, "", "([BLjava/lang/String;)V"); - jsize strSize = static_cast(strlen(pat)); - jbyteArray bytes = (env)->NewByteArray(strSize); - (env)->SetByteArrayRegion(bytes, 0, strSize, reinterpret_cast(const_cast(pat))); - jstring encoding = (env)->NewStringUTF("UTF-8"); - jstring result = static_cast((env)->NewObject(str_class, ctor_id, bytes, encoding)); + const jclass str_class = (env)->FindClass("Ljava/lang/String;"); + const jmethodID ctor_id = (env)->GetMethodID(str_class, "", "([BLjava/lang/String;)V"); + const jsize str_size = static_cast(strlen(pat)); + const jbyteArray bytes = (env)->NewByteArray(str_size); + (env)->SetByteArrayRegion(bytes, 0, str_size, reinterpret_cast(const_cast(pat))); + const jstring encoding = (env)->NewStringUTF("UTF-8"); + const auto result = static_cast((env)->NewObject(str_class, ctor_id, bytes, encoding)); env->DeleteLocalRef(bytes); env->DeleteLocalRef(encoding); return result; diff --git a/cpp-ch/local-engine/jni/jni_common.h b/cpp-ch/local-engine/jni/jni_common.h index c1cc805aa3ed1..c1e0fbead5357 100644 --- a/cpp-ch/local-engine/jni/jni_common.h +++ b/cpp-ch/local-engine/jni/jni_common.h @@ -28,6 +28,7 @@ namespace DB namespace ErrorCodes { extern const int LOGICAL_ERROR; + extern const int UNKNOWN_TYPE; } } @@ -62,6 +63,24 @@ jbyteArray stringTojbyteArray(JNIEnv * env, const std::string & str); throw DB::Exception::createRuntime(DB::ErrorCodes::LOGICAL_ERROR, msg); \ } +#define TRY_LOCAL_ENGINE_JNI_JMETHOD_START +#define TRY_LOCAL_ENGINE_JNI_JMETHOD_END(env) \ + if ((env)->ExceptionCheck()) \ + { \ + LOG_ERROR(&Poco::Logger::get("local_engine"), "Enter java exception handle."); \ + auto excp = (env)->ExceptionOccurred(); \ + (env)->ExceptionDescribe(); \ + (env)->ExceptionClear(); \ + jclass cls = (env)->GetObjectClass(excp); \ + jmethodID mid = env->GetMethodID(cls, "toString", "()Ljava/lang/String;"); \ + jstring jmsg = static_cast((env)->CallObjectMethod(excp, mid)); \ + const char * nmsg = (env)->GetStringUTFChars(jmsg, NULL); \ + std::string msg = std::string(nmsg); \ + env->ReleaseStringUTFChars(jmsg, nmsg); \ + LOG_WARNING(&Poco::Logger::get("local_engine"), "Ignore java exception: {}", msg); \ + } + + template jobject safeCallObjectMethod(JNIEnv * env, jobject obj, jmethodID method_id, Args... args) { @@ -106,6 +125,14 @@ void safeCallVoidMethod(JNIEnv * env, jobject obj, jmethodID method_id, Args... LOCAL_ENGINE_JNI_JMETHOD_END(env) } +template +void tryCallVoidMethod(JNIEnv * env, jobject obj, jmethodID method_id, Args... args) +{ + TRY_LOCAL_ENGINE_JNI_JMETHOD_START + env->CallVoidMethod(obj, method_id, args...); + TRY_LOCAL_ENGINE_JNI_JMETHOD_END(env) +} + template jlong safeCallStaticLongMethod(JNIEnv * env, jclass clazz, jmethodID method_id, Args... args) { @@ -114,4 +141,103 @@ jlong safeCallStaticLongMethod(JNIEnv * env, jclass clazz, jmethodID method_id, LOCAL_ENGINE_JNI_JMETHOD_END(env) return ret; } + +// Safe version of JNI {Get|Release}ArrayElements routines. +// SafeNativeArray would release the managed array elements automatically +// during destruction. + +enum class JniPrimitiveArrayType { + kBoolean = 0, + kByte = 1, + kChar = 2, + kShort = 3, + kInt = 4, + kLong = 5, + kFloat = 6, + kDouble = 7 +}; + +#define CONCATENATE(t1, t2, t3) t1##t2##t3 + +#define DEFINE_PRIMITIVE_ARRAY(PRIM_TYPE, JAVA_TYPE, JNI_NATIVE_TYPE, NATIVE_TYPE, METHOD_VAR) \ + template <> \ + struct JniPrimitiveArray { \ + using JavaType = JAVA_TYPE; \ + using JniNativeType = JNI_NATIVE_TYPE; \ + using NativeType = NATIVE_TYPE; \ + \ + static JniNativeType get(JNIEnv* env, JavaType javaArray) { \ + return env->CONCATENATE(Get, METHOD_VAR, ArrayElements)(javaArray, nullptr); \ + } \ + \ + static void release(JNIEnv* env, JavaType javaArray, JniNativeType nativeArray) { \ + env->CONCATENATE(Release, METHOD_VAR, ArrayElements)(javaArray, nativeArray, JNI_ABORT); \ + } \ + }; + +template +struct JniPrimitiveArray {}; + +DEFINE_PRIMITIVE_ARRAY(kBoolean, jbooleanArray, jboolean*, bool*, Boolean) +DEFINE_PRIMITIVE_ARRAY(kByte, jbyteArray, jbyte*, uint8_t*, Byte) +DEFINE_PRIMITIVE_ARRAY(kChar, jcharArray, jchar*, uint16_t*, Char) +DEFINE_PRIMITIVE_ARRAY(kShort, jshortArray, jshort*, int16_t*, Short) +DEFINE_PRIMITIVE_ARRAY(kInt, jintArray, jint*, int32_t*, Int) +DEFINE_PRIMITIVE_ARRAY(kLong, jlongArray, jlong*, int64_t*, Long) +DEFINE_PRIMITIVE_ARRAY(kFloat, jfloatArray, jfloat*, float_t*, Float) +DEFINE_PRIMITIVE_ARRAY(kDouble, jdoubleArray, jdouble*, double_t*, Double) + +template +class SafeNativeArray { + using PrimitiveArray = JniPrimitiveArray; + using JavaArrayType = typename PrimitiveArray::JavaType; + using JniNativeArrayType = typename PrimitiveArray::JniNativeType; + using NativeArrayType = typename PrimitiveArray::NativeType; + + public: + virtual ~SafeNativeArray() { + PrimitiveArray::release(env_, javaArray_, nativeArray_); + } + + SafeNativeArray(const SafeNativeArray&) = delete; + SafeNativeArray(SafeNativeArray&&) = delete; + SafeNativeArray& operator=(const SafeNativeArray&) = delete; + SafeNativeArray& operator=(SafeNativeArray&&) = delete; + + const NativeArrayType elems() const { + return reinterpret_cast(nativeArray_); + } + + const jsize length() const { + return env_->GetArrayLength(javaArray_); + } + + static SafeNativeArray get(JNIEnv* env, JavaArrayType javaArray) { + JniNativeArrayType nativeArray = PrimitiveArray::get(env, javaArray); + return SafeNativeArray(env, javaArray, nativeArray); + } + + private: + SafeNativeArray(JNIEnv* env, JavaArrayType javaArray, JniNativeArrayType nativeArray) + : env_(env), javaArray_(javaArray), nativeArray_(nativeArray){}; + + JNIEnv* env_; + JavaArrayType javaArray_; + JniNativeArrayType nativeArray_; +}; + +#define DEFINE_SAFE_GET_PRIMITIVE_ARRAY_FUNCTIONS(PRIM_TYPE, JAVA_TYPE, METHOD_VAR) \ + inline SafeNativeArray CONCATENATE(get, METHOD_VAR, ArrayElementsSafe)( \ + JNIEnv * env, JAVA_TYPE array) { \ + return SafeNativeArray::get(env, array); \ + } + +DEFINE_SAFE_GET_PRIMITIVE_ARRAY_FUNCTIONS(kBoolean, jbooleanArray, Boolean) +DEFINE_SAFE_GET_PRIMITIVE_ARRAY_FUNCTIONS(kByte, jbyteArray, Byte) +DEFINE_SAFE_GET_PRIMITIVE_ARRAY_FUNCTIONS(kChar, jcharArray, Char) +DEFINE_SAFE_GET_PRIMITIVE_ARRAY_FUNCTIONS(kShort, jshortArray, Short) +DEFINE_SAFE_GET_PRIMITIVE_ARRAY_FUNCTIONS(kInt, jintArray, Int) +DEFINE_SAFE_GET_PRIMITIVE_ARRAY_FUNCTIONS(kLong, jlongArray, Long) +DEFINE_SAFE_GET_PRIMITIVE_ARRAY_FUNCTIONS(kFloat, jfloatArray, Float) +DEFINE_SAFE_GET_PRIMITIVE_ARRAY_FUNCTIONS(kDouble, jdoubleArray, Double) } diff --git a/cpp-ch/local-engine/jni/jni_error.h b/cpp-ch/local-engine/jni/jni_error.h index 216a5da9392f3..c6f46bc8fc6e2 100644 --- a/cpp-ch/local-engine/jni/jni_error.h +++ b/cpp-ch/local-engine/jni/jni_error.h @@ -37,6 +37,8 @@ class JniErrorsGlobalState : boost::noncopyable ~JniErrorsGlobalState() = default; static JniErrorsGlobalState & instance(); + static void throwException(JNIEnv * env, jclass exception_class, const std::string & message, const std::string & stack_trace = ""); + void initialize(JNIEnv * env_); void destroy(JNIEnv * env); @@ -48,7 +50,6 @@ class JniErrorsGlobalState : boost::noncopyable void throwException(JNIEnv * env, const DB::Exception & e); void throwException(JNIEnv * env, const std::exception & e); - static void throwException(JNIEnv * env, jclass exception_class, const std::string & message, const std::string & stack_trace = ""); void throwRuntimeException(JNIEnv * env, const std::string & message, const std::string & stack_trace = ""); diff --git a/cpp-ch/local-engine/local_engine_jni.cpp b/cpp-ch/local-engine/local_engine_jni.cpp index c7721b470cf8f..695fc85855384 100644 --- a/cpp-ch/local-engine/local_engine_jni.cpp +++ b/cpp-ch/local-engine/local_engine_jni.cpp @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include @@ -37,10 +36,14 @@ #include #include #include +#include +#include #include #include #include +#include #include +#include #include #include #include @@ -52,10 +55,6 @@ #include #include #include -#include -#include -#include -#include #ifdef __cplusplus @@ -94,16 +93,6 @@ static std::string jstring2string(JNIEnv * env, jstring jStr) return ret; } -static jstring stringTojstring(JNIEnv * env, const char * pat) -{ - jclass strClass = (env)->FindClass("java/lang/String"); - jmethodID ctorID = (env)->GetMethodID(strClass, "", "([BLjava/lang/String;)V"); - jbyteArray bytes = (env)->NewByteArray(strlen(pat)); - (env)->SetByteArrayRegion(bytes, 0, strlen(pat), reinterpret_cast(pat)); - jstring encoding = (env)->NewStringUTF("UTF-8"); - return static_cast((env)->NewObject(strClass, ctorID, bytes, encoding)); -} - extern "C" { #endif @@ -122,9 +111,6 @@ static jmethodID block_stripes_constructor; static jclass split_result_class; static jmethodID split_result_constructor; -static jclass native_metrics_class; -static jmethodID native_metrics_constructor; - JNIEXPORT jint JNI_OnLoad(JavaVM * vm, void * /*reserved*/) { JNIEnv * env; @@ -189,10 +175,6 @@ JNIEXPORT jint JNI_OnLoad(JavaVM * vm, void * /*reserved*/) local_engine::ReservationListenerWrapper::reservation_listener_currentMemory = local_engine::GetMethodID(env, local_engine::ReservationListenerWrapper::reservation_listener_class, "currentMemory", "()J"); - - native_metrics_class = local_engine::CreateGlobalClassReference(env, "Lorg/apache/gluten/metrics/NativeMetrics;"); - native_metrics_constructor = local_engine::GetMethodID(env, native_metrics_class, "", "(Ljava/lang/String;)V"); - local_engine::BroadCastJoinBuilder::init(env); local_engine::JNIUtils::vm = vm; @@ -219,18 +201,14 @@ JNIEXPORT void JNI_OnUnload(JavaVM * vm, void * /*reserved*/) env->DeleteGlobalRef(local_engine::SourceFromJavaIter::serialized_record_batch_iterator_class); env->DeleteGlobalRef(local_engine::SparkRowToCHColumn::spark_row_interator_class); env->DeleteGlobalRef(local_engine::ReservationListenerWrapper::reservation_listener_class); - env->DeleteGlobalRef(native_metrics_class); } JNIEXPORT void Java_org_apache_gluten_vectorized_ExpressionEvaluatorJniWrapper_nativeInitNative(JNIEnv * env, jobject, jbyteArray conf_plan) { LOCAL_ENGINE_JNI_METHOD_START - jsize plan_buf_size = env->GetArrayLength(conf_plan); - jbyte * plan_buf_addr = env->GetByteArrayElements(conf_plan, nullptr); - std::string plan_str; - plan_str.assign(reinterpret_cast(plan_buf_addr), plan_buf_size); - local_engine::BackendInitializerUtil::init(&plan_str); - env->ReleaseByteArrayElements(conf_plan, plan_buf_addr, JNI_ABORT); + const auto conf_plan_a = local_engine::getByteArrayElementsSafe(env, conf_plan); + const std::string::size_type plan_buf_size = conf_plan_a.length(); + local_engine::BackendInitializerUtil::init({reinterpret_cast(conf_plan_a.elems()), plan_buf_size}); LOCAL_ENGINE_JNI_METHOD_END(env, ) } @@ -255,11 +233,10 @@ JNIEXPORT jlong Java_org_apache_gluten_vectorized_ExpressionEvaluatorJniWrapper_ auto query_context = local_engine::getAllocator(allocator_id)->query_context; // by task update new configs ( in case of dynamic config update ) - jsize plan_buf_size = env->GetArrayLength(conf_plan); - jbyte * plan_buf_addr = env->GetByteArrayElements(conf_plan, nullptr); - std::string plan_str; - plan_str.assign(reinterpret_cast(plan_buf_addr), plan_buf_size); - local_engine::BackendInitializerUtil::updateConfig(query_context, &plan_str); + const auto conf_plan_a = local_engine::getByteArrayElementsSafe(env, conf_plan); + const std::string::size_type conf_plan_size = conf_plan_a.length(); + local_engine::BackendInitializerUtil::updateConfig( + query_context, {reinterpret_cast(conf_plan_a.elems()), conf_plan_size}); local_engine::SerializedPlanParser parser(query_context); jsize iter_num = env->GetArrayLength(iter_arr); @@ -270,68 +247,26 @@ JNIEXPORT jlong Java_org_apache_gluten_vectorized_ExpressionEvaluatorJniWrapper_ parser.addInputIter(iter, materialize_input); } - for (jsize i = 0, split_info_arr_size = env->GetArrayLength(split_infos); i < split_info_arr_size; i++) { + for (jsize i = 0, split_info_arr_size = env->GetArrayLength(split_infos); i < split_info_arr_size; i++) + { jbyteArray split_info = static_cast(env->GetObjectArrayElement(split_infos, i)); - jsize split_info_size = env->GetArrayLength(split_info); - jbyte * split_info_addr = env->GetByteArrayElements(split_info, nullptr); - std::string split_info_str; - split_info_str.assign(reinterpret_cast(split_info_addr), split_info_size); - parser.addSplitInfo(split_info_str); + const auto split_info_a = local_engine::getByteArrayElementsSafe(env, split_info); + const std::string::size_type split_info_size = split_info_a.length(); + parser.addSplitInfo({reinterpret_cast(split_info_a.elems()), split_info_size}); } - jsize plan_size = env->GetArrayLength(plan); - jbyte * plan_address = env->GetByteArrayElements(plan, nullptr); - std::string plan_string; - plan_string.assign(reinterpret_cast(plan_address), plan_size); - auto query_plan = parser.parse(plan_string); - local_engine::LocalExecutor * executor = new local_engine::LocalExecutor(query_context); + const auto plan_a = local_engine::getByteArrayElementsSafe(env, plan); + const std::string::size_type plan_size = plan_a.length(); + local_engine::LocalExecutor * executor + = parser.createExecutor({reinterpret_cast(plan_a.elems()), plan_size}).release(); + LOG_INFO(&Poco::Logger::get("jni"), "Construct LocalExecutor {}", reinterpret_cast(executor)); executor->setMetric(parser.getMetric()); executor->setExtraPlanHolder(parser.extra_plan_holder); - executor->execute(std::move(query_plan)); - env->ReleaseByteArrayElements(plan, plan_address, JNI_ABORT); - env->ReleaseByteArrayElements(conf_plan, plan_buf_addr, JNI_ABORT); + return reinterpret_cast(executor); LOCAL_ENGINE_JNI_METHOD_END(env, -1) } -JNIEXPORT jboolean Java_org_apache_gluten_row_RowIterator_nativeHasNext(JNIEnv * env, jobject /*obj*/, jlong executor_address) -{ - LOCAL_ENGINE_JNI_METHOD_START - local_engine::LocalExecutor * executor = reinterpret_cast(executor_address); - return executor->hasNext(); - LOCAL_ENGINE_JNI_METHOD_END(env, false) -} - -JNIEXPORT jobject Java_org_apache_gluten_row_RowIterator_nativeNext(JNIEnv * env, jobject /*obj*/, jlong executor_address) -{ - LOCAL_ENGINE_JNI_METHOD_START - local_engine::LocalExecutor * executor = reinterpret_cast(executor_address); - local_engine::SparkRowInfoPtr spark_row_info = executor->next(); - - auto * offsets_arr = env->NewLongArray(spark_row_info->getNumRows()); - const auto * offsets_src = reinterpret_cast(spark_row_info->getOffsets().data()); - env->SetLongArrayRegion(offsets_arr, 0, spark_row_info->getNumRows(), offsets_src); - auto * lengths_arr = env->NewLongArray(spark_row_info->getNumRows()); - const auto * lengths_src = reinterpret_cast(spark_row_info->getLengths().data()); - env->SetLongArrayRegion(lengths_arr, 0, spark_row_info->getNumRows(), lengths_src); - int64_t address = reinterpret_cast(spark_row_info->getBufferAddress()); - int64_t column_number = reinterpret_cast(spark_row_info->getNumCols()); - int64_t total_size = reinterpret_cast(spark_row_info->getTotalBytes()); - - jobject spark_row_info_object - = env->NewObject(spark_row_info_class, spark_row_info_constructor, offsets_arr, lengths_arr, address, column_number, total_size); - return spark_row_info_object; - LOCAL_ENGINE_JNI_METHOD_END(env, nullptr) -} - -JNIEXPORT void Java_org_apache_gluten_row_RowIterator_nativeClose(JNIEnv * env, jobject /*obj*/, jlong executor_address) -{ - LOCAL_ENGINE_JNI_METHOD_START - local_engine::LocalExecutor * executor = reinterpret_cast(executor_address); - delete executor; - LOCAL_ENGINE_JNI_METHOD_END(env, ) -} - // Columnar Iterator JNIEXPORT jboolean Java_org_apache_gluten_vectorized_BatchIterator_nativeHasNext(JNIEnv * env, jobject /*obj*/, jlong executor_address) { @@ -346,45 +281,38 @@ JNIEXPORT jlong Java_org_apache_gluten_vectorized_BatchIterator_nativeCHNext(JNI LOCAL_ENGINE_JNI_METHOD_START local_engine::LocalExecutor * executor = reinterpret_cast(executor_address); DB::Block * column_batch = executor->nextColumnar(); - // LOG_DEBUG(&Poco::Logger::get("jni"), "row size of the column batch: {}", column_batch->rows()); return reinterpret_cast(column_batch); LOCAL_ENGINE_JNI_METHOD_END(env, -1) } -JNIEXPORT void Java_org_apache_gluten_vectorized_BatchIterator_nativeClose(JNIEnv * env, jobject /*obj*/, jlong executor_address) +JNIEXPORT void Java_org_apache_gluten_vectorized_BatchIterator_nativeCancel(JNIEnv * env, jobject /*obj*/, jlong executor_address) { LOCAL_ENGINE_JNI_METHOD_START - local_engine::LocalExecutor * executor = reinterpret_cast(executor_address); - delete executor; + auto *executor = reinterpret_cast(executor_address); + executor->cancel(); + LOG_INFO(&Poco::Logger::get("jni"), "Cancel LocalExecutor {}", reinterpret_cast(executor)); LOCAL_ENGINE_JNI_METHOD_END(env, ) } -JNIEXPORT jobject Java_org_apache_gluten_vectorized_BatchIterator_nativeFetchMetrics(JNIEnv * env, jobject /*obj*/, jlong executor_address) +JNIEXPORT void Java_org_apache_gluten_vectorized_BatchIterator_nativeClose(JNIEnv * env, jobject /*obj*/, jlong executor_address) { LOCAL_ENGINE_JNI_METHOD_START - /// Collect metrics only if optimizations are disabled, otherwise coredump would happen. - local_engine::LocalExecutor * executor = reinterpret_cast(executor_address); - auto metric = executor->getMetric(); - String metrics_json = metric ? local_engine::RelMetricSerializer::serializeRelMetric(metric) : ""; - LOG_DEBUG(&Poco::Logger::get("jni"), "{}", metrics_json); - jobject native_metrics = env->NewObject(native_metrics_class, native_metrics_constructor, stringTojstring(env, metrics_json.c_str())); - return native_metrics; - LOCAL_ENGINE_JNI_METHOD_END(env, nullptr) -} - -JNIEXPORT void -Java_org_apache_gluten_vectorized_ExpressionEvaluatorJniWrapper_nativeSetJavaTmpDir(JNIEnv * /*env*/, jobject /*obj*/, jstring /*dir*/) -{ + auto *executor = reinterpret_cast(executor_address); + LOG_INFO(&Poco::Logger::get("jni"), "Finalize LocalExecutor {}", reinterpret_cast(executor)); + delete executor; + LOCAL_ENGINE_JNI_METHOD_END(env, ) } -JNIEXPORT void -Java_org_apache_gluten_vectorized_ExpressionEvaluatorJniWrapper_nativeSetBatchSize(JNIEnv * /*env*/, jobject /*obj*/, jint /*batch_size*/) +JNIEXPORT jstring Java_org_apache_gluten_vectorized_BatchIterator_nativeFetchMetrics(JNIEnv * env, jobject /*obj*/, jlong executor_address) { -} + LOCAL_ENGINE_JNI_METHOD_START + /// Collect metrics only if optimizations are disabled, otherwise coredump would happen. + const local_engine::LocalExecutor * executor = reinterpret_cast(executor_address); + const auto metric = executor->getMetric(); + const String metrics_json = metric ? local_engine::RelMetricSerializer::serializeRelMetric(metric) : ""; -JNIEXPORT void Java_org_apache_gluten_vectorized_ExpressionEvaluatorJniWrapper_nativeSetMetricsTime( - JNIEnv * /*env*/, jobject /*obj*/, jboolean /*setMetricsTime*/) -{ + return local_engine::charTojstring(env, metrics_json.c_str()); + LOCAL_ENGINE_JNI_METHOD_END(env, nullptr) } JNIEXPORT jboolean @@ -603,52 +531,6 @@ JNIEXPORT void Java_org_apache_gluten_vectorized_CHStreamReader_nativeClose(JNIE LOCAL_ENGINE_JNI_METHOD_END(env, ) } -JNIEXPORT jlong Java_org_apache_gluten_vectorized_CHCoalesceOperator_createNativeOperator(JNIEnv * env, jobject /*obj*/, jint buf_size) -{ - LOCAL_ENGINE_JNI_METHOD_START - local_engine::BlockCoalesceOperator * instance = new local_engine::BlockCoalesceOperator(buf_size); - return reinterpret_cast(instance); - LOCAL_ENGINE_JNI_METHOD_END(env, -1) -} - -JNIEXPORT void Java_org_apache_gluten_vectorized_CHCoalesceOperator_nativeMergeBlock( - JNIEnv * env, jobject /*obj*/, jlong instance_address, jlong block_address) -{ - LOCAL_ENGINE_JNI_METHOD_START - local_engine::BlockCoalesceOperator * instance = reinterpret_cast(instance_address); - DB::Block * block = reinterpret_cast(block_address); - auto new_block = DB::Block(*block); - instance->mergeBlock(new_block); - LOCAL_ENGINE_JNI_METHOD_END(env, ) -} - -JNIEXPORT jboolean Java_org_apache_gluten_vectorized_CHCoalesceOperator_nativeIsFull(JNIEnv * env, jobject /*obj*/, jlong instance_address) -{ - LOCAL_ENGINE_JNI_METHOD_START - local_engine::BlockCoalesceOperator * instance = reinterpret_cast(instance_address); - bool full = instance->isFull(); - return full ? JNI_TRUE : JNI_FALSE; - LOCAL_ENGINE_JNI_METHOD_END(env, false) -} - -JNIEXPORT jlong Java_org_apache_gluten_vectorized_CHCoalesceOperator_nativeRelease(JNIEnv * env, jobject /*obj*/, jlong instance_address) -{ - LOCAL_ENGINE_JNI_METHOD_START - local_engine::BlockCoalesceOperator * instance = reinterpret_cast(instance_address); - auto * block = instance->releaseBlock(); - Int64 address = reinterpret_cast(block); - return address; - LOCAL_ENGINE_JNI_METHOD_END(env, -1) -} - -JNIEXPORT void Java_org_apache_gluten_vectorized_CHCoalesceOperator_nativeClose(JNIEnv * env, jobject /*obj*/, jlong instance_address) -{ - LOCAL_ENGINE_JNI_METHOD_START - local_engine::BlockCoalesceOperator * instance = reinterpret_cast(instance_address); - delete instance; - LOCAL_ENGINE_JNI_METHOD_END(env, ) -} - // Splitter Jni Wrapper JNIEXPORT jlong Java_org_apache_gluten_vectorized_CHShuffleSplitterJniWrapper_nativeMake( JNIEnv * env, @@ -671,29 +553,24 @@ JNIEXPORT jlong Java_org_apache_gluten_vectorized_CHShuffleSplitterJniWrapper_na jboolean flush_block_buffer_before_evict, jlong max_sort_buffer_size, jboolean spill_firstly_before_stop, - jboolean force_sort) + jboolean force_external_sort, + jboolean force_memory_sort) { LOCAL_ENGINE_JNI_METHOD_START std::string hash_exprs; std::string out_exprs; if (expr_list != nullptr) { - int len = env->GetArrayLength(expr_list); - auto * str = reinterpret_cast(new char[len]); - memset(str, 0, len); - env->GetByteArrayRegion(expr_list, 0, len, str); - hash_exprs = std::string(str, str + len); - delete[] str; + const auto expr_list_a = local_engine::getByteArrayElementsSafe(env, expr_list); + const std::string::size_type expr_list_size = expr_list_a.length(); + hash_exprs = std::string{reinterpret_cast(expr_list_a.elems()), expr_list_size}; } if (out_expr_list != nullptr) { - int len = env->GetArrayLength(out_expr_list); - auto * str = reinterpret_cast(new char[len]); - memset(str, 0, len); - env->GetByteArrayRegion(out_expr_list, 0, len, str); - out_exprs = std::string(str, str + len); - delete[] str; + const auto out_expr_list_a = local_engine::getByteArrayElementsSafe(env, out_expr_list); + const std::string::size_type out_expr_list_size = out_expr_list_a.length(); + out_exprs = std::string{reinterpret_cast(out_expr_list_a.elems()), out_expr_list_size}; } Poco::StringTokenizer local_dirs_tokenizer(jstring2string(env, local_dirs), ","); @@ -718,8 +595,8 @@ JNIEXPORT jlong Java_org_apache_gluten_vectorized_CHShuffleSplitterJniWrapper_na .flush_block_buffer_before_evict = static_cast(flush_block_buffer_before_evict), .max_sort_buffer_size = static_cast(max_sort_buffer_size), .spill_firstly_before_stop = static_cast(spill_firstly_before_stop), - .force_sort = static_cast(force_sort) - }; + .force_external_sort = static_cast(force_external_sort), + .force_mermory_sort = static_cast(force_memory_sort)}; auto name = jstring2string(env, short_name); local_engine::SplitterHolder * splitter; if (prefer_spill) @@ -745,27 +622,25 @@ JNIEXPORT jlong Java_org_apache_gluten_vectorized_CHShuffleSplitterJniWrapper_na jstring hash_algorithm, jobject pusher, jboolean throw_if_memory_exceed, - jboolean flush_block_buffer_before_evict) + jboolean flush_block_buffer_before_evict, + jboolean force_external_sort, + jboolean force_memory_sort) { LOCAL_ENGINE_JNI_METHOD_START std::string hash_exprs; std::string out_exprs; if (expr_list != nullptr) { - int len = env->GetArrayLength(expr_list); - auto * str = reinterpret_cast(new char[len]); - env->GetByteArrayRegion(expr_list, 0, len, str); - hash_exprs = std::string(str, str + len); - delete[] str; + const auto expr_list_a = local_engine::getByteArrayElementsSafe(env, expr_list); + const std::string::size_type expr_list_size = expr_list_a.length(); + hash_exprs = std::string{reinterpret_cast(expr_list_a.elems()), expr_list_size}; } if (out_expr_list != nullptr) { - int len = env->GetArrayLength(out_expr_list); - auto * str = reinterpret_cast(new char[len]); - env->GetByteArrayRegion(out_expr_list, 0, len, str); - out_exprs = std::string(str, str + len); - delete[] str; + const auto out_expr_list_a = local_engine::getByteArrayElementsSafe(env, out_expr_list); + const std::string::size_type out_expr_list_size = out_expr_list_a.length(); + out_exprs = std::string{reinterpret_cast(out_expr_list_a.elems()), out_expr_list_size}; } local_engine::SplitOptions options{ @@ -780,7 +655,9 @@ JNIEXPORT jlong Java_org_apache_gluten_vectorized_CHShuffleSplitterJniWrapper_na .spill_threshold = static_cast(spill_threshold), .hash_algorithm = jstring2string(env, hash_algorithm), .throw_if_memory_exceed = static_cast(throw_if_memory_exceed), - .flush_block_buffer_before_evict = static_cast(flush_block_buffer_before_evict)}; + .flush_block_buffer_before_evict = static_cast(flush_block_buffer_before_evict), + .force_external_sort = static_cast(force_external_sort), + .force_mermory_sort = static_cast(force_memory_sort)}; auto name = jstring2string(env, short_name); local_engine::SplitterHolder * splitter; splitter = new local_engine::SplitterHolder{.splitter = std::make_unique(name, options, pusher)}; @@ -851,8 +728,8 @@ JNIEXPORT void Java_org_apache_gluten_vectorized_CHShuffleSplitterJniWrapper_clo } // CHBlockConverterJniWrapper -JNIEXPORT jobject -Java_org_apache_gluten_vectorized_CHBlockConverterJniWrapper_convertColumnarToRow(JNIEnv * env, jclass, jlong block_address, jintArray masks) +JNIEXPORT jobject Java_org_apache_gluten_vectorized_CHBlockConverterJniWrapper_convertColumnarToRow( + JNIEnv * env, jclass, jlong block_address, jintArray masks) { LOCAL_ENGINE_JNI_METHOD_START local_engine::CHColumnToSparkRow converter; @@ -862,14 +739,12 @@ Java_org_apache_gluten_vectorized_CHBlockConverterJniWrapper_convertColumnarToRo DB::Block * block = reinterpret_cast(block_address); if (masks != nullptr) { - jint size = env->GetArrayLength(masks); - jboolean is_cp = JNI_FALSE; - jint * values = env->GetIntArrayElements(masks, &is_cp); + auto safeArray = local_engine::getIntArrayElementsSafe(env, masks); mask = std::make_unique>(); - for (int j = 0; j < size; j++) - mask->push_back(values[j]); - env->ReleaseIntArrayElements(masks, values, JNI_ABORT); + for (int j = 0; j < safeArray.length(); j++) + mask->push_back(safeArray.elems()[j]); } + spark_row_info = converter.convertCHColumnToSparkRow(*block, mask); auto * offsets_arr = env->NewLongArray(spark_row_info->getNumRows()); @@ -1015,52 +890,39 @@ JNIEXPORT jlong Java_org_apache_spark_sql_execution_datasources_CHDatasourceJniW LOCAL_ENGINE_JNI_METHOD_START auto query_context = local_engine::getAllocator(allocator_id)->query_context; // by task update new configs ( in case of dynamic config update ) - jsize conf_plan_buf_size = env->GetArrayLength(conf_plan); - jbyte * conf_plan_buf_addr = env->GetByteArrayElements(conf_plan, nullptr); - std::string conf_plan_str; - conf_plan_str.assign(reinterpret_cast(conf_plan_buf_addr), conf_plan_buf_size); - local_engine::BackendInitializerUtil::updateConfig(query_context, &conf_plan_str); + const auto conf_plan_a = local_engine::getByteArrayElementsSafe(env, conf_plan); + const std::string::size_type conf_plan_size = conf_plan_a.length(); + local_engine::BackendInitializerUtil::updateConfig( + query_context, {reinterpret_cast(conf_plan_a.elems()), conf_plan_size}); const auto uuid_str = jstring2string(env, uuid_); const auto task_id = jstring2string(env, task_id_); const auto partition_dir = jstring2string(env, partition_dir_); const auto bucket_dir = jstring2string(env, bucket_dir_); - jsize plan_buf_size = env->GetArrayLength(plan_); - jbyte * plan_buf_addr = env->GetByteArrayElements(plan_, nullptr); - std::string plan_str; - plan_str.assign(reinterpret_cast(plan_buf_addr), plan_buf_size); + const auto plan_a = local_engine::getByteArrayElementsSafe(env, plan_); - jsize split_info_size = env->GetArrayLength(split_info_); - jbyte * split_info_addr = env->GetByteArrayElements(split_info_, nullptr); - std::string split_info_str; - split_info_str.assign(reinterpret_cast(split_info_addr), split_info_size); - - auto plan_ptr = std::make_unique(); /// https://stackoverflow.com/questions/52028583/getting-error-parsing-protobuf-data /// Parsing may fail when the number of recursive layers is large. /// Here, set a limit large enough to avoid this problem. /// Once this problem occurs, it is difficult to troubleshoot, because the pb of c++ will not provide any valid information - google::protobuf::io::CodedInputStream coded_in( - reinterpret_cast(plan_str.data()), static_cast(plan_str.size())); + google::protobuf::io::CodedInputStream coded_in(plan_a.elems(), plan_a.length()); coded_in.SetRecursionLimit(100000); - auto ok = plan_ptr->ParseFromCodedStream(&coded_in); - if (!ok) + substrait::Plan plan_ptr; + if (!plan_ptr.ParseFromCodedStream(&coded_in)) throw DB::Exception(DB::ErrorCodes::CANNOT_PARSE_PROTOBUF_SCHEMA, "Parse substrait::Plan from string failed"); - substrait::ReadRel::ExtensionTable extension_table = - local_engine::SerializedPlanParser::parseExtensionTable(split_info_str); + const auto split_info_a = local_engine::getByteArrayElementsSafe(env, split_info_); + const std::string::size_type split_info_size = split_info_a.length(); + std::string split_info_str{reinterpret_cast(split_info_a.elems()), split_info_size}; + + substrait::ReadRel::ExtensionTable extension_table = local_engine::SerializedPlanParser::parseExtensionTable(split_info_str); - auto storage = local_engine::MergeTreeRelParser::parseStorage( - extension_table, local_engine::SerializedPlanParser::global_context); + auto merge_tree_table = local_engine::MergeTreeRelParser::parseMergeTreeTable(extension_table); auto uuid = uuid_str + "_" + task_id; - auto * writer = new local_engine::SparkMergeTreeWriter( - storage, storage->getInMemoryMetadataPtr(), query_context, uuid, partition_dir, bucket_dir); + auto * writer = new local_engine::SparkMergeTreeWriter(merge_tree_table, query_context, uuid, partition_dir, bucket_dir); - env->ReleaseByteArrayElements(plan_, plan_buf_addr, JNI_ABORT); - env->ReleaseByteArrayElements(split_info_, split_info_addr, JNI_ABORT); - env->ReleaseByteArrayElements(conf_plan, conf_plan_buf_addr, JNI_ABORT); return reinterpret_cast(writer); LOCAL_ENGINE_JNI_METHOD_END(env, 0) } @@ -1070,41 +932,32 @@ JNIEXPORT jstring Java_org_apache_spark_sql_execution_datasources_CHDatasourceJn JNIEnv * env, jclass, jbyteArray plan_, jbyteArray read_) { LOCAL_ENGINE_JNI_METHOD_START - jsize plan_buf_size = env->GetArrayLength(plan_); - jbyte * plan_buf_addr = env->GetByteArrayElements(plan_, nullptr); - std::string plan_str; - plan_str.assign(reinterpret_cast(plan_buf_addr), plan_buf_size); + const auto plan_a = local_engine::getByteArrayElementsSafe(env, plan_); + const std::string::size_type plan_size = plan_a.length(); - auto plan_ptr = std::make_unique(); - if (!plan_ptr->ParseFromString(plan_str)) + substrait::Plan plan_ptr; + if (!plan_ptr.ParseFromString({reinterpret_cast(plan_a.elems()), plan_size})) throw Exception(DB::ErrorCodes::CANNOT_PARSE_PROTOBUF_SCHEMA, "Parse substrait::Plan from string failed"); - jsize read_buf_size = env->GetArrayLength(read_); - jbyte * read_buf_addr = env->GetByteArrayElements(read_, nullptr); - std::string filter_str; - filter_str.assign(reinterpret_cast(read_buf_addr), read_buf_size); - - auto read_ptr = std::make_unique(); + const auto read_a = local_engine::getByteArrayElementsSafe(env, read_); /// https://stackoverflow.com/questions/52028583/getting-error-parsing-protobuf-data /// Parsing may fail when the number of recursive layers is large. /// Here, set a limit large enough to avoid this problem. /// Once this problem occurs, it is difficult to troubleshoot, because the pb of c++ will not provide any valid information - google::protobuf::io::CodedInputStream coded_in( - reinterpret_cast(filter_str.data()), static_cast(filter_str.size())); + google::protobuf::io::CodedInputStream coded_in(read_a.elems(), read_a.length()); coded_in.SetRecursionLimit(100000); - if (!read_ptr->ParseFromCodedStream(&coded_in)) + substrait::Rel read_ptr; + if (!read_ptr.ParseFromCodedStream(&coded_in)) throw Exception(DB::ErrorCodes::CANNOT_PARSE_PROTOBUF_SCHEMA, "Parse substrait::Expression from string failed"); local_engine::SerializedPlanParser parser(local_engine::SerializedPlanParser::global_context); - parser.parseExtensions(plan_ptr->extensions()); + parser.parseExtensions(plan_ptr.extensions()); local_engine::MergeTreeRelParser mergeTreeParser(&parser, local_engine::SerializedPlanParser::global_context); - auto res = mergeTreeParser.filterRangesOnDriver(read_ptr->read()); + auto res = mergeTreeParser.filterRangesOnDriver(read_ptr.read()); - env->ReleaseByteArrayElements(plan_, plan_buf_addr, JNI_ABORT); - env->ReleaseByteArrayElements(read_, read_buf_addr, JNI_ABORT); - return stringTojstring(env, res.c_str()); + return local_engine::charTojstring(env, res.c_str()); LOCAL_ENGINE_JNI_METHOD_END(env, nullptr) } @@ -1128,8 +981,8 @@ JNIEXPORT void Java_org_apache_spark_sql_execution_datasources_CHDatasourceJniWr LOCAL_ENGINE_JNI_METHOD_END(env, ) } -JNIEXPORT void -Java_org_apache_spark_sql_execution_datasources_CHDatasourceJniWrapper_writeToMergeTree(JNIEnv * env, jobject, jlong instanceId, jlong block_address) +JNIEXPORT void Java_org_apache_spark_sql_execution_datasources_CHDatasourceJniWrapper_writeToMergeTree( + JNIEnv * env, jobject, jlong instanceId, jlong block_address) { LOCAL_ENGINE_JNI_METHOD_START auto * writer = reinterpret_cast(instanceId); @@ -1138,7 +991,8 @@ Java_org_apache_spark_sql_execution_datasources_CHDatasourceJniWrapper_writeToMe LOCAL_ENGINE_JNI_METHOD_END(env, ) } -JNIEXPORT jstring Java_org_apache_spark_sql_execution_datasources_CHDatasourceJniWrapper_closeMergeTreeWriter(JNIEnv * env, jobject, jlong instanceId) +JNIEXPORT jstring +Java_org_apache_spark_sql_execution_datasources_CHDatasourceJniWrapper_closeMergeTreeWriter(JNIEnv * env, jobject, jlong instanceId) { LOCAL_ENGINE_JNI_METHOD_START auto * writer = reinterpret_cast(instanceId); @@ -1146,12 +1000,19 @@ JNIEXPORT jstring Java_org_apache_spark_sql_execution_datasources_CHDatasourceJn auto part_infos = writer->getAllPartInfo(); auto json_info = local_engine::SparkMergeTreeWriter::partInfosToJson(part_infos); delete writer; - return stringTojstring(env, json_info.c_str()); + return local_engine::charTojstring(env, json_info.c_str()); LOCAL_ENGINE_JNI_METHOD_END(env, nullptr) } JNIEXPORT jstring Java_org_apache_spark_sql_execution_datasources_CHDatasourceJniWrapper_nativeMergeMTParts( - JNIEnv * env, jobject, jbyteArray plan_, jbyteArray split_info_, jstring uuid_, jstring task_id_, jstring partition_dir_, jstring bucket_dir_) + JNIEnv * env, + jobject, + jbyteArray plan_, + jbyteArray split_info_, + jstring uuid_, + jstring task_id_, + jstring partition_dir_, + jstring bucket_dir_) { LOCAL_ENGINE_JNI_METHOD_START @@ -1164,53 +1025,45 @@ JNIEXPORT jstring Java_org_apache_spark_sql_execution_datasources_CHDatasourceJn const auto partition_dir = jstring2string(env, partition_dir_); const auto bucket_dir = jstring2string(env, bucket_dir_); - jsize plan_buf_size = env->GetArrayLength(plan_); - jbyte * plan_buf_addr = env->GetByteArrayElements(plan_, nullptr); - std::string plan_str; - plan_str.assign(reinterpret_cast(plan_buf_addr), plan_buf_size); - - jsize split_info_size = env->GetArrayLength(split_info_); - jbyte * split_info_addr = env->GetByteArrayElements(split_info_, nullptr); - std::string split_info_str; - split_info_str.assign(reinterpret_cast(split_info_addr), split_info_size); + const auto plan_a = local_engine::getByteArrayElementsSafe(env, plan_); - auto plan_ptr = std::make_unique(); /// https://stackoverflow.com/questions/52028583/getting-error-parsing-protobuf-data /// Parsing may fail when the number of recursive layers is large. /// Here, set a limit large enough to avoid this problem. /// Once this problem occurs, it is difficult to troubleshoot, because the pb of c++ will not provide any valid information - google::protobuf::io::CodedInputStream coded_in( - reinterpret_cast(plan_str.data()), static_cast(plan_str.size())); + google::protobuf::io::CodedInputStream coded_in(plan_a.elems(), plan_a.length()); coded_in.SetRecursionLimit(100000); - auto ok = plan_ptr->ParseFromCodedStream(&coded_in); - if (!ok) + substrait::Plan plan_ptr; + if (!plan_ptr.ParseFromCodedStream(&coded_in)) throw DB::Exception(DB::ErrorCodes::CANNOT_PARSE_PROTOBUF_SCHEMA, "Parse substrait::Plan from string failed"); - substrait::ReadRel::ExtensionTable extension_table = - local_engine::SerializedPlanParser::parseExtensionTable(split_info_str); - - UUID uuid = UUIDHelpers::generateV4(); // each task using its own CustomStorageMergeTree, don't reuse - auto storage = local_engine::MergeTreeRelParser::parseStorage( - extension_table, local_engine::SerializedPlanParser::global_context, uuid); + const auto split_info_a = local_engine::getByteArrayElementsSafe(env, split_info_); + const std::string::size_type split_info_size = split_info_a.length(); + std::string split_info_str{reinterpret_cast(split_info_a.elems()), split_info_size}; + substrait::ReadRel::ExtensionTable extension_table = local_engine::SerializedPlanParser::parseExtensionTable(split_info_str); google::protobuf::StringValue table; table.ParseFromString(extension_table.detail().value()); auto merge_tree_table = local_engine::parseMergeTreeTableString(table.value()); - DB::StorageID table_id(merge_tree_table.database, merge_tree_table.table, uuid); - local_engine::TempStorageFreer freer {table_id}; // to release temp CustomStorageMergeTree with RAII - auto storage_factory = local_engine::StorageMergeTreeFactory::instance(); - std::vector selected_parts = storage_factory.getDataParts(table_id, merge_tree_table.snapshot_id, merge_tree_table.getPartNames()); + + // each task using its own CustomStorageMergeTree, don't reuse + auto temp_storage + = local_engine::MergeTreeRelParser::copyToVirtualStorage(merge_tree_table, local_engine::SerializedPlanParser::global_context); + + local_engine::TempStorageFreer freer{temp_storage->getStorageID()}; // to release temp CustomStorageMergeTree with RAII + std::vector selected_parts = local_engine::StorageMergeTreeFactory::instance().getDataPartsByNames( + temp_storage->getStorageID(), "", merge_tree_table.getPartNames()); std::unordered_map partition_values; - std::vector loaded = - local_engine::mergeParts(selected_parts, partition_values, uuid_str, storage, partition_dir, bucket_dir); + std::vector loaded + = local_engine::mergeParts(selected_parts, partition_values, uuid_str, temp_storage, partition_dir, bucket_dir); std::vector res; for (auto & partPtr : loaded) { saveFileStatus( - *storage, + *temp_storage, local_engine::SerializedPlanParser::global_context, partPtr->name, const_cast(partPtr->getDataPartStorage())); @@ -1220,10 +1073,7 @@ JNIEXPORT jstring Java_org_apache_spark_sql_execution_datasources_CHDatasourceJn auto json_info = local_engine::SparkMergeTreeWriter::partInfosToJson(res); - env->ReleaseByteArrayElements(plan_, plan_buf_addr, JNI_ABORT); - env->ReleaseByteArrayElements(split_info_, split_info_addr, JNI_ABORT); - - return stringTojstring(env, json_info.c_str()); + return local_engine::charTojstring(env, json_info.c_str()); LOCAL_ENGINE_JNI_METHOD_END(env, nullptr) } @@ -1233,16 +1083,14 @@ JNIEXPORT jobject Java_org_apache_spark_sql_execution_datasources_CHDatasourceJn { LOCAL_ENGINE_JNI_METHOD_START auto * block = reinterpret_cast(blockAddress); - int * pIndice = env->GetIntArrayElements(partitionColIndice, nullptr); - int size = env->GetArrayLength(partitionColIndice); + auto safeArray = local_engine::getIntArrayElementsSafe(env, partitionColIndice); std::vector partition_col_indice_vec; - for (int i = 0; i < size; ++i) - partition_col_indice_vec.push_back(pIndice[i]); - - env->ReleaseIntArrayElements(partitionColIndice, pIndice, JNI_ABORT); - local_engine::BlockStripes bs = local_engine::BlockStripeSplitter::split(*block, partition_col_indice_vec, hasBucket, reserve_partition_columns); + for (int i = 0; i < safeArray.length(); ++i) + partition_col_indice_vec.push_back(safeArray.elems()[i]); + local_engine::BlockStripes bs + = local_engine::BlockStripeSplitter::split(*block, partition_col_indice_vec, hasBucket, reserve_partition_columns); auto * addresses = env->NewLongArray(bs.block_addresses.size()); env->SetLongArrayRegion(addresses, 0, bs.block_addresses.size(), bs.block_addresses.data()); @@ -1257,23 +1105,29 @@ JNIEXPORT jobject Java_org_apache_spark_sql_execution_datasources_CHDatasourceJn } JNIEXPORT jlong Java_org_apache_gluten_vectorized_StorageJoinBuilder_nativeBuild( - JNIEnv * env, jclass, jstring key, jbyteArray in, jlong row_count_, jstring join_key_, jint join_type_, jbyteArray named_struct) + JNIEnv * env, + jclass, + jstring key, + jbyteArray in, + jlong row_count_, + jstring join_key_, + jint join_type_, + jboolean has_mixed_join_condition, + jbyteArray named_struct) { LOCAL_ENGINE_JNI_METHOD_START const auto hash_table_id = jstring2string(env, key); const auto join_key = jstring2string(env, join_key_); - const jsize struct_size = env->GetArrayLength(named_struct); - jbyte * struct_address = env->GetByteArrayElements(named_struct, nullptr); - std::string struct_string; - struct_string.assign(reinterpret_cast(struct_address), struct_size); + const auto named_struct_a = local_engine::getByteArrayElementsSafe(env, named_struct); + const std::string::size_type struct_size = named_struct_a.length(); + std::string struct_string{reinterpret_cast(named_struct_a.elems()), struct_size}; const auto join_type = static_cast(join_type_); const jsize length = env->GetArrayLength(in); local_engine::ReadBufferFromByteArray read_buffer_from_java_array(in, length); DB::CompressedReadBuffer input(read_buffer_from_java_array); local_engine::configureCompressedReadBuffer(input); - const auto * obj - = make_wrapper(local_engine::BroadCastJoinBuilder::buildJoin(hash_table_id, input, row_count_, join_key, join_type, struct_string)); - env->ReleaseByteArrayElements(named_struct, struct_address, JNI_ABORT); + const auto * obj = make_wrapper(local_engine::BroadCastJoinBuilder::buildJoin( + hash_table_id, input, row_count_, join_key, join_type, has_mixed_join_condition, struct_string)); return obj->instance(); LOCAL_ENGINE_JNI_METHOD_END(env, 0) } @@ -1402,14 +1256,10 @@ Java_org_apache_gluten_vectorized_SimpleExpressionEval_createNativeInstance(JNIE local_engine::SerializedPlanParser parser(context); jobject iter = env->NewGlobalRef(input); parser.addInputIter(iter, false); - jsize plan_size = env->GetArrayLength(plan); - jbyte * plan_address = env->GetByteArrayElements(plan, nullptr); - std::string plan_string; - plan_string.assign(reinterpret_cast(plan_address), plan_size); - auto query_plan = parser.parse(plan_string); - local_engine::LocalExecutor * executor = new local_engine::LocalExecutor(context); - executor->execute(std::move(query_plan)); - env->ReleaseByteArrayElements(plan, plan_address, JNI_ABORT); + const auto plan_a = local_engine::getByteArrayElementsSafe(env, plan); + const std::string::size_type plan_size = plan_a.length(); + local_engine::LocalExecutor * executor + = parser.createExecutor({reinterpret_cast(plan_a.elems()), plan_size}).release(); return reinterpret_cast(executor); LOCAL_ENGINE_JNI_METHOD_END(env, -1) } @@ -1443,7 +1293,8 @@ JNIEXPORT jlong Java_org_apache_gluten_memory_alloc_CHNativeMemoryAllocator_getD return -1; } -JNIEXPORT jlong Java_org_apache_gluten_memory_alloc_CHNativeMemoryAllocator_createListenableAllocator(JNIEnv * env, jclass, jobject listener) +JNIEXPORT jlong +Java_org_apache_gluten_memory_alloc_CHNativeMemoryAllocator_createListenableAllocator(JNIEnv * env, jclass, jobject listener) { LOCAL_ENGINE_JNI_METHOD_START auto listener_wrapper = std::make_shared(env->NewGlobalRef(listener)); diff --git a/cpp-ch/local-engine/proto/CMakeLists.txt b/cpp-ch/local-engine/proto/CMakeLists.txt index 31583ff659ed1..ffb34504af526 100644 --- a/cpp-ch/local-engine/proto/CMakeLists.txt +++ b/cpp-ch/local-engine/proto/CMakeLists.txt @@ -12,37 +12,34 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -file(GLOB protobuf_files - substrait/*.proto - substrait/extensions/*.proto - ) +file(GLOB protobuf_files substrait/*.proto substrait/extensions/*.proto) -FOREACH(FIL ${protobuf_files}) - file(RELATIVE_PATH FIL_RELATIVE ${ClickHouse_SOURCE_DIR}/utils/extern-local-engine/proto/ ${FIL}) - string(REGEX REPLACE "\\.proto" "" FILE_NAME ${FIL_RELATIVE}) - LIST(APPEND SUBSTRAIT_SRCS "${CMAKE_CURRENT_BINARY_DIR}/${FILE_NAME}.pb.cc") - LIST(APPEND SUBSTRAIT_HEADERS "${CMAKE_CURRENT_BINARY_DIR}/${FILE_NAME}.pb.h") -ENDFOREACH() +foreach(FIL ${protobuf_files}) + file(RELATIVE_PATH FIL_RELATIVE + ${ClickHouse_SOURCE_DIR}/utils/extern-local-engine/proto/ ${FIL}) + string(REGEX REPLACE "\\.proto" "" FILE_NAME ${FIL_RELATIVE}) + list(APPEND SUBSTRAIT_SRCS "${CMAKE_CURRENT_BINARY_DIR}/${FILE_NAME}.pb.cc") + list(APPEND SUBSTRAIT_HEADERS "${CMAKE_CURRENT_BINARY_DIR}/${FILE_NAME}.pb.h") +endforeach() # Generate Substrait headers add_custom_command( - OUTPUT ${SUBSTRAIT_SRCS} ${SUBSTRAIT_HEADERS} - COMMAND - $ --cpp_out ${CMAKE_CURRENT_BINARY_DIR} - --proto_path ${CMAKE_CURRENT_SOURCE_DIR} - --proto_path ${ClickHouse_SOURCE_DIR}/contrib/google-protobuf/src - ${protobuf_files} - DEPENDS ${protobuf_files} - COMMENT "Running cpp protocol buffer compiler" - VERBATIM) -add_custom_target(generate_substrait ALL DEPENDS ${SUBSTRAIT_SRCS} ${SUBSTRAIT_HEADERS}) + OUTPUT ${SUBSTRAIT_SRCS} ${SUBSTRAIT_HEADERS} + COMMAND + $ --cpp_out ${CMAKE_CURRENT_BINARY_DIR} --proto_path + ${CMAKE_CURRENT_SOURCE_DIR} --proto_path + ${ClickHouse_SOURCE_DIR}/contrib/google-protobuf/src ${protobuf_files} + DEPENDS ${protobuf_files} + COMMENT "Running cpp protocol buffer compiler" + VERBATIM) +add_custom_target(generate_substrait ALL DEPENDS ${SUBSTRAIT_SRCS} + ${SUBSTRAIT_HEADERS}) set_source_files_properties(${SUBSTRAIT_SRCS} PROPERTIES GENERATED TRUE) add_library(substrait ${SUBSTRAIT_SRCS}) add_dependencies(substrait generate_substrait) -target_compile_options(substrait PUBLIC -fPIC - -Wno-reserved-identifier - -Wno-deprecated) -target_include_directories(substrait SYSTEM BEFORE PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) +target_compile_options(substrait PUBLIC -fPIC -Wno-reserved-identifier + -Wno-deprecated) +target_include_directories(substrait SYSTEM BEFORE + PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) target_link_libraries(substrait ch_contrib::protobuf) - diff --git a/cpp-ch/local-engine/tests/CMakeLists.txt b/cpp-ch/local-engine/tests/CMakeLists.txt index 9781a332e89cf..be02bf6234d28 100644 --- a/cpp-ch/local-engine/tests/CMakeLists.txt +++ b/cpp-ch/local-engine/tests/CMakeLists.txt @@ -12,76 +12,92 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -configure_file(${CMAKE_CURRENT_SOURCE_DIR}/testConfig.h.in ${CMAKE_CURRENT_SOURCE_DIR}/testConfig.h) +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/testConfig.h.in + ${CMAKE_CURRENT_SOURCE_DIR}/testConfig.h) -if (ENABLE_TESTS) - macro(add_gtest_sources prefix common_path) - add_glob(${prefix}_sources ${common_path}/gtest*.cpp) - endmacro() +if(ENABLE_TESTS) + macro(add_gtest_sources prefix common_path) + add_glob(${prefix}_sources ${common_path}/gtest*.cpp) + endmacro() - set(USE_INTERNAL_GTEST_LIBRARY 0) - set(BENCHMARK_ENABLE_TESTING OFF) + set(USE_INTERNAL_GTEST_LIBRARY 0) + set(BENCHMARK_ENABLE_TESTING OFF) - enable_testing() - include(CTest) + enable_testing() + include(CTest) - include_directories(${GTEST_INCLUDE_DIRS}) + include_directories(${GTEST_INCLUDE_DIRS}) - set(TEST_DATA_DIR "${ClickHouse_SOURCE_DIR}/utils/extern-local-engine/tests") - set(HAVE_POSIX_REGEX 1) - set(LOCAL_ENGINE_DIR "${ClickHouse_SOURCE_DIR}/utils/extern-local-engine") + set(TEST_DATA_DIR "${ClickHouse_SOURCE_DIR}/utils/extern-local-engine/tests") + set(HAVE_POSIX_REGEX 1) + set(LOCAL_ENGINE_DIR "${ClickHouse_SOURCE_DIR}/utils/extern-local-engine") - add_gtest_sources(local_engine_gtest .) - if (ENABLE_LOCAL_UDFS) - file(GLOB children CONFIGURE_DEPENDS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} ../Parser/*_udf/tests) - foreach(child ${children}) - add_gtest_sources(local_engine_gtest ${child}) - endforeach() - - file(GLOB children CONFIGURE_DEPENDS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} ../Parser/*_udf) - foreach(child ${children}) - add_headers_and_sources(local_engine_udf ${child}) - endforeach() - endif () + add_gtest_sources(local_engine_gtest .) + if(ENABLE_LOCAL_UDFS) + file( + GLOB children CONFIGURE_DEPENDS + RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} + ../Parser/*_udf/tests) + foreach(child ${children}) + add_gtest_sources(local_engine_gtest ${child}) + endforeach() - file(GLOB children CONFIGURE_DEPENDS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} ../Parser/*_function_parser) + file( + GLOB children CONFIGURE_DEPENDS + RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} + ../Parser/*_udf) foreach(child ${children}) - add_headers_and_sources(local_engine_function_parser ${child}) + add_headers_and_sources(local_engine_udf ${child}) endforeach() + endif() + file( + GLOB children CONFIGURE_DEPENDS + RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} + ../Parser/*_function_parser) + foreach(child ${children}) + add_headers_and_sources(local_engine_function_parser ${child}) + endforeach() - message("local engine gtest sources: ${local_engine_gtest_sources}") - message("local engine udf sources: ${local_engine_udf_sources}") - message("local engine function parser sources: ${local_engine_function_parser_sources}") + message("local engine gtest sources: ${local_engine_gtest_sources}") + message("local engine udf sources: ${local_engine_udf_sources}") + message( + "local engine function parser sources: ${local_engine_function_parser_sources}" + ) - add_executable(unit_tests_local_engine - gluten_test_util.cpp - ${local_engine_gtest_sources} - ${local_engine_udf_sources} - ${local_engine_function_parser_sources}) - target_include_directories(unit_tests_local_engine PRIVATE - ${ClickHouse_SOURCE_DIR}/utils/extern-local_engine - ${CMAKE_CURRENT_SOURCE_DIR} - ) - # no-unreachable-code for GTEST_SKIP - target_compile_options(unit_tests_local_engine PRIVATE -Wno-unreachable-code) - target_link_libraries(unit_tests_local_engine PRIVATE gluten_clickhouse_backend_libs clickhouse_parsers loggers ch_contrib::gmock_all ch_contrib::gtest) - target_link_libraries(unit_tests_local_engine PRIVATE ch_parquet) + add_executable( + unit_tests_local_engine + gluten_test_util.cpp ${local_engine_gtest_sources} + ${local_engine_udf_sources} ${local_engine_function_parser_sources}) + target_include_directories( + unit_tests_local_engine + PRIVATE ${ClickHouse_SOURCE_DIR}/utils/extern-local_engine + ${CMAKE_CURRENT_SOURCE_DIR}) + # no-unreachable-code for GTEST_SKIP + target_compile_options(unit_tests_local_engine PRIVATE -Wno-unreachable-code) + target_link_libraries( + unit_tests_local_engine + PRIVATE gluten_clickhouse_backend_libs clickhouse_parsers loggers + ch_contrib::gmock_all ch_contrib::gtest) + target_link_libraries(unit_tests_local_engine PRIVATE ch_parquet) endif() -if (ENABLE_BENCHMARKS) - include_directories(benchmark_local_engine SYSTEM PUBLIC - ${ClickHouse_SOURCE_DIR}/utils/extern-local_engine - ) - add_executable(benchmark_local_engine - gluten_test_util.cpp - benchmark_local_engine.cpp - benchmark_parquet_read.cpp - benchmark_spark_row.cpp - benchmark_unix_timestamp_function.cpp - benchmark_spark_floor_function.cpp - benchmark_cast_float_function.cpp - benchmark_to_datetime_function.cpp - benchmark_spark_divide_function.cpp) - target_link_libraries(benchmark_local_engine PRIVATE gluten_clickhouse_backend_libs ch_contrib::gbenchmark_all loggers ch_parquet) +if(ENABLE_BENCHMARKS) + include_directories(benchmark_local_engine SYSTEM PUBLIC + ${ClickHouse_SOURCE_DIR}/utils/extern-local_engine) + add_executable( + benchmark_local_engine + gluten_test_util.cpp + benchmark_local_engine.cpp + benchmark_parquet_read.cpp + benchmark_spark_row.cpp + benchmark_unix_timestamp_function.cpp + benchmark_spark_floor_function.cpp + benchmark_cast_float_function.cpp + benchmark_to_datetime_function.cpp + benchmark_spark_divide_function.cpp) + target_link_libraries( + benchmark_local_engine + PRIVATE gluten_clickhouse_backend_libs ch_contrib::gbenchmark_all loggers + ch_parquet) endif() diff --git a/cpp-ch/local-engine/tests/benchmark_local_engine.cpp b/cpp-ch/local-engine/tests/benchmark_local_engine.cpp index 89fa4fa961eaa..43cdab8a41fa4 100644 --- a/cpp-ch/local-engine/tests/benchmark_local_engine.cpp +++ b/cpp-ch/local-engine/tests/benchmark_local_engine.cpp @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include #include @@ -154,14 +154,11 @@ DB::ContextMutablePtr global_context; std::move(schema)) .build(); local_engine::SerializedPlanParser parser(global_context); - auto query_plan = parser.parse(std::move(plan)); - local_engine::LocalExecutor local_executor; + auto local_executor = parser.createExecutor(*plan); state.ResumeTiming(); - local_executor.execute(std::move(query_plan)); - while (local_executor.hasNext()) - { - local_engine::SparkRowInfoPtr spark_row_info = local_executor.next(); - } + + while (local_executor->hasNext()) + local_engine::SparkRowInfoPtr spark_row_info = local_executor->next(); } } @@ -212,13 +209,12 @@ DB::ContextMutablePtr global_context; std::move(schema)) .build(); local_engine::SerializedPlanParser parser(SerializedPlanParser::global_context); - auto query_plan = parser.parse(std::move(plan)); - local_engine::LocalExecutor local_executor; + auto local_executor = parser.createExecutor(*plan); state.ResumeTiming(); - local_executor.execute(std::move(query_plan)); - while (local_executor.hasNext()) + + while (local_executor->hasNext()) { - Block * block = local_executor.nextColumnar(); + Block * block = local_executor->nextColumnar(); delete block; } } @@ -238,15 +234,10 @@ DB::ContextMutablePtr global_context; std::ifstream t(path); std::string str((std::istreambuf_iterator(t)), std::istreambuf_iterator()); std::cout << "the plan from: " << path << std::endl; - - auto query_plan = parser.parse(str); - local_engine::LocalExecutor local_executor; + auto local_executor = parser.createExecutor(str); state.ResumeTiming(); - local_executor.execute(std::move(query_plan)); - while (local_executor.hasNext()) - { - [[maybe_unused]] auto * x = local_executor.nextColumnar(); - } + while (local_executor->hasNext()) [[maybe_unused]] + auto * x = local_executor->nextColumnar(); } } @@ -282,14 +273,12 @@ DB::ContextMutablePtr global_context; std::move(schema)) .build(); local_engine::SerializedPlanParser parser(SerializedPlanParser::global_context); - auto query_plan = parser.parse(std::move(plan)); - local_engine::LocalExecutor local_executor; + + auto local_executor = parser.createExecutor(*plan); state.ResumeTiming(); - local_executor.execute(std::move(query_plan)); - while (local_executor.hasNext()) - { - local_engine::SparkRowInfoPtr spark_row_info = local_executor.next(); - } + + while (local_executor->hasNext()) + local_engine::SparkRowInfoPtr spark_row_info = local_executor->next(); } } @@ -320,16 +309,13 @@ DB::ContextMutablePtr global_context; .build(); local_engine::SerializedPlanParser parser(SerializedPlanParser::global_context); - auto query_plan = parser.parse(std::move(plan)); - local_engine::LocalExecutor local_executor; - - local_executor.execute(std::move(query_plan)); + auto local_executor = parser.createExecutor(*plan); local_engine::SparkRowToCHColumn converter; - while (local_executor.hasNext()) + while (local_executor->hasNext()) { - local_engine::SparkRowInfoPtr spark_row_info = local_executor.next(); + local_engine::SparkRowInfoPtr spark_row_info = local_executor->next(); state.ResumeTiming(); - auto block = converter.convertSparkRowInfoToCHColumn(*spark_row_info, local_executor.getHeader()); + auto block = converter.convertSparkRowInfoToCHColumn(*spark_row_info, local_executor->getHeader()); state.PauseTiming(); } state.ResumeTiming(); @@ -368,16 +354,13 @@ DB::ContextMutablePtr global_context; std::move(schema)) .build(); local_engine::SerializedPlanParser parser(SerializedPlanParser::global_context); - auto query_plan = parser.parse(std::move(plan)); - local_engine::LocalExecutor local_executor; - - local_executor.execute(std::move(query_plan)); + auto local_executor = parser.createExecutor(*plan); local_engine::SparkRowToCHColumn converter; - while (local_executor.hasNext()) + while (local_executor->hasNext()) { - local_engine::SparkRowInfoPtr spark_row_info = local_executor.next(); + local_engine::SparkRowInfoPtr spark_row_info = local_executor->next(); state.ResumeTiming(); - auto block = converter.convertSparkRowInfoToCHColumn(*spark_row_info, local_executor.getHeader()); + auto block = converter.convertSparkRowInfoToCHColumn(*spark_row_info, local_executor->getHeader()); state.PauseTiming(); } state.ResumeTiming(); @@ -485,12 +468,8 @@ DB::ContextMutablePtr global_context; y.reserve(cnt); for (auto _ : state) - { for (i = 0; i < cnt; i++) - { y[i] = add(x[i], i); - } - } } [[maybe_unused]] static void BM_TestSumInline(benchmark::State & state) @@ -504,12 +483,8 @@ DB::ContextMutablePtr global_context; y.reserve(cnt); for (auto _ : state) - { for (i = 0; i < cnt; i++) - { y[i] = x[i] + i; - } - } } [[maybe_unused]] static void BM_TestPlus(benchmark::State & state) @@ -545,9 +520,7 @@ DB::ContextMutablePtr global_context; block.insert(y); auto executable_function = function->prepare(arguments); for (auto _ : state) - { auto result = executable_function->execute(block.getColumnsWithTypeAndName(), type, rows, false); - } } [[maybe_unused]] static void BM_TestPlusEmbedded(benchmark::State & state) @@ -847,9 +820,7 @@ QueryPlanPtr joinPlan(QueryPlanPtr left, QueryPlanPtr right, String left_key, St ASTPtr rkey = std::make_shared(right_key); join->addOnKeys(lkey, rkey, true); for (const auto & column : join->columnsFromJoinedTable()) - { join->addJoinedColumn(column); - } auto left_keys = left->getCurrentDataStream().header.getNamesAndTypesList(); join->addJoinedColumnsAndCorrectTypes(left_keys, true); @@ -920,7 +891,8 @@ BENCHMARK(BM_ParquetRead)->Unit(benchmark::kMillisecond)->Iterations(10); int main(int argc, char ** argv) { - BackendInitializerUtil::init(nullptr); + std::string empty; + BackendInitializerUtil::init(empty); SCOPE_EXIT({ BackendFinalizerUtil::finalizeGlobally(); }); ::benchmark::Initialize(&argc, argv); diff --git a/cpp-ch/local-engine/tests/gluten_test_util.cpp b/cpp-ch/local-engine/tests/gluten_test_util.cpp index 7fdd32d1661b5..0448092b960dd 100644 --- a/cpp-ch/local-engine/tests/gluten_test_util.cpp +++ b/cpp-ch/local-engine/tests/gluten_test_util.cpp @@ -62,14 +62,14 @@ ActionsDAGPtr parseFilter(const std::string & filter, const AnotherRowType & nam size_limits_for_set, static_cast(0), name_and_types, - std::make_shared(name_and_types), + ActionsDAG(name_and_types), prepared_sets /* prepared_sets */, false /* no_subqueries */, false /* no_makeset */, false /* only_consts */, info); ActionsVisitor(visitor_data).visit(ast_exp); - return ActionsDAG::buildFilterActionsDAG({visitor_data.getActions()->getOutputs().back()}, node_name_to_input_column); + return ActionsDAG::buildFilterActionsDAG({visitor_data.getActions().getOutputs().back()}, node_name_to_input_column); } const char * get_data_dir() diff --git a/cpp-ch/local-engine/tests/gluten_test_util.h b/cpp-ch/local-engine/tests/gluten_test_util.h index d4c16e9fbbd84..dba4496d6221b 100644 --- a/cpp-ch/local-engine/tests/gluten_test_util.h +++ b/cpp-ch/local-engine/tests/gluten_test_util.h @@ -24,6 +24,7 @@ #include #include #include +#include #include using BlockRowType = DB::ColumnsWithTypeAndName; @@ -60,6 +61,23 @@ AnotherRowType readParquetSchema(const std::string & file); DB::ActionsDAGPtr parseFilter(const std::string & filter, const AnotherRowType & name_and_types); +namespace pb_util +{ +template +std::string JsonStringToBinary(const std::string_view & json) +{ + Message message; + std::string binary; + auto s = google::protobuf::util::JsonStringToMessage(json, &message); + if (!s.ok()) + { + const std::string err_msg{s.message()}; + throw std::runtime_error(err_msg); + } + message.SerializeToString(&binary); + return binary; +} +} } inline DB::DataTypePtr BIGINT() diff --git a/cpp-ch/local-engine/tests/gtest_ch_functions.cpp b/cpp-ch/local-engine/tests/gtest_ch_functions.cpp index 943826287d540..613beb9b8051d 100644 --- a/cpp-ch/local-engine/tests/gtest_ch_functions.cpp +++ b/cpp-ch/local-engine/tests/gtest_ch_functions.cpp @@ -79,10 +79,12 @@ TEST(TestFunction, In) set->insertFromBlock(col1_set_block.getColumnsWithTypeAndName()); set->finishInsert(); auto future_set = std::make_shared(std::move(set)); - auto arg = ColumnSet::create(1, future_set); + //TODO: WHY? after https://github.com/ClickHouse/ClickHouse/pull/63723 we need pass 4 instead of 1 + auto arg = ColumnSet::create(4, future_set); ColumnsWithTypeAndName columns - = {ColumnWithTypeAndName(std::move(column1), type0, "string0"), ColumnWithTypeAndName(std::move(arg), type_set, "__set")}; + = {ColumnWithTypeAndName(std::move(column1), type0, "string0"), + ColumnWithTypeAndName(std::move(arg), type_set, "__set")}; Block block(columns); std::cerr << "input:\n"; debug::headBlock(block); @@ -121,7 +123,9 @@ TEST(TestFunction, NotIn1) set->insertFromBlock(col1_set_block.getColumnsWithTypeAndName()); set->finishInsert(); auto future_set = std::make_shared(std::move(set)); - auto arg = ColumnSet::create(1,future_set); + + //TODO: WHY? after https://github.com/ClickHouse/ClickHouse/pull/63723 we need pass 4 instead of 1 + auto arg = ColumnSet::create(4,future_set); ColumnsWithTypeAndName columns = {ColumnWithTypeAndName(std::move(column1), type0, "string0"), ColumnWithTypeAndName(std::move(arg), type_set, "__set")}; @@ -162,7 +166,9 @@ TEST(TestFunction, NotIn2) set->insertFromBlock(col1_set_block.getColumnsWithTypeAndName()); set->finishInsert(); auto future_set = std::make_shared(std::move(set)); - auto arg = ColumnSet::create(1,future_set); + + //TODO: WHY? after https://github.com/ClickHouse/ClickHouse/pull/63723 we need pass 4 instead of 1 + auto arg = ColumnSet::create(4,future_set); ColumnsWithTypeAndName columns = {ColumnWithTypeAndName(std::move(column1), type0, "string0"), ColumnWithTypeAndName(std::move(arg), type_set, "__set")}; diff --git a/cpp-ch/local-engine/tests/gtest_ch_join.cpp b/cpp-ch/local-engine/tests/gtest_ch_join.cpp index 739390302b46a..43bac7a597286 100644 --- a/cpp-ch/local-engine/tests/gtest_ch_join.cpp +++ b/cpp-ch/local-engine/tests/gtest_ch_join.cpp @@ -31,7 +31,7 @@ #include #include -#include +#include #include #include diff --git a/cpp-ch/local-engine/tests/gtest_local_engine.cpp b/cpp-ch/local-engine/tests/gtest_local_engine.cpp index 2d1807841041a..962bf9def52e3 100644 --- a/cpp-ch/local-engine/tests/gtest_local_engine.cpp +++ b/cpp-ch/local-engine/tests/gtest_local_engine.cpp @@ -16,9 +16,12 @@ */ #include #include +#include +#include + #include -#include #include +#include #include #include #include @@ -28,7 +31,6 @@ #include #include #include -#include #include #include #include @@ -84,13 +86,23 @@ TEST(ReadBufferFromFile, seekBackwards) ASSERT_EQ(x, 8); } +INCBIN(resource_embedded_config_json, SOURCE_DIR "/utils/extern-local-engine/tests/json/gtest_local_engine_config.json"); + +namespace DB +{ +void registerOutputFormatParquet(DB::FormatFactory & factory); +} + int main(int argc, char ** argv) { - auto * init = new String("{\"advancedExtensions\":{\"enhancement\":{\"@type\":\"type.googleapis.com/substrait.Expression\",\"literal\":{\"map\":{\"keyValues\":[{\"key\":{\"string\":\"spark.gluten.sql.columnar.backend.ch.runtime_config.logger.level\"},\"value\":{\"string\":\"trace\"}},{\"key\":{\"string\":\"spark.gluten.sql.columnar.backend.ch.runtime_settings.max_bytes_before_external_sort\"},\"value\":{\"string\":\"5368709120\"}},{\"key\":{\"string\":\"spark.hadoop.fs.s3a.endpoint\"},\"value\":{\"string\":\"localhost:9000\"}},{\"key\":{\"string\":\"spark.gluten.sql.columnar.backend.velox.IOThreads\"},\"value\":{\"string\":\"0\"}},{\"key\":{\"string\":\"spark.gluten.sql.columnar.backend.ch.runtime_config.hdfs.input_read_timeout\"},\"value\":{\"string\":\"180000\"}},{\"key\":{\"string\":\"spark.gluten.sql.columnar.backend.ch.runtime_settings.query_plan_enable_optimizations\"},\"value\":{\"string\":\"false\"}},{\"key\":{\"string\":\"spark.gluten.sql.columnar.backend.ch.worker.id\"},\"value\":{\"string\":\"1\"}},{\"key\":{\"string\":\"spark.memory.offHeap.enabled\"},\"value\":{\"string\":\"true\"}},{\"key\":{\"string\":\"spark.hadoop.fs.s3a.iam.role.session.name\"},\"value\":{\"string\":\"\"}},{\"key\":{\"string\":\"spark.gluten.sql.columnar.backend.ch.runtime_config.hdfs.input_connect_timeout\"},\"value\":{\"string\":\"180000\"}},{\"key\":{\"string\":\"spark.gluten.sql.columnar.shuffle.codec\"},\"value\":{\"string\":\"\"}},{\"key\":{\"string\":\"spark.gluten.sql.columnar.backend.ch.runtime_config.local_engine.settings.log_processors_profiles\"},\"value\":{\"string\":\"true\"}},{\"key\":{\"string\":\"spark.gluten.memory.offHeap.size.in.bytes\"},\"value\":{\"string\":\"10737418240\"}},{\"key\":{\"string\":\"spark.gluten.sql.columnar.shuffle.codecBackend\"},\"value\":{\"string\":\"\"}},{\"key\":{\"string\":\"spark.sql.orc.compression.codec\"},\"value\":{\"string\":\"snappy\"}},{\"key\":{\"string\":\"spark.gluten.sql.columnar.backend.ch.runtime_settings.max_bytes_before_external_group_by\"},\"value\":{\"string\":\"5368709120\"}},{\"key\":{\"string\":\"spark.hadoop.input.write.timeout\"},\"value\":{\"string\":\"180000\"}},{\"key\":{\"string\":\"spark.hadoop.fs.s3a.secret.key\"},\"value\":{\"string\":\"\"}},{\"key\":{\"string\":\"spark.hadoop.fs.s3a.access.key\"},\"value\":{\"string\":\"\"}},{\"key\":{\"string\":\"spark.gluten.sql.columnar.backend.ch.runtime_config.hdfs.dfs_client_log_severity\"},\"value\":{\"string\":\"INFO\"}},{\"key\":{\"string\":\"spark.hadoop.fs.s3a.path.style.access\"},\"value\":{\"string\":\"true\"}},{\"key\":{\"string\":\"spark.gluten.sql.columnar.backend.ch.runtime_config.timezone\"},\"value\":{\"string\":\"Asia/Shanghai\"}},{\"key\":{\"string\":\"spark.hadoop.input.read.timeout\"},\"value\":{\"string\":\"180000\"}},{\"key\":{\"string\":\"spark.hadoop.fs.s3a.use.instance.credentials\"},\"value\":{\"string\":\"false\"}},{\"key\":{\"string\":\"spark.gluten.sql.columnar.backend.ch.runtime_settings.output_format_orc_compression_method\"},\"value\":{\"string\":\"snappy\"}},{\"key\":{\"string\":\"spark.hadoop.fs.s3a.iam.role\"},\"value\":{\"string\":\"\"}},{\"key\":{\"string\":\"spark.gluten.memory.task.offHeap.size.in.bytes\"},\"value\":{\"string\":\"10737418240\"}},{\"key\":{\"string\":\"spark.hadoop.input.connect.timeout\"},\"value\":{\"string\":\"180000\"}},{\"key\":{\"string\":\"spark.hadoop.dfs.client.log.severity\"},\"value\":{\"string\":\"INFO\"}},{\"key\":{\"string\":\"spark.gluten.sql.columnar.backend.velox.SplitPreloadPerDriver\"},\"value\":{\"string\":\"2\"}},{\"key\":{\"string\":\"spark.gluten.sql.columnar.backend.ch.runtime_config.hdfs.input_write_timeout\"},\"value\":{\"string\":\"180000\"}},{\"key\":{\"string\":\"spark.hadoop.fs.s3a.connection.ssl.enabled\"},\"value\":{\"string\":\"false\"}}]}}}}}"); + BackendInitializerUtil::init(test::pb_util::JsonStringToBinary( + {reinterpret_cast(gresource_embedded_config_jsonData), gresource_embedded_config_jsonSize})); + + auto & factory = FormatFactory::instance(); + DB::registerOutputFormatParquet(factory); - BackendInitializerUtil::init_json(std::move(init)); SCOPE_EXIT({ BackendFinalizerUtil::finalizeGlobally(); }); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); -} +} \ No newline at end of file diff --git a/cpp-ch/local-engine/tests/gtest_parquet_columnindex.cpp b/cpp-ch/local-engine/tests/gtest_parquet_columnindex.cpp index ba09b21b266e5..bdaa51f974c52 100644 --- a/cpp-ch/local-engine/tests/gtest_parquet_columnindex.cpp +++ b/cpp-ch/local-engine/tests/gtest_parquet_columnindex.cpp @@ -15,12 +15,14 @@ * limitations under the License. */ +#include + + #include "config.h" #if USE_PARQUET #include #include #include -#include #include #include #include @@ -59,6 +61,9 @@ class PrimitiveNodeBuilder parquet::Repetition::type repetition_ = parquet::Repetition::UNDEFINED; parquet::ConvertedType::type converted_type_ = parquet::ConvertedType::NONE; parquet::Type::type physical_type_ = parquet::Type::UNDEFINED; + int length_ = -1; + int precision_ = -1; + int scale_ = -1; public: PrimitiveNodeBuilder & as(parquet::ConvertedType::type converted_type) @@ -67,13 +72,25 @@ class PrimitiveNodeBuilder return *this; } + PrimitiveNodeBuilder & with_length(int length) + { + length_ = length; + return *this; + } + PrimitiveNodeBuilder & asDecimal(int precision, int scale) + { + converted_type_ = parquet::ConvertedType::DECIMAL; + precision_ = precision; + scale_ = scale; + return *this; + } parquet::schema::NodePtr named(const std::string & name) const { assert(!name.empty()); if (physical_type_ == parquet::Type::UNDEFINED) throw Exception(ErrorCodes::LOGICAL_ERROR, "Unsupported physical type"); return parquet::schema::PrimitiveNode::Make( - name, repetition_, physical_type_, converted_type_, /*length=*/-1, /*precision=*/-1, /*scale=*/-1, /*field_id*/ -1); + name, repetition_, physical_type_, converted_type_, length_, precision_, scale_, /*field_id*/ -1); } parquet::ColumnDescriptor descriptor(const std::string & name) const { return {named(name), /*max_definition_level=*/1, 0}; } static PrimitiveNodeBuilder optional(parquet::Type::type physical_type) @@ -483,13 +500,22 @@ using ParquetValue = std::variant< parquet::DoubleType::c_type, parquet::ByteArrayType::c_type>; -ParquetValue to(const DB::Field & value, const parquet::ColumnDescriptor & desc) +template +void doComapre( + const parquet::ColumnDescriptor & descriptor, const DB::Field & value, const std::function & compare) +{ + local_engine::ToParquet to_parquet; + compare({to_parquet.as(value, descriptor)}); +} + +void with_actual(const DB::Field & value, const parquet::ColumnDescriptor & desc, const std::function & compare) { using namespace local_engine; switch (desc.physical_type()) { case parquet::Type::BOOLEAN: - break; + doComapre(desc, value, compare); + return; case parquet::Type::INT32: { switch (desc.converted_type()) { @@ -500,7 +526,8 @@ ParquetValue to(const DB::Field & value, const parquet::ColumnDescriptor & desc) case parquet::ConvertedType::INT_16: case parquet::ConvertedType::INT_32: case parquet::ConvertedType::NONE: - return {parquetCast(value)}; + doComapre(desc, value, compare); + return; default: break; } @@ -512,35 +539,83 @@ ParquetValue to(const DB::Field & value, const parquet::ColumnDescriptor & desc) case parquet::ConvertedType::INT_64: case parquet::ConvertedType::UINT_64: case parquet::ConvertedType::NONE: - return {parquetCast(value)}; + doComapre(desc, value, compare); + return; default: break; } break; case parquet::Type::INT96: + // doComapre(desc, value, compare); break; case parquet::Type::FLOAT: - return {value.get()}; + doComapre(desc, value, compare); + return; case parquet::Type::DOUBLE: - return {value.get()}; - break; + doComapre(desc, value, compare); + return; case parquet::Type::BYTE_ARRAY: switch (desc.converted_type()) { case parquet::ConvertedType::UTF8: - return parquetCast(value); + doComapre(desc, value, compare); + return; default: break; } break; case parquet::Type::FIXED_LEN_BYTE_ARRAY: + // doComapre(desc, value, compare); break; case parquet::Type::UNDEFINED: break; } - abort(); + ASSERT_TRUE(false) << "Unsupported physical type: [" << TypeToString(desc.physical_type()) << "] with logical type: [" + << desc.logical_type()->ToString() << "] with converted type: [" << ConvertedTypeToString(desc.converted_type()) + << "]"; } +// for gtest +namespace parquet +{ +void PrintTo(const ByteArray & val, std::ostream * os) +{ + *os << '[' << std::hex; + + for (size_t i = 0; i < val.len; ++i) + { + *os << std::setw(2) << std::setfill('0') << static_cast(val.ptr[i]); + if (i != val.len - 1) + *os << ", "; + } + *os << ']'; +} +} +TEST(ColumnIndex, DecimalField) +{ + // we can't define `operator==` for parquet::FLBAType + Field value = DecimalField(Int128(300000000), 4); + local_engine::ToParquet to_parquet; + const parquet::ColumnDescriptor desc + = PNB::optional(parquet::Type::FIXED_LEN_BYTE_ARRAY).asDecimal(38, 4).with_length(13).descriptor("column1"); + uint8_t expected_a[13]{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x11, 0xE1, 0xA3, 0x0}; + const parquet::ByteArray expected{13, expected_a}; + const parquet::ByteArray actual{13, to_parquet.as(value, desc).ptr}; + ASSERT_EQ(actual, expected); + + + /// Exception test, only in release node +#ifdef NDEBUG + Field unsupport = DecimalField(Int256(300000000), 4); + EXPECT_THROW(to_parquet.as(unsupport, desc), DB::Exception); + + const parquet::ColumnDescriptor error + = PNB::optional(parquet::Type::FIXED_LEN_BYTE_ARRAY).asDecimal(38, 4).with_length(18).descriptor("column1"); + EXPECT_THROW(to_parquet.as(value, error), DB::Exception); +#endif +} + + TEST(ColumnIndex, Field) { std::string s_tmp = "hello world"; @@ -551,7 +626,6 @@ TEST(ColumnIndex, Field) parquet::ColumnDescriptor, //desc ParquetValue //expected value >; - using PNB = test_utils::PrimitiveNodeBuilder; const std::vector datas{ {"int32_UINT_8", static_cast(1), @@ -579,8 +653,7 @@ TEST(ColumnIndex, Field) const auto & value = std::get<1>(data); const auto & desc = std::get<2>(data); const auto & expected = std::get<3>(data); - const auto actual = to(value, desc); - ASSERT_EQ(actual, expected) << name; + with_actual(value, desc, [&](const ParquetValue & actual) { ASSERT_EQ(actual, expected) << name; }); }); const std::vector> primitive_fields{ @@ -612,7 +685,7 @@ struct ReadStatesParam ReadStatesParam() = default; ReadStatesParam(local_engine::RowRanges ranges, std::shared_ptr states) - : row_ranges(std::move(ranges)), read_states(std::move(states)){}; + : row_ranges(std::move(ranges)), read_states(std::move(states)) {}; local_engine::RowRanges row_ranges; std::shared_ptr read_states; diff --git a/cpp-ch/local-engine/tests/gtest_parquet_read.cpp b/cpp-ch/local-engine/tests/gtest_parquet_read.cpp index 94f28763e679c..9623ffa98d281 100644 --- a/cpp-ch/local-engine/tests/gtest_parquet_read.cpp +++ b/cpp-ch/local-engine/tests/gtest_parquet_read.cpp @@ -15,6 +15,9 @@ * limitations under the License. */ +#include + + #include "config.h" #if USE_PARQUET @@ -139,6 +142,32 @@ TEST(ParquetRead, ReadSchema) readSchema("alltypes/alltypes_null.parquet"); } +TEST(ParquetRead, VerifyPageindexReaderSupport) +{ + EXPECT_FALSE(local_engine::ParquetFormatFile::pageindex_reader_support( + toBlockRowType(local_engine::test::readParquetSchema(local_engine::test::data_file("alltypes/alltypes_notnull.parquet"))))); + EXPECT_FALSE(local_engine::ParquetFormatFile::pageindex_reader_support( + toBlockRowType(local_engine::test::readParquetSchema(local_engine::test::data_file("alltypes/alltypes_null.parquet"))))); + + + EXPECT_FALSE(local_engine::ParquetFormatFile::pageindex_reader_support( + toBlockRowType(local_engine::test::readParquetSchema(local_engine::test::data_file("array.parquet"))))); + EXPECT_TRUE(local_engine::ParquetFormatFile::pageindex_reader_support( + toBlockRowType(local_engine::test::readParquetSchema(local_engine::test::data_file("date.parquet"))))); + EXPECT_TRUE(local_engine::ParquetFormatFile::pageindex_reader_support( + toBlockRowType(local_engine::test::readParquetSchema(local_engine::test::data_file("datetime64.parquet"))))); + EXPECT_TRUE(local_engine::ParquetFormatFile::pageindex_reader_support( + toBlockRowType(local_engine::test::readParquetSchema(local_engine::test::data_file("decimal.parquet"))))); + EXPECT_TRUE(local_engine::ParquetFormatFile::pageindex_reader_support( + toBlockRowType(local_engine::test::readParquetSchema(local_engine::test::data_file("iris.parquet"))))); + EXPECT_FALSE(local_engine::ParquetFormatFile::pageindex_reader_support( + toBlockRowType(local_engine::test::readParquetSchema(local_engine::test::data_file("map.parquet"))))); + EXPECT_TRUE(local_engine::ParquetFormatFile::pageindex_reader_support( + toBlockRowType(local_engine::test::readParquetSchema(local_engine::test::data_file("sample.parquet"))))); + EXPECT_FALSE(local_engine::ParquetFormatFile::pageindex_reader_support( + toBlockRowType(local_engine::test::readParquetSchema(local_engine::test::data_file("struct.parquet"))))); +} + TEST(ParquetRead, ReadDataNotNull) { const std::map fields{ diff --git a/cpp-ch/local-engine/tests/gtest_parser.cpp b/cpp-ch/local-engine/tests/gtest_parser.cpp new file mode 100644 index 0000000000000..24c796358f450 --- /dev/null +++ b/cpp-ch/local-engine/tests/gtest_parser.cpp @@ -0,0 +1,155 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include + + +using namespace local_engine; +using namespace DB; + +// Plan for https://github.com/ClickHouse/ClickHouse/pull/65234 +INCBIN(resource_embedded_pr_65234_json, SOURCE_DIR "/utils/extern-local-engine/tests/json/clickhouse_pr_65234.json"); + +TEST(SerializedPlanParser, PR65234) +{ + const std::string split + = R"({"items":[{"uriFile":"file:///home/chang/SourceCode/rebase_gluten/backends-clickhouse/target/scala-2.12/test-classes/tests-working-home/tpch-data/supplier/part-00000-16caa751-9774-470c-bd37-5c84c53373c8-c000.snappy.parquet","length":"84633","parquet":{},"schema":{},"metadataColumns":[{}]}]})"; + SerializedPlanParser parser(SerializedPlanParser::global_context); + parser.addSplitInfo(test::pb_util::JsonStringToBinary(split)); + auto query_plan + = parser.parseJson({reinterpret_cast(gresource_embedded_pr_65234_jsonData), gresource_embedded_pr_65234_jsonSize}); +} + +#include +#include +#include +#include +#include + +Chunk testChunk() +{ + auto nameCol = STRING()->createColumn(); + nameCol->insert("one"); + nameCol->insert("two"); + nameCol->insert("three"); + + auto valueCol = UINT()->createColumn(); + valueCol->insert(1); + valueCol->insert(2); + valueCol->insert(3); + MutableColumns x; + x.push_back(std::move(nameCol)); + x.push_back(std::move(valueCol)); + return {std::move(x), 3}; +} + +TEST(LocalExecutor, StorageObjectStorageSink) +{ + /// 0. Create ObjectStorage for HDFS + auto settings = SerializedPlanParser::global_context->getSettingsRef(); + const std::string query + = R"(CREATE TABLE hdfs_engine_xxxx (name String, value UInt32) ENGINE=HDFS('hdfs://localhost:8020/clickhouse/test2', 'Parquet'))"; + DB::ParserCreateQuery parser; + std::string error_message; + const char * pos = query.data(); + auto ast = DB::tryParseQuery( + parser, + pos, + pos + query.size(), + error_message, + /* hilite = */ false, + "QUERY TEST", + /* allow_multi_statements = */ false, + 0, + settings.max_parser_depth, + settings.max_parser_backtracks, + true); + auto & create = ast->as(); + auto arg = create.storage->children[0]; + const auto * func = arg->as(); + EXPECT_TRUE(func && func->name == "HDFS"); + + DB::StorageHDFSConfiguration config; + StorageObjectStorage::Configuration::initialize(config, arg->children[0]->children, SerializedPlanParser::global_context, false); + + const std::shared_ptr object_storage + = std::dynamic_pointer_cast(config.createObjectStorage(SerializedPlanParser::global_context, false)); + EXPECT_TRUE(object_storage != nullptr); + + RelativePathsWithMetadata files_with_metadata; + object_storage->listObjects("/clickhouse", files_with_metadata, 0); + + /// 1. Create ObjectStorageSink + DB::StorageObjectStorageSink sink{ + object_storage, config.clone(), {}, {{STRING(), "name"}, {UINT(), "value"}}, SerializedPlanParser::global_context, ""}; + + /// 2. Create Chunk + /// 3. comsume + sink.consume(testChunk()); + sink.onFinish(); +} + +namespace DB +{ +SinkToStoragePtr createFilelinkSink( + const StorageMetadataPtr & metadata_snapshot, + const String & table_name_for_log, + const String & path, + CompressionMethod compression_method, + const std::optional & format_settings, + const String & format_name, + const ContextPtr & context, + int flags); +} + +INCBIN(resource_embedded_readcsv_json, SOURCE_DIR "/utils/extern-local-engine/tests/json/read_student_option_schema.csv.json"); +TEST(LocalExecutor, StorageFileSink) +{ + const std::string split + = R"({"items":[{"uriFile":"file:///home/chang/SourceCode/rebase_gluten/backends-velox/src/test/resources/datasource/csv/student_option_schema.csv","length":"56","text":{"fieldDelimiter":",","maxBlockSize":"8192","header":"1"},"schema":{"names":["id","name","language"],"struct":{"types":[{"string":{"nullability":"NULLABILITY_NULLABLE"}},{"string":{"nullability":"NULLABILITY_NULLABLE"}},{"string":{"nullability":"NULLABILITY_NULLABLE"}}]}},"metadataColumns":[{}]}]})"; + SerializedPlanParser parser(SerializedPlanParser::global_context); + parser.addSplitInfo(test::pb_util::JsonStringToBinary(split)); + auto local_executor = parser.createExecutor( + {reinterpret_cast(gresource_embedded_readcsv_jsonData), gresource_embedded_readcsv_jsonSize}); + + while (local_executor->hasNext()) + { + const Block & x = *local_executor->nextColumnar(); + EXPECT_EQ(4, x.rows()); + } + + StorageInMemoryMetadata metadata; + metadata.setColumns(ColumnsDescription::fromNamesAndTypes({{"name", STRING()}, {"value", UINT()}})); + StorageMetadataPtr metadata_ptr = std::make_shared(metadata); + +/* + auto sink = createFilelinkSink( + metadata_ptr, + "test_table", + "/tmp/test_table.parquet", + CompressionMethod::None, + {}, + "Parquet", + SerializedPlanParser::global_context, + 0); + + sink->consume(testChunk()); + sink->onFinish(); + */ +} \ No newline at end of file diff --git a/cpp-ch/local-engine/tests/json/clickhouse_pr_65234.json b/cpp-ch/local-engine/tests/json/clickhouse_pr_65234.json new file mode 100644 index 0000000000000..1c37b68b7144e --- /dev/null +++ b/cpp-ch/local-engine/tests/json/clickhouse_pr_65234.json @@ -0,0 +1,273 @@ +{ + "extensions": [{ + "extensionFunction": { + "functionAnchor": 1, + "name": "is_not_null:str" + } + }, { + "extensionFunction": { + "functionAnchor": 2, + "name": "equal:str_str" + } + }, { + "extensionFunction": { + "functionAnchor": 3, + "name": "is_not_null:i64" + } + }, { + "extensionFunction": { + "name": "and:bool_bool" + } + }], + "relations": [{ + "root": { + "input": { + "project": { + "common": { + "emit": { + "outputMapping": [2] + } + }, + "input": { + "filter": { + "common": { + "direct": { + } + }, + "input": { + "read": { + "common": { + "direct": { + } + }, + "baseSchema": { + "names": ["r_regionkey", "r_name"], + "struct": { + "types": [{ + "i64": { + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "string": { + "nullability": "NULLABILITY_NULLABLE" + } + }] + }, + "columnTypes": ["NORMAL_COL", "NORMAL_COL"] + }, + "filter": { + "scalarFunction": { + "outputType": { + "bool": { + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [{ + "value": { + "scalarFunction": { + "outputType": { + "bool": { + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [{ + "value": { + "scalarFunction": { + "functionReference": 1, + "outputType": { + "bool": { + "nullability": "NULLABILITY_REQUIRED" + } + }, + "arguments": [{ + "value": { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + } + } + } + }] + } + } + }, { + "value": { + "scalarFunction": { + "functionReference": 2, + "outputType": { + "bool": { + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [{ + "value": { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + } + } + } + }, { + "value": { + "literal": { + "string": "EUROPE" + } + } + }] + } + } + }] + } + } + }, { + "value": { + "scalarFunction": { + "functionReference": 3, + "outputType": { + "bool": { + "nullability": "NULLABILITY_REQUIRED" + } + }, + "arguments": [{ + "value": { + "selection": { + "directReference": { + "structField": { + } + } + } + } + }] + } + } + }] + } + }, + "advancedExtension": { + "optimization": { + "@type": "type.googleapis.com/google.protobuf.StringValue", + "value": "isMergeTree\u003d0\n" + } + } + } + }, + "condition": { + "scalarFunction": { + "outputType": { + "bool": { + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [{ + "value": { + "scalarFunction": { + "outputType": { + "bool": { + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [{ + "value": { + "scalarFunction": { + "functionReference": 1, + "outputType": { + "bool": { + "nullability": "NULLABILITY_REQUIRED" + } + }, + "arguments": [{ + "value": { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + } + } + } + }] + } + } + }, { + "value": { + "scalarFunction": { + "functionReference": 2, + "outputType": { + "bool": { + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [{ + "value": { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + } + } + } + }, { + "value": { + "literal": { + "string": "EUROPE" + } + } + }] + } + } + }] + } + } + }, { + "value": { + "scalarFunction": { + "functionReference": 3, + "outputType": { + "bool": { + "nullability": "NULLABILITY_REQUIRED" + } + }, + "arguments": [{ + "value": { + "selection": { + "directReference": { + "structField": { + } + } + } + } + }] + } + } + }] + } + } + } + }, + "expressions": [{ + "selection": { + "directReference": { + "structField": { + } + } + } + }] + } + }, + "names": ["r_regionkey#72"], + "outputSchema": { + "types": [{ + "i64": { + "nullability": "NULLABILITY_NULLABLE" + } + }], + "nullability": "NULLABILITY_REQUIRED" + } + } + }] +} \ No newline at end of file diff --git a/cpp-ch/local-engine/tests/json/gtest_local_engine_config.json b/cpp-ch/local-engine/tests/json/gtest_local_engine_config.json new file mode 100644 index 0000000000000..10f0ea3dfdad9 --- /dev/null +++ b/cpp-ch/local-engine/tests/json/gtest_local_engine_config.json @@ -0,0 +1,269 @@ +{ + "advancedExtensions": { + "enhancement": { + "@type": "type.googleapis.com/substrait.Expression", + "literal": { + "map": { + "keyValues": [ + { + "key": { + "string": "spark.gluten.sql.columnar.backend.ch.runtime_config.logger.level" + }, + "value": { + "string": "test" + } + }, + { + "key": { + "string": "spark.gluten.sql.columnar.backend.ch.runtime_settings.max_bytes_before_external_sort" + }, + "value": { + "string": "5368709120" + } + }, + { + "key": { + "string": "spark.hadoop.fs.s3a.endpoint" + }, + "value": { + "string": "localhost:9000" + } + }, + { + "key": { + "string": "spark.gluten.sql.columnar.backend.velox.IOThreads" + }, + "value": { + "string": "0" + } + }, + { + "key": { + "string": "spark.gluten.sql.columnar.backend.ch.runtime_config.hdfs.input_read_timeout" + }, + "value": { + "string": "180000" + } + }, + { + "key": { + "string": "spark.gluten.sql.columnar.backend.ch.runtime_settings.query_plan_enable_optimizations" + }, + "value": { + "string": "false" + } + }, + { + "key": { + "string": "spark.gluten.sql.columnar.backend.ch.worker.id" + }, + "value": { + "string": "1" + } + }, + { + "key": { + "string": "spark.memory.offHeap.enabled" + }, + "value": { + "string": "true" + } + }, + { + "key": { + "string": "spark.hadoop.fs.s3a.iam.role.session.name" + }, + "value": { + "string": "" + } + }, + { + "key": { + "string": "spark.gluten.sql.columnar.backend.ch.runtime_config.hdfs.input_connect_timeout" + }, + "value": { + "string": "180000" + } + }, + { + "key": { + "string": "spark.gluten.sql.columnar.shuffle.codec" + }, + "value": { + "string": "" + } + }, + { + "key": { + "string": "spark.gluten.sql.columnar.backend.ch.runtime_config.local_engine.settings.log_processors_profiles" + }, + "value": { + "string": "true" + } + }, + { + "key": { + "string": "spark.gluten.memory.offHeap.size.in.bytes" + }, + "value": { + "string": "10737418240" + } + }, + { + "key": { + "string": "spark.gluten.sql.columnar.shuffle.codecBackend" + }, + "value": { + "string": "" + } + }, + { + "key": { + "string": "spark.sql.orc.compression.codec" + }, + "value": { + "string": "snappy" + } + }, + { + "key": { + "string": "spark.gluten.sql.columnar.backend.ch.runtime_settings.max_bytes_before_external_group_by" + }, + "value": { + "string": "5368709120" + } + }, + { + "key": { + "string": "spark.hadoop.input.write.timeout" + }, + "value": { + "string": "180000" + } + }, + { + "key": { + "string": "spark.hadoop.fs.s3a.secret.key" + }, + "value": { + "string": "" + } + }, + { + "key": { + "string": "spark.hadoop.fs.s3a.access.key" + }, + "value": { + "string": "" + } + }, + { + "key": { + "string": "spark.gluten.sql.columnar.backend.ch.runtime_config.hdfs.dfs_client_log_severity" + }, + "value": { + "string": "INFO" + } + }, + { + "key": { + "string": "spark.hadoop.fs.s3a.path.style.access" + }, + "value": { + "string": "true" + } + }, + { + "key": { + "string": "spark.gluten.sql.columnar.backend.ch.runtime_config.timezone" + }, + "value": { + "string": "Asia/Shanghai" + } + }, + { + "key": { + "string": "spark.hadoop.input.read.timeout" + }, + "value": { + "string": "180000" + } + }, + { + "key": { + "string": "spark.hadoop.fs.s3a.use.instance.credentials" + }, + "value": { + "string": "false" + } + }, + { + "key": { + "string": "spark.gluten.sql.columnar.backend.ch.runtime_settings.output_format_orc_compression_method" + }, + "value": { + "string": "snappy" + } + }, + { + "key": { + "string": "spark.hadoop.fs.s3a.iam.role" + }, + "value": { + "string": "" + } + }, + { + "key": { + "string": "spark.gluten.memory.task.offHeap.size.in.bytes" + }, + "value": { + "string": "10737418240" + } + }, + { + "key": { + "string": "spark.hadoop.input.connect.timeout" + }, + "value": { + "string": "180000" + } + }, + { + "key": { + "string": "spark.hadoop.dfs.client.log.severity" + }, + "value": { + "string": "INFO" + } + }, + { + "key": { + "string": "spark.gluten.sql.columnar.backend.velox.SplitPreloadPerDriver" + }, + "value": { + "string": "2" + } + }, + { + "key": { + "string": "spark.gluten.sql.columnar.backend.ch.runtime_config.hdfs.input_write_timeout" + }, + "value": { + "string": "180000" + } + }, + { + "key": { + "string": "spark.hadoop.fs.s3a.connection.ssl.enabled" + }, + "value": { + "string": "false" + } + } + ] + } + } + } + } +} \ No newline at end of file diff --git a/cpp-ch/local-engine/tests/json/read_student_option_schema.csv.json b/cpp-ch/local-engine/tests/json/read_student_option_schema.csv.json new file mode 100644 index 0000000000000..f9518d39014a5 --- /dev/null +++ b/cpp-ch/local-engine/tests/json/read_student_option_schema.csv.json @@ -0,0 +1,77 @@ +{ + "relations": [ + { + "root": { + "input": { + "read": { + "common": { + "direct": {} + }, + "baseSchema": { + "names": [ + "id", + "name", + "language" + ], + "struct": { + "types": [ + { + "string": { + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "string": { + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "string": { + "nullability": "NULLABILITY_NULLABLE" + } + } + ] + }, + "columnTypes": [ + "NORMAL_COL", + "NORMAL_COL", + "NORMAL_COL" + ] + }, + "advancedExtension": { + "optimization": { + "@type": "type.googleapis.com/google.protobuf.StringValue", + "value": "isMergeTree=0\n" + } + } + } + }, + "names": [ + "id#20", + "name#21", + "language#22" + ], + "outputSchema": { + "types": [ + { + "string": { + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "string": { + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "string": { + "nullability": "NULLABILITY_NULLABLE" + } + } + ], + "nullability": "NULLABILITY_REQUIRED" + } + } + } + ] +} \ No newline at end of file diff --git a/cpp/CMake/BuildGTest.cmake b/cpp/CMake/BuildGTest.cmake index fff99455c419d..d85578c0593c1 100644 --- a/cpp/CMake/BuildGTest.cmake +++ b/cpp/CMake/BuildGTest.cmake @@ -6,15 +6,14 @@ set(GLUTEN_GTEST_BUILD_SHA256_CHECKSUM ad7fdba11ea011c1d925b3289cf4af2c66a352e18d4c7264392fead75e919363) set(GLUTEN_GTEST_SOURCE_URL "https://github.com/google/googletest/archive/refs/tags/v${GLUTEN_GTEST_VERSION}.tar.gz" - ) +) resolve_dependency_url(GTEST) message(STATUS "Building gtest from source") FetchContent_Declare( - gtest - URL ${GLUTEN_GTEST_SOURCE_URL} - URL_HASH "${GLUTEN_GTEST_BUILD_SHA256_CHECKSUM}" -) + gtest + URL ${GLUTEN_GTEST_SOURCE_URL} + URL_HASH "${GLUTEN_GTEST_BUILD_SHA256_CHECKSUM}") FetchContent_MakeAvailable(gtest) diff --git a/cpp/CMake/BuildGflags.cmake b/cpp/CMake/BuildGflags.cmake index 4cb2011158356..8e66bd6b9839b 100644 --- a/cpp/CMake/BuildGflags.cmake +++ b/cpp/CMake/BuildGflags.cmake @@ -17,10 +17,10 @@ include_guard(GLOBAL) set(GLUTEN_GFLAGS_BUILD_SHA256_CHECKSUM - 34af2f15cf7367513b352bdcd2493ab14ce43692d2dcd9dfc499492966c64dcf) + 34af2f15cf7367513b352bdcd2493ab14ce43692d2dcd9dfc499492966c64dcf) string(CONCAT GLUTEN_GFLAGS_SOURCE_URL - "https://github.com/gflags/gflags/archive/refs/tags/" - "v${GLUTEN_GFLAGS_VERSION}.tar.gz") + "https://github.com/gflags/gflags/archive/refs/tags/" + "v${GLUTEN_GFLAGS_VERSION}.tar.gz") resolve_dependency_url(GFLAGS) diff --git a/cpp/CMake/BuildGlog.cmake b/cpp/CMake/BuildGlog.cmake index cf405225c3136..3f0f78a165314 100644 --- a/cpp/CMake/BuildGlog.cmake +++ b/cpp/CMake/BuildGlog.cmake @@ -14,10 +14,10 @@ include_guard(GLOBAL) set(GLUTEN_GLOG_BUILD_SHA256_CHECKSUM - 8a83bf982f37bb70825df71a9709fa90ea9f4447fb3c099e1d720a439d88bad6) + 8a83bf982f37bb70825df71a9709fa90ea9f4447fb3c099e1d720a439d88bad6) set(GLUTEN_GLOG_SOURCE_URL - "https://github.com/google/glog/archive/refs/tags/v${GLUTEN_GLOG_VERSION}.tar.gz" - ) + "https://github.com/google/glog/archive/refs/tags/v${GLUTEN_GLOG_VERSION}.tar.gz" +) resolve_dependency_url(GLOG) @@ -27,7 +27,7 @@ FetchContent_Declare( URL ${GLUTEN_GLOG_SOURCE_URL} URL_HASH SHA256=${GLUTEN_GLOG_BUILD_SHA256_CHECKSUM} PATCH_COMMAND git apply ${CMAKE_CURRENT_LIST_DIR}/glog/glog-no-export.patch - && git apply ${CMAKE_CURRENT_LIST_DIR}/glog/glog-config.patch) + && git apply ${CMAKE_CURRENT_LIST_DIR}/glog/glog-config.patch) set(BUILD_SHARED_LIBS OFF) set(WITH_UNWIND OFF) @@ -48,6 +48,6 @@ endif() # These headers are missing from the include dir but adding the src dir causes # issues with folly so we just copy it to the include dir file(COPY ${glog_SOURCE_DIR}/src/glog/platform.h - DESTINATION ${glog_BINARY_DIR}/glog) + DESTINATION ${glog_BINARY_DIR}/glog) file(COPY ${glog_SOURCE_DIR}/src/glog/log_severity.h - DESTINATION ${glog_BINARY_DIR}/glog) + DESTINATION ${glog_BINARY_DIR}/glog) diff --git a/cpp/CMake/BuildGoogleBenchmark.cmake b/cpp/CMake/BuildGoogleBenchmark.cmake index 8efbb58eab742..a71d73432f62c 100644 --- a/cpp/CMake/BuildGoogleBenchmark.cmake +++ b/cpp/CMake/BuildGoogleBenchmark.cmake @@ -21,8 +21,10 @@ include(FetchContent) set(GLUTEN_GBENCHMARK_BUILD_VERSION "v1.6.0") set(GLUTEN_GBENCHMARK_SOURCE_URL "https://github.com/google/benchmark/archive/refs/tags/${GLUTEN_GBENCHMARK_BUILD_VERSION}.tar.gz" - "https://github.com/ursa-labs/thirdparty/releases/download/latest/gbenchmark-${GLUTEN_GBENCHMARK_BUILD_VERSION}.tar.gz") -set(GLUTEN_GBENCHMARK_BUILD_SHA256_CHECKSUM "1f71c72ce08d2c1310011ea6436b31e39ccab8c2db94186d26657d41747c85d6") + "https://github.com/ursa-labs/thirdparty/releases/download/latest/gbenchmark-${GLUTEN_GBENCHMARK_BUILD_VERSION}.tar.gz" +) +set(GLUTEN_GBENCHMARK_BUILD_SHA256_CHECKSUM + "1f71c72ce08d2c1310011ea6436b31e39ccab8c2db94186d26657d41747c85d6") resolve_dependency_url(GBENCHMARK) @@ -30,12 +32,11 @@ set(GBENCHMARK_CMAKE_ARGS "-fPIC -w") message(STATUS "Building google benchmark from source") FetchContent_Declare( - gbenchmark - URL ${GLUTEN_GBENCHMARK_SOURCE_URL} - URL_HASH "${GLUTEN_GBENCHMARK_BUILD_SHA256_CHECKSUM}" -) + gbenchmark + URL ${GLUTEN_GBENCHMARK_SOURCE_URL} + URL_HASH "${GLUTEN_GBENCHMARK_BUILD_SHA256_CHECKSUM}") -if (NOT gbenchmark_POPULATED) +if(NOT gbenchmark_POPULATED) # We don't want to build tests. set(BENCHMARK_ENABLE_TESTING OFF diff --git a/cpp/CMake/BuildMemkind.cmake b/cpp/CMake/BuildMemkind.cmake index 039db0cc0d818..2f2248de6fc43 100644 --- a/cpp/CMake/BuildMemkind.cmake +++ b/cpp/CMake/BuildMemkind.cmake @@ -26,48 +26,50 @@ endif() macro(build_hwloc) message(STATUS "Building hwloc from source") set(HWLOC_BUILD_VERSION "2.8.0") - set(HWLOC_BUILD_SHA256_CHECKSUM "311d44e99bbf6d269c2cbc569d073978d88352bc31d51e31457d4df94783172d") + set(HWLOC_BUILD_SHA256_CHECKSUM + "311d44e99bbf6d269c2cbc569d073978d88352bc31d51e31457d4df94783172d") set(HWLOC_SOURCE_URL - "https://github.com/open-mpi/hwloc/archive/refs/tags/hwloc-${HWLOC_BUILD_VERSION}.tar.gz") + "https://github.com/open-mpi/hwloc/archive/refs/tags/hwloc-${HWLOC_BUILD_VERSION}.tar.gz" + ) set(HWLOC_LIB_NAME "hwloc") set(HWLOC_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/hwloc_ep-install") set(HWLOC_SOURCE_DIR "${HWLOC_PREFIX}/src/hwloc_ep") set(HWLOC_INCLUDE_DIR "${HWLOC_SOURCE_DIR}/include") set(HWLOC_LIB_DIR "${HWLOC_SOURCE_DIR}/hwloc/.libs") - set(HWLOC_STATIC_LIB_NAME "${CMAKE_STATIC_LIBRARY_PREFIX}${HWLOC_LIB_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}") + set(HWLOC_STATIC_LIB_NAME + "${CMAKE_STATIC_LIBRARY_PREFIX}${HWLOC_LIB_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) set(HWLOC_STATIC_LIB_TARGETS - "${HWLOC_SOURCE_DIR}/src/.libs/${HWLOC_STATIC_LIB_NAME}") + "${HWLOC_SOURCE_DIR}/src/.libs/${HWLOC_STATIC_LIB_NAME}") set(HWLOC_CONFIGURE_ARGS - "--prefix=${HWLOC_PREFIX}" - "--with-pic" - "--enable-static" - "--disable-shared" - "--enable-plugins") - ExternalProject_Add(hwloc_ep - PREFIX ${HWLOC_PREFIX} - URL ${HWLOC_SOURCE_URL} - URL_HASH "SHA256=${HWLOC_BUILD_SHA256_CHECKSUM}" - SOURCE_DIR ${HWLOC_SOURCE_DIR} - CONFIGURE_COMMAND ./configure ${HWLOC_CONFIGURE_ARGS} - BUILD_COMMAND ${MAKE} - BUILD_BYPRODUCTS ${HWLOC_STATIC_LIB_TARGETS} - BUILD_IN_SOURCE 1) + "--prefix=${HWLOC_PREFIX}" "--with-pic" "--enable-static" + "--disable-shared" "--enable-plugins") + ExternalProject_Add( + hwloc_ep + PREFIX ${HWLOC_PREFIX} + URL ${HWLOC_SOURCE_URL} + URL_HASH "SHA256=${HWLOC_BUILD_SHA256_CHECKSUM}" + SOURCE_DIR ${HWLOC_SOURCE_DIR} + CONFIGURE_COMMAND ./configure ${HWLOC_CONFIGURE_ARGS} + BUILD_COMMAND ${MAKE} + BUILD_BYPRODUCTS ${HWLOC_STATIC_LIB_TARGETS} + BUILD_IN_SOURCE 1) - ExternalProject_Add_Step(hwloc_ep pre-configure - COMMAND ./autogen.sh - DEPENDEES download - DEPENDERS configure - WORKING_DIRECTORY ${HWLOC_SOURCE_DIR}) + ExternalProject_Add_Step( + hwloc_ep pre-configure + COMMAND ./autogen.sh + DEPENDEES download + DEPENDERS configure + WORKING_DIRECTORY ${HWLOC_SOURCE_DIR}) # The include directory must exist before it is referenced by a target. file(MAKE_DIRECTORY "${HWLOC_INCLUDE_DIR}") add_library(hwloc::hwloc STATIC IMPORTED) - set_target_properties(hwloc::hwloc - PROPERTIES IMPORTED_LOCATION - "${HWLOC_LIB_DIR}/${HWLOC_STATIC_LIB_NAME}" - INTERFACE_INCLUDE_DIRECTORIES - "${HWLOC_INCLUDE_DIR}") + set_target_properties( + hwloc::hwloc + PROPERTIES IMPORTED_LOCATION "${HWLOC_LIB_DIR}/${HWLOC_STATIC_LIB_NAME}" + INTERFACE_INCLUDE_DIRECTORIES "${HWLOC_INCLUDE_DIR}") add_dependencies(hwloc::hwloc hwloc_ep) endmacro() @@ -75,36 +77,43 @@ endmacro() macro(build_memkind) message(STATUS "Building Memkind from source") set(MEMKIND_BUILD_VERSION "v1.14.0") - set(MEMKIND_BUILD_SHA256_CHECKSUM "ab366b20b5a87ea655483631fc762ba6eb59eb6c3a08652e643f1ee3f06a6a12") + set(MEMKIND_BUILD_SHA256_CHECKSUM + "ab366b20b5a87ea655483631fc762ba6eb59eb6c3a08652e643f1ee3f06a6a12") set(MEMKIND_SOURCE_URL - "https://github.com/memkind/memkind/archive/refs/tags/${MEMKIND_BUILD_VERSION}.tar.gz") + "https://github.com/memkind/memkind/archive/refs/tags/${MEMKIND_BUILD_VERSION}.tar.gz" + ) set(MEMKIND_LIB_NAME "memkind") set(MEMKIND_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/memkind_ep-install") set(MEMKIND_SOURCE_DIR "${MEMKIND_PREFIX}/src/memkind_ep") set(MEMKIND_INCLUDE_DIR "${MEMKIND_SOURCE_DIR}/include") set(MEMKIND_LIB_DIR "${MEMKIND_SOURCE_DIR}/.libs") - set(MEMKIND_STATIC_LIB_NAME "${CMAKE_STATIC_LIBRARY_PREFIX}${MEMKIND_LIB_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}") + set(MEMKIND_STATIC_LIB_NAME + "${CMAKE_STATIC_LIBRARY_PREFIX}${MEMKIND_LIB_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) set(MEMKIND_STATIC_LIB_TARGETS - "${MEMKIND_SOURCE_DIR}/src/.libs/${MEMKIND_STATIC_LIB_NAME}") - set(MEMKIND_CONFIGURE_ARGS - "--prefix=${MEMKIND_PREFIX}" - "--with-pic" - "--enable-static") - ExternalProject_Add(memkind_ep - PREFIX ${MEMKIND_PREFIX} - URL ${MEMKIND_SOURCE_URL} - URL_HASH "SHA256=${MEMKIND_BUILD_SHA256_CHECKSUM}" - SOURCE_DIR ${MEMKIND_SOURCE_DIR} - CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env LDFLAGS=-L${HWLOC_LIB_DIR} env CFLAGS=-I${HWLOC_INCLUDE_DIR} env CXXFLAGS=-I${HWLOC_INCLUDE_DIR} ./configure ${MEMKIND_CONFIGURE_ARGS} - BUILD_COMMAND ${MAKE} - BUILD_BYPRODUCTS ${MEMKIND_STATIC_LIB_TARGETS} - BUILD_IN_SOURCE 1) + "${MEMKIND_SOURCE_DIR}/src/.libs/${MEMKIND_STATIC_LIB_NAME}") + set(MEMKIND_CONFIGURE_ARGS "--prefix=${MEMKIND_PREFIX}" "--with-pic" + "--enable-static") + ExternalProject_Add( + memkind_ep + PREFIX ${MEMKIND_PREFIX} + URL ${MEMKIND_SOURCE_URL} + URL_HASH "SHA256=${MEMKIND_BUILD_SHA256_CHECKSUM}" + SOURCE_DIR ${MEMKIND_SOURCE_DIR} + CONFIGURE_COMMAND + ${CMAKE_COMMAND} -E env LDFLAGS=-L${HWLOC_LIB_DIR} env + CFLAGS=-I${HWLOC_INCLUDE_DIR} env CXXFLAGS=-I${HWLOC_INCLUDE_DIR} + ./configure ${MEMKIND_CONFIGURE_ARGS} + BUILD_COMMAND ${MAKE} + BUILD_BYPRODUCTS ${MEMKIND_STATIC_LIB_TARGETS} + BUILD_IN_SOURCE 1) - ExternalProject_Add_Step(memkind_ep pre-configure - COMMAND ./autogen.sh - DEPENDEES download - DEPENDERS configure - WORKING_DIRECTORY ${MEMKIND_SOURCE_DIR}) + ExternalProject_Add_Step( + memkind_ep pre-configure + COMMAND ./autogen.sh + DEPENDEES download + DEPENDERS configure + WORKING_DIRECTORY ${MEMKIND_SOURCE_DIR}) add_dependencies(memkind_ep hwloc::hwloc) @@ -112,12 +121,12 @@ macro(build_memkind) file(MAKE_DIRECTORY "${MEMKIND_INCLUDE_DIR}") add_library(memkind::memkind STATIC IMPORTED) - set_target_properties(memkind::memkind - PROPERTIES IMPORTED_LOCATION - "${MEMKIND_LIB_DIR}/${MEMKIND_STATIC_LIB_NAME}" - INTERFACE_INCLUDE_DIRECTORIES - "${MEMKIND_INCLUDE_DIR}") - target_link_libraries(memkind::memkind INTERFACE hwloc::hwloc dl numa pthread daxctl) + set_target_properties( + memkind::memkind + PROPERTIES IMPORTED_LOCATION "${MEMKIND_LIB_DIR}/${MEMKIND_STATIC_LIB_NAME}" + INTERFACE_INCLUDE_DIRECTORIES "${MEMKIND_INCLUDE_DIR}") + target_link_libraries(memkind::memkind INTERFACE hwloc::hwloc dl numa pthread + daxctl) add_dependencies(memkind::memkind memkind_ep) endmacro() diff --git a/cpp/CMake/BuildQATZstd.cmake b/cpp/CMake/BuildQATZstd.cmake index f6c152db14603..f79e9ea58fcc3 100644 --- a/cpp/CMake/BuildQATZstd.cmake +++ b/cpp/CMake/BuildQATZstd.cmake @@ -28,61 +28,72 @@ macro(build_qatzstd) include(FindZstd) message(STATUS "Building QAT-ZSTD from source") - set(QATZSTD_SOURCE_URL - "https://github.com/marin-ma/QAT-ZSTD-Plugin.git") + set(QATZSTD_SOURCE_URL "https://github.com/marin-ma/QAT-ZSTD-Plugin.git") set(QATZSTD_SOURCE_BRANCH "fix-duplicate-symbol") set(QATZSTD_LIB_NAME "qatseqprod") - set(QATZSTD_PREFIX - "${CMAKE_CURRENT_BINARY_DIR}/qatzstd_ep-install") + set(QATZSTD_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/qatzstd_ep-install") set(QATZSTD_SOURCE_DIR "${QATZSTD_PREFIX}/src/qatzstd_ep") set(QATZSTD_INCLUDE_DIR "${QATZSTD_SOURCE_DIR}/src") - set(QATZSTD_STATIC_LIB_NAME "${CMAKE_STATIC_LIBRARY_PREFIX}${QATZSTD_LIB_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}") - set(QATZSTD_STATIC_LIB_TARGETS "${QATZSTD_SOURCE_DIR}/src/${QATZSTD_STATIC_LIB_NAME}") + set(QATZSTD_STATIC_LIB_NAME + "${CMAKE_STATIC_LIBRARY_PREFIX}${QATZSTD_LIB_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) + set(QATZSTD_STATIC_LIB_TARGETS + "${QATZSTD_SOURCE_DIR}/src/${QATZSTD_STATIC_LIB_NAME}") set(QATZSTD_MAKE_ARGS "ENABLE_USDM_DRV=1" "ZSTDLIB=${ZSTD_INCLUDE_DIR}") - ExternalProject_Add(qatzstd_ep - PREFIX ${QATZSTD_PREFIX} - GIT_REPOSITORY ${QATZSTD_SOURCE_URL} - GIT_TAG ${QATZSTD_SOURCE_BRANCH} - SOURCE_DIR ${QATZSTD_SOURCE_DIR} - CONFIGURE_COMMAND "" - BUILD_COMMAND ${MAKE} ${QATZSTD_MAKE_ARGS} - INSTALL_COMMAND "" - BUILD_BYPRODUCTS ${QATZSTD_STATIC_LIB_TARGETS} - BUILD_IN_SOURCE 1) + ExternalProject_Add( + qatzstd_ep + PREFIX ${QATZSTD_PREFIX} + GIT_REPOSITORY ${QATZSTD_SOURCE_URL} + GIT_TAG ${QATZSTD_SOURCE_BRANCH} + SOURCE_DIR ${QATZSTD_SOURCE_DIR} + CONFIGURE_COMMAND "" + BUILD_COMMAND ${MAKE} ${QATZSTD_MAKE_ARGS} + INSTALL_COMMAND "" + BUILD_BYPRODUCTS ${QATZSTD_STATIC_LIB_TARGETS} + BUILD_IN_SOURCE 1) add_library(qatzstd::qatzstd STATIC IMPORTED) # The include directory must exist before it is referenced by a target. file(MAKE_DIRECTORY "${QATZSTD_INCLUDE_DIR}") - set(QATZSTD_INCLUDE_DIRS - "${QATZSTD_INCLUDE_DIR}" - "${ZSTD_INCLUDE_DIR}") + set(QATZSTD_INCLUDE_DIRS "${QATZSTD_INCLUDE_DIR}" "${ZSTD_INCLUDE_DIR}") set(QATZSTD_LINK_LIBRARIES - "${ZSTD_LIBRARY}" - "${QAT_LIBRARY}" - "${USDM_DRV_LIBRARY}" - "${ADF_LIBRARY}" + "${ZSTD_LIBRARY}" "${QAT_LIBRARY}" "${USDM_DRV_LIBRARY}" "${ADF_LIBRARY}" "${OSAL_LIBRARY}") - set_target_properties(qatzstd::qatzstd - PROPERTIES IMPORTED_LOCATION - "${QATZSTD_STATIC_LIB_TARGETS}" - INTERFACE_INCLUDE_DIRECTORIES - "${QATZSTD_INCLUDE_DIRS}" - INTERFACE_LINK_LIBRARIES - "${QATZSTD_LINK_LIBRARIES}") + set_target_properties( + qatzstd::qatzstd + PROPERTIES IMPORTED_LOCATION "${QATZSTD_STATIC_LIB_TARGETS}" + INTERFACE_INCLUDE_DIRECTORIES "${QATZSTD_INCLUDE_DIRS}" + INTERFACE_LINK_LIBRARIES "${QATZSTD_LINK_LIBRARIES}") add_dependencies(qatzstd::qatzstd qatzstd_ep) endmacro() -find_library(QAT_LIBRARY REQUIRED NAMES qat PATHS "$ENV{ICP_ROOT}/build" NO_DEFAULT_PATH) -find_library(USDM_DRV_LIBRARY REQUIRED NAMES usdm_drv PATHS "$ENV{ICP_ROOT}/build" NO_DEFAULT_PATH) -find_library(ADF_LIBRARY REQUIRED NAMES adf PATHS "$ENV{ICP_ROOT}/build" NO_DEFAULT_PATH) -find_library(OSAL_LIBRARY REQUIRED NAMES osal PATHS "$ENV{ICP_ROOT}/build" NO_DEFAULT_PATH) +find_library( + QAT_LIBRARY REQUIRED + NAMES qat + PATHS "$ENV{ICP_ROOT}/build" + NO_DEFAULT_PATH) +find_library( + USDM_DRV_LIBRARY REQUIRED + NAMES usdm_drv + PATHS "$ENV{ICP_ROOT}/build" + NO_DEFAULT_PATH) +find_library( + ADF_LIBRARY REQUIRED + NAMES adf + PATHS "$ENV{ICP_ROOT}/build" + NO_DEFAULT_PATH) +find_library( + OSAL_LIBRARY REQUIRED + NAMES osal + PATHS "$ENV{ICP_ROOT}/build" + NO_DEFAULT_PATH) message(STATUS "Found qat: ${QAT_LIBRARY}") message(STATUS "Found usdm_drv: ${USDM_DRV_LIBRARY}") @@ -90,4 +101,3 @@ message(STATUS "Found adf: ${ADF_LIBRARY}") message(STATUS "Found osal: ${OSAL_LIBRARY}") build_qatzstd() - diff --git a/cpp/CMake/BuildQATzip.cmake b/cpp/CMake/BuildQATzip.cmake index 376f1645509af..fd75757d72865 100644 --- a/cpp/CMake/BuildQATzip.cmake +++ b/cpp/CMake/BuildQATzip.cmake @@ -26,38 +26,42 @@ endif() macro(build_qatzip) message(STATUS "Building QATzip from source") set(QATZIP_BUILD_VERSION "v1.1.1") - set(QATZIP_BUILD_SHA256_CHECKSUM "679f5522deb35e7ffa36f227ae49d07ef2d69a83e56bfda849303829b274e79b") + set(QATZIP_BUILD_SHA256_CHECKSUM + "679f5522deb35e7ffa36f227ae49d07ef2d69a83e56bfda849303829b274e79b") set(QATZIP_SOURCE_URL - "https://github.com/intel/QATzip/archive/refs/tags/${QATZIP_BUILD_VERSION}.tar.gz") + "https://github.com/intel/QATzip/archive/refs/tags/${QATZIP_BUILD_VERSION}.tar.gz" + ) set(QATZIP_LIB_NAME "qatzip") - set(QATZIP_PREFIX - "${CMAKE_CURRENT_BINARY_DIR}/qatzip_ep-install") + set(QATZIP_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/qatzip_ep-install") set(QATZIP_SOURCE_DIR "${QATZIP_PREFIX}/src/qatzip_ep") set(QATZIP_INCLUDE_DIR "${QATZIP_SOURCE_DIR}/include") - set(QATZIP_STATIC_LIB_NAME "${CMAKE_STATIC_LIBRARY_PREFIX}${QATZIP_LIB_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}") + set(QATZIP_STATIC_LIB_NAME + "${CMAKE_STATIC_LIBRARY_PREFIX}${QATZIP_LIB_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) set(QATZIP_STATIC_LIB_TARGETS "${QATZIP_SOURCE_DIR}/src/.libs/${QATZIP_STATIC_LIB_NAME}") - set(QATZIP_CONFIGURE_ARGS - "--prefix=${QATZIP_PREFIX}" - "--with-pic" - "--with-ICP_ROOT=$ENV{ICP_ROOT}") + set(QATZIP_CONFIGURE_ARGS "--prefix=${QATZIP_PREFIX}" "--with-pic" + "--with-ICP_ROOT=$ENV{ICP_ROOT}") - ExternalProject_Add(qatzip_ep - PREFIX ${QATZIP_PREFIX} - URL ${QATZIP_SOURCE_URL} - URL_HASH "SHA256=${QATZIP_BUILD_SHA256_CHECKSUM}" - SOURCE_DIR ${QATZIP_SOURCE_DIR} - CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env QZ_ROOT=${QATZIP_SOURCE_DIR} ./configure ${QATZIP_CONFIGURE_ARGS} - BUILD_COMMAND ${MAKE} all - BUILD_BYPRODUCTS ${QATZIP_STATIC_LIB_TARGETS} - BUILD_IN_SOURCE 1) + ExternalProject_Add( + qatzip_ep + PREFIX ${QATZIP_PREFIX} + URL ${QATZIP_SOURCE_URL} + URL_HASH "SHA256=${QATZIP_BUILD_SHA256_CHECKSUM}" + SOURCE_DIR ${QATZIP_SOURCE_DIR} + CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env QZ_ROOT=${QATZIP_SOURCE_DIR} + ./configure ${QATZIP_CONFIGURE_ARGS} + BUILD_COMMAND ${MAKE} all + BUILD_BYPRODUCTS ${QATZIP_STATIC_LIB_TARGETS} + BUILD_IN_SOURCE 1) - ExternalProject_Add_Step(qatzip_ep pre-configure - COMMAND ./autogen.sh - DEPENDEES download - DEPENDERS configure - WORKING_DIRECTORY ${QATZIP_SOURCE_DIR}) + ExternalProject_Add_Step( + qatzip_ep pre-configure + COMMAND ./autogen.sh + DEPENDEES download + DEPENDERS configure + WORKING_DIRECTORY ${QATZIP_SOURCE_DIR}) # The include directory must exist before it is referenced by a target. file(MAKE_DIRECTORY "${QATZIP_INCLUDE_DIR}") @@ -72,14 +76,16 @@ macro(build_qatzip) "${OSAL_LIBRARY}" Threads::Threads) + # Fix libudev.so not get linked. + set(QATZIP_LINK_OPTIONS "-Wl,--no-as-needed") + add_library(qatzip::qatzip STATIC IMPORTED) - set_target_properties(qatzip::qatzip - PROPERTIES IMPORTED_LOCATION - "${QATZIP_STATIC_LIB_TARGETS}" - INTERFACE_INCLUDE_DIRECTORIES - "${QATZIP_INCLUDE_DIR}" - INTERFACE_LINK_LIBRARIES - "${QATZIP_LINK_LIBRARIES}") + set_target_properties( + qatzip::qatzip + PROPERTIES IMPORTED_LOCATION "${QATZIP_STATIC_LIB_TARGETS}" + INTERFACE_INCLUDE_DIRECTORIES "${QATZIP_INCLUDE_DIR}" + INTERFACE_LINK_LIBRARIES "${QATZIP_LINK_LIBRARIES}" + INTERFACE_LINK_OPTIONS "${QATZIP_LINK_OPTIONS}") add_dependencies(qatzip::qatzip qatzip_ep) endmacro() @@ -90,10 +96,26 @@ find_package(Threads REQUIRED) find_library(ZLIB_LIBRARY REQUIRED NAMES z) find_library(LZ4_LIBRARY REQUIRED NAMES lz4) find_library(UDEV_LIBRARY REQUIRED NAMES udev) -find_library(USDM_DRV_LIBRARY REQUIRED NAMES usdm_drv PATHS "$ENV{ICP_ROOT}/build" NO_DEFAULT_PATH) -find_library(QAT_LIBRARY REQUIRED NAMES qat PATHS "$ENV{ICP_ROOT}/build" NO_DEFAULT_PATH) -find_library(OSAL_LIBRARY REQUIRED NAMES osal PATHS "$ENV{ICP_ROOT}/build" NO_DEFAULT_PATH) -find_library(ADF_LIBRARY REQUIRED NAMES adf PATHS "$ENV{ICP_ROOT}/build" NO_DEFAULT_PATH) +find_library( + USDM_DRV_LIBRARY REQUIRED + NAMES usdm_drv + PATHS "$ENV{ICP_ROOT}/build" + NO_DEFAULT_PATH) +find_library( + QAT_LIBRARY REQUIRED + NAMES qat + PATHS "$ENV{ICP_ROOT}/build" + NO_DEFAULT_PATH) +find_library( + OSAL_LIBRARY REQUIRED + NAMES osal + PATHS "$ENV{ICP_ROOT}/build" + NO_DEFAULT_PATH) +find_library( + ADF_LIBRARY REQUIRED + NAMES adf + PATHS "$ENV{ICP_ROOT}/build" + NO_DEFAULT_PATH) message(STATUS "Found zlib: ${ZLIB_LIBRARY}") message(STATUS "Found lz4: ${LZ4_LIBRARY}") @@ -102,4 +124,3 @@ message(STATUS "Found usdm_drv: ${USDM_DRV_LIBRARY}") message(STATUS "Found qat: ${QAT_LIBRARY}") build_qatzip() - diff --git a/cpp/CMake/BuildQpl.cmake b/cpp/CMake/BuildQpl.cmake index dbfd16645342d..7715bb8e767fc 100644 --- a/cpp/CMake/BuildQpl.cmake +++ b/cpp/CMake/BuildQpl.cmake @@ -18,46 +18,43 @@ include(ExternalProject) macro(build_qpl) - message(STATUS "Building QPL from source") - set(QPL_BUILD_VERSION "v1.1.0") - set(QPL_BUILD_SHA256_CHECKSUM "00306000035621dfbc21007481395c46ba9723fc8add8ca5142847b94dc564c5") - set(QPL_SOURCE_URL - "https://github.com/intel/qpl/archive/refs/tags/v1.1.0.tar.gz") - set(QPL_LIB_NAME "qpl") - - set(QPL_PREFIX - "${CMAKE_CURRENT_BINARY_DIR}/qpl_ep-install") - set(QPL_SOURCE_DIR "${QPL_PREFIX}/src/qpl_ep") - set(QPL_INCLUDE_DIR "${QPL_PREFIX}/include") - set(QPL_LIB_DIR "${QPL_PREFIX}/lib") - set(QPL_STATIC_LIB_NAME "${CMAKE_STATIC_LIBRARY_PREFIX}${QPL_LIB_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}${QPL_STATIC_LIB_MAJOR_VERSION}") - set(QPL_STATIC_LIB_TARGETS - "${QPL_LIB_DIR}/${QPL_STATIC_LIB_NAME}" - ) - ExternalProject_Add(qpl_ep - PREFIX ${QPL_PREFIX} - URL ${QPL_SOURCE_URL} - URL_HASH "SHA256=${QPL_BUILD_SHA256_CHECKSUM}" - SOURCE_DIR ${QPL_SOURCE_DIR} - CMAKE_ARGS - -DCMAKE_INSTALL_PREFIX=${QPL_PREFIX} - -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} - -DQPL_BUILD_TESTS=OFF - -DLOG_HW_INIT=ON - BUILD_BYPRODUCTS ${QPL_STATIC_LIB_TARGETS}) - - # The include directory must exist before it is referenced by a target. - file(MAKE_DIRECTORY "${QPL_INCLUDE_DIR}") - - add_library(qpl::qpl STATIC IMPORTED) - set_target_properties(qpl::qpl - PROPERTIES IMPORTED_LOCATION - "${QPL_LIB_DIR}/${QPL_STATIC_LIB_NAME}" - INTERFACE_INCLUDE_DIRECTORIES - "${QPL_INCLUDE_DIR}") - - add_dependencies(qpl::qpl qpl_ep) + message(STATUS "Building QPL from source") + set(QPL_BUILD_VERSION "v1.1.0") + set(QPL_BUILD_SHA256_CHECKSUM + "00306000035621dfbc21007481395c46ba9723fc8add8ca5142847b94dc564c5") + set(QPL_SOURCE_URL + "https://github.com/intel/qpl/archive/refs/tags/v1.1.0.tar.gz") + set(QPL_LIB_NAME "qpl") + + set(QPL_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/qpl_ep-install") + set(QPL_SOURCE_DIR "${QPL_PREFIX}/src/qpl_ep") + set(QPL_INCLUDE_DIR "${QPL_PREFIX}/include") + set(QPL_LIB_DIR "${QPL_PREFIX}/lib") + set(QPL_STATIC_LIB_NAME + "${CMAKE_STATIC_LIBRARY_PREFIX}${QPL_LIB_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}${QPL_STATIC_LIB_MAJOR_VERSION}" + ) + set(QPL_STATIC_LIB_TARGETS "${QPL_LIB_DIR}/${QPL_STATIC_LIB_NAME}") + ExternalProject_Add( + qpl_ep + PREFIX ${QPL_PREFIX} + URL ${QPL_SOURCE_URL} + URL_HASH "SHA256=${QPL_BUILD_SHA256_CHECKSUM}" + SOURCE_DIR ${QPL_SOURCE_DIR} + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${QPL_PREFIX} + -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DQPL_BUILD_TESTS=OFF + -DLOG_HW_INIT=ON + BUILD_BYPRODUCTS ${QPL_STATIC_LIB_TARGETS}) + + # The include directory must exist before it is referenced by a target. + file(MAKE_DIRECTORY "${QPL_INCLUDE_DIR}") + + add_library(qpl::qpl STATIC IMPORTED) + set_target_properties( + qpl::qpl + PROPERTIES IMPORTED_LOCATION "${QPL_LIB_DIR}/${QPL_STATIC_LIB_NAME}" + INTERFACE_INCLUDE_DIRECTORIES "${QPL_INCLUDE_DIR}") + + add_dependencies(qpl::qpl qpl_ep) endmacro() build_qpl() - diff --git a/cpp/CMake/Buildjemalloc_pic.cmake b/cpp/CMake/Buildjemalloc_pic.cmake new file mode 100644 index 0000000000000..7c2316ea95406 --- /dev/null +++ b/cpp/CMake/Buildjemalloc_pic.cmake @@ -0,0 +1,74 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Building Jemalloc +macro(build_jemalloc) + message(STATUS "Building Jemalloc from Source") + + if(DEFINED ENV{GLUTEN_JEMALLOC_URL}) + set(JEMALLOC_SOURCE_URL "$ENV{GLUTEN_JEMALLOC_URL}") + else() + set(JEMALLOC_BUILD_VERSION "5.2.1") + set(JEMALLOC_SOURCE_URL + "https://github.com/jemalloc/jemalloc/releases/download/${JEMALLOC_BUILD_VERSION}/jemalloc-${JEMALLOC_BUILD_VERSION}.tar.bz2" + "https://github.com/ursa-labs/thirdparty/releases/download/latest/jemalloc-${JEMALLOC_BUILD_VERSION}.tar.bz2" + ) + endif() + + set(JEMALLOC_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/jemalloc_ep-install") + set(JEMALLOC_LIB_DIR "${JEMALLOC_PREFIX}/lib") + set(JEMALLOC_INCLUDE_DIR "${JEMALLOC_PREFIX}/include") + set(JEMALLOC_STATIC_LIB + "${JEMALLOC_LIB_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}jemalloc_pic${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) + set(JEMALLOC_INCLUDE "${JEMALLOC_PREFIX}/include") + set(JEMALLOC_CONFIGURE_ARGS + "AR=${CMAKE_AR}" + "CC=${CMAKE_C_COMPILER}" + "--prefix=${JEMALLOC_PREFIX}" + "--libdir=${JEMALLOC_LIB_DIR}" + "--with-jemalloc-prefix=je_gluten_" + "--with-private-namespace=je_gluten_private_" + "--without-export" + "--disable-shared" + "--disable-cxx" + "--disable-libdl" + # For fixing an issue when loading native lib: cannot allocate memory in + # static TLS block. + "--disable-initial-exec-tls" + "CFLAGS=-fPIC" + "CXXFLAGS=-fPIC") + set(JEMALLOC_BUILD_COMMAND ${MAKE} ${MAKE_BUILD_ARGS}) + ExternalProject_Add( + jemalloc_ep + URL ${JEMALLOC_SOURCE_URL} + PATCH_COMMAND touch doc/jemalloc.3 doc/jemalloc.html + CONFIGURE_COMMAND "./configure" ${JEMALLOC_CONFIGURE_ARGS} + BUILD_COMMAND ${JEMALLOC_BUILD_COMMAND} + BUILD_IN_SOURCE 1 + BUILD_BYPRODUCTS "${JEMALLOC_STATIC_LIB}" + INSTALL_COMMAND make install) + + file(MAKE_DIRECTORY "${JEMALLOC_INCLUDE_DIR}") + add_library(jemalloc::libjemalloc STATIC IMPORTED) + set_target_properties( + jemalloc::libjemalloc + PROPERTIES INTERFACE_LINK_LIBRARIES Threads::Threads + IMPORTED_LOCATION "${JEMALLOC_STATIC_LIB}" + INTERFACE_INCLUDE_DIRECTORIES "${JEMALLOC_INCLUDE_DIR}") + add_dependencies(jemalloc::libjemalloc jemalloc_ep) +endmacro() diff --git a/cpp/CMake/ConfigArrow.cmake b/cpp/CMake/ConfigArrow.cmake index 1ae4ece1b8ef7..e27a3414dd313 100644 --- a/cpp/CMake/ConfigArrow.cmake +++ b/cpp/CMake/ConfigArrow.cmake @@ -15,41 +15,41 @@ # specific language governing permissions and limitations # under the License. -if (${CMAKE_SYSTEM_NAME} STREQUAL "Darwin") - set(ARROW_SHARED_LIBRARY_SUFFIX ".1500.dylib") - set(ARROW_SHARED_LIBRARY_PARENT_SUFFIX ".1500.1.0.dylib") -else() - set(ARROW_SHARED_LIBRARY_SUFFIX ".so.1500") - set(ARROW_SHARED_LIBRARY_PARENT_SUFFIX ".so.1500.1.0") -endif() +set(ARROW_STATIC_LIBRARY_SUFFIX ".a") set(ARROW_LIB_NAME "arrow") set(PARQUET_LIB_NAME "parquet") -set(ARROW_DATASET_LIB_NAME "arrow_dataset") -set(ARROW_SUBSTRAIT_LIB_NAME "arrow_substrait") +set(ARROW_BUNDLED_DEPS "arrow_bundled_dependencies") + +set(ARROW_INSTALL_DIR "${ARROW_HOME}/install") +set(ARROW_LIB_DIR "${ARROW_INSTALL_DIR}/lib") +set(ARROW_LIB64_DIR "${ARROW_INSTALL_DIR}/lib64") +set(ARROW_INCLUDE_DIR "${ARROW_INSTALL_DIR}/include") function(FIND_ARROW_LIB LIB_NAME) if(NOT TARGET Arrow::${LIB_NAME}) - set(ARROW_LIB_FULL_NAME ${CMAKE_SHARED_LIBRARY_PREFIX}${LIB_NAME}${ARROW_SHARED_LIBRARY_SUFFIX}) - add_library(Arrow::${LIB_NAME} SHARED IMPORTED) - find_library(ARROW_LIB_${LIB_NAME} - NAMES ${ARROW_LIB_FULL_NAME} - PATHS ${ARROW_LIB_DIR} ${ARROW_LIB64_DIR} - NO_DEFAULT_PATH) + set(ARROW_LIB_FULL_NAME + ${CMAKE_SHARED_LIBRARY_PREFIX}${LIB_NAME}${ARROW_STATIC_LIBRARY_SUFFIX}) + add_library(Arrow::${LIB_NAME} STATIC IMPORTED) + # Firstly find the lib from velox's arrow build path. If not found, try to + # find it from system. + find_library( + ARROW_LIB_${LIB_NAME} + NAMES ${ARROW_LIB_FULL_NAME} + PATHS ${ARROW_LIB_DIR} ${ARROW_LIB64_DIR}) if(NOT ARROW_LIB_${LIB_NAME}) message(FATAL_ERROR "Arrow library Not Found: ${ARROW_LIB_FULL_NAME}") + endif() + message(STATUS "Found Arrow library: ${ARROW_LIB_${LIB_NAME}}") + if(LIB_NAME STREQUAL ${ARROW_BUNDLED_DEPS}) + set_target_properties( + Arrow::${LIB_NAME} PROPERTIES IMPORTED_LOCATION + ${ARROW_LIB_${LIB_NAME}}) else() - message(STATUS "Found Arrow library: ${ARROW_LIB_${LIB_NAME}}") - set_target_properties(Arrow::${LIB_NAME} - PROPERTIES IMPORTED_LOCATION "${ARROW_LIB_${LIB_NAME}}" - INTERFACE_INCLUDE_DIRECTORIES - "${ARROW_HOME}/install/include") + set_target_properties( + Arrow::${LIB_NAME} + PROPERTIES IMPORTED_LOCATION ${ARROW_LIB_${LIB_NAME}} + INTERFACE_INCLUDE_DIRECTORIES ${ARROW_HOME}/install/include) endif() - file(COPY ${ARROW_LIB_${LIB_NAME}} DESTINATION ${root_directory}/releases/ FOLLOW_SYMLINK_CHAIN) endif() endfunction() - -set(ARROW_INSTALL_DIR "${ARROW_HOME}/install") -set(ARROW_LIB_DIR "${ARROW_INSTALL_DIR}/lib") -set(ARROW_LIB64_DIR "${ARROW_INSTALL_DIR}/lib64") -set(ARROW_INCLUDE_DIR "${ARROW_INSTALL_DIR}/include") diff --git a/cpp/CMake/FindThrift.cmake b/cpp/CMake/FindThrift.cmake index 07028971d9fcc..273500a6ae369 100644 --- a/cpp/CMake/FindThrift.cmake +++ b/cpp/CMake/FindThrift.cmake @@ -12,27 +12,26 @@ # See the License for the specific language governing permissions and # limitations under the License. -# - Find Thrift (a cross platform RPC lib/tool) +# * Find Thrift (a cross platform RPC lib/tool) # # Variables used by this module, they can change the default behaviour and need # to be set before calling find_package: # -# Thrift_ROOT - When set, this path is inspected instead of standard library -# locations as the root of the Thrift installation. -# The environment variable THRIFT_HOME overrides this variable. +# Thrift_ROOT - When set, this path is inspected instead of standard library +# locations as the root of the Thrift installation. The environment variable +# THRIFT_HOME overrides this variable. # -# This module defines -# Thrift_FOUND, whether Thrift is found or not -# Thrift_COMPILER_FOUND, whether Thrift compiler is found or not +# This module defines Thrift_FOUND, whether Thrift is found or not +# Thrift_COMPILER_FOUND, whether Thrift compiler is found or not # -# thrift::thrift, a library target to use Thrift -# thrift::compiler, a executable target to use Thrift compiler +# thrift::thrift, a library target to use Thrift thrift::compiler, a executable +# target to use Thrift compiler function(EXTRACT_THRIFT_VERSION) if(THRIFT_INCLUDE_DIR) file(READ "${THRIFT_INCLUDE_DIR}/thrift/config.h" THRIFT_CONFIG_H_CONTENT) - string(REGEX MATCH "#define PACKAGE_VERSION \"[0-9.]+\"" THRIFT_VERSION_DEFINITION - "${THRIFT_CONFIG_H_CONTENT}") + string(REGEX MATCH "#define PACKAGE_VERSION \"[0-9.]+\"" + THRIFT_VERSION_DEFINITION "${THRIFT_CONFIG_H_CONTENT}") string(REGEX MATCH "[0-9.]+" Thrift_VERSION "${THRIFT_VERSION_DEFINITION}") set(Thrift_VERSION "${Thrift_VERSION}" @@ -66,14 +65,16 @@ set(THRIFT_LIB_NAME_BASE "thrift${THRIFT_MSVC_LIB_SUFFIX}") if(ARROW_THRIFT_USE_SHARED) set(THRIFT_LIB_NAMES thrift) if(CMAKE_IMPORT_LIBRARY_SUFFIX) - list(APPEND - THRIFT_LIB_NAMES - "${CMAKE_IMPORT_LIBRARY_PREFIX}${THRIFT_LIB_NAME_BASE}${CMAKE_IMPORT_LIBRARY_SUFFIX}" + list( + APPEND + THRIFT_LIB_NAMES + "${CMAKE_IMPORT_LIBRARY_PREFIX}${THRIFT_LIB_NAME_BASE}${CMAKE_IMPORT_LIBRARY_SUFFIX}" ) endif() - list(APPEND - THRIFT_LIB_NAMES - "${CMAKE_SHARED_LIBRARY_PREFIX}${THRIFT_LIB_NAME_BASE}${CMAKE_SHARED_LIBRARY_SUFFIX}" + list( + APPEND + THRIFT_LIB_NAMES + "${CMAKE_SHARED_LIBRARY_PREFIX}${THRIFT_LIB_NAME_BASE}${CMAKE_SHARED_LIBRARY_SUFFIX}" ) else() set(THRIFT_LIB_NAMES @@ -82,20 +83,24 @@ else() endif() if(Thrift_ROOT) - find_library(THRIFT_LIB - NAMES ${THRIFT_LIB_NAMES} - PATHS ${Thrift_ROOT} - PATH_SUFFIXES "lib/${CMAKE_LIBRARY_ARCHITECTURE}" "lib") - find_path(THRIFT_INCLUDE_DIR thrift/Thrift.h - PATHS ${Thrift_ROOT} - PATH_SUFFIXES "include") - find_program(THRIFT_COMPILER thrift - PATHS ${Thrift_ROOT} - PATH_SUFFIXES "bin") + find_library( + THRIFT_LIB + NAMES ${THRIFT_LIB_NAMES} + PATHS ${Thrift_ROOT} + PATH_SUFFIXES "lib/${CMAKE_LIBRARY_ARCHITECTURE}" "lib") + find_path( + THRIFT_INCLUDE_DIR thrift/Thrift.h + PATHS ${Thrift_ROOT} + PATH_SUFFIXES "include") + find_program( + THRIFT_COMPILER thrift + PATHS ${Thrift_ROOT} + PATH_SUFFIXES "bin") extract_thrift_version() else() - # THRIFT-4760: The pkgconfig files are currently only installed when using autotools. - # Starting with 0.13, they are also installed for the CMake-based installations of Thrift. + # THRIFT-4760: The pkgconfig files are currently only installed when using + # autotools. Starting with 0.13, they are also installed for the CMake-based + # installations of Thrift. find_package(PkgConfig QUIET) pkg_check_modules(THRIFT_PC thrift) if(THRIFT_PC_FOUND) @@ -103,19 +108,22 @@ else() list(APPEND THRIFT_PC_LIBRARY_DIRS "${THRIFT_PC_LIBDIR}") - find_library(THRIFT_LIB - NAMES ${THRIFT_LIB_NAMES} - PATHS ${THRIFT_PC_LIBRARY_DIRS} - NO_DEFAULT_PATH) - find_program(THRIFT_COMPILER thrift - HINTS ${THRIFT_PC_PREFIX} - NO_DEFAULT_PATH - PATH_SUFFIXES "bin") + find_library( + THRIFT_LIB + NAMES ${THRIFT_LIB_NAMES} + PATHS ${THRIFT_PC_LIBRARY_DIRS} + NO_DEFAULT_PATH) + find_program( + THRIFT_COMPILER thrift + HINTS ${THRIFT_PC_PREFIX} + NO_DEFAULT_PATH + PATH_SUFFIXES "bin") set(Thrift_VERSION ${THRIFT_PC_VERSION}) else() - find_library(THRIFT_LIB - NAMES ${THRIFT_LIB_NAMES} - PATH_SUFFIXES "lib/${CMAKE_LIBRARY_ARCHITECTURE}" "lib") + find_library( + THRIFT_LIB + NAMES ${THRIFT_LIB_NAMES} + PATH_SUFFIXES "lib/${CMAKE_LIBRARY_ARCHITECTURE}" "lib") find_path(THRIFT_INCLUDE_DIR thrift/Thrift.h PATH_SUFFIXES "include") find_program(THRIFT_COMPILER thrift PATH_SUFFIXES "bin") extract_thrift_version() @@ -140,14 +148,15 @@ if(Thrift_FOUND) else() add_library(thrift::thrift STATIC IMPORTED) endif() - set_target_properties(thrift::thrift - PROPERTIES IMPORTED_LOCATION "${THRIFT_LIB}" - INTERFACE_INCLUDE_DIRECTORIES "${THRIFT_INCLUDE_DIR}") + set_target_properties( + thrift::thrift + PROPERTIES IMPORTED_LOCATION "${THRIFT_LIB}" INTERFACE_INCLUDE_DIRECTORIES + "${THRIFT_INCLUDE_DIR}") if(WIN32 AND NOT MSVC_TOOLCHAIN) - # We don't need this for Visual C++ because Thrift uses - # "#pragma comment(lib, "Ws2_32.lib")" in - # thrift/windows/config.h for Visual C++. - set_target_properties(thrift::thrift PROPERTIES INTERFACE_LINK_LIBRARIES "ws2_32") + # We don't need this for Visual C++ because Thrift uses "#pragma + # comment(lib, "Ws2_32.lib")" in thrift/windows/config.h for Visual C++. + set_target_properties(thrift::thrift PROPERTIES INTERFACE_LINK_LIBRARIES + "ws2_32") endif() if(Thrift_COMPILER_FOUND) diff --git a/cpp/CMake/FindZstd.cmake b/cpp/CMake/FindZstd.cmake index a7efd5adfd3f0..62e8b874d7359 100644 --- a/cpp/CMake/FindZstd.cmake +++ b/cpp/CMake/FindZstd.cmake @@ -15,43 +15,44 @@ # specific language governing permissions and limitations # under the License. -# ZSTD_HOME environmental variable is used to check for Zstd headers and static library +# ZSTD_HOME environmental variable is used to check for Zstd headers and static +# library -# ZSTD_INCLUDE_DIR: directory containing headers -# ZSTD_LIBRARY: path to libzstd.so -# ZSTD_FOUND: whether zstd has been found +# ZSTD_INCLUDE_DIR: directory containing headers ZSTD_LIBRARY: path to +# libzstd.so ZSTD_FOUND: whether zstd has been found -if (NOT "$ENV{ZSTD_HOME}" STREQUAL "") +if(NOT "$ENV{ZSTD_HOME}" STREQUAL "") file(TO_CMAKE_PATH "$ENV{ZSTD_HOME}" _zstd_path) message(STATUS "ZSTD_HOME: ${_zstd_path}") else() set(_zstd_path "/usr/local") endif() -find_path(ZSTD_INCLUDE_DIR zstd.h HINTS - ${_zstd_path} - PATH_SUFFIXES "include") +find_path( + ZSTD_INCLUDE_DIR zstd.h + HINTS ${_zstd_path} + PATH_SUFFIXES "include") -find_library (ZSTD_LIBRARY NAMES zstd HINTS - ${_zstd_path} - PATH_SUFFIXES "lib") +find_library( + ZSTD_LIBRARY + NAMES zstd + HINTS ${_zstd_path} + PATH_SUFFIXES "lib") -if (ZSTD_INCLUDE_DIR AND ZSTD_LIBRARY) +if(ZSTD_INCLUDE_DIR AND ZSTD_LIBRARY) set(ZSTD_FOUND TRUE) set(ZSTD_HEADER_NAME zstd.h) set(ZSTD_HEADER ${ZSTD_INCLUDE_DIR}/${ZSTD_HEADER_NAME}) -else () +else() set(ZSTD_FOUND FALSE) -endif () +endif() -if (ZSTD_FOUND) +if(ZSTD_FOUND) message(STATUS "Found the zstd header: ${ZSTD_HEADER}") message(STATUS "Found the zstd static library: ${ZSTD_LIBRARY}") -else () - message(FATAL_ERROR ZSTD_ERR_MSG "Could not find zstd. Looked in ${_zstd_path}.") -endif () - -mark_as_advanced( - ZSTD_INCLUDE_DIR - ZSTD_LIBRARY) +else() + message(FATAL_ERROR ZSTD_ERR_MSG + "Could not find zstd. Looked in ${_zstd_path}.") +endif() +mark_as_advanced(ZSTD_INCLUDE_DIR ZSTD_LIBRARY) diff --git a/cpp/CMake/Findglog.cmake b/cpp/CMake/Findglog.cmake index b165fd80f3d7f..6d9dbdacf1b13 100644 --- a/cpp/CMake/Findglog.cmake +++ b/cpp/CMake/Findglog.cmake @@ -22,23 +22,17 @@ if(NOT BUILD_GLOG) include(FindPackageHandleStandardArgs) include(SelectLibraryConfigurations) - find_library(GLOG_LIBRARY_RELEASE glog - PATHS ${GLOG_LIBRARYDIR}) - find_library(GLOG_LIBRARY_DEBUG glogd - PATHS ${GLOG_LIBRARYDIR}) + find_library(GLOG_LIBRARY_RELEASE glog PATHS ${GLOG_LIBRARYDIR}) + find_library(GLOG_LIBRARY_DEBUG glogd PATHS ${GLOG_LIBRARYDIR}) - find_path(GLOG_INCLUDE_DIR glog/logging.h - PATHS ${GLOG_INCLUDEDIR}) + find_path(GLOG_INCLUDE_DIR glog/logging.h PATHS ${GLOG_INCLUDEDIR}) select_library_configurations(GLOG) - find_package_handle_standard_args(glog DEFAULT_MSG - GLOG_LIBRARY - GLOG_INCLUDE_DIR) + find_package_handle_standard_args(glog DEFAULT_MSG GLOG_LIBRARY + GLOG_INCLUDE_DIR) - mark_as_advanced( - GLOG_LIBRARY - GLOG_INCLUDE_DIR) + mark_as_advanced(GLOG_LIBRARY GLOG_INCLUDE_DIR) endif() if(NOT glog_FOUND) @@ -56,26 +50,40 @@ endif() # glog::glog may already exist. Use google::glog to avoid conflicts. add_library(google::glog ${libglog_type} IMPORTED) -set_target_properties(google::glog PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${GLOG_INCLUDE_DIR}") -set_target_properties(google::glog PROPERTIES IMPORTED_LINK_INTERFACE_LANGUAGES "C" IMPORTED_LOCATION "${GLOG_LIBRARY}") +set_target_properties(google::glog PROPERTIES INTERFACE_INCLUDE_DIRECTORIES + "${GLOG_INCLUDE_DIR}") +set_target_properties( + google::glog PROPERTIES IMPORTED_LINK_INTERFACE_LANGUAGES "C" + IMPORTED_LOCATION "${GLOG_LIBRARY}") set(GLUTEN_GFLAGS_VERSION 2.2.2) -find_package(gflags ${GLUTEN_GFLAGS_VERSION} CONFIG COMPONENTS ${libgflags_component}) +find_package(gflags ${GLUTEN_GFLAGS_VERSION} CONFIG + COMPONENTS ${libgflags_component}) if(NOT gflags_FOUND AND glog_FOUND) - message(FATAL_ERROR "Glog found but Gflags not found. Set BUILD_GLOG=ON and reload cmake.") + message( + FATAL_ERROR + "Glog found but Gflags not found. Set BUILD_GLOG=ON and reload cmake.") endif() if(gflags_FOUND) - if(NOT TARGET gflags::gflags_${libgflags_component} AND NOT TARGET gflags_${libgflags_component}) - message(FATAL_ERROR "Found Gflags but missing component gflags_${libgflags_component}") + if(NOT TARGET gflags::gflags_${libgflags_component} + AND NOT TARGET gflags_${libgflags_component}) + message( + FATAL_ERROR + "Found Gflags but missing component gflags_${libgflags_component}") endif() if(TARGET gflags::gflags_${libgflags_component}) - set_target_properties(google::glog PROPERTIES IMPORTED_LINK_INTERFACE_LIBRARIES gflags::gflags_${libgflags_component}) + set_target_properties( + google::glog PROPERTIES IMPORTED_LINK_INTERFACE_LIBRARIES + gflags::gflags_${libgflags_component}) else() - set_target_properties(google::glog PROPERTIES IMPORTED_LINK_INTERFACE_LIBRARIES gflags_${libgflags_component}) + set_target_properties( + google::glog PROPERTIES IMPORTED_LINK_INTERFACE_LIBRARIES + gflags_${libgflags_component}) endif() else() include(BuildGflags) - set_target_properties(google::glog PROPERTIES IMPORTED_LINK_INTERFACE_LIBRARIES gflags_static) + set_target_properties( + google::glog PROPERTIES IMPORTED_LINK_INTERFACE_LIBRARIES gflags_static) endif() diff --git a/cpp/CMake/Findjemalloc_pic.cmake b/cpp/CMake/Findjemalloc_pic.cmake index 9511dcd336632..ca7b7d213dfc8 100644 --- a/cpp/CMake/Findjemalloc_pic.cmake +++ b/cpp/CMake/Findjemalloc_pic.cmake @@ -17,71 +17,25 @@ # Find Jemalloc macro(find_jemalloc) - # Find the existing Protobuf + # Find the existing jemalloc set(CMAKE_FIND_LIBRARY_SUFFIXES ".a") - find_package(jemalloc_pic) - if ("${Jemalloc_LIBRARY}" STREQUAL "Jemalloc_LIBRARY-NOTFOUND") - message(FATAL_ERROR "Jemalloc Library Not Found") - endif() - set(PROTOC_BIN ${Jemalloc_PROTOC_EXECUTABLE}) -endmacro() - -# Building Jemalloc -macro(build_jemalloc) - message(STATUS "Building Jemalloc from Source") - - if(DEFINED ENV{GLUTEN_JEMALLOC_URL}) - set(JEMALLOC_SOURCE_URL "$ENV{GLUTEN_JEMALLOC_URL}") + # Find from vcpkg-installed lib path. + find_library( + JEMALLOC_LIBRARY + NAMES jemalloc_pic + PATHS + ${CMAKE_CURRENT_BINARY_DIR}/../../../dev/vcpkg/vcpkg_installed/x64-linux-avx/lib/ + NO_DEFAULT_PATH) + if("${JEMALLOC_LIBRARY}" STREQUAL "JEMALLOC_LIBRARY-NOTFOUND") + message(STATUS "Jemalloc Library Not Found.") + set(JEMALLOC_NOT_FOUND TRUE) else() - set(JEMALLOC_BUILD_VERSION "5.2.1") - set(JEMALLOC_SOURCE_URL - "https://github.com/jemalloc/jemalloc/releases/download/${JEMALLOC_BUILD_VERSION}/jemalloc-${JEMALLOC_BUILD_VERSION}.tar.bz2" - "https://github.com/ursa-labs/thirdparty/releases/download/latest/jemalloc-${JEMALLOC_BUILD_VERSION}.tar.bz2" - ) + message(STATUS "Found jemalloc: ${JEMALLOC_LIBRARY}") + find_path(JEMALLOC_INCLUDE_DIR jemalloc/jemalloc.h) + add_library(jemalloc::libjemalloc STATIC IMPORTED) + set_target_properties( + jemalloc::libjemalloc + PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${JEMALLOC_INCLUDE_DIR}" + IMPORTED_LOCATION "${JEMALLOC_LIBRARY}") endif() - - set(JEMALLOC_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/jemalloc_ep-install") - set(JEMALLOC_LIB_DIR "${JEMALLOC_PREFIX}/lib") - set(JEMALLOC_INCLUDE_DIR "${JEMALLOC_PREFIX}/include") - set( - JEMALLOC_STATIC_LIB - "${JEMALLOC_LIB_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}jemalloc_pic${CMAKE_STATIC_LIBRARY_SUFFIX}" - ) - set( - JEMALLOC_INCLUDE - "${JEMALLOC_PREFIX}/include" - ) - set(JEMALLOC_CONFIGURE_ARGS - "AR=${CMAKE_AR}" - "CC=${CMAKE_C_COMPILER}" - "--prefix=${JEMALLOC_PREFIX}" - "--libdir=${JEMALLOC_LIB_DIR}" - "--with-jemalloc-prefix=je_gluten_" - "--with-private-namespace=je_gluten_private_" - "--without-export" - "--disable-shared" - "--disable-cxx" - "--disable-libdl" - "--disable-initial-exec-tls" - "CFLAGS=-fPIC" - "CXXFLAGS=-fPIC") - set(JEMALLOC_BUILD_COMMAND ${MAKE} ${MAKE_BUILD_ARGS}) - ExternalProject_Add(jemalloc_ep - URL ${JEMALLOC_SOURCE_URL} - PATCH_COMMAND touch doc/jemalloc.3 doc/jemalloc.html - CONFIGURE_COMMAND "./configure" ${JEMALLOC_CONFIGURE_ARGS} - BUILD_COMMAND ${JEMALLOC_BUILD_COMMAND} - BUILD_IN_SOURCE 1 - BUILD_BYPRODUCTS "${JEMALLOC_STATIC_LIB}" - INSTALL_COMMAND make install) - - file(MAKE_DIRECTORY "${JEMALLOC_INCLUDE_DIR}") - add_library(jemalloc::libjemalloc STATIC IMPORTED) - set_target_properties( - jemalloc::libjemalloc - PROPERTIES INTERFACE_LINK_LIBRARIES Threads::Threads - IMPORTED_LOCATION "${JEMALLOC_STATIC_LIB}" - INTERFACE_INCLUDE_DIRECTORIES - "${JEMALLOC_INCLUDE_DIR}") - add_dependencies(jemalloc::libjemalloc protobuf_ep) endmacro() diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index d6e3eeb133f74..c5cbab0697bf4 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -19,13 +19,13 @@ message(STATUS "Building using CMake version: ${CMAKE_VERSION}") set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) -# The set(CACHE) command does not remove any normal variable of the same name from the current scope -# https://cmake.org/cmake/help/latest/policy/CMP0126.html +# The set(CACHE) command does not remove any normal variable of the same name +# from the current scope https://cmake.org/cmake/help/latest/policy/CMP0126.html if(POLICY CMP0126) cmake_policy(SET CMP0126 NEW) endif() -if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0") +if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0") cmake_policy(SET CMP0135 NEW) endif() @@ -36,7 +36,9 @@ if(NOT DEFINED CMAKE_BUILD_TYPE) endif() set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/CMake" ${CMAKE_MODULE_PATH}) -set(CMAKE_EXPORT_COMPILE_COMMANDS ON CACHE INTERNAL "") +set(CMAKE_EXPORT_COMPILE_COMMANDS + ON + CACHE INTERNAL "") project(gluten) @@ -60,15 +62,19 @@ option(ENABLE_ABFS "Enable ABFS" OFF) set(root_directory ${PROJECT_BINARY_DIR}) get_filename_component(GLUTEN_HOME ${CMAKE_SOURCE_DIR} DIRECTORY) -if (NOT DEFINED VELOX_HOME) +if(NOT DEFINED VELOX_HOME) set(VELOX_HOME ${GLUTEN_HOME}/ep/build-velox/build/velox_ep) message(STATUS "Set VELOX_HOME to ${VELOX_HOME}") endif() -if (${CMAKE_BUILD_TYPE} STREQUAL "Debug") - set(ARROW_HOME ${VELOX_HOME}/_build/debug/third_party/arrow_ep) +if(${CMAKE_BUILD_TYPE} STREQUAL "Debug") + set(ARROW_HOME + ${VELOX_HOME}/_build/debug/CMake/resolve_dependency_modules/arrow/arrow_ep/ + ) else() - set(ARROW_HOME ${VELOX_HOME}/_build/release/third_party/arrow_ep) + set(ARROW_HOME + ${VELOX_HOME}/_build/release/CMake/resolve_dependency_modules/arrow/arrow_ep + ) endif() include(ResolveDependency) @@ -77,10 +83,10 @@ include(ResolveDependency) # Compiler flags # -if (${CMAKE_BUILD_TYPE} STREQUAL "Debug") +if(${CMAKE_BUILD_TYPE} STREQUAL "Debug") set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -ggdb -O0") message(STATUS "CMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}") -else () +else() add_definitions(-DNDEBUG) message(STATUS "Add definition NDEBUG") endif() @@ -97,22 +103,20 @@ set(KNOWN_WARNINGS -Wno-ignored-qualifiers") if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") - set(KNOWN_WARNINGS - "-Wno-error=unused-but-set-variable \ + set(KNOWN_WARNINGS "-Wno-error=unused-but-set-variable \ ${KNOWN_WARNINGS}") if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 11) - set(KNOWN_WARNINGS - "-Wno-error=maybe-uninitialized \ + set(KNOWN_WARNINGS "-Wno-error=maybe-uninitialized \ ${KNOWN_WARNINGS}") endif() -elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") +elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") # Experimental set(KNOWN_WARNINGS "-Wno-implicit-int-float-conversion \ -Wno-nullability-completeness \ -Wno-mismatched-tags \ ${KNOWN_WARNINGS}") -elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang") +elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang") # Experimental set(KNOWN_WARNINGS "-Wno-implicit-int-float-conversion \ @@ -126,15 +130,16 @@ else() endif() # see https://issues.apache.org/jira/browse/ARROW-4665 -if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin") +if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin") set(KNOWN_WARNINGS "-Wno-macro-redefined \ -Wno-nullability-completeness \ -Wno-pessimizing-move \ -Wno-mismatched-tags \ ${KNOWN_WARNINGS}") - # Specific definition for an issue with boost/stacktrace when building on macOS. - # See https://github.com/boostorg/stacktrace/issues/88 and comments therein. + # Specific definition for an issue with boost/stacktrace when building on + # macOS. See https://github.com/boostorg/stacktrace/issues/88 and comments + # therein. add_compile_definitions(_GNU_SOURCE) endif() @@ -166,17 +171,11 @@ endif() function(ADD_TEST_CASE TEST_NAME) set(options) set(one_value_args) - set(multi_value_args - SOURCES - EXTRA_LINK_LIBS - EXTRA_INCLUDES - EXTRA_DEPENDENCIES) - - cmake_parse_arguments(ARG - "${options}" - "${one_value_args}" - "${multi_value_args}" - ${ARGN}) + set(multi_value_args SOURCES EXTRA_LINK_LIBS EXTRA_INCLUDES + EXTRA_DEPENDENCIES) + + cmake_parse_arguments(ARG "${options}" "${one_value_args}" + "${multi_value_args}" ${ARGN}) if(ARG_SOURCES) set(SOURCES ${ARG_SOURCES}) @@ -185,7 +184,8 @@ function(ADD_TEST_CASE TEST_NAME) endif() add_executable(${TEST_NAME} ${SOURCES}) - target_link_libraries(${TEST_NAME} gluten google::glog GTest::gtest GTest::gtest_main Threads::Threads) + target_link_libraries(${TEST_NAME} gluten google::glog GTest::gtest + GTest::gtest_main Threads::Threads) target_include_directories(${TEST_NAME} PRIVATE ${CMAKE_SOURCE_DIR}/core) if(ARG_EXTRA_LINK_LIBS) diff --git a/cpp/core/CMakeLists.txt b/cpp/core/CMakeLists.txt index dc9ce3435c380..cc5b6c7e926df 100644 --- a/cpp/core/CMakeLists.txt +++ b/cpp/core/CMakeLists.txt @@ -23,25 +23,27 @@ include(GNUInstallDirs) include(CheckCXXCompilerFlag) # Only set arch=native for non-AppleClang compilers. -if (NOT CMAKE_CXX_COMPILER_ID MATCHES "AppleClang") +if(NOT CMAKE_CXX_COMPILER_ID MATCHES "AppleClang") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native") endif() set(BOOST_MIN_VERSION "1.42.0") find_package(Boost REQUIRED) -INCLUDE_DIRECTORIES(${Boost_INCLUDE_DIRS}) +include_directories(${Boost_INCLUDE_DIRS}) set(source_root_directory ${CMAKE_CURRENT_SOURCE_DIR}) -if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0") +if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0") cmake_policy(SET CMP0135 NEW) endif() set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/CMake" ${CMAKE_MODULE_PATH}) -set(SUBSTRAIT_PROTO_SRC_DIR ${GLUTEN_HOME}/gluten-core/src/main/resources/substrait/proto) +set(SUBSTRAIT_PROTO_SRC_DIR + ${GLUTEN_HOME}/gluten-core/src/main/resources/substrait/proto) message(STATUS "Set Substrait Proto Directory in ${SUBSTRAIT_PROTO_SRC_DIR}") -set(GLUTEN_PROTO_SRC_DIR ${GLUTEN_HOME}/gluten-core/src/main/resources/org/apache/gluten/proto) +set(GLUTEN_PROTO_SRC_DIR + ${GLUTEN_HOME}/gluten-core/src/main/resources/org/apache/gluten/proto) message(STATUS "Set Gluten Proto Directory in ${GLUTEN_PROTO_SRC_DIR}") find_program(CCACHE_FOUND ccache) @@ -58,31 +60,25 @@ macro(build_protobuf) set(PROTOBUF_SOURCE_URL "$ENV{GLUTEN_PROTOBUF_URL}") else() set(PROTOBUF_BUILD_VERSION "21.4") - set (PROTOBUF_SOURCE_URL - "https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOBUF_BUILD_VERSION}/protobuf-all-${PROTOBUF_BUILD_VERSION}.tar.gz") + set(PROTOBUF_SOURCE_URL + "https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOBUF_BUILD_VERSION}/protobuf-all-${PROTOBUF_BUILD_VERSION}.tar.gz" + ) endif() - set(PROTOBUF_BUILD_SHA256_CHECKSUM "6c5e1b0788afba4569aeebb2cfe205cb154aa01deacaba0cd26442f3b761a836") + set(PROTOBUF_BUILD_SHA256_CHECKSUM + "6c5e1b0788afba4569aeebb2cfe205cb154aa01deacaba0cd26442f3b761a836") set(PROTOBUF_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/protobuf_ep-install") set(PROTOBUF_INCLUDE_DIR "${PROTOBUF_PREFIX}/include") - set( - PROTOBUF_STATIC_LIB - "${PROTOBUF_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}protobuf${CMAKE_STATIC_LIBRARY_SUFFIX}" - ) - set( - PROTOC_STATIC_LIB - "${PROTOBUF_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}protoc${CMAKE_STATIC_LIBRARY_SUFFIX}" - ) - set( - PROTOC_BIN - "${PROTOBUF_PREFIX}/bin/protoc" - ) - set( - PROTOBUF_INCLUDE - "${PROTOBUF_PREFIX}/include" - CACHE PATH - "Protobuf include path" - ) + set(PROTOBUF_STATIC_LIB + "${PROTOBUF_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}protobuf${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) + set(PROTOC_STATIC_LIB + "${PROTOBUF_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}protoc${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) + set(PROTOC_BIN "${PROTOBUF_PREFIX}/bin/protoc") + set(PROTOBUF_INCLUDE + "${PROTOBUF_PREFIX}/include" + CACHE PATH "Protobuf include path") set(PROTOBUF_COMPILER "${PROTOBUF_PREFIX}/bin/protoc") set(PROTOBUF_CONFIGURE_ARGS "AR=${CMAKE_AR}" @@ -94,22 +90,23 @@ macro(build_protobuf) "CFLAGS=-fPIC" "CXXFLAGS=-fPIC") set(PROTOBUF_BUILD_COMMAND ${MAKE} ${MAKE_BUILD_ARGS}) - ExternalProject_Add(protobuf_ep - PREFIX protobuf_ep - CONFIGURE_COMMAND ./autogen.sh COMMAND "./configure" ${PROTOBUF_CONFIGURE_ARGS} - BUILD_BYPRODUCTS "${PROTOBUF_STATIC_LIB}" "${PROTOBUF_COMPILER}" - BUILD_COMMAND ${PROTOBUF_BUILD_COMMAND} - BUILD_IN_SOURCE 1 - URL ${PROTOBUF_SOURCE_URL} - URL_HASH "SHA256=${PROTOBUF_BUILD_SHA256_CHECKSUM}" - ) + ExternalProject_Add( + protobuf_ep + PREFIX protobuf_ep + CONFIGURE_COMMAND ./autogen.sh + COMMAND "./configure" ${PROTOBUF_CONFIGURE_ARGS} + BUILD_BYPRODUCTS "${PROTOBUF_STATIC_LIB}" "${PROTOBUF_COMPILER}" + BUILD_COMMAND ${PROTOBUF_BUILD_COMMAND} + BUILD_IN_SOURCE 1 + URL ${PROTOBUF_SOURCE_URL} + URL_HASH "SHA256=${PROTOBUF_BUILD_SHA256_CHECKSUM}") file(MAKE_DIRECTORY "${PROTOBUF_INCLUDE_DIR}") add_library(protobuf::libprotobuf STATIC IMPORTED) set_target_properties( protobuf::libprotobuf - PROPERTIES IMPORTED_LOCATION "${PROTOBUF_STATIC_LIB}" INTERFACE_INCLUDE_DIRECTORIES - "${PROTOBUF_INCLUDE_DIR}") + PROPERTIES IMPORTED_LOCATION "${PROTOBUF_STATIC_LIB}" + INTERFACE_INCLUDE_DIRECTORIES "${PROTOBUF_INCLUDE_DIR}") add_dependencies(protobuf::libprotobuf protobuf_ep) endmacro() @@ -117,17 +114,19 @@ macro(find_protobuf) # Find the existing Protobuf set(CMAKE_FIND_LIBRARY_SUFFIXES ".a") find_package(Protobuf) - if ("${Protobuf_LIBRARY}" STREQUAL "Protobuf_LIBRARY-NOTFOUND") + if("${Protobuf_LIBRARY}" STREQUAL "Protobuf_LIBRARY-NOTFOUND") message(FATAL_ERROR "Protobuf Library Not Found") endif() set(PROTOC_BIN ${Protobuf_PROTOC_EXECUTABLE}) - set(PROTOBUF_INCLUDE "${Protobuf_INCLUDE_DIRS}" CACHE PATH "Protobuf include path") + set(PROTOBUF_INCLUDE + "${Protobuf_INCLUDE_DIRS}" + CACHE PATH "Protobuf include path") endmacro() if(USE_AVX512) # Only enable additional instruction sets if they are supported message(STATUS "System processor: ${CMAKE_SYSTEM_PROCESSOR}") - if (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86)|(X86)|(amd64)|(AMD64)") + if(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86)|(X86)|(amd64)|(AMD64)") set(AVX512_FLAG "-march=skylake-avx512") check_cxx_compiler_flag(${AVX512_FLAG} CXX_SUPPORTS_AVX512) if(NOT CXX_SUPPORTS_AVX512) @@ -135,7 +134,7 @@ if(USE_AVX512) endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${AVX512_FLAG}") add_definitions(-DCOLUMNAR_PLUGIN_USE_AVX512) - endif () + endif() endif() # Set up Proto @@ -144,83 +143,91 @@ file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/proto) # List Substrait Proto compiled files file(GLOB SUBSTRAIT_PROTO_FILES ${SUBSTRAIT_PROTO_SRC_DIR}/substrait/*.proto - ${SUBSTRAIT_PROTO_SRC_DIR}/substrait/extensions/*.proto) -FOREACH(PROTO ${SUBSTRAIT_PROTO_FILES}) + ${SUBSTRAIT_PROTO_SRC_DIR}/substrait/extensions/*.proto) +foreach(PROTO ${SUBSTRAIT_PROTO_FILES}) file(RELATIVE_PATH REL_PROTO ${SUBSTRAIT_PROTO_SRC_DIR} ${PROTO}) string(REGEX REPLACE "\\.proto" "" PROTO_NAME ${REL_PROTO}) - LIST(APPEND SUBSTRAIT_PROTO_SRCS "${PROTO_OUTPUT_DIR}/${PROTO_NAME}.pb.cc") - LIST(APPEND SUBSTRAIT_PROTO_HDRS "${PROTO_OUTPUT_DIR}/${PROTO_NAME}.pb.h") -ENDFOREACH() -set(SUBSTRAIT_PROTO_OUTPUT_FILES ${SUBSTRAIT_PROTO_HDRS} ${SUBSTRAIT_PROTO_SRCS}) -set_source_files_properties(${SUBSTRAIT_PROTO_OUTPUT_FILES} PROPERTIES GENERATED TRUE) -get_filename_component(SUBSTRAIT_PROTO_DIR ${SUBSTRAIT_PROTO_SRC_DIR}/ DIRECTORY) + list(APPEND SUBSTRAIT_PROTO_SRCS "${PROTO_OUTPUT_DIR}/${PROTO_NAME}.pb.cc") + list(APPEND SUBSTRAIT_PROTO_HDRS "${PROTO_OUTPUT_DIR}/${PROTO_NAME}.pb.h") +endforeach() +set(SUBSTRAIT_PROTO_OUTPUT_FILES ${SUBSTRAIT_PROTO_HDRS} + ${SUBSTRAIT_PROTO_SRCS}) +set_source_files_properties(${SUBSTRAIT_PROTO_OUTPUT_FILES} PROPERTIES GENERATED + TRUE) +get_filename_component(SUBSTRAIT_PROTO_DIR ${SUBSTRAIT_PROTO_SRC_DIR}/ + DIRECTORY) # List Gluten Proto compiled files file(GLOB GLUTEN_PROTO_FILES ${GLUTEN_PROTO_SRC_DIR}/*.proto) -FOREACH(PROTO ${GLUTEN_PROTO_FILES}) +foreach(PROTO ${GLUTEN_PROTO_FILES}) file(RELATIVE_PATH REL_PROTO ${GLUTEN_PROTO_SRC_DIR} ${PROTO}) string(REGEX REPLACE "\\.proto" "" PROTO_NAME ${REL_PROTO}) - LIST(APPEND GLUTEN_PROTO_SRCS "${PROTO_OUTPUT_DIR}/${PROTO_NAME}.pb.cc") - LIST(APPEND GLUTEN_PROTO_HDRS "${PROTO_OUTPUT_DIR}/${PROTO_NAME}.pb.h") -ENDFOREACH() + list(APPEND GLUTEN_PROTO_SRCS "${PROTO_OUTPUT_DIR}/${PROTO_NAME}.pb.cc") + list(APPEND GLUTEN_PROTO_HDRS "${PROTO_OUTPUT_DIR}/${PROTO_NAME}.pb.h") +endforeach() set(GLUTEN_PROTO_OUTPUT_FILES ${GLUTEN_PROTO_HDRS} ${GLUTEN_PROTO_SRCS}) -set_source_files_properties(${GLUTEN_PROTO_OUTPUT_FILES} PROPERTIES GENERATED TRUE) +set_source_files_properties(${GLUTEN_PROTO_OUTPUT_FILES} PROPERTIES GENERATED + TRUE) get_filename_component(GLUTEN_PROTO_DIR ${GLUTEN_PROTO_SRC_DIR}/ DIRECTORY) -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecated-declarations -Wno-attributes") +set(CMAKE_CXX_FLAGS + "${CMAKE_CXX_FLAGS} -Wno-deprecated-declarations -Wno-attributes") message("Core module final CMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}") set(SPARK_COLUMNAR_PLUGIN_SRCS - ${SUBSTRAIT_PROTO_SRCS} - ${GLUTEN_PROTO_SRCS} - compute/Runtime.cc - compute/ProtobufUtils.cc - compute/ResultIterator.cc - config/GlutenConfig.cc - jni/JniWrapper.cc - memory/AllocationListener.cc - memory/MemoryAllocator.cc - memory/ArrowMemoryPool.cc - memory/ColumnarBatch.cc - operators/writer/ArrowWriter.cc - shuffle/FallbackRangePartitioner.cc - shuffle/HashPartitioner.cc - shuffle/LocalPartitionWriter.cc - shuffle/Options.cc - shuffle/Partitioner.cc - shuffle/Partitioning.cc - shuffle/Payload.cc - shuffle/rss/RssPartitionWriter.cc - shuffle/RoundRobinPartitioner.cc - shuffle/ShuffleMemoryPool.cc - shuffle/ShuffleReader.cc - shuffle/SinglePartitioner.cc - shuffle/Spill.cc - shuffle/Utils.cc - utils/Compression.cc - utils/StringUtil.cc - utils/ObjectStore.cc - jni/JniError.cc - jni/JniCommon.cc) + ${SUBSTRAIT_PROTO_SRCS} + ${GLUTEN_PROTO_SRCS} + compute/Runtime.cc + compute/ProtobufUtils.cc + compute/ResultIterator.cc + config/GlutenConfig.cc + jni/JniWrapper.cc + memory/AllocationListener.cc + memory/MemoryAllocator.cc + memory/ArrowMemoryPool.cc + memory/ColumnarBatch.cc + operators/writer/ArrowWriter.cc + shuffle/FallbackRangePartitioner.cc + shuffle/HashPartitioner.cc + shuffle/LocalPartitionWriter.cc + shuffle/Options.cc + shuffle/Partitioner.cc + shuffle/Partitioning.cc + shuffle/Payload.cc + shuffle/rss/RssPartitionWriter.cc + shuffle/RandomPartitioner.cc + shuffle/RoundRobinPartitioner.cc + shuffle/ShuffleMemoryPool.cc + shuffle/ShuffleReader.cc + shuffle/SinglePartitioner.cc + shuffle/Spill.cc + shuffle/Utils.cc + utils/Compression.cc + utils/StringUtil.cc + utils/ObjectStore.cc + jni/JniError.cc + jni/JniCommon.cc) file(MAKE_DIRECTORY ${root_directory}/releases) add_library(gluten SHARED ${SPARK_COLUMNAR_PLUGIN_SRCS}) add_dependencies(gluten jni_proto) if(ENABLE_GLUTEN_VCPKG) - # Hide symbols of some static dependencies. Otherwise, if such dependencies are already - # statically linked to libvelox.so, a runtime error will be reported: xxx is being linked - # both statically and dynamically. - target_link_options(gluten PRIVATE -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/symbols.map) + # Hide symbols of some static dependencies. Otherwise, if such dependencies + # are already statically linked to libvelox.so, a runtime error will be + # reported: xxx is being linked both statically and dynamically. + target_link_options( + gluten PRIVATE -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/symbols.map) endif() if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 9.0) - execute_process(COMMAND ${CMAKE_C_COMPILER} -print-file-name=libstdc++fs.a + execute_process( + COMMAND ${CMAKE_C_COMPILER} -print-file-name=libstdc++fs.a RESULT_VARIABLE LIBSTDCXXFS_STATIC_RESULT OUTPUT_VARIABLE LIBSTDCXXFS_STATIC_PATH OUTPUT_STRIP_TRAILING_WHITESPACE) - if (LIBSTDCXXFS_STATIC_RESULT EQUAL 0 AND EXISTS "${LIBSTDCXXFS_STATIC_PATH}") + if(LIBSTDCXXFS_STATIC_RESULT EQUAL 0 AND EXISTS "${LIBSTDCXXFS_STATIC_PATH}") message(STATUS "libstdc++fs.a found at: ${LIBSTDCXXFS_STATIC_PATH}") target_link_libraries(gluten PRIVATE ${LIBSTDCXXFS_STATIC_PATH}) else() @@ -231,6 +238,7 @@ endif() find_arrow_lib(${ARROW_LIB_NAME}) find_arrow_lib(${PARQUET_LIB_NAME}) +find_arrow_lib(${ARROW_BUNDLED_DEPS}) if(ENABLE_HBM) include(BuildMemkind) @@ -243,67 +251,55 @@ if(ENABLE_QAT) include(BuildQATzip) include(BuildQATZstd) target_sources(gluten PRIVATE utils/qat/QatCodec.cc) - target_include_directories(gluten PUBLIC ${QATZIP_INCLUDE_DIR} ${QATZSTD_INCLUDE_DIR}) + target_include_directories(gluten PUBLIC ${QATZIP_INCLUDE_DIR} + ${QATZSTD_INCLUDE_DIR}) target_link_libraries(gluten PUBLIC qatzip::qatzip qatzstd::qatzstd) endif() if(ENABLE_IAA) include(BuildQpl) target_include_directories(gluten PUBLIC ${QPL_INCLUDE_DIR}) - target_sources(gluten PRIVATE utils/qpl/qpl_job_pool.cc utils/qpl/qpl_codec.cc) + target_sources(gluten PRIVATE utils/qpl/qpl_job_pool.cc + utils/qpl/qpl_codec.cc) target_link_libraries(gluten PUBLIC qpl::qpl) endif() if(BUILD_PROTOBUF) build_protobuf() message(STATUS "Building ProtoBuf from Source: ${BUILD_PROTOBUF}") - target_link_libraries(gluten - LINK_PRIVATE protobuf::libprotobuf) + target_link_libraries(gluten LINK_PRIVATE protobuf::libprotobuf) else() find_protobuf() message(STATUS "Use existing ProtoBuf libraries: ${PROTOBUF_LIBRARY}") - target_link_libraries(gluten - LINK_PUBLIC ${PROTOBUF_LIBRARY}) + target_link_libraries(gluten LINK_PUBLIC ${PROTOBUF_LIBRARY}) endif() -add_custom_command(OUTPUT ${SUBSTRAIT_PROTO_OUTPUT_FILES} - COMMAND ${PROTOC_BIN} - --proto_path - ${SUBSTRAIT_PROTO_SRC_DIR}/ - --cpp_out - ${PROTO_OUTPUT_DIR} - ${SUBSTRAIT_PROTO_FILES} - DEPENDS ${SUBSTRAIT_PROTO_DIR} - COMMENT "Running Substrait PROTO compiler" - VERBATIM) - -add_custom_command(OUTPUT ${GLUTEN_PROTO_OUTPUT_FILES} - COMMAND ${PROTOC_BIN} - --proto_path - ${GLUTEN_PROTO_SRC_DIR}/ - --cpp_out - ${PROTO_OUTPUT_DIR} - ${GLUTEN_PROTO_FILES} - DEPENDS ${GLUTEN_PROTO_DIR} - COMMENT "Running Gluten PROTO compiler" - VERBATIM) - -add_custom_target(jni_proto ALL DEPENDS ${SUBSTRAIT_PROTO_OUTPUT_FILES} ${GLUTEN_PROTO_OUTPUT_FILES}) +add_custom_command( + OUTPUT ${SUBSTRAIT_PROTO_OUTPUT_FILES} + COMMAND ${PROTOC_BIN} --proto_path ${SUBSTRAIT_PROTO_SRC_DIR}/ --cpp_out + ${PROTO_OUTPUT_DIR} ${SUBSTRAIT_PROTO_FILES} + DEPENDS ${SUBSTRAIT_PROTO_DIR} + COMMENT "Running Substrait PROTO compiler" + VERBATIM) + +add_custom_command( + OUTPUT ${GLUTEN_PROTO_OUTPUT_FILES} + COMMAND ${PROTOC_BIN} --proto_path ${GLUTEN_PROTO_SRC_DIR}/ --cpp_out + ${PROTO_OUTPUT_DIR} ${GLUTEN_PROTO_FILES} + DEPENDS ${GLUTEN_PROTO_DIR} + COMMENT "Running Gluten PROTO compiler" + VERBATIM) + +add_custom_target(jni_proto ALL DEPENDS ${SUBSTRAIT_PROTO_OUTPUT_FILES} + ${GLUTEN_PROTO_OUTPUT_FILES}) add_dependencies(jni_proto protobuf::libprotobuf) -target_include_directories(gluten PUBLIC ${CMAKE_SYSTEM_INCLUDE_PATH} ${JNI_INCLUDE_DIRS} ${CMAKE_CURRENT_SOURCE_DIR} ${PROTO_OUTPUT_DIR} ${PROTOBUF_INCLUDE}) -set_target_properties(gluten PROPERTIES - LIBRARY_OUTPUT_DIRECTORY ${root_directory}/releases) - -include(Findjemalloc_pic) -# Build Jemalloc -if(BUILD_JEMALLOC) - build_jemalloc(${STATIC_JEMALLOC}) - message(STATUS "Building Jemalloc: ${STATIC_JEMALLOC}") -else() # - find_jemalloc() - message(STATUS "Use existing Jemalloc libraries") -endif() +target_include_directories( + gluten + PUBLIC ${CMAKE_SYSTEM_INCLUDE_PATH} ${JNI_INCLUDE_DIRS} + ${CMAKE_CURRENT_SOURCE_DIR} ${PROTO_OUTPUT_DIR} ${PROTOBUF_INCLUDE}) +set_target_properties(gluten PROPERTIES LIBRARY_OUTPUT_DIRECTORY + ${root_directory}/releases) if(BUILD_TESTS) add_subdirectory(tests) @@ -313,23 +309,27 @@ if(BUILD_BENCHMARKS) add_subdirectory(benchmarks) endif() - if(DEFINED ENV{HADOOP_HOME}) set(LIBHDFS3_DESTINATION $ENV{HADOOP_HOME}/lib/native) else() set(LIBHDFS3_DESTINATION ${CMAKE_INSTALL_LIBDIR}) endif() -target_link_libraries(gluten - PUBLIC Arrow::arrow Arrow::parquet) +target_link_libraries(gluten PUBLIC Arrow::parquet Arrow::arrow + Arrow::arrow_bundled_dependencies) target_link_libraries(gluten PRIVATE google::glog) -install(TARGETS gluten - DESTINATION ${CMAKE_INSTALL_LIBDIR}) +install(TARGETS gluten DESTINATION ${CMAKE_INSTALL_LIBDIR}) install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/resources/libhdfs.so DESTINATION ${LIBHDFS3_DESTINATION}) -add_custom_command(TARGET gluten POST_BUILD COMMAND ld $ || true - COMMENT "Checking ld result of libgluten.so") -add_custom_command(TARGET gluten POST_BUILD COMMAND ldd $ || true - COMMENT "Checking ldd result of libgluten.so") +add_custom_command( + TARGET gluten + POST_BUILD + COMMAND ld $ || true + COMMENT "Checking ld result of libgluten.so") +add_custom_command( + TARGET gluten + POST_BUILD + COMMAND ldd $ || true + COMMENT "Checking ldd result of libgluten.so") diff --git a/cpp/core/benchmarks/CMakeLists.txt b/cpp/core/benchmarks/CMakeLists.txt index 6d39501477df8..4b4c7656639c0 100644 --- a/cpp/core/benchmarks/CMakeLists.txt +++ b/cpp/core/benchmarks/CMakeLists.txt @@ -31,7 +31,8 @@ macro(package_add_gbenchmark TESTNAME) add_executable(${TESTNAME} ${ARGN}) - target_link_libraries(${TESTNAME} benchmark::benchmark gluten google::glog ${CMAKE_THREAD_LIBS_INIT}) + target_link_libraries(${TESTNAME} benchmark::benchmark gluten google::glog + ${CMAKE_THREAD_LIBS_INIT}) target_include_directories(${TESTNAME} PUBLIC ${source_root_directory}) set_target_properties(${TESTNAME} PROPERTIES FOLDER tests) endmacro() diff --git a/cpp/core/compute/Runtime.cc b/cpp/core/compute/Runtime.cc index 387da1a0a8970..c6bae1e7bc0bc 100644 --- a/cpp/core/compute/Runtime.cc +++ b/cpp/core/compute/Runtime.cc @@ -56,9 +56,12 @@ void Runtime::registerFactory(const std::string& kind, Runtime::Factory factory) runtimeFactories().registerFactory(kind, std::move(factory)); } -Runtime* Runtime::create(const std::string& kind, const std::unordered_map& sessionConf) { +Runtime* Runtime::create( + const std::string& kind, + std::unique_ptr listener, + const std::unordered_map& sessionConf) { auto& factory = runtimeFactories().getFactory(kind); - return factory(sessionConf); + return factory(std::move(listener), sessionConf); } void Runtime::release(Runtime* runtime) { diff --git a/cpp/core/compute/Runtime.h b/cpp/core/compute/Runtime.h index 7574b0219dcff..fb501dc9acca8 100644 --- a/cpp/core/compute/Runtime.h +++ b/cpp/core/compute/Runtime.h @@ -55,13 +55,18 @@ struct SparkTaskInfo { class Runtime : public std::enable_shared_from_this { public: - using Factory = std::function&)>; + using Factory = std::function< + Runtime*(std::unique_ptr listener, const std::unordered_map&)>; static void registerFactory(const std::string& kind, Factory factory); - static Runtime* create(const std::string& kind, const std::unordered_map& sessionConf = {}); + static Runtime* create( + const std::string& kind, + std::unique_ptr listener, + const std::unordered_map& sessionConf = {}); static void release(Runtime*); - Runtime() = default; - Runtime(const std::unordered_map& confMap) : confMap_(confMap) {} + Runtime(std::shared_ptr memoryManager, const std::unordered_map& confMap) + : memoryManager_(memoryManager), confMap_(confMap) {} + virtual ~Runtime() = default; virtual void parsePlan(const uint8_t* data, int32_t size, std::optional dumpFile) = 0; @@ -78,52 +83,40 @@ class Runtime : public std::enable_shared_from_this { } virtual std::shared_ptr createResultIterator( - MemoryManager* memoryManager, const std::string& spillDir, const std::vector>& inputs, const std::unordered_map& sessionConf) = 0; virtual std::shared_ptr createOrGetEmptySchemaBatch(int32_t numRows) = 0; - virtual std::shared_ptr - select(MemoryManager*, std::shared_ptr, std::vector) = 0; + virtual std::shared_ptr select(std::shared_ptr, std::vector) = 0; - virtual MemoryManager* createMemoryManager( - const std::string& name, - std::shared_ptr, - std::unique_ptr) = 0; + virtual MemoryManager* memoryManager() { + return memoryManager_.get(); + }; /// This function is used to create certain converter from the format used by /// the backend to Spark unsafe row. - virtual std::shared_ptr createColumnar2RowConverter(MemoryManager* memoryManager) = 0; + virtual std::shared_ptr createColumnar2RowConverter() = 0; - virtual std::shared_ptr createRow2ColumnarConverter( - MemoryManager* memoryManager, - struct ArrowSchema* cSchema) = 0; + virtual std::shared_ptr createRow2ColumnarConverter(struct ArrowSchema* cSchema) = 0; virtual std::shared_ptr createShuffleWriter( int numPartitions, std::unique_ptr partitionWriter, - ShuffleWriterOptions options, - MemoryManager* memoryManager) = 0; + ShuffleWriterOptions options) = 0; virtual Metrics* getMetrics(ColumnarBatchIterator* rawIter, int64_t exportNanos) = 0; virtual std::shared_ptr createDatasource( const std::string& filePath, - MemoryManager* memoryManager, std::shared_ptr schema) = 0; virtual std::shared_ptr createShuffleReader( std::shared_ptr schema, - ShuffleReaderOptions options, - arrow::MemoryPool* pool, - MemoryManager* memoryManager) = 0; + ShuffleReaderOptions options) = 0; - virtual std::unique_ptr createColumnarBatchSerializer( - MemoryManager* memoryManager, - arrow::MemoryPool* arrowPool, - struct ArrowSchema* cSchema) = 0; + virtual std::unique_ptr createColumnarBatchSerializer(struct ArrowSchema* cSchema) = 0; virtual void dumpConf(const std::string& path) = 0; @@ -135,17 +128,18 @@ class Runtime : public std::enable_shared_from_this { taskInfo_ = taskInfo; } - ObjectStore* objectStore() { - return objStore_.get(); + ObjectHandle saveObject(std::shared_ptr obj) { + return objStore_->save(obj); } protected: + std::shared_ptr memoryManager_; std::unique_ptr objStore_ = ObjectStore::create(); + std::unordered_map confMap_; // Session conf map + ::substrait::Plan substraitPlan_; std::vector<::substrait::ReadRel_LocalFiles> localFiles_; std::optional writeFilesTempPath_; SparkTaskInfo taskInfo_; - // Session conf map - std::unordered_map confMap_; }; } // namespace gluten diff --git a/cpp/core/config/GlutenConfig.h b/cpp/core/config/GlutenConfig.h index a039537b78ba5..ad7dacf113ecb 100644 --- a/cpp/core/config/GlutenConfig.h +++ b/cpp/core/config/GlutenConfig.h @@ -30,8 +30,6 @@ const std::string kGlutenSaveDir = "spark.gluten.saveDir"; const std::string kCaseSensitive = "spark.sql.caseSensitive"; -const std::string kLegacySize = "spark.sql.legacy.sizeOfNull"; - const std::string kSessionTimezone = "spark.sql.session.timeZone"; const std::string kIgnoreMissingFiles = "spark.sql.files.ignoreMissingFiles"; diff --git a/cpp/core/jni/JniCommon.cc b/cpp/core/jni/JniCommon.cc index 328a7b7722f9c..759a9d121f911 100644 --- a/cpp/core/jni/JniCommon.cc +++ b/cpp/core/jni/JniCommon.cc @@ -65,3 +65,61 @@ gluten::Runtime* gluten::getRuntime(JNIEnv* env, jobject runtimeAware) { GLUTEN_CHECK(ctx != nullptr, "FATAL: resource instance should not be null."); return ctx; } + +std::unique_ptr gluten::makeJniColumnarBatchIterator( + JNIEnv* env, + jobject jColumnarBatchItr, + gluten::Runtime* runtime, + std::shared_ptr writer) { + return std::make_unique(env, jColumnarBatchItr, runtime, writer); +} + +gluten::JniColumnarBatchIterator::JniColumnarBatchIterator( + JNIEnv* env, + jobject jColumnarBatchItr, + gluten::Runtime* runtime, + std::shared_ptr writer) + : runtime_(runtime), writer_(writer) { + // IMPORTANT: DO NOT USE LOCAL REF IN DIFFERENT THREAD + if (env->GetJavaVM(&vm_) != JNI_OK) { + std::string errorMessage = "Unable to get JavaVM instance"; + throw gluten::GlutenException(errorMessage); + } + serializedColumnarBatchIteratorClass_ = + createGlobalClassReferenceOrError(env, "Lorg/apache/gluten/vectorized/ColumnarBatchInIterator;"); + serializedColumnarBatchIteratorHasNext_ = + getMethodIdOrError(env, serializedColumnarBatchIteratorClass_, "hasNext", "()Z"); + serializedColumnarBatchIteratorNext_ = getMethodIdOrError(env, serializedColumnarBatchIteratorClass_, "next", "()J"); + jColumnarBatchItr_ = env->NewGlobalRef(jColumnarBatchItr); +} + +gluten::JniColumnarBatchIterator::~JniColumnarBatchIterator() { + JNIEnv* env; + attachCurrentThreadAsDaemonOrThrow(vm_, &env); + env->DeleteGlobalRef(jColumnarBatchItr_); + env->DeleteGlobalRef(serializedColumnarBatchIteratorClass_); + vm_->DetachCurrentThread(); +} + +std::shared_ptr gluten::JniColumnarBatchIterator::next() { + JNIEnv* env; + attachCurrentThreadAsDaemonOrThrow(vm_, &env); + if (!env->CallBooleanMethod(jColumnarBatchItr_, serializedColumnarBatchIteratorHasNext_)) { + checkException(env); + return nullptr; // stream ended + } + + checkException(env); + jlong handle = env->CallLongMethod(jColumnarBatchItr_, serializedColumnarBatchIteratorNext_); + checkException(env); + auto batch = ObjectStore::retrieve(handle); + if (writer_ != nullptr) { + // save snapshot of the batch to file + std::shared_ptr schema = batch->exportArrowSchema(); + std::shared_ptr array = batch->exportArrowArray(); + auto rb = gluten::arrowGetOrThrow(arrow::ImportRecordBatch(array.get(), schema.get())); + GLUTEN_THROW_NOT_OK(writer_->initWriter(*(rb->schema().get()))); + GLUTEN_THROW_NOT_OK(writer_->writeInBatches(rb)); + } + return batch; +} diff --git a/cpp/core/jni/JniCommon.h b/cpp/core/jni/JniCommon.h index 5858a70e9a777..d5c9f2b3b18b4 100644 --- a/cpp/core/jni/JniCommon.h +++ b/cpp/core/jni/JniCommon.h @@ -28,6 +28,8 @@ #include "memory/AllocationListener.h" #include "shuffle/rss/RssClient.h" #include "utils/Compression.h" +#include "utils/ObjectStore.h" +#include "utils/ResourceMap.h" #include "utils/exception.h" static jint jniVersion = JNI_VERSION_1_8; @@ -119,6 +121,12 @@ static inline void attachCurrentThreadAsDaemonOrThrow(JavaVM* vm, JNIEnv** out) } } +template +static T* jniCastOrThrow(jlong handle) { + auto instance = reinterpret_cast(handle); + GLUTEN_CHECK(instance != nullptr, "FATAL: resource instance should not be null."); + return instance; +} namespace gluten { class JniCommonState { @@ -251,6 +259,40 @@ DEFINE_SAFE_GET_PRIMITIVE_ARRAY_FUNCTIONS(kLong, jlongArray, Long) DEFINE_SAFE_GET_PRIMITIVE_ARRAY_FUNCTIONS(kFloat, jfloatArray, Float) DEFINE_SAFE_GET_PRIMITIVE_ARRAY_FUNCTIONS(kDouble, jdoubleArray, Double) +class JniColumnarBatchIterator : public ColumnarBatchIterator { + public: + explicit JniColumnarBatchIterator( + JNIEnv* env, + jobject jColumnarBatchItr, + Runtime* runtime, + std::shared_ptr writer); + + // singleton + JniColumnarBatchIterator(const JniColumnarBatchIterator&) = delete; + JniColumnarBatchIterator(JniColumnarBatchIterator&&) = delete; + JniColumnarBatchIterator& operator=(const JniColumnarBatchIterator&) = delete; + JniColumnarBatchIterator& operator=(JniColumnarBatchIterator&&) = delete; + + virtual ~JniColumnarBatchIterator(); + + std::shared_ptr next() override; + + private: + JavaVM* vm_; + jobject jColumnarBatchItr_; + Runtime* runtime_; + std::shared_ptr writer_; + + jclass serializedColumnarBatchIteratorClass_; + jmethodID serializedColumnarBatchIteratorHasNext_; + jmethodID serializedColumnarBatchIteratorNext_; +}; + +std::unique_ptr makeJniColumnarBatchIterator( + JNIEnv* env, + jobject jColumnarBatchItr, + Runtime* runtime, + std::shared_ptr writer); } // namespace gluten // TODO: Move the static functions to namespace gluten diff --git a/cpp/core/jni/JniError.cc b/cpp/core/jni/JniError.cc index af21ba2a1f3e1..b0f4e713a9482 100644 --- a/cpp/core/jni/JniError.cc +++ b/cpp/core/jni/JniError.cc @@ -41,7 +41,13 @@ jclass gluten::JniErrorState::illegalAccessExceptionClass() { return illegalAccessExceptionClass_; } +jclass gluten::JniErrorState::glutenExceptionClass() { + assertInitialized(); + return glutenExceptionClass_; +} + void gluten::JniErrorState::initialize(JNIEnv* env) { + glutenExceptionClass_ = createGlobalClassReference(env, "Lorg/apache/gluten/exception/GlutenException;"); ioExceptionClass_ = createGlobalClassReference(env, "Ljava/io/IOException;"); runtimeExceptionClass_ = createGlobalClassReference(env, "Ljava/lang/RuntimeException;"); unsupportedOperationExceptionClass_ = createGlobalClassReference(env, "Ljava/lang/UnsupportedOperationException;"); @@ -61,6 +67,7 @@ void gluten::JniErrorState::close() { } JNIEnv* env; attachCurrentThreadAsDaemonOrThrow(vm_, &env); + env->DeleteGlobalRef(glutenExceptionClass_); env->DeleteGlobalRef(ioExceptionClass_); env->DeleteGlobalRef(runtimeExceptionClass_); env->DeleteGlobalRef(unsupportedOperationExceptionClass_); diff --git a/cpp/core/jni/JniError.h b/cpp/core/jni/JniError.h index f902ebe0367ec..828c908612922 100644 --- a/cpp/core/jni/JniError.h +++ b/cpp/core/jni/JniError.h @@ -28,11 +28,11 @@ #endif #ifndef JNI_METHOD_END -#define JNI_METHOD_END(fallback_expr) \ - } \ - catch (std::exception & e) { \ - env->ThrowNew(gluten::getJniErrorState()->runtimeExceptionClass(), e.what()); \ - return fallback_expr; \ +#define JNI_METHOD_END(fallback_expr) \ + } \ + catch (std::exception & e) { \ + env->ThrowNew(gluten::getJniErrorState()->glutenExceptionClass(), e.what()); \ + return fallback_expr; \ } // macro ended #endif @@ -53,6 +53,8 @@ class JniErrorState { jclass illegalAccessExceptionClass(); + jclass glutenExceptionClass(); + private: void initialize(JNIEnv* env); @@ -61,6 +63,7 @@ class JniErrorState { jclass unsupportedOperationExceptionClass_ = nullptr; jclass illegalAccessExceptionClass_ = nullptr; jclass illegalArgumentExceptionClass_ = nullptr; + jclass glutenExceptionClass_ = nullptr; JavaVM* vm_; bool initialized_{false}; bool closed_{false}; diff --git a/cpp/core/jni/JniWrapper.cc b/cpp/core/jni/JniWrapper.cc index a04ba73a18aff..ea5c9d271c924 100644 --- a/cpp/core/jni/JniWrapper.cc +++ b/cpp/core/jni/JniWrapper.cc @@ -58,13 +58,8 @@ static jmethodID splitResultConstructor; static jclass columnarBatchSerializeResultClass; static jmethodID columnarBatchSerializeResultConstructor; -static jclass serializedColumnarBatchIteratorClass; static jclass metricsBuilderClass; static jmethodID metricsBuilderConstructor; - -static jmethodID serializedColumnarBatchIteratorHasNext; -static jmethodID serializedColumnarBatchIteratorNext; - static jclass nativeColumnarToRowInfoClass; static jmethodID nativeColumnarToRowInfoConstructor; @@ -72,8 +67,8 @@ static jclass shuffleReaderMetricsClass; static jmethodID shuffleReaderMetricsSetDecompressTime; static jmethodID shuffleReaderMetricsSetDeserializeTime; -static jclass block_stripes_class; -static jmethodID block_stripes_constructor; +static jclass blockStripesClass; +static jmethodID blockStripesConstructor; class JavaInputStreamAdaptor final : public arrow::io::InputStream { public: @@ -147,80 +142,6 @@ class JavaInputStreamAdaptor final : public arrow::io::InputStream { bool closed_ = false; }; -class JniColumnarBatchIterator : public ColumnarBatchIterator { - public: - explicit JniColumnarBatchIterator( - JNIEnv* env, - jobject jColumnarBatchItr, - Runtime* runtime, - std::shared_ptr writer) - : runtime_(runtime), writer_(writer) { - // IMPORTANT: DO NOT USE LOCAL REF IN DIFFERENT THREAD - if (env->GetJavaVM(&vm_) != JNI_OK) { - std::string errorMessage = "Unable to get JavaVM instance"; - throw gluten::GlutenException(errorMessage); - } - jColumnarBatchItr_ = env->NewGlobalRef(jColumnarBatchItr); - } - - // singleton - JniColumnarBatchIterator(const JniColumnarBatchIterator&) = delete; - JniColumnarBatchIterator(JniColumnarBatchIterator&&) = delete; - JniColumnarBatchIterator& operator=(const JniColumnarBatchIterator&) = delete; - JniColumnarBatchIterator& operator=(JniColumnarBatchIterator&&) = delete; - - virtual ~JniColumnarBatchIterator() { - JNIEnv* env; - attachCurrentThreadAsDaemonOrThrow(vm_, &env); - env->DeleteGlobalRef(jColumnarBatchItr_); - vm_->DetachCurrentThread(); - } - - std::shared_ptr next() override { - JNIEnv* env; - attachCurrentThreadAsDaemonOrThrow(vm_, &env); - if (!env->CallBooleanMethod(jColumnarBatchItr_, serializedColumnarBatchIteratorHasNext)) { - checkException(env); - return nullptr; // stream ended - } - - checkException(env); - jlong handle = env->CallLongMethod(jColumnarBatchItr_, serializedColumnarBatchIteratorNext); - checkException(env); - auto batch = runtime_->objectStore()->retrieve(handle); - if (writer_ != nullptr) { - // save snapshot of the batch to file - std::shared_ptr schema = batch->exportArrowSchema(); - std::shared_ptr array = batch->exportArrowArray(); - auto rb = gluten::arrowGetOrThrow(arrow::ImportRecordBatch(array.get(), schema.get())); - GLUTEN_THROW_NOT_OK(writer_->initWriter(*(rb->schema().get()))); - GLUTEN_THROW_NOT_OK(writer_->writeInBatches(rb)); - } - return batch; - } - - private: - JavaVM* vm_; - jobject jColumnarBatchItr_; - Runtime* runtime_; - std::shared_ptr writer_; -}; - -std::unique_ptr makeJniColumnarBatchIterator( - JNIEnv* env, - jobject jColumnarBatchItr, - Runtime* runtime, - std::shared_ptr writer) { - return std::make_unique(env, jColumnarBatchItr, runtime, writer); -} - -template -T* jniCastOrThrow(ResourceHandle handle) { - auto instance = reinterpret_cast(handle); - GLUTEN_CHECK(instance != nullptr, "FATAL: resource instance should not be null."); - return instance; -} - #ifdef __cplusplus extern "C" { #endif @@ -253,21 +174,13 @@ jint JNI_OnLoad(JavaVM* vm, void* reserved) { metricsBuilderConstructor = getMethodIdOrError( env, metricsBuilderClass, "", "([J[J[J[J[J[J[J[J[J[JJ[J[J[J[J[J[J[J[J[J[J[J[J[J[J[J[J[J[J[J[J)V"); - serializedColumnarBatchIteratorClass = - createGlobalClassReferenceOrError(env, "Lorg/apache/gluten/vectorized/ColumnarBatchInIterator;"); - - serializedColumnarBatchIteratorHasNext = - getMethodIdOrError(env, serializedColumnarBatchIteratorClass, "hasNext", "()Z"); - - serializedColumnarBatchIteratorNext = getMethodIdOrError(env, serializedColumnarBatchIteratorClass, "next", "()J"); - nativeColumnarToRowInfoClass = createGlobalClassReferenceOrError(env, "Lorg/apache/gluten/vectorized/NativeColumnarToRowInfo;"); nativeColumnarToRowInfoConstructor = getMethodIdOrError(env, nativeColumnarToRowInfoClass, "", "([I[IJ)V"); javaReservationListenerClass = createGlobalClassReference( env, - "Lorg/apache/gluten/memory/nmm/" + "Lorg/apache/gluten/memory/listener/" "ReservationListener;"); reserveMemoryMethod = getMethodIdOrError(env, javaReservationListenerClass, "reserve", "(J)J"); @@ -280,9 +193,9 @@ jint JNI_OnLoad(JavaVM* vm, void* reserved) { shuffleReaderMetricsSetDeserializeTime = getMethodIdOrError(env, shuffleReaderMetricsClass, "setDeserializeTime", "(J)V"); - block_stripes_class = + blockStripesClass = createGlobalClassReferenceOrError(env, "Lorg/apache/spark/sql/execution/datasources/BlockStripes;"); - block_stripes_constructor = env->GetMethodID(block_stripes_class, "", "(J[J[II[B)V"); + blockStripesConstructor = env->GetMethodID(blockStripesClass, "", "(J[J[II[B)V"); return jniVersion; } @@ -293,11 +206,10 @@ void JNI_OnUnload(JavaVM* vm, void* reserved) { env->DeleteGlobalRef(jniByteInputStreamClass); env->DeleteGlobalRef(splitResultClass); env->DeleteGlobalRef(columnarBatchSerializeResultClass); - env->DeleteGlobalRef(serializedColumnarBatchIteratorClass); env->DeleteGlobalRef(nativeColumnarToRowInfoClass); env->DeleteGlobalRef(byteArrayClass); env->DeleteGlobalRef(shuffleReaderMetricsClass); - env->DeleteGlobalRef(block_stripes_class); + env->DeleteGlobalRef(blockStripesClass); gluten::getJniErrorState()->close(); gluten::getJniCommonState()->close(); @@ -307,14 +219,63 @@ JNIEXPORT jlong JNICALL Java_org_apache_gluten_exec_RuntimeJniWrapper_createRunt JNIEnv* env, jclass, jstring jbackendType, + jobject jlistener, jbyteArray sessionConf) { JNI_METHOD_START + JavaVM* vm; + if (env->GetJavaVM(&vm) != JNI_OK) { + throw gluten::GlutenException("Unable to get JavaVM instance"); + } + auto backendType = jStringToCString(env, jbackendType); + std::unique_ptr listener = + std::make_unique(vm, jlistener, reserveMemoryMethod, unreserveMemoryMethod); + auto safeArray = gluten::getByteArrayElementsSafe(env, sessionConf); auto sparkConf = gluten::parseConfMap(env, safeArray.elems(), safeArray.length()); - auto runtime = gluten::Runtime::create(backendType, sparkConf); + auto runtime = gluten::Runtime::create(backendType, std::move(listener), sparkConf); return reinterpret_cast(runtime); - JNI_METHOD_END(kInvalidResourceHandle) + JNI_METHOD_END(kInvalidObjectHandle) +} + +JNIEXPORT jbyteArray JNICALL Java_org_apache_gluten_exec_RuntimeJniWrapper_collectMemoryUsage( // NOLINT + JNIEnv* env, + jclass, + jlong ctxHandle) { + JNI_METHOD_START + auto runtime = jniCastOrThrow(ctxHandle); + + const MemoryUsageStats& stats = runtime->memoryManager()->collectMemoryUsageStats(); + auto size = stats.ByteSizeLong(); + jbyteArray out = env->NewByteArray(size); + uint8_t buffer[size]; + GLUTEN_CHECK( + stats.SerializeToArray(reinterpret_cast(buffer), size), + "Serialization failed when collecting memory usage stats"); + env->SetByteArrayRegion(out, 0, size, reinterpret_cast(buffer)); + return out; + JNI_METHOD_END(nullptr) +} + +JNIEXPORT jlong JNICALL Java_org_apache_gluten_exec_RuntimeJniWrapper_shrinkMemory( // NOLINT + JNIEnv* env, + jclass, + jlong ctxHandle, + jlong size) { + JNI_METHOD_START + auto runtime = jniCastOrThrow(ctxHandle); + return runtime->memoryManager()->shrink(static_cast(size)); + JNI_METHOD_END(kInvalidObjectHandle) +} + +JNIEXPORT void JNICALL Java_org_apache_gluten_exec_RuntimeJniWrapper_holdMemory( // NOLINT + JNIEnv* env, + jclass, + jlong ctxHandle) { + JNI_METHOD_START + auto runtime = jniCastOrThrow(ctxHandle); + runtime->memoryManager()->hold(); + JNI_METHOD_END() } JNIEXPORT void JNICALL Java_org_apache_gluten_exec_RuntimeJniWrapper_releaseRuntime( // NOLINT @@ -366,7 +327,6 @@ JNIEXPORT jlong JNICALL Java_org_apache_gluten_vectorized_PlanEvaluatorJniWrapper_nativeCreateKernelWithIterator( // NOLINT JNIEnv* env, jobject wrapper, - jlong memoryManagerHandle, jbyteArray planArr, jobjectArray splitInfosArr, jobjectArray iterArr, @@ -396,8 +356,6 @@ Java_org_apache_gluten_vectorized_PlanEvaluatorJniWrapper_nativeCreateKernelWith ctx->dumpConf(saveDir + "/conf" + fileIdentifier + ".ini"); } - auto memoryManager = jniCastOrThrow(memoryManagerHandle); - auto spillDirStr = jStringToCString(env, spillDir); auto safePlanArray = gluten::getByteArrayElementsSafe(env, planArr); @@ -434,8 +392,8 @@ Java_org_apache_gluten_vectorized_PlanEvaluatorJniWrapper_nativeCreateKernelWith inputIters.push_back(std::move(resultIter)); } - return ctx->objectStore()->save(ctx->createResultIterator(memoryManager, spillDirStr, inputIters, conf)); - JNI_METHOD_END(kInvalidResourceHandle) + return ctx->saveObject(ctx->createResultIterator(spillDirStr, inputIters, conf)); + JNI_METHOD_END(kInvalidObjectHandle) } JNIEXPORT jboolean JNICALL Java_org_apache_gluten_vectorized_ColumnarBatchOutIterator_nativeHasNext( // NOLINT @@ -445,7 +403,7 @@ JNIEXPORT jboolean JNICALL Java_org_apache_gluten_vectorized_ColumnarBatchOutIte JNI_METHOD_START auto ctx = gluten::getRuntime(env, wrapper); - auto iter = ctx->objectStore()->retrieve(iterHandle); + auto iter = ObjectStore::retrieve(iterHandle); if (iter == nullptr) { std::string errorMessage = "When hasNext() is called on a closed iterator, an exception is thrown. To prevent this, consider using the protectInvocationFlow() method when creating the iterator in scala side. This will allow the hasNext() method to be called multiple times without issue."; @@ -462,17 +420,17 @@ JNIEXPORT jlong JNICALL Java_org_apache_gluten_vectorized_ColumnarBatchOutIterat JNI_METHOD_START auto ctx = gluten::getRuntime(env, wrapper); - auto iter = ctx->objectStore()->retrieve(iterHandle); + auto iter = ObjectStore::retrieve(iterHandle); if (!iter->hasNext()) { - return kInvalidResourceHandle; + return kInvalidObjectHandle; } std::shared_ptr batch = iter->next(); - auto batchHandle = ctx->objectStore()->save(batch); + auto batchHandle = ctx->saveObject(batch); iter->setExportNanos(batch->getExportNanos()); return batchHandle; - JNI_METHOD_END(kInvalidResourceHandle) + JNI_METHOD_END(kInvalidObjectHandle) } JNIEXPORT jobject JNICALL Java_org_apache_gluten_vectorized_ColumnarBatchOutIterator_nativeFetchMetrics( // NOLINT @@ -482,7 +440,7 @@ JNIEXPORT jobject JNICALL Java_org_apache_gluten_vectorized_ColumnarBatchOutIter JNI_METHOD_START auto ctx = gluten::getRuntime(env, wrapper); - auto iter = ctx->objectStore()->retrieve(iterHandle); + auto iter = ObjectStore::retrieve(iterHandle); auto metrics = iter->getMetrics(); unsigned int numMetrics = 0; if (metrics) { @@ -543,9 +501,13 @@ JNIEXPORT jlong JNICALL Java_org_apache_gluten_vectorized_ColumnarBatchOutIterat JNI_METHOD_START auto ctx = gluten::getRuntime(env, wrapper); - auto it = ctx->objectStore()->retrieve(iterHandle); + auto it = ObjectStore::retrieve(iterHandle); + if (it == nullptr) { + std::string errorMessage = "Invalid result iter handle " + std::to_string(iterHandle); + throw gluten::GlutenException(errorMessage); + } return it->spillFixedSize(size); - JNI_METHOD_END(kInvalidResourceHandle) + JNI_METHOD_END(kInvalidObjectHandle) } JNIEXPORT void JNICALL Java_org_apache_gluten_vectorized_ColumnarBatchOutIterator_nativeClose( // NOLINT @@ -555,34 +517,32 @@ JNIEXPORT void JNICALL Java_org_apache_gluten_vectorized_ColumnarBatchOutIterato JNI_METHOD_START auto ctx = gluten::getRuntime(env, wrapper); - ctx->objectStore()->release(iterHandle); + ObjectStore::release(iterHandle); JNI_METHOD_END() } JNIEXPORT jlong JNICALL Java_org_apache_gluten_vectorized_NativeColumnarToRowJniWrapper_nativeColumnarToRowInit( // NOLINT JNIEnv* env, - jobject wrapper, - jlong memoryManagerHandle) { + jobject wrapper) { JNI_METHOD_START auto ctx = gluten::getRuntime(env, wrapper); - auto memoryManager = jniCastOrThrow(memoryManagerHandle); // Convert the native batch to Spark unsafe row. - return ctx->objectStore()->save(ctx->createColumnar2RowConverter(memoryManager)); - JNI_METHOD_END(kInvalidResourceHandle) + return ctx->saveObject(ctx->createColumnar2RowConverter()); + JNI_METHOD_END(kInvalidObjectHandle) } JNIEXPORT jobject JNICALL Java_org_apache_gluten_vectorized_NativeColumnarToRowJniWrapper_nativeColumnarToRowConvert( // NOLINT JNIEnv* env, jobject wrapper, - jlong batchHandle, - jlong c2rHandle) { + jlong c2rHandle, + jlong batchHandle) { JNI_METHOD_START auto ctx = gluten::getRuntime(env, wrapper); - auto columnarToRowConverter = ctx->objectStore()->retrieve(c2rHandle); - auto cb = ctx->objectStore()->retrieve(batchHandle); + auto columnarToRowConverter = ObjectStore::retrieve(c2rHandle); + auto cb = ObjectStore::retrieve(batchHandle); columnarToRowConverter->convert(cb); const auto& offsets = columnarToRowConverter->getOffsets(); @@ -611,22 +571,19 @@ JNIEXPORT void JNICALL Java_org_apache_gluten_vectorized_NativeColumnarToRowJniW JNI_METHOD_START auto ctx = gluten::getRuntime(env, wrapper); - ctx->objectStore()->release(c2rHandle); + ObjectStore::release(c2rHandle); JNI_METHOD_END() } JNIEXPORT jlong JNICALL Java_org_apache_gluten_vectorized_NativeRowToColumnarJniWrapper_init( // NOLINT JNIEnv* env, jobject wrapper, - jlong cSchema, - jlong memoryManagerHandle) { + jlong cSchema) { JNI_METHOD_START auto ctx = gluten::getRuntime(env, wrapper); - auto memoryManager = jniCastOrThrow(memoryManagerHandle); - return ctx->objectStore()->save( - ctx->createRow2ColumnarConverter(memoryManager, reinterpret_cast(cSchema))); - JNI_METHOD_END(kInvalidResourceHandle) + return ctx->saveObject(ctx->createRow2ColumnarConverter(reinterpret_cast(cSchema))); + JNI_METHOD_END(kInvalidObjectHandle) } JNIEXPORT jlong JNICALL @@ -646,10 +603,10 @@ Java_org_apache_gluten_vectorized_NativeRowToColumnarJniWrapper_nativeConvertRow auto safeArray = gluten::getLongArrayElementsSafe(env, rowLength); uint8_t* address = reinterpret_cast(memoryAddress); - auto converter = ctx->objectStore()->retrieve(r2cHandle); + auto converter = ObjectStore::retrieve(r2cHandle); auto cb = converter->convert(numRows, safeArray.elems(), address); - return ctx->objectStore()->save(cb); - JNI_METHOD_END(kInvalidResourceHandle) + return ctx->saveObject(cb); + JNI_METHOD_END(kInvalidObjectHandle) } JNIEXPORT void JNICALL Java_org_apache_gluten_vectorized_NativeRowToColumnarJniWrapper_close( // NOLINT @@ -659,7 +616,7 @@ JNIEXPORT void JNICALL Java_org_apache_gluten_vectorized_NativeRowToColumnarJniW JNI_METHOD_START auto ctx = gluten::getRuntime(env, wrapper); - ctx->objectStore()->release(r2cHandle); + ObjectStore::release(r2cHandle); JNI_METHOD_END() } @@ -669,7 +626,7 @@ JNIEXPORT jstring JNICALL Java_org_apache_gluten_columnarbatch_ColumnarBatchJniW jlong batchHandle) { JNI_METHOD_START auto ctx = gluten::getRuntime(env, wrapper); - auto batch = ctx->objectStore()->retrieve(batchHandle); + auto batch = ObjectStore::retrieve(batchHandle); return env->NewStringUTF(batch->getType().c_str()); JNI_METHOD_END(nullptr) } @@ -680,9 +637,9 @@ JNIEXPORT jlong JNICALL Java_org_apache_gluten_columnarbatch_ColumnarBatchJniWra jlong batchHandle) { JNI_METHOD_START auto ctx = gluten::getRuntime(env, wrapper); - auto batch = ctx->objectStore()->retrieve(batchHandle); + auto batch = ObjectStore::retrieve(batchHandle); return batch->numBytes(); - JNI_METHOD_END(kInvalidResourceHandle) + JNI_METHOD_END(kInvalidObjectHandle) } JNIEXPORT jlong JNICALL Java_org_apache_gluten_columnarbatch_ColumnarBatchJniWrapper_numColumns( // NOLINT @@ -691,9 +648,9 @@ JNIEXPORT jlong JNICALL Java_org_apache_gluten_columnarbatch_ColumnarBatchJniWra jlong batchHandle) { JNI_METHOD_START auto ctx = gluten::getRuntime(env, wrapper); - auto batch = ctx->objectStore()->retrieve(batchHandle); + auto batch = ObjectStore::retrieve(batchHandle); return batch->numColumns(); - JNI_METHOD_END(kInvalidResourceHandle) + JNI_METHOD_END(kInvalidObjectHandle) } JNIEXPORT jlong JNICALL Java_org_apache_gluten_columnarbatch_ColumnarBatchJniWrapper_numRows( // NOLINT @@ -702,9 +659,9 @@ JNIEXPORT jlong JNICALL Java_org_apache_gluten_columnarbatch_ColumnarBatchJniWra jlong batchHandle) { JNI_METHOD_START auto ctx = gluten::getRuntime(env, wrapper); - auto batch = ctx->objectStore()->retrieve(batchHandle); + auto batch = ObjectStore::retrieve(batchHandle); return batch->numRows(); - JNI_METHOD_END(kInvalidResourceHandle) + JNI_METHOD_END(kInvalidObjectHandle) } JNIEXPORT jlong JNICALL Java_org_apache_gluten_columnarbatch_ColumnarBatchJniWrapper_compose( // NOLINT @@ -720,12 +677,12 @@ JNIEXPORT jlong JNICALL Java_org_apache_gluten_columnarbatch_ColumnarBatchJniWra std::vector> batches; for (int i = 0; i < handleCount; ++i) { int64_t handle = safeArray.elems()[i]; - auto batch = ctx->objectStore()->retrieve(handle); + auto batch = ObjectStore::retrieve(handle); batches.push_back(batch); } auto newBatch = CompositeColumnarBatch::create(std::move(batches)); - return ctx->objectStore()->save(newBatch); - JNI_METHOD_END(kInvalidResourceHandle) + return ctx->saveObject(newBatch); + JNI_METHOD_END(kInvalidObjectHandle) } JNIEXPORT void JNICALL Java_org_apache_gluten_columnarbatch_ColumnarBatchJniWrapper_exportToArrow( // NOLINT @@ -736,7 +693,7 @@ JNIEXPORT void JNICALL Java_org_apache_gluten_columnarbatch_ColumnarBatchJniWrap jlong cArray) { JNI_METHOD_START auto ctx = gluten::getRuntime(env, wrapper); - auto batch = ctx->objectStore()->retrieve(batchHandle); + auto batch = ObjectStore::retrieve(batchHandle); std::shared_ptr exportedSchema = batch->exportArrowSchema(); std::shared_ptr exportedArray = batch->exportArrowArray(); ArrowSchemaMove(exportedSchema.get(), reinterpret_cast(cSchema)); @@ -760,8 +717,8 @@ JNIEXPORT jlong JNICALL Java_org_apache_gluten_columnarbatch_ColumnarBatchJniWra ArrowSchemaMove(arrowSchema, targetSchema.get()); std::shared_ptr batch = std::make_shared(std::move(targetSchema), std::move(targetArray)); - return ctx->objectStore()->save(batch); - JNI_METHOD_END(kInvalidResourceHandle) + return ctx->saveObject(batch); + JNI_METHOD_END(kInvalidObjectHandle) } JNIEXPORT jlong JNICALL Java_org_apache_gluten_columnarbatch_ColumnarBatchJniWrapper_getForEmptySchema( // NOLINT @@ -770,19 +727,17 @@ JNIEXPORT jlong JNICALL Java_org_apache_gluten_columnarbatch_ColumnarBatchJniWra jint numRows) { JNI_METHOD_START auto ctx = gluten::getRuntime(env, wrapper); - return ctx->objectStore()->save(ctx->createOrGetEmptySchemaBatch(static_cast(numRows))); - JNI_METHOD_END(kInvalidResourceHandle) + return ctx->saveObject(ctx->createOrGetEmptySchemaBatch(static_cast(numRows))); + JNI_METHOD_END(kInvalidObjectHandle) } JNIEXPORT jlong JNICALL Java_org_apache_gluten_columnarbatch_ColumnarBatchJniWrapper_select( // NOLINT JNIEnv* env, jobject wrapper, - jlong memoryManagerHandle, jlong batchHandle, jintArray jcolumnIndices) { JNI_METHOD_START auto ctx = gluten::getRuntime(env, wrapper); - auto memoryManager = jniCastOrThrow(memoryManagerHandle); auto safeArray = gluten::getIntArrayElementsSafe(env, jcolumnIndices); int size = env->GetArrayLength(jcolumnIndices); @@ -791,9 +746,8 @@ JNIEXPORT jlong JNICALL Java_org_apache_gluten_columnarbatch_ColumnarBatchJniWra columnIndices.push_back(safeArray.elems()[i]); } - return ctx->objectStore()->save( - ctx->select(memoryManager, ctx->objectStore()->retrieve(batchHandle), std::move(columnIndices))); - JNI_METHOD_END(kInvalidResourceHandle) + return ctx->saveObject(ctx->select(ObjectStore::retrieve(batchHandle), std::move(columnIndices))); + JNI_METHOD_END(kInvalidObjectHandle) } JNIEXPORT void JNICALL Java_org_apache_gluten_columnarbatch_ColumnarBatchJniWrapper_close( // NOLINT @@ -802,7 +756,7 @@ JNIEXPORT void JNICALL Java_org_apache_gluten_columnarbatch_ColumnarBatchJniWrap jlong batchHandle) { JNI_METHOD_START auto ctx = gluten::getRuntime(env, wrapper); - ctx->objectStore()->release(batchHandle); + ObjectStore::release(batchHandle); JNI_METHOD_END() } @@ -823,7 +777,6 @@ JNIEXPORT jlong JNICALL Java_org_apache_gluten_vectorized_ShuffleWriterJniWrappe jstring dataFileJstr, jint numSubDirs, jstring localDirsJstr, - jlong memoryManagerHandle, jdouble reallocThreshold, jlong firstBatchHandle, jlong taskAttemptId, @@ -835,11 +788,11 @@ JNIEXPORT jlong JNICALL Java_org_apache_gluten_vectorized_ShuffleWriterJniWrappe jstring shuffleWriterTypeJstr) { JNI_METHOD_START auto ctx = gluten::getRuntime(env, wrapper); - auto memoryManager = jniCastOrThrow(memoryManagerHandle); if (partitioningNameJstr == nullptr) { throw gluten::GlutenException(std::string("Short partitioning name can't be null")); } + // Build ShuffleWriterOptions. auto shuffleWriterOptions = ShuffleWriterOptions{ .bufferSize = bufferSize, .bufferReallocThreshold = reallocThreshold, @@ -847,7 +800,15 @@ JNIEXPORT jlong JNICALL Java_org_apache_gluten_vectorized_ShuffleWriterJniWrappe .taskAttemptId = (int64_t)taskAttemptId, .startPartitionId = startPartitionId, }; + auto shuffleWriterTypeC = env->GetStringUTFChars(shuffleWriterTypeJstr, JNI_FALSE); + auto shuffleWriterType = std::string(shuffleWriterTypeC); + env->ReleaseStringUTFChars(shuffleWriterTypeJstr, shuffleWriterTypeC); + + if (shuffleWriterType == "sort") { + shuffleWriterOptions.shuffleWriterType = kSortShuffle; + } + // Build PartitionWriterOptions. auto partitionWriterOptions = PartitionWriterOptions{ .mergeBufferSize = mergeBufferSize, .mergeThreshold = mergeThreshold, @@ -863,20 +824,13 @@ JNIEXPORT jlong JNICALL Java_org_apache_gluten_vectorized_ShuffleWriterJniWrappe partitionWriterOptions.codecBackend = getCodecBackend(env, codecBackendJstr); partitionWriterOptions.compressionMode = getCompressionMode(env, compressionModeJstr); } + std::unique_ptr partitionWriter; auto partitionWriterTypeC = env->GetStringUTFChars(partitionWriterTypeJstr, JNI_FALSE); auto partitionWriterType = std::string(partitionWriterTypeC); env->ReleaseStringUTFChars(partitionWriterTypeJstr, partitionWriterTypeC); - auto shuffleWriterTypeC = env->GetStringUTFChars(shuffleWriterTypeJstr, JNI_FALSE); - auto shuffleWriterType = std::string(shuffleWriterTypeC); - env->ReleaseStringUTFChars(shuffleWriterTypeJstr, shuffleWriterTypeC); - - if (shuffleWriterType == "sort") { - shuffleWriterOptions.shuffleWriterType = kSortShuffle; - } - if (partitionWriterType == "local") { if (dataFileJstr == NULL) { throw gluten::GlutenException(std::string("Shuffle DataFile can't be null")); @@ -895,7 +849,7 @@ JNIEXPORT jlong JNICALL Java_org_apache_gluten_vectorized_ShuffleWriterJniWrappe partitionWriter = std::make_unique( numPartitions, std::move(partitionWriterOptions), - memoryManager->getArrowMemoryPool(), + ctx->memoryManager()->getArrowMemoryPool(), dataFile, configuredDirs); } else if (partitionWriterType == "celeborn") { @@ -912,7 +866,7 @@ JNIEXPORT jlong JNICALL Java_org_apache_gluten_vectorized_ShuffleWriterJniWrappe partitionWriter = std::make_unique( numPartitions, std::move(partitionWriterOptions), - memoryManager->getArrowMemoryPool(), + ctx->memoryManager()->getArrowMemoryPool(), std::move(celebornClient)); } else if (partitionWriterType == "uniffle") { jclass unifflePartitionPusherClass = @@ -928,15 +882,15 @@ JNIEXPORT jlong JNICALL Java_org_apache_gluten_vectorized_ShuffleWriterJniWrappe partitionWriter = std::make_unique( numPartitions, std::move(partitionWriterOptions), - memoryManager->getArrowMemoryPool(), + ctx->memoryManager()->getArrowMemoryPool(), std::move(uniffleClient)); } else { throw gluten::GlutenException("Unrecognizable partition writer type: " + partitionWriterType); } - return ctx->objectStore()->save(ctx->createShuffleWriter( - numPartitions, std::move(partitionWriter), std::move(shuffleWriterOptions), memoryManager)); - JNI_METHOD_END(kInvalidResourceHandle) + return ctx->saveObject( + ctx->createShuffleWriter(numPartitions, std::move(partitionWriter), std::move(shuffleWriterOptions))); + JNI_METHOD_END(kInvalidObjectHandle) } JNIEXPORT jlong JNICALL Java_org_apache_gluten_vectorized_ShuffleWriterJniWrapper_nativeEvict( // NOLINT @@ -948,7 +902,7 @@ JNIEXPORT jlong JNICALL Java_org_apache_gluten_vectorized_ShuffleWriterJniWrappe JNI_METHOD_START auto ctx = gluten::getRuntime(env, wrapper); - auto shuffleWriter = ctx->objectStore()->retrieve(shuffleWriterHandle); + auto shuffleWriter = ObjectStore::retrieve(shuffleWriterHandle); if (!shuffleWriter) { std::string errorMessage = "Invalid shuffle writer handle " + std::to_string(shuffleWriterHandle); throw gluten::GlutenException(errorMessage); @@ -957,7 +911,7 @@ JNIEXPORT jlong JNICALL Java_org_apache_gluten_vectorized_ShuffleWriterJniWrappe gluten::arrowAssertOkOrThrow( shuffleWriter->reclaimFixedSize(size, &evictedSize), "(shuffle) nativeEvict: evict failed"); return (jlong)evictedSize; - JNI_METHOD_END(kInvalidResourceHandle) + JNI_METHOD_END(kInvalidObjectHandle) } JNIEXPORT jlong JNICALL Java_org_apache_gluten_vectorized_ShuffleWriterJniWrapper_write( // NOLINT @@ -970,18 +924,18 @@ JNIEXPORT jlong JNICALL Java_org_apache_gluten_vectorized_ShuffleWriterJniWrappe JNI_METHOD_START auto ctx = gluten::getRuntime(env, wrapper); - auto shuffleWriter = ctx->objectStore()->retrieve(shuffleWriterHandle); + auto shuffleWriter = ObjectStore::retrieve(shuffleWriterHandle); if (!shuffleWriter) { std::string errorMessage = "Invalid shuffle writer handle " + std::to_string(shuffleWriterHandle); throw gluten::GlutenException(errorMessage); } // The column batch maybe VeloxColumnBatch or ArrowCStructColumnarBatch(FallbackRangeShuffleWriter) - auto batch = ctx->objectStore()->retrieve(batchHandle); + auto batch = ObjectStore::retrieve(batchHandle); auto numBytes = batch->numBytes(); gluten::arrowAssertOkOrThrow(shuffleWriter->write(batch, memLimit), "Native write: shuffle writer failed"); return numBytes; - JNI_METHOD_END(kInvalidResourceHandle) + JNI_METHOD_END(kInvalidObjectHandle) } JNIEXPORT jobject JNICALL Java_org_apache_gluten_vectorized_ShuffleWriterJniWrapper_stop( // NOLINT @@ -991,7 +945,7 @@ JNIEXPORT jobject JNICALL Java_org_apache_gluten_vectorized_ShuffleWriterJniWrap JNI_METHOD_START auto ctx = gluten::getRuntime(env, wrapper); - auto shuffleWriter = ctx->objectStore()->retrieve(shuffleWriterHandle); + auto shuffleWriter = ObjectStore::retrieve(shuffleWriterHandle); if (!shuffleWriter) { std::string errorMessage = "Invalid shuffle writer handle " + std::to_string(shuffleWriterHandle); throw gluten::GlutenException(errorMessage); @@ -1033,7 +987,7 @@ JNIEXPORT void JNICALL Java_org_apache_gluten_vectorized_ShuffleWriterJniWrapper JNI_METHOD_START auto ctx = gluten::getRuntime(env, wrapper); - ctx->objectStore()->release(shuffleWriterHandle); + ObjectStore::release(shuffleWriterHandle); JNI_METHOD_END() } @@ -1053,16 +1007,13 @@ JNIEXPORT jlong JNICALL Java_org_apache_gluten_vectorized_ShuffleReaderJniWrappe JNIEnv* env, jobject wrapper, jlong cSchema, - jlong memoryManagerHandle, jstring compressionType, jstring compressionBackend, jint batchSize, jstring shuffleWriterType) { JNI_METHOD_START auto ctx = gluten::getRuntime(env, wrapper); - auto memoryManager = jniCastOrThrow(memoryManagerHandle); - auto pool = memoryManager->getArrowMemoryPool(); ShuffleReaderOptions options = ShuffleReaderOptions{}; options.compressionType = getCompressionType(env, compressionType); options.compressionTypeStr = getCompressionTypeStr(env, compressionType); @@ -1078,8 +1029,8 @@ JNIEXPORT jlong JNICALL Java_org_apache_gluten_vectorized_ShuffleReaderJniWrappe std::shared_ptr schema = gluten::arrowGetOrThrow(arrow::ImportSchema(reinterpret_cast(cSchema))); - return ctx->objectStore()->save(ctx->createShuffleReader(schema, options, pool, memoryManager)); - JNI_METHOD_END(kInvalidResourceHandle) + return ctx->saveObject(ctx->createShuffleReader(schema, options)); + JNI_METHOD_END(kInvalidObjectHandle) } JNIEXPORT jlong JNICALL Java_org_apache_gluten_vectorized_ShuffleReaderJniWrapper_readStream( // NOLINT @@ -1089,11 +1040,11 @@ JNIEXPORT jlong JNICALL Java_org_apache_gluten_vectorized_ShuffleReaderJniWrappe jobject jniIn) { JNI_METHOD_START auto ctx = gluten::getRuntime(env, wrapper); - auto reader = ctx->objectStore()->retrieve(shuffleReaderHandle); + auto reader = ObjectStore::retrieve(shuffleReaderHandle); std::shared_ptr in = std::make_shared(env, reader->getPool(), jniIn); auto outItr = reader->readStream(in); - return ctx->objectStore()->save(outItr); - JNI_METHOD_END(kInvalidResourceHandle) + return ctx->saveObject(outItr); + JNI_METHOD_END(kInvalidObjectHandle) } JNIEXPORT void JNICALL Java_org_apache_gluten_vectorized_ShuffleReaderJniWrapper_populateMetrics( // NOLINT @@ -1104,7 +1055,7 @@ JNIEXPORT void JNICALL Java_org_apache_gluten_vectorized_ShuffleReaderJniWrapper JNI_METHOD_START auto ctx = gluten::getRuntime(env, wrapper); - auto reader = ctx->objectStore()->retrieve(shuffleReaderHandle); + auto reader = ObjectStore::retrieve(shuffleReaderHandle); env->CallVoidMethod(metrics, shuffleReaderMetricsSetDecompressTime, reader->getDecompressTime()); env->CallVoidMethod(metrics, shuffleReaderMetricsSetDeserializeTime, reader->getDeserializeTime()); @@ -1119,9 +1070,9 @@ JNIEXPORT void JNICALL Java_org_apache_gluten_vectorized_ShuffleReaderJniWrapper JNI_METHOD_START auto ctx = gluten::getRuntime(env, wrapper); - auto reader = ctx->objectStore()->retrieve(shuffleReaderHandle); + auto reader = ObjectStore::retrieve(shuffleReaderHandle); GLUTEN_THROW_NOT_OK(reader->close()); - ctx->objectStore()->release(shuffleReaderHandle); + ObjectStore::release(shuffleReaderHandle); JNI_METHOD_END() } @@ -1130,30 +1081,28 @@ JNIEXPORT jlong JNICALL Java_org_apache_gluten_datasource_DatasourceJniWrapper_n jobject wrapper, jstring filePath, jlong cSchema, - jlong memoryManagerHandle, jbyteArray options) { JNI_METHOD_START auto ctx = gluten::getRuntime(env, wrapper); - auto memoryManager = jniCastOrThrow(memoryManagerHandle); - ResourceHandle handle = kInvalidResourceHandle; + ObjectHandle handle = kInvalidObjectHandle; if (cSchema == -1) { // Only inspect the schema and not write - handle = ctx->objectStore()->save(ctx->createDatasource(jStringToCString(env, filePath), memoryManager, nullptr)); + handle = ctx->saveObject(ctx->createDatasource(jStringToCString(env, filePath), nullptr)); } else { auto safeArray = gluten::getByteArrayElementsSafe(env, options); auto datasourceOptions = gluten::parseConfMap(env, safeArray.elems(), safeArray.length()); auto& sparkConf = ctx->getConfMap(); datasourceOptions.insert(sparkConf.begin(), sparkConf.end()); auto schema = gluten::arrowGetOrThrow(arrow::ImportSchema(reinterpret_cast(cSchema))); - handle = ctx->objectStore()->save(ctx->createDatasource(jStringToCString(env, filePath), memoryManager, schema)); - auto datasource = ctx->objectStore()->retrieve(handle); + handle = ctx->saveObject(ctx->createDatasource(jStringToCString(env, filePath), schema)); + auto datasource = ObjectStore::retrieve(handle); datasource->init(datasourceOptions); } return handle; - JNI_METHOD_END(kInvalidResourceHandle) + JNI_METHOD_END(kInvalidObjectHandle) } JNIEXPORT void JNICALL Java_org_apache_gluten_datasource_DatasourceJniWrapper_inspectSchema( // NOLINT @@ -1164,7 +1113,7 @@ JNIEXPORT void JNICALL Java_org_apache_gluten_datasource_DatasourceJniWrapper_in JNI_METHOD_START auto ctx = gluten::getRuntime(env, wrapper); - auto datasource = ctx->objectStore()->retrieve(dsHandle); + auto datasource = ObjectStore::retrieve(dsHandle); datasource->inspectSchema(reinterpret_cast(cSchema)); JNI_METHOD_END() } @@ -1176,9 +1125,9 @@ JNIEXPORT void JNICALL Java_org_apache_gluten_datasource_DatasourceJniWrapper_cl JNI_METHOD_START auto ctx = gluten::getRuntime(env, wrapper); - auto datasource = ctx->objectStore()->retrieve(dsHandle); + auto datasource = ObjectStore::retrieve(dsHandle); datasource->close(); - ctx->objectStore()->release(dsHandle); + ObjectStore::release(dsHandle); JNI_METHOD_END() } @@ -1189,7 +1138,7 @@ JNIEXPORT void JNICALL Java_org_apache_gluten_datasource_DatasourceJniWrapper_wr jobject jIter) { JNI_METHOD_START auto ctx = gluten::getRuntime(env, wrapper); - auto datasource = ctx->objectStore()->retrieve(dsHandle); + auto datasource = ObjectStore::retrieve(dsHandle); auto iter = makeJniColumnarBatchIterator(env, jIter, ctx, nullptr); while (true) { auto batch = iter->next(); @@ -1211,7 +1160,7 @@ Java_org_apache_gluten_datasource_DatasourceJniWrapper_splitBlockByPartitionAndB jlong memoryManagerId) { JNI_METHOD_START auto ctx = gluten::getRuntime(env, wrapper); - auto batch = ctx->objectStore()->retrieve(batchHandle); + auto batch = ObjectStore::retrieve(batchHandle); auto safeArray = gluten::getIntArrayElementsSafe(env, partitionColIndice); int size = env->GetArrayLength(partitionColIndice); std::vector partitionColIndiceVec; @@ -1219,15 +1168,13 @@ Java_org_apache_gluten_datasource_DatasourceJniWrapper_splitBlockByPartitionAndB partitionColIndiceVec.push_back(safeArray.elems()[i]); } - MemoryManager* memoryManager = reinterpret_cast(memoryManagerId); - auto result = batch->getRowBytes(0); - auto rowBytes = result.first; - auto newBatchHandle = ctx->objectStore()->save(ctx->select(memoryManager, batch, partitionColIndiceVec)); + auto result = batch->toUnsafeRow(0); + auto rowBytes = result.data(); + auto newBatchHandle = ctx->saveObject(ctx->select(batch, partitionColIndiceVec)); - auto bytesSize = result.second; + auto bytesSize = result.size(); jbyteArray bytesArray = env->NewByteArray(bytesSize); env->SetByteArrayRegion(bytesArray, 0, bytesSize, reinterpret_cast(rowBytes)); - delete[] rowBytes; jlongArray batchArray = env->NewLongArray(1); long* cBatchArray = new long[1]; @@ -1235,150 +1182,18 @@ Java_org_apache_gluten_datasource_DatasourceJniWrapper_splitBlockByPartitionAndB env->SetLongArrayRegion(batchArray, 0, 1, cBatchArray); delete[] cBatchArray; - jobject block_stripes = env->NewObject( - block_stripes_class, - block_stripes_constructor, - batchHandle, - batchArray, - nullptr, - batch->numColumns(), - bytesArray); - return block_stripes; - JNI_METHOD_END(nullptr) -} - -JNIEXPORT jlong JNICALL Java_org_apache_gluten_memory_alloc_NativeMemoryAllocator_getAllocator( // NOLINT - JNIEnv* env, - jclass, - jstring jTypeName) { - JNI_METHOD_START - std::string typeName = jStringToCString(env, jTypeName); - std::shared_ptr* allocator = new std::shared_ptr; - if (typeName == "DEFAULT") { - *allocator = defaultMemoryAllocator(); - } else { - delete allocator; - allocator = nullptr; - throw GlutenException("Unexpected allocator type name: " + typeName); - } - return reinterpret_cast(allocator); - JNI_METHOD_END(kInvalidResourceHandle) -} - -JNIEXPORT void JNICALL Java_org_apache_gluten_memory_alloc_NativeMemoryAllocator_releaseAllocator( // NOLINT - JNIEnv* env, - jclass, - jlong allocatorId) { - JNI_METHOD_START - delete reinterpret_cast*>(allocatorId); - JNI_METHOD_END() -} - -JNIEXPORT jlong JNICALL Java_org_apache_gluten_memory_alloc_NativeMemoryAllocator_bytesAllocated( // NOLINT - JNIEnv* env, - jclass, - jlong allocatorId) { - JNI_METHOD_START - auto* alloc = reinterpret_cast*>(allocatorId); - if (alloc == nullptr) { - throw gluten::GlutenException("Memory allocator instance not found. It may not exist nor has been closed"); - } - return (*alloc)->getBytes(); - JNI_METHOD_END(kInvalidResourceHandle) -} - -JNIEXPORT jlong JNICALL Java_org_apache_gluten_memory_nmm_NativeMemoryManager_create( // NOLINT - JNIEnv* env, - jclass, - jstring jbackendType, - jstring jnmmName, - jlong allocatorId, - jobject jlistener) { - JNI_METHOD_START - JavaVM* vm; - if (env->GetJavaVM(&vm) != JNI_OK) { - throw gluten::GlutenException("Unable to get JavaVM instance"); - } - auto allocator = reinterpret_cast*>(allocatorId); - if (allocator == nullptr) { - throw gluten::GlutenException("Allocator does not exist or has been closed"); - } - - std::unique_ptr listener = - std::make_unique(vm, jlistener, reserveMemoryMethod, unreserveMemoryMethod); - - if (gluten::backtrace_allocation) { - listener = std::make_unique(std::move(listener)); - } - - auto name = jStringToCString(env, jnmmName); - auto backendType = jStringToCString(env, jbackendType); - // TODO: move memory manager into Runtime then we can use more general Runtime. - auto runtime = gluten::Runtime::create(backendType); - auto manager = runtime->createMemoryManager(name, *allocator, std::move(listener)); - gluten::Runtime::release(runtime); - return reinterpret_cast(manager); - JNI_METHOD_END(kInvalidResourceHandle) -} - -JNIEXPORT jbyteArray JNICALL Java_org_apache_gluten_memory_nmm_NativeMemoryManager_collectMemoryUsage( // NOLINT - JNIEnv* env, - jclass, - jlong memoryManagerHandle) { - JNI_METHOD_START - auto memoryManager = jniCastOrThrow(memoryManagerHandle); - - const MemoryUsageStats& stats = memoryManager->collectMemoryUsageStats(); - auto size = stats.ByteSizeLong(); - jbyteArray out = env->NewByteArray(size); - uint8_t buffer[size]; - GLUTEN_CHECK( - stats.SerializeToArray(reinterpret_cast(buffer), size), - "Serialization failed when collecting memory usage stats"); - env->SetByteArrayRegion(out, 0, size, reinterpret_cast(buffer)); - return out; + jobject blockStripes = env->NewObject( + blockStripesClass, blockStripesConstructor, batchHandle, batchArray, nullptr, batch->numColumns(), bytesArray); + return blockStripes; JNI_METHOD_END(nullptr) } -JNIEXPORT jlong JNICALL Java_org_apache_gluten_memory_nmm_NativeMemoryManager_shrink( // NOLINT - JNIEnv* env, - jclass, - jlong memoryManagerHandle, - jlong size) { - JNI_METHOD_START - auto memoryManager = jniCastOrThrow(memoryManagerHandle); - return memoryManager->shrink(static_cast(size)); - JNI_METHOD_END(kInvalidResourceHandle) -} - -JNIEXPORT void JNICALL Java_org_apache_gluten_memory_nmm_NativeMemoryManager_hold( // NOLINT - JNIEnv* env, - jclass, - jlong memoryManagerHandle) { - JNI_METHOD_START - auto memoryManager = jniCastOrThrow(memoryManagerHandle); - memoryManager->hold(); - JNI_METHOD_END() -} - -JNIEXPORT void JNICALL Java_org_apache_gluten_memory_nmm_NativeMemoryManager_release( // NOLINT - JNIEnv* env, - jclass, - jlong memoryManagerHandle) { - JNI_METHOD_START - auto memoryManager = jniCastOrThrow(memoryManagerHandle); - delete memoryManager; - JNI_METHOD_END() -} - JNIEXPORT jobject JNICALL Java_org_apache_gluten_vectorized_ColumnarBatchSerializerJniWrapper_serialize( // NOLINT JNIEnv* env, jobject wrapper, - jlongArray handles, - jlong memoryManagerHandle) { + jlongArray handles) { JNI_METHOD_START auto ctx = gluten::getRuntime(env, wrapper); - auto memoryManager = jniCastOrThrow(memoryManagerHandle); int32_t numBatches = env->GetArrayLength(handles); auto safeArray = gluten::getLongArrayElementsSafe(env, handles); @@ -1386,15 +1201,14 @@ JNIEXPORT jobject JNICALL Java_org_apache_gluten_vectorized_ColumnarBatchSeriali std::vector> batches; int64_t numRows = 0L; for (int32_t i = 0; i < numBatches; i++) { - auto batch = ctx->objectStore()->retrieve(safeArray.elems()[i]); + auto batch = ObjectStore::retrieve(safeArray.elems()[i]); GLUTEN_DCHECK( batch != nullptr, "Cannot find the ColumnarBatch with handle " + std::to_string(safeArray.elems()[i])); numRows += batch->numRows(); batches.emplace_back(batch); } - auto arrowPool = memoryManager->getArrowMemoryPool(); - auto serializer = ctx->createColumnarBatchSerializer(memoryManager, arrowPool, nullptr); + auto serializer = ctx->createColumnarBatchSerializer(nullptr); auto buffer = serializer->serializeColumnarBatches(batches); auto bufferArr = env->NewByteArray(buffer->size()); env->SetByteArrayRegion(bufferArr, 0, buffer->size(), reinterpret_cast(buffer->data())); @@ -1409,16 +1223,11 @@ JNIEXPORT jobject JNICALL Java_org_apache_gluten_vectorized_ColumnarBatchSeriali JNIEXPORT jlong JNICALL Java_org_apache_gluten_vectorized_ColumnarBatchSerializerJniWrapper_init( // NOLINT JNIEnv* env, jobject wrapper, - jlong cSchema, - jlong memoryManagerHandle) { + jlong cSchema) { JNI_METHOD_START auto ctx = gluten::getRuntime(env, wrapper); - auto memoryManager = jniCastOrThrow(memoryManagerHandle); - - auto arrowPool = memoryManager->getArrowMemoryPool(); - return ctx->objectStore()->save( - ctx->createColumnarBatchSerializer(memoryManager, arrowPool, reinterpret_cast(cSchema))); - JNI_METHOD_END(kInvalidResourceHandle) + return ctx->saveObject(ctx->createColumnarBatchSerializer(reinterpret_cast(cSchema))); + JNI_METHOD_END(kInvalidObjectHandle) } JNIEXPORT jlong JNICALL Java_org_apache_gluten_vectorized_ColumnarBatchSerializerJniWrapper_deserialize( // NOLINT @@ -1429,13 +1238,13 @@ JNIEXPORT jlong JNICALL Java_org_apache_gluten_vectorized_ColumnarBatchSerialize JNI_METHOD_START auto ctx = gluten::getRuntime(env, wrapper); - auto serializer = ctx->objectStore()->retrieve(serializerHandle); + auto serializer = ObjectStore::retrieve(serializerHandle); GLUTEN_DCHECK(serializer != nullptr, "ColumnarBatchSerializer cannot be null"); int32_t size = env->GetArrayLength(data); auto safeArray = gluten::getByteArrayElementsSafe(env, data); auto batch = serializer->deserialize(safeArray.elems(), size); - return ctx->objectStore()->save(batch); - JNI_METHOD_END(kInvalidResourceHandle) + return ctx->saveObject(batch); + JNI_METHOD_END(kInvalidObjectHandle) } JNIEXPORT void JNICALL Java_org_apache_gluten_vectorized_ColumnarBatchSerializerJniWrapper_close( // NOLINT @@ -1445,7 +1254,7 @@ JNIEXPORT void JNICALL Java_org_apache_gluten_vectorized_ColumnarBatchSerializer JNI_METHOD_START auto ctx = gluten::getRuntime(env, wrapper); - ctx->objectStore()->release(serializerHandle); + ObjectStore::release(serializerHandle); JNI_METHOD_END() } diff --git a/cpp/core/memory/ColumnarBatch.cc b/cpp/core/memory/ColumnarBatch.cc index bb80510ee3513..23567535d50a8 100644 --- a/cpp/core/memory/ColumnarBatch.cc +++ b/cpp/core/memory/ColumnarBatch.cc @@ -43,8 +43,8 @@ int64_t ColumnarBatch::getExportNanos() const { return exportNanos_; } -std::pair ColumnarBatch::getRowBytes(int32_t rowId) const { - throw gluten::GlutenException("Not implemented getRowBytes for ColumnarBatch"); +std::vector ColumnarBatch::toUnsafeRow(int32_t rowId) const { + throw gluten::GlutenException("Not implemented toUnsafeRow for ColumnarBatch"); } std::ostream& operator<<(std::ostream& os, const ColumnarBatch& columnarBatch) { @@ -86,8 +86,8 @@ std::shared_ptr ArrowColumnarBatch::exportArrowArray() { return cArray; } -std::pair ArrowColumnarBatch::getRowBytes(int32_t rowId) const { - throw gluten::GlutenException("Not implemented getRowBytes for ArrowColumnarBatch"); +std::vector ArrowColumnarBatch::toUnsafeRow(int32_t rowId) const { + throw gluten::GlutenException("#toUnsafeRow of ArrowColumnarBatch is not implemented"); } ArrowCStructColumnarBatch::ArrowCStructColumnarBatch( @@ -123,8 +123,8 @@ std::shared_ptr ArrowCStructColumnarBatch::exportArrowArray() { return cArray_; } -std::pair ArrowCStructColumnarBatch::getRowBytes(int32_t rowId) const { - throw gluten::GlutenException("Not implemented getRowBytes for ArrowCStructColumnarBatch"); +std::vector ArrowCStructColumnarBatch::toUnsafeRow(int32_t rowId) const { + throw gluten::GlutenException("#toUnsafeRow of ArrowCStructColumnarBatch is not implemented"); } std::shared_ptr CompositeColumnarBatch::create(std::vector> batches) { @@ -171,8 +171,8 @@ const std::vector>& CompositeColumnarBatch::getBa return batches_; } -std::pair CompositeColumnarBatch::getRowBytes(int32_t rowId) const { - throw gluten::GlutenException("Not implemented getRowBytes for CompositeColumnarBatch"); +std::vector CompositeColumnarBatch::toUnsafeRow(int32_t rowId) const { + throw gluten::GlutenException("#toUnsafeRow of CompositeColumnarBatch is not implemented"); } CompositeColumnarBatch::CompositeColumnarBatch( diff --git a/cpp/core/memory/ColumnarBatch.h b/cpp/core/memory/ColumnarBatch.h index 4a7b34889f603..fd8189aa6a20e 100644 --- a/cpp/core/memory/ColumnarBatch.h +++ b/cpp/core/memory/ColumnarBatch.h @@ -49,7 +49,8 @@ class ColumnarBatch { virtual int64_t getExportNanos() const; - virtual std::pair getRowBytes(int32_t rowId) const; + // Serializes one single row to byte array that can be accessed as Spark-compatible unsafe row. + virtual std::vector toUnsafeRow(int32_t rowId) const; friend std::ostream& operator<<(std::ostream& os, const ColumnarBatch& columnarBatch); @@ -75,7 +76,7 @@ class ArrowColumnarBatch final : public ColumnarBatch { std::shared_ptr exportArrowArray() override; - std::pair getRowBytes(int32_t rowId) const override; + std::vector toUnsafeRow(int32_t rowId) const override; private: std::shared_ptr batch_; @@ -95,7 +96,7 @@ class ArrowCStructColumnarBatch final : public ColumnarBatch { std::shared_ptr exportArrowArray() override; - std::pair getRowBytes(int32_t rowId) const override; + std::vector toUnsafeRow(int32_t rowId) const override; private: std::shared_ptr cSchema_ = std::make_shared(); @@ -120,7 +121,7 @@ class CompositeColumnarBatch final : public ColumnarBatch { const std::vector>& getBatches() const; - std::pair getRowBytes(int32_t rowId) const override; + std::vector toUnsafeRow(int32_t rowId) const override; private: explicit CompositeColumnarBatch( diff --git a/cpp/core/shuffle/LocalPartitionWriter.cc b/cpp/core/shuffle/LocalPartitionWriter.cc index 6c4a0af39a222..f56543bab5e05 100644 --- a/cpp/core/shuffle/LocalPartitionWriter.cc +++ b/cpp/core/shuffle/LocalPartitionWriter.cc @@ -25,7 +25,6 @@ #include "shuffle/Payload.h" #include "shuffle/Spill.h" #include "shuffle/Utils.h" -#include "utils/Timer.h" namespace gluten { @@ -547,7 +546,7 @@ arrow::Status LocalPartitionWriter::evict( arrow::Status LocalPartitionWriter::evict(uint32_t partitionId, int64_t rawSize, const char* data, int64_t length) { rawPartitionLengths_[partitionId] += rawSize; - if (partitionId <= lastEvictPid_) { + if (partitionId < lastEvictPid_) { RETURN_NOT_OK(finishSpill()); } lastEvictPid_ = partitionId; diff --git a/cpp/core/shuffle/Partitioner.cc b/cpp/core/shuffle/Partitioner.cc index 80b4598a1f170..fb1a5aab44ebd 100644 --- a/cpp/core/shuffle/Partitioner.cc +++ b/cpp/core/shuffle/Partitioner.cc @@ -18,6 +18,7 @@ #include "shuffle/Partitioner.h" #include "shuffle/FallbackRangePartitioner.h" #include "shuffle/HashPartitioner.h" +#include "shuffle/RandomPartitioner.h" #include "shuffle/RoundRobinPartitioner.h" #include "shuffle/SinglePartitioner.h" @@ -34,6 +35,8 @@ Partitioner::make(Partitioning partitioning, int32_t numPartitions, int32_t star return std::make_shared(); case Partitioning::kRange: return std::make_shared(numPartitions); + case Partitioning::kRandom: + return std::make_shared(numPartitions); default: return arrow::Status::Invalid("Unsupported partitioning type: " + std::to_string(partitioning)); } diff --git a/cpp/core/shuffle/Partitioning.cc b/cpp/core/shuffle/Partitioning.cc index dfe848d630466..84fe6ecd972f9 100644 --- a/cpp/core/shuffle/Partitioning.cc +++ b/cpp/core/shuffle/Partitioning.cc @@ -23,6 +23,7 @@ static const std::string kSinglePartitioningName = "single"; static const std::string kRoundRobinPartitioningName = "rr"; static const std::string kHashPartitioningName = "hash"; static const std::string kRangePartitioningName = "range"; +static const std::string kRandomPartitioningName = "random"; } // namespace namespace gluten { @@ -39,6 +40,9 @@ Partitioning toPartitioning(std::string name) { if (name == kRangePartitioningName) { return Partitioning::kRange; } + if (name == kRandomPartitioningName) { + return Partitioning::kRandom; + } throw GlutenException("Invalid partition name: " + name); } diff --git a/cpp/core/shuffle/Partitioning.h b/cpp/core/shuffle/Partitioning.h index 1d65e9d6b9933..a60d43561bee1 100644 --- a/cpp/core/shuffle/Partitioning.h +++ b/cpp/core/shuffle/Partitioning.h @@ -20,7 +20,7 @@ #include namespace gluten { -enum Partitioning { kSingle, kRoundRobin, kHash, kRange }; +enum Partitioning { kSingle, kRoundRobin, kHash, kRange, kRandom /*for test only*/ }; Partitioning toPartitioning(std::string name); diff --git a/cpp/core/shuffle/Payload.cc b/cpp/core/shuffle/Payload.cc index 626ed0cf0c18c..fb91c326b6793 100644 --- a/cpp/core/shuffle/Payload.cc +++ b/cpp/core/shuffle/Payload.cc @@ -327,6 +327,8 @@ arrow::Result>> BlockPayload::deseria case arrow::ListType::type_id: { hasComplexDataType = true; } break; + case arrow::NullType::type_id: + break; default: { buffers.emplace_back(); ARROW_ASSIGN_OR_RAISE(buffers.back(), readBuffer()); @@ -501,6 +503,7 @@ arrow::Status UncompressedDiskBlockPayload::serialize(arrow::io::OutputStream* o } arrow::Result> UncompressedDiskBlockPayload::readUncompressedBuffer() { + ScopedTimer timer(&writeTime_); readPos_++; int64_t bufferLength; RETURN_NOT_OK(inputStream_->Read(sizeof(int64_t), &bufferLength)); @@ -523,6 +526,7 @@ CompressedDiskBlockPayload::CompressedDiskBlockPayload( : Payload(Type::kCompressed, numRows, isValidityBuffer), inputStream_(inputStream), rawSize_(rawSize) {} arrow::Status CompressedDiskBlockPayload::serialize(arrow::io::OutputStream* outputStream) { + ScopedTimer timer(&writeTime_); ARROW_ASSIGN_OR_RAISE(auto block, inputStream_->Read(rawSize_)); RETURN_NOT_OK(outputStream->Write(block)); return arrow::Status::OK(); diff --git a/cpp/core/shuffle/RandomPartitioner.cc b/cpp/core/shuffle/RandomPartitioner.cc new file mode 100644 index 0000000000000..06d87be40f7f1 --- /dev/null +++ b/cpp/core/shuffle/RandomPartitioner.cc @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "shuffle/RandomPartitioner.h" + +namespace gluten { + +arrow::Status gluten::RandomPartitioner::compute( + const int32_t* pidArr, + const int64_t numRows, + std::vector& row2Partition, + std::vector& partition2RowCount) { + std::fill(std::begin(partition2RowCount), std::end(partition2RowCount), 0); + row2Partition.resize(numRows); + + for (int32_t i = 0; i < numRows; ++i) { + row2Partition[i] = dist_(rng_); + } + + for (auto& pid : row2Partition) { + partition2RowCount[pid]++; + } + + return arrow::Status::OK(); +} + +arrow::Status gluten::RandomPartitioner::compute( + const int32_t* pidArr, + const int64_t numRows, + const int32_t vectorIndex, + std::unordered_map>& rowVectorIndexMap) { + auto index = static_cast(vectorIndex) << 32; + for (int32_t i = 0; i < numRows; ++i) { + int64_t combined = index | (i & 0xFFFFFFFFLL); + auto& vec = rowVectorIndexMap[dist_(rng_)]; + vec.push_back(combined); + } + + return arrow::Status::OK(); +} + +} // namespace gluten diff --git a/cpp/core/shuffle/RandomPartitioner.h b/cpp/core/shuffle/RandomPartitioner.h new file mode 100644 index 0000000000000..77d00716943cd --- /dev/null +++ b/cpp/core/shuffle/RandomPartitioner.h @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include "shuffle/Partitioner.h" + +namespace gluten { +class RandomPartitioner final : public Partitioner { + public: + RandomPartitioner(int32_t numPartitions) : Partitioner(numPartitions, false) { + std::random_device dev; + rng_.seed(dev()); + dist_ = std::uniform_int_distribution(0, numPartitions - 1); + } + + arrow::Status compute( + const int32_t* pidArr, + const int64_t numRows, + std::vector& row2Partition, + std::vector& partition2RowCount) override; + + arrow::Status compute( + const int32_t* pidArr, + const int64_t numRows, + const int32_t vectorIndex, + std::unordered_map>& rowVectorIndexMap) override; + + private: + std::mt19937 rng_; + std::uniform_int_distribution dist_; +}; +} // namespace gluten diff --git a/cpp/core/tests/CMakeLists.txt b/cpp/core/tests/CMakeLists.txt index d8efd9d404441..a0f3406f5c195 100644 --- a/cpp/core/tests/CMakeLists.txt +++ b/cpp/core/tests/CMakeLists.txt @@ -18,3 +18,4 @@ if(ENABLE_HBM) endif() add_test_case(round_robin_partitioner_test SOURCES RoundRobinPartitionerTest.cc) +add_test_case(objectstore__test SOURCES ObjectStoreTest.cc) diff --git a/cpp/core/tests/ObjectStoreTest.cc b/cpp/core/tests/ObjectStoreTest.cc new file mode 100644 index 0000000000000..cb5ab3a877070 --- /dev/null +++ b/cpp/core/tests/ObjectStoreTest.cc @@ -0,0 +1,116 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "utils/ObjectStore.h" +#include + +using namespace gluten; + +TEST(ObjectStore, retreive) { + auto store = ObjectStore::create(); + auto obj = std::make_shared(1); + auto handle = store->save(obj); + auto retrieved = ObjectStore::retrieve(handle); + ASSERT_EQ(*retrieved, 1); +} + +TEST(ObjectStore, retreiveMultiple) { + auto store = ObjectStore::create(); + auto obj1 = std::make_shared(50); + auto obj2 = std::make_shared(100); + auto handle1 = store->save(obj1); + auto handle2 = store->save(obj2); + auto retrieved1 = ObjectStore::retrieve(handle1); + auto retrieved2 = ObjectStore::retrieve(handle2); + ASSERT_EQ(*retrieved1, *obj1); + ASSERT_EQ(*retrieved2, *obj2); +} + +TEST(ObjectStore, release) { + ObjectHandle handle = kInvalidObjectHandle; + auto store = ObjectStore::create(); + { + auto obj = std::make_shared(1); + handle = store->save(obj); + } + auto retrieved = ObjectStore::retrieve(handle); + ASSERT_EQ(*retrieved, 1); + ObjectStore::release(handle); + ASSERT_ANY_THROW(ObjectStore::retrieve(handle)); +} + +TEST(ObjectStore, releaseMultiple) { + ObjectHandle handle1 = kInvalidObjectHandle; + ObjectHandle handle2 = kInvalidObjectHandle; + auto store = ObjectStore::create(); + { + auto obj1 = std::make_shared(50); + auto obj2 = std::make_shared(100); + handle1 = store->save(obj1); + handle2 = store->save(obj2); + } + ASSERT_EQ(*ObjectStore::retrieve(handle1), 50); + ASSERT_EQ(*ObjectStore::retrieve(handle2), 100); + ObjectStore::release(handle2); + ASSERT_EQ(*ObjectStore::retrieve(handle1), 50); + ASSERT_ANY_THROW(ObjectStore::retrieve(handle2)); + ObjectStore::release(handle1); + ASSERT_ANY_THROW(ObjectStore::retrieve(handle1)); + ASSERT_ANY_THROW(ObjectStore::retrieve(handle2)); +} + +TEST(ObjectStore, releaseObjectsInMultipleStores) { + ObjectHandle handle1 = kInvalidObjectHandle; + ObjectHandle handle2 = kInvalidObjectHandle; + auto store1 = ObjectStore::create(); + auto store2 = ObjectStore::create(); + { + auto obj1 = std::make_shared(50); + auto obj2 = std::make_shared(100); + handle1 = store1->save(obj1); + handle2 = store2->save(obj2); + } + ASSERT_EQ(*ObjectStore::retrieve(handle1), 50); + ASSERT_EQ(*ObjectStore::retrieve(handle2), 100); + ObjectStore::release(handle2); + ASSERT_EQ(*ObjectStore::retrieve(handle1), 50); + ASSERT_ANY_THROW(ObjectStore::retrieve(handle2)); + ObjectStore::release(handle1); + ASSERT_ANY_THROW(ObjectStore::retrieve(handle1)); + ASSERT_ANY_THROW(ObjectStore::retrieve(handle2)); +} + +TEST(ObjectStore, releaseMultipleStores) { + ObjectHandle handle1 = kInvalidObjectHandle; + ObjectHandle handle2 = kInvalidObjectHandle; + auto store1 = ObjectStore::create(); + auto store2 = ObjectStore::create(); + { + auto obj1 = std::make_shared(50); + auto obj2 = std::make_shared(100); + handle1 = store1->save(obj1); + handle2 = store2->save(obj2); + } + ASSERT_EQ(*ObjectStore::retrieve(handle1), 50); + ASSERT_EQ(*ObjectStore::retrieve(handle2), 100); + store2.reset(); + ASSERT_EQ(*ObjectStore::retrieve(handle1), 50); + ASSERT_ANY_THROW(ObjectStore::retrieve(handle2)); + store1.reset(); + ASSERT_ANY_THROW(ObjectStore::retrieve(handle1)); + ASSERT_ANY_THROW(ObjectStore::retrieve(handle2)); +} diff --git a/cpp/core/utils/ObjectStore.cc b/cpp/core/utils/ObjectStore.cc index 5dc9c7843a73a..648163e4bcfb2 100644 --- a/cpp/core/utils/ObjectStore.cc +++ b/cpp/core/utils/ObjectStore.cc @@ -22,24 +22,22 @@ gluten::ObjectStore::~ObjectStore() { // destructing in reversed order (the last added object destructed first) const std::lock_guard lock(mtx_); - for (auto itr = aliveObjectHandles_.rbegin(); itr != aliveObjectHandles_.rend(); itr++) { + for (auto itr = aliveObjects_.rbegin(); itr != aliveObjects_.rend(); itr++) { ResourceHandle handle = *itr; - if (store_.lookup(handle) == nullptr) { - LOG(WARNING) << "Fatal: resource handle " + std::to_string(handle) + " not found in store."; - } store_.erase(handle); } + stores().erase(storeId_); } -gluten::ResourceHandle gluten::ObjectStore::save(std::shared_ptr obj) { +gluten::ObjectHandle gluten::ObjectStore::save(std::shared_ptr obj) { const std::lock_guard lock(mtx_); ResourceHandle handle = store_.insert(std::move(obj)); - aliveObjectHandles_.insert(handle); - return handle; + aliveObjects_.insert(handle); + return toObjHandle(handle); } -void gluten::ObjectStore::release(gluten::ResourceHandle handle) { +void gluten::ObjectStore::release0(gluten::ResourceHandle handle) { const std::lock_guard lock(mtx_); store_.erase(handle); - aliveObjectHandles_.erase(handle); + aliveObjects_.erase(handle); } diff --git a/cpp/core/utils/ObjectStore.h b/cpp/core/utils/ObjectStore.h index 6bfecf599b0d1..8e5dd250a03b0 100644 --- a/cpp/core/utils/ObjectStore.h +++ b/cpp/core/utils/ObjectStore.h @@ -19,25 +19,73 @@ #include #include "utils/ResourceMap.h" -#include "utils/exception.h" namespace gluten { +// ObjectHandle is a signed int64 consisting of: +// 1. 1 - 32 bits is a signed int32 as the object store's ID; +// 2. 1st bit is always zero to be compatible with jlong; +// 3. 33 - 64 bits is an unsigned int32 as the object's ID; +// +// When the object is tended to be retrieved with its ObjectHandle, +// the program first finds its resident object store, then looks up +// for the object in the store. +using StoreHandle = int32_t; +using ObjectHandle = int64_t; +constexpr static ObjectHandle kInvalidObjectHandle = -1; + // A store for caching shared-ptrs and enlarging lifecycles of the ptrs to match lifecycle of the store itself by // default, and also serving release calls to release a ptr in advance. This is typically used in JNI scenario to bind // a shared-ptr's lifecycle to a Java-side object or some kind of resource manager. class ObjectStore { public: static std::unique_ptr create() { - return std::unique_ptr(new ObjectStore()); + static std::mutex mtx; + std::lock_guard lock(mtx); + StoreHandle nextId = stores().nextId(); + auto store = std::unique_ptr(new ObjectStore(nextId)); + StoreHandle storeId = safeCast(stores().insert(store.get())); + GLUTEN_CHECK(storeId == nextId, "Store ID mismatched, this should not happen"); + return store; + } + + static void release(ObjectHandle handle) { + ResourceHandle storeId = safeCast(handle >> (sizeof(ResourceHandle) * 8)); + ResourceHandle resourceId = safeCast(handle & std::numeric_limits::max()); + auto store = stores().lookup(storeId); + store->release0(resourceId); + } + + template + static std::shared_ptr retrieve(ObjectHandle handle) { + ResourceHandle storeId = safeCast(handle >> (sizeof(ResourceHandle) * 8)); + ResourceHandle resourceId = safeCast(handle & std::numeric_limits::max()); + auto store = stores().lookup(storeId); + return store->retrieve0(resourceId); } virtual ~ObjectStore(); - ResourceHandle save(std::shared_ptr obj); + StoreHandle id() { + return storeId_; + } + + ObjectHandle save(std::shared_ptr obj); + + private: + static ResourceMap& stores() { + static ResourceMap stores; + return stores; + } + + ObjectHandle toObjHandle(ResourceHandle rh) { + ObjectHandle prefix = static_cast(storeId_) << (sizeof(ResourceHandle) * 8); + ObjectHandle objHandle = prefix + rh; + return objHandle; + } template - std::shared_ptr retrieve(ResourceHandle handle) { + std::shared_ptr retrieve0(ResourceHandle handle) { const std::lock_guard lock(mtx_); std::shared_ptr object = store_.lookup(handle); // Programming carefully. This will lead to ub if wrong typename T was passed in. @@ -45,12 +93,12 @@ class ObjectStore { return casted; } - void release(ResourceHandle handle); + void release0(ResourceHandle handle); - private: - ObjectStore(){}; + ObjectStore(StoreHandle storeId) : storeId_(storeId){}; + StoreHandle storeId_; ResourceMap> store_; - std::set aliveObjectHandles_; + std::set aliveObjects_; std::mutex mtx_; }; } // namespace gluten diff --git a/cpp/core/utils/ResourceMap.h b/cpp/core/utils/ResourceMap.h index c47fbd9ecb399..580ad4f6f94b9 100644 --- a/cpp/core/utils/ResourceMap.h +++ b/cpp/core/utils/ResourceMap.h @@ -17,13 +17,25 @@ #pragma once +#include +#include #include #include +#include "utils/exception.h" namespace gluten { +using ResourceHandle = uint32_t; +static_assert(std::numeric_limits::min() == 0); -using ResourceHandle = int64_t; -constexpr static ResourceHandle kInvalidResourceHandle = -1; +template +T safeCast(F f) { + GLUTEN_CHECK(sizeof(T) <= sizeof(F), "Vain safe casting"); + F min = 0; + F max = static_cast(std::numeric_limits::max()); + GLUTEN_CHECK(f >= min, "Safe casting a negative number"); + GLUTEN_CHECK(f <= max, "Number overflow"); + return static_cast(f); +} /** * An utility class that map resource handle to its shared pointers. @@ -36,40 +48,48 @@ class ResourceMap { ResourceMap() : resourceId_(kInitResourceId) {} ResourceHandle insert(TResource holder) { - ResourceHandle result = resourceId_++; + ResourceHandle result = safeCast(resourceId_++); + const std::lock_guard lock(mtx_); map_.insert(std::pair(result, holder)); return result; } void erase(ResourceHandle moduleId) { - map_.erase(moduleId); + const std::lock_guard lock(mtx_); + GLUTEN_CHECK(map_.erase(moduleId) == 1, "Module not found in resource map: " + std::to_string(moduleId)); } TResource lookup(ResourceHandle moduleId) { + const std::lock_guard lock(mtx_); auto it = map_.find(moduleId); - if (it != map_.end()) { - return it->second; - } - return nullptr; + GLUTEN_CHECK(it != map_.end(), "Module not found in resource map: " + std::to_string(moduleId)); + return it->second; } void clear() { + const std::lock_guard lock(mtx_); map_.clear(); } size_t size() { + const std::lock_guard lock(mtx_); return map_.size(); } + size_t nextId() { + return resourceId_; + } + private: // Initialize the resource id starting value to a number greater than zero // to allow for easier debugging of uninitialized java variables. - static constexpr int kInitResourceId = 4; + static constexpr size_t kInitResourceId = 4; - ResourceHandle resourceId_; + std::atomic resourceId_{0}; // map from resource ids returned to Java and resource pointers std::unordered_map map_; + std::mutex mtx_; }; } // namespace gluten diff --git a/cpp/velox/CMakeLists.txt b/cpp/velox/CMakeLists.txt index 9bedfe45ba0e0..b734669b8e463 100644 --- a/cpp/velox/CMakeLists.txt +++ b/cpp/velox/CMakeLists.txt @@ -23,39 +23,54 @@ include(GNUInstallDirs) include(CheckCXXCompilerFlag) include(FindPackageHandleStandardArgs) -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecated-declarations -Wno-attributes") -if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Darwin") +set(CMAKE_CXX_FLAGS + "${CMAKE_CXX_FLAGS} -Wno-deprecated-declarations -Wno-attributes") +if(NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Darwin") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-class-memaccess") endif() -set(SYSTEM_LIB_PATH "/usr/lib" CACHE PATH "System Lib dir") -set(SYSTEM_LIB64_PATH "/usr/lib64" CACHE PATH "System Lib64 dir") -set(SYSTEM_LOCAL_LIB_PATH "/usr/local/lib" CACHE PATH "System Local Lib dir") -set(SYSTEM_LOCAL_LIB64_PATH "/usr/local/lib64" CACHE PATH "System Local Lib64 dir") -if (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86)|(X86)|(amd64)|(AMD64)") - set(SYSTEM_LIB_MULTIARCH_PATH "/usr/lib/x86_64-linux-gnu" CACHE PATH "System Lib MultiArch dir") +set(SYSTEM_LIB_PATH + "/usr/lib" + CACHE PATH "System Lib dir") +set(SYSTEM_LIB64_PATH + "/usr/lib64" + CACHE PATH "System Lib64 dir") +set(SYSTEM_LOCAL_LIB_PATH + "/usr/local/lib" + CACHE PATH "System Local Lib dir") +set(SYSTEM_LOCAL_LIB64_PATH + "/usr/local/lib64" + CACHE PATH "System Local Lib64 dir") +if(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86)|(X86)|(amd64)|(AMD64)") + set(SYSTEM_LIB_MULTIARCH_PATH + "/usr/lib/x86_64-linux-gnu" + CACHE PATH "System Lib MultiArch dir") elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL aarch64) - set(SYSTEM_LIB_MULTIARCH_PATH "/usr/lib/aarch64-linux-gnu" CACHE PATH "System Lib MultiArch dir") + set(SYSTEM_LIB_MULTIARCH_PATH + "/usr/lib/aarch64-linux-gnu" + CACHE PATH "System Lib MultiArch dir") elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL arm64) - set(SYSTEM_LIB_MULTIARCH_PATH "/usr/lib" CACHE PATH "System Lib MultiArch dir") + set(SYSTEM_LIB_MULTIARCH_PATH + "/usr/lib" + CACHE PATH "System Lib MultiArch dir") else() message(FATAL_ERROR "Unsupported processor type: ${CMAKE_SYSTEM_PROCESSOR}") endif() -if (NOT DEFINED VELOX_HOME) +if(NOT DEFINED VELOX_HOME) set(VELOX_HOME ${GLUTEN_HOME}/ep/build-velox/build/velox_ep) message(STATUS "Set VELOX_HOME to ${VELOX_HOME}") endif() # Keep same compile option with Velox. execute_process( - COMMAND + COMMAND bash -c "( source ${VELOX_HOME}/scripts/setup-helper-functions.sh && echo -n $(get_cxx_flags $ENV{CPU_TARGET}))" - OUTPUT_VARIABLE SCRIPT_CXX_FLAGS - RESULT_VARIABLE COMMAND_STATUS) + OUTPUT_VARIABLE SCRIPT_CXX_FLAGS + RESULT_VARIABLE COMMAND_STATUS) if(COMMAND_STATUS EQUAL "1") - message(FATAL_ERROR "Unable to determine compiler flags!") + message(FATAL_ERROR "Unable to determine compiler flags!") endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SCRIPT_CXX_FLAGS}") @@ -63,10 +78,14 @@ message("Velox module final CMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}") # User can specify VELOX_BUILD_PATH, if Velox are built elsewhere. if(NOT DEFINED VELOX_BUILD_PATH) - if (${CMAKE_BUILD_TYPE} STREQUAL "Debug") - set(VELOX_BUILD_PATH "${VELOX_HOME}/_build/debug" CACHE PATH "Velox build directory.") + if(${CMAKE_BUILD_TYPE} STREQUAL "Debug") + set(VELOX_BUILD_PATH + "${VELOX_HOME}/_build/debug" + CACHE PATH "Velox build directory.") else() - set(VELOX_BUILD_PATH "${VELOX_HOME}/_build/release" CACHE PATH "Velox build directory.") + set(VELOX_BUILD_PATH + "${VELOX_HOME}/_build/release" + CACHE PATH "Velox build directory.") endif() endif() @@ -78,25 +97,28 @@ function(ADD_VELOX_DEPENDENCY VELOX_DEP_LIB_NAME VELOX_DEP_LIB_PATH) endif() set(VELOX_DEP_LIB facebook::velox::${VELOX_DEP_LIB_NAME}) add_library(${VELOX_DEP_LIB} STATIC IMPORTED) - set_target_properties(${VELOX_DEP_LIB} PROPERTIES - IMPORTED_LOCATION ${VELOX_DEP_LIB_PATH}) + set_target_properties(${VELOX_DEP_LIB} PROPERTIES IMPORTED_LOCATION + ${VELOX_DEP_LIB_PATH}) target_link_libraries(velox PUBLIC ${VELOX_DEP_LIB}) endfunction() macro(ADD_VELOX_OBJECTS) add_library(velox_objects OBJECT IMPORTED GLOBAL) - set_property(TARGET velox_objects PROPERTY IMPORTED_OBJECTS - "${VELOX_COMPONENTS_PATH}/connectors/hive/CMakeFiles/velox_hive_connector.dir/FileHandle.cpp.o" - "${VELOX_COMPONENTS_PATH}/connectors/hive/CMakeFiles/velox_hive_connector.dir/HiveConfig.cpp.o" - "${VELOX_COMPONENTS_PATH}/connectors/hive/CMakeFiles/velox_hive_connector.dir/HiveConnector.cpp.o" - "${VELOX_COMPONENTS_PATH}/connectors/hive/CMakeFiles/velox_hive_connector.dir/HiveDataSink.cpp.o" - "${VELOX_COMPONENTS_PATH}/connectors/hive/CMakeFiles/velox_hive_connector.dir/HiveDataSource.cpp.o" - "${VELOX_COMPONENTS_PATH}/connectors/hive/CMakeFiles/velox_hive_connector.dir/HivePartitionUtil.cpp.o" - "${VELOX_COMPONENTS_PATH}/connectors/hive/CMakeFiles/velox_hive_connector.dir/PartitionIdGenerator.cpp.o" - "${VELOX_COMPONENTS_PATH}/connectors/hive/CMakeFiles/velox_hive_connector.dir/SplitReader.cpp.o" - "${VELOX_COMPONENTS_PATH}/connectors/hive/CMakeFiles/velox_hive_connector.dir/TableHandle.cpp.o" - "${VELOX_COMPONENTS_PATH}/connectors/hive/CMakeFiles/velox_hive_connector.dir/HiveConnectorUtil.cpp.o" - ) + set_property( + TARGET velox_objects + PROPERTY + IMPORTED_OBJECTS + "${VELOX_COMPONENTS_PATH}/connectors/hive/CMakeFiles/velox_hive_connector.dir/FileHandle.cpp.o" + "${VELOX_COMPONENTS_PATH}/connectors/hive/CMakeFiles/velox_hive_connector.dir/HiveConfig.cpp.o" + "${VELOX_COMPONENTS_PATH}/connectors/hive/CMakeFiles/velox_hive_connector.dir/HiveConnector.cpp.o" + "${VELOX_COMPONENTS_PATH}/connectors/hive/CMakeFiles/velox_hive_connector.dir/HiveDataSink.cpp.o" + "${VELOX_COMPONENTS_PATH}/connectors/hive/CMakeFiles/velox_hive_connector.dir/HiveDataSource.cpp.o" + "${VELOX_COMPONENTS_PATH}/connectors/hive/CMakeFiles/velox_hive_connector.dir/HivePartitionUtil.cpp.o" + "${VELOX_COMPONENTS_PATH}/connectors/hive/CMakeFiles/velox_hive_connector.dir/PartitionIdGenerator.cpp.o" + "${VELOX_COMPONENTS_PATH}/connectors/hive/CMakeFiles/velox_hive_connector.dir/SplitReader.cpp.o" + "${VELOX_COMPONENTS_PATH}/connectors/hive/CMakeFiles/velox_hive_connector.dir/TableHandle.cpp.o" + "${VELOX_COMPONENTS_PATH}/connectors/hive/CMakeFiles/velox_hive_connector.dir/HiveConnectorUtil.cpp.o" + ) target_link_libraries(velox PUBLIC velox_objects) endmacro() @@ -112,125 +134,305 @@ endmacro() macro(ADD_VELOX_DEPENDENCIES) add_velox_objects() - add_velox_dependency(expression::sigparser "${VELOX_COMPONENTS_PATH}/expression/signature_parser/libvelox_signature_parser.a") - add_velox_dependency(functions::sparksql::lib "${VELOX_COMPONENTS_PATH}/functions/sparksql/libvelox_functions_spark.a") - add_velox_dependency(functions::sparksql::agg "${VELOX_COMPONENTS_PATH}/functions/sparksql/aggregates/libvelox_functions_spark_aggregates.a") - add_velox_dependency(functions::window::sparksql "${VELOX_COMPONENTS_PATH}/functions/sparksql/window/libvelox_functions_spark_window.a") - add_velox_dependency(functions::prestosql::agg "${VELOX_COMPONENTS_PATH}/functions/prestosql/aggregates/libvelox_aggregates.a") - add_velox_dependency(functions::lib::agg "${VELOX_COMPONENTS_PATH}/functions/lib/aggregates/libvelox_functions_aggregates.a") - add_velox_dependency(functions::prestosql::window "${VELOX_COMPONENTS_PATH}/functions/prestosql/window/libvelox_window.a") - add_velox_dependency(functions::lib::window "${VELOX_COMPONENTS_PATH}/functions/lib/window/libvelox_functions_window.a") - add_velox_dependency(velox::buffer "${VELOX_COMPONENTS_PATH}/buffer/libvelox_buffer.a") - - add_velox_dependency(functions::isnull "${VELOX_COMPONENTS_PATH}/functions/lib/libvelox_is_null_functions.a") - add_velox_dependency(functions::prestosql "${VELOX_COMPONENTS_PATH}/functions/prestosql/registration/libvelox_functions_prestosql.a") - add_velox_dependency(functions::prestosql::impl "${VELOX_COMPONENTS_PATH}/functions/prestosql/libvelox_functions_prestosql_impl.a") - add_velox_dependency(functions::json "${VELOX_COMPONENTS_PATH}/functions/prestosql/json/libvelox_functions_json.a") - add_velox_dependency(functions::hyperloglog "${VELOX_COMPONENTS_PATH}/common/hyperloglog/libvelox_common_hyperloglog.a") - add_velox_dependency(functions::lib "${VELOX_COMPONENTS_PATH}/functions/lib/libvelox_functions_lib.a") + add_velox_dependency( + expression::sigparser + "${VELOX_COMPONENTS_PATH}/expression/signature_parser/libvelox_signature_parser.a" + ) + add_velox_dependency( + functions::sparksql::lib + "${VELOX_COMPONENTS_PATH}/functions/sparksql/libvelox_functions_spark.a") + add_velox_dependency( + functions::sparksql::agg + "${VELOX_COMPONENTS_PATH}/functions/sparksql/aggregates/libvelox_functions_spark_aggregates.a" + ) + add_velox_dependency( + functions::window::sparksql + "${VELOX_COMPONENTS_PATH}/functions/sparksql/window/libvelox_functions_spark_window.a" + ) + add_velox_dependency( + functions::prestosql::agg + "${VELOX_COMPONENTS_PATH}/functions/prestosql/aggregates/libvelox_aggregates.a" + ) + add_velox_dependency( + functions::lib::agg + "${VELOX_COMPONENTS_PATH}/functions/lib/aggregates/libvelox_functions_aggregates.a" + ) + add_velox_dependency( + functions::prestosql::window + "${VELOX_COMPONENTS_PATH}/functions/prestosql/window/libvelox_window.a") + add_velox_dependency( + functions::lib::window + "${VELOX_COMPONENTS_PATH}/functions/lib/window/libvelox_functions_window.a") + add_velox_dependency(velox::buffer + "${VELOX_COMPONENTS_PATH}/buffer/libvelox_buffer.a") + + add_velox_dependency( + functions::isnull + "${VELOX_COMPONENTS_PATH}/functions/lib/libvelox_is_null_functions.a") + add_velox_dependency( + functions::prestosql + "${VELOX_COMPONENTS_PATH}/functions/prestosql/registration/libvelox_functions_prestosql.a" + ) + add_velox_dependency( + functions::prestosql::impl + "${VELOX_COMPONENTS_PATH}/functions/prestosql/libvelox_functions_prestosql_impl.a" + ) + add_velox_dependency( + functions::json + "${VELOX_COMPONENTS_PATH}/functions/prestosql/json/libvelox_functions_json.a" + ) + add_velox_dependency( + functions::hyperloglog + "${VELOX_COMPONENTS_PATH}/common/hyperloglog/libvelox_common_hyperloglog.a") + add_velox_dependency( + functions::lib + "${VELOX_COMPONENTS_PATH}/functions/lib/libvelox_functions_lib.a") + add_velox_dependency( + functions::lib::date_time_formatter + "${VELOX_COMPONENTS_PATH}/functions/lib/libvelox_functions_lib_date_time_formatter.a" + ) if(BUILD_TESTS) - add_velox_dependency(exec::test "${VELOX_COMPONENTS_PATH}/exec/tests/utils/libvelox_exec_test_lib.a") - add_velox_dependency(temp::path "${VELOX_COMPONENTS_PATH}/exec/tests/utils/libvelox_temp_path.a") - add_velox_dependency(dwio::common::test::utils "${VELOX_COMPONENTS_PATH}/dwio/common/tests/utils/libvelox_dwio_common_test_utils.a") + add_velox_dependency( + exec::test + "${VELOX_COMPONENTS_PATH}/exec/tests/utils/libvelox_exec_test_lib.a") + add_velox_dependency( + temp::path + "${VELOX_COMPONENTS_PATH}/exec/tests/utils/libvelox_temp_path.a") + add_velox_dependency( + dwio::common::test::utils + "${VELOX_COMPONENTS_PATH}/dwio/common/tests/utils/libvelox_dwio_common_test_utils.a" + ) endif() add_velox_dependency(exec "${VELOX_COMPONENTS_PATH}/exec/libvelox_exec.a") if(BUILD_TESTS) - add_velox_dependency(parse::parser "${VELOX_COMPONENTS_PATH}/parse/libvelox_parse_parser.a") - add_velox_dependency(duckdb::parser "${VELOX_COMPONENTS_PATH}/duckdb/conversion/libvelox_duckdb_parser.a") - add_velox_dependency(parse::expression "${VELOX_COMPONENTS_PATH}/parse/libvelox_parse_expression.a") - add_velox_dependency(parse::utils "${VELOX_COMPONENTS_PATH}/parse/libvelox_parse_utils.a") - add_velox_dependency(function::registry "${VELOX_COMPONENTS_PATH}/functions/libvelox_function_registry.a") + add_velox_dependency( + parse::parser "${VELOX_COMPONENTS_PATH}/parse/libvelox_parse_parser.a") + add_velox_dependency( + duckdb::parser + "${VELOX_COMPONENTS_PATH}/duckdb/conversion/libvelox_duckdb_parser.a") + add_velox_dependency( + parse::expression + "${VELOX_COMPONENTS_PATH}/parse/libvelox_parse_expression.a") + add_velox_dependency( + parse::utils "${VELOX_COMPONENTS_PATH}/parse/libvelox_parse_utils.a") + add_velox_dependency( + function::registry + "${VELOX_COMPONENTS_PATH}/functions/libvelox_function_registry.a") endif() - add_velox_dependency(vector::arrow::bridge "${VELOX_COMPONENTS_PATH}/vector/arrow/libvelox_arrow_bridge.a") + add_velox_dependency( + vector::arrow::bridge + "${VELOX_COMPONENTS_PATH}/vector/arrow/libvelox_arrow_bridge.a") add_velox_dependency(row "${VELOX_COMPONENTS_PATH}/row/libvelox_row_fast.a") - add_velox_dependency(connector "${VELOX_COMPONENTS_PATH}/connectors/libvelox_connector.a") - add_velox_dependency(connector::hive_parition "${VELOX_COMPONENTS_PATH}/connectors/hive/libvelox_hive_partition_function.a") - add_velox_dependency(connector::hive::iceberg::IcebergSplitReader "${VELOX_COMPONENTS_PATH}/connectors/hive/iceberg/libvelox_hive_iceberg_splitreader.a") + add_velox_dependency( + connector "${VELOX_COMPONENTS_PATH}/connectors/libvelox_connector.a") + add_velox_dependency( + connector::hive_parition + "${VELOX_COMPONENTS_PATH}/connectors/hive/libvelox_hive_partition_function.a" + ) + add_velox_dependency( + connector::hive::iceberg::IcebergSplitReader + "${VELOX_COMPONENTS_PATH}/connectors/hive/iceberg/libvelox_hive_iceberg_splitreader.a" + ) if(ENABLE_HDFS) - add_velox_dependency(connector::hive::hdfs "${VELOX_COMPONENTS_PATH}/connectors/hive/storage_adapters/hdfs/libvelox_hdfs.a") + add_velox_dependency( + connector::hive::hdfs + "${VELOX_COMPONENTS_PATH}/connectors/hive/storage_adapters/hdfs/libvelox_hdfs.a" + ) endif() if(ENABLE_GCS) - add_velox_dependency(connector::hive::gcs "${VELOX_COMPONENTS_PATH}/connectors/hive/storage_adapters/gcs/libvelox_gcs.a") + add_velox_dependency( + connector::hive::gcs + "${VELOX_COMPONENTS_PATH}/connectors/hive/storage_adapters/gcs/libvelox_gcs.a" + ) endif() if(ENABLE_S3) - add_velox_dependency(connector::hive::s3fs "${VELOX_COMPONENTS_PATH}/connectors/hive/storage_adapters/s3fs/libvelox_s3fs.a") + add_velox_dependency( + connector::hive::s3fs + "${VELOX_COMPONENTS_PATH}/connectors/hive/storage_adapters/s3fs/libvelox_s3fs.a" + ) endif() if(ENABLE_ABFS) - add_velox_dependency(connector::hive::abfs "${VELOX_COMPONENTS_PATH}/connectors/hive/storage_adapters/abfs/libvelox_abfs.a") + add_velox_dependency( + connector::hive::abfs + "${VELOX_COMPONENTS_PATH}/connectors/hive/storage_adapters/abfs/libvelox_abfs.a" + ) endif() - add_velox_dependency(dwio::dwrf::writer "${VELOX_COMPONENTS_PATH}/dwio/dwrf/writer/libvelox_dwio_dwrf_writer.a") - add_velox_dependency(dwio::dwrf::reader "${VELOX_COMPONENTS_PATH}/dwio/dwrf/reader/libvelox_dwio_dwrf_reader.a") - add_velox_dependency(dwio::dwrf::utils "${VELOX_COMPONENTS_PATH}/dwio/dwrf/utils/libvelox_dwio_dwrf_utils.a") - add_velox_dependency(dwio::dwrf::common "${VELOX_COMPONENTS_PATH}/dwio/dwrf/common/libvelox_dwio_dwrf_common.a") - add_velox_dependency(parquet "${VELOX_COMPONENTS_PATH}/dwio/parquet/libvelox_dwio_parquet_reader.a") - add_velox_dependency(parquet::reader::native "${VELOX_COMPONENTS_PATH}/dwio/parquet/reader/libvelox_dwio_native_parquet_reader.a") + add_velox_dependency( + dwio::dwrf::writer + "${VELOX_COMPONENTS_PATH}/dwio/dwrf/writer/libvelox_dwio_dwrf_writer.a") + add_velox_dependency( + dwio::dwrf::reader + "${VELOX_COMPONENTS_PATH}/dwio/dwrf/reader/libvelox_dwio_dwrf_reader.a") + add_velox_dependency( + dwio::orc::reader + "${VELOX_COMPONENTS_PATH}/dwio/orc/reader/libvelox_dwio_orc_reader.a") + add_velox_dependency( + dwio::dwrf::utils + "${VELOX_COMPONENTS_PATH}/dwio/dwrf/utils/libvelox_dwio_dwrf_utils.a") + add_velox_dependency( + dwio::dwrf::common + "${VELOX_COMPONENTS_PATH}/dwio/dwrf/common/libvelox_dwio_dwrf_common.a") + add_velox_dependency( + parquet + "${VELOX_COMPONENTS_PATH}/dwio/parquet/libvelox_dwio_parquet_reader.a") + add_velox_dependency( + parquet::reader::native + "${VELOX_COMPONENTS_PATH}/dwio/parquet/reader/libvelox_dwio_native_parquet_reader.a" + ) if(BUILD_TESTS) - add_velox_dependency(dwio::common::utils "${VELOX_COMPONENTS_PATH}/dwio/common/tests/utils/libvelox_dwio_common_test_utils.a") - add_velox_dependency(dwio::dwrf::test_utils "${VELOX_COMPONENTS_PATH}/dwio/dwrf/test/utils/libvelox_dwrf_test_utils.a") - add_velox_dependency(parquet::reader::duckdb_conversion "${VELOX_COMPONENTS_PATH}/duckdb/conversion/libvelox_duckdb_conversion.a") + add_velox_dependency( + dwio::common::utils + "${VELOX_COMPONENTS_PATH}/dwio/common/tests/utils/libvelox_dwio_common_test_utils.a" + ) + add_velox_dependency( + dwio::dwrf::test_utils + "${VELOX_COMPONENTS_PATH}/dwio/dwrf/test/utils/libvelox_dwrf_test_utils.a" + ) + add_velox_dependency( + parquet::reader::duckdb_conversion + "${VELOX_COMPONENTS_PATH}/duckdb/conversion/libvelox_duckdb_conversion.a") add_duckdb() - add_velox_dependency(tpch::gen "${VELOX_COMPONENTS_PATH}/tpch/gen/libvelox_tpch_gen.a") - add_velox_dependency(dbgen "${VELOX_COMPONENTS_PATH}/tpch/gen/dbgen/libvelox_dbgen.a") + add_velox_dependency( + tpch::gen "${VELOX_COMPONENTS_PATH}/tpch/gen/libvelox_tpch_gen.a") + add_velox_dependency( + dbgen "${VELOX_COMPONENTS_PATH}/tpch/gen/dbgen/libvelox_dbgen.a") endif() - add_velox_dependency(parquet::reader::thrift "${VELOX_COMPONENTS_PATH}/dwio/parquet/thrift/libvelox_dwio_parquet_thrift.a") - - add_velox_dependency(velox::arrow::parquet::writer "${VELOX_COMPONENTS_PATH}/dwio/parquet/writer/libvelox_dwio_arrow_parquet_writer.a") - add_velox_dependency(dwio::arrow::parquet::writer "${VELOX_COMPONENTS_PATH}/dwio/parquet/writer/arrow/libvelox_dwio_arrow_parquet_writer_lib.a") - add_velox_dependency(dwio::arrow::parquet::writer::util "${VELOX_COMPONENTS_PATH}/dwio/parquet/writer/arrow/util/libvelox_dwio_arrow_parquet_writer_util_lib.a") - add_velox_dependency(dwio::arrow::parquet::writer::thrift::lib "${VELOX_COMPONENTS_PATH}/dwio/parquet/writer/arrow/generated/libvelox_dwio_arrow_parquet_writer_thrift_lib.a") - add_velox_dependency(dwio::common::compression "${VELOX_COMPONENTS_PATH}/dwio/common/compression/libvelox_dwio_common_compression.a") - add_velox_dependency(dwio::common "${VELOX_COMPONENTS_PATH}/dwio/common/libvelox_dwio_common.a") - add_velox_dependency(functions::prestosql::types "${VELOX_COMPONENTS_PATH}/functions/prestosql/types/libvelox_presto_types.a") - add_velox_dependency(functions::spark::specialforms "${VELOX_COMPONENTS_PATH}/functions/sparksql/specialforms/libvelox_functions_spark_specialforms.a") - add_velox_dependency(expression "${VELOX_COMPONENTS_PATH}/expression/libvelox_expression.a") + add_velox_dependency( + parquet::reader::thrift + "${VELOX_COMPONENTS_PATH}/dwio/parquet/thrift/libvelox_dwio_parquet_thrift.a" + ) + + add_velox_dependency( + velox::arrow::parquet::writer + "${VELOX_COMPONENTS_PATH}/dwio/parquet/writer/libvelox_dwio_arrow_parquet_writer.a" + ) + add_velox_dependency( + dwio::arrow::parquet::writer + "${VELOX_COMPONENTS_PATH}/dwio/parquet/writer/arrow/libvelox_dwio_arrow_parquet_writer_lib.a" + ) + add_velox_dependency( + dwio::arrow::parquet::writer::util + "${VELOX_COMPONENTS_PATH}/dwio/parquet/writer/arrow/util/libvelox_dwio_arrow_parquet_writer_util_lib.a" + ) + add_velox_dependency( + dwio::arrow::parquet::writer::thrift::lib + "${VELOX_COMPONENTS_PATH}/dwio/parquet/writer/arrow/generated/libvelox_dwio_arrow_parquet_writer_thrift_lib.a" + ) + add_velox_dependency( + dwio::common::compression + "${VELOX_COMPONENTS_PATH}/dwio/common/compression/libvelox_dwio_common_compression.a" + ) + add_velox_dependency( + dwio::common "${VELOX_COMPONENTS_PATH}/dwio/common/libvelox_dwio_common.a") + add_velox_dependency( + functions::prestosql::types + "${VELOX_COMPONENTS_PATH}/functions/prestosql/types/libvelox_presto_types.a" + ) + add_velox_dependency( + functions::spark::specialforms + "${VELOX_COMPONENTS_PATH}/functions/sparksql/specialforms/libvelox_functions_spark_specialforms.a" + ) + add_velox_dependency( + expression "${VELOX_COMPONENTS_PATH}/expression/libvelox_expression.a") add_velox_dependency(core "${VELOX_COMPONENTS_PATH}/core/libvelox_core.a") - add_velox_dependency(type::fbhive "${VELOX_COMPONENTS_PATH}/type/fbhive/libvelox_type_fbhive.a") + add_velox_dependency( + type::fbhive "${VELOX_COMPONENTS_PATH}/type/fbhive/libvelox_type_fbhive.a") add_velox_dependency(type "${VELOX_COMPONENTS_PATH}/type/libvelox_type.a") - add_velox_dependency(vector::serializes "${VELOX_COMPONENTS_PATH}/serializers/libvelox_presto_serializer.a") - add_velox_dependency(functions::lib::util "${VELOX_COMPONENTS_PATH}/functions/lib/libvelox_functions_util.a") - add_velox_dependency(vector "${VELOX_COMPONENTS_PATH}/vector/libvelox_vector.a") - add_velox_dependency(expression::function "${VELOX_COMPONENTS_PATH}/expression/libvelox_expression_functions.a") - add_velox_dependency(expression::type_calculation "${VELOX_COMPONENTS_PATH}/expression/type_calculation/libvelox_type_calculation.a") - - add_velox_dependency(common::caching "${VELOX_COMPONENTS_PATH}/common/caching/libvelox_caching.a") - add_velox_dependency(common::base "${VELOX_COMPONENTS_PATH}/common/base/libvelox_common_base.a") - add_velox_dependency(common::memory "${VELOX_COMPONENTS_PATH}/common/memory/libvelox_memory.a") - add_velox_dependency(common::serialization "${VELOX_COMPONENTS_PATH}/common/serialization/libvelox_serialization.a") - add_velox_dependency(common::base::exception "${VELOX_COMPONENTS_PATH}/common/base/libvelox_exception.a") - - add_velox_dependency(type::tz "${VELOX_COMPONENTS_PATH}/type/tz/libvelox_type_tz.a") - add_velox_dependency(dwio::dwrf::proto "${VELOX_COMPONENTS_PATH}/dwio/dwrf/proto/libvelox_dwio_dwrf_proto.a") - add_velox_dependency(dwio::catalog::fbhive "${VELOX_COMPONENTS_PATH}/dwio/catalog/fbhive/libvelox_dwio_catalog_fbhive.a") - add_velox_dependency(dwio::common::exception "${VELOX_COMPONENTS_PATH}/dwio/common/exception/libvelox_dwio_common_exception.a") - add_velox_dependency(dwio::common::encryption "${VELOX_COMPONENTS_PATH}/dwio/common/encryption/libvelox_dwio_common_encryption.a") - - add_velox_dependency(core::config "${VELOX_COMPONENTS_PATH}/core/libvelox_config.a") - add_velox_dependency(common::encode "${VELOX_COMPONENTS_PATH}/common/encode/libvelox_encode.a") - add_velox_dependency(common::time "${VELOX_COMPONENTS_PATH}/common/time/libvelox_time.a") + add_velox_dependency( + vector::serializes + "${VELOX_COMPONENTS_PATH}/serializers/libvelox_presto_serializer.a") + add_velox_dependency( + functions::lib::util + "${VELOX_COMPONENTS_PATH}/functions/lib/libvelox_functions_util.a") + add_velox_dependency(vector + "${VELOX_COMPONENTS_PATH}/vector/libvelox_vector.a") + add_velox_dependency( + expression::function + "${VELOX_COMPONENTS_PATH}/expression/libvelox_expression_functions.a") + add_velox_dependency( + expression::type_calculation + "${VELOX_COMPONENTS_PATH}/expression/type_calculation/libvelox_type_calculation.a" + ) + + add_velox_dependency( + common::caching + "${VELOX_COMPONENTS_PATH}/common/caching/libvelox_caching.a") + add_velox_dependency( + common::base "${VELOX_COMPONENTS_PATH}/common/base/libvelox_common_base.a") + add_velox_dependency( + common::memory "${VELOX_COMPONENTS_PATH}/common/memory/libvelox_memory.a") + add_velox_dependency( + common::serialization + "${VELOX_COMPONENTS_PATH}/common/serialization/libvelox_serialization.a") + add_velox_dependency( + common::base::exception + "${VELOX_COMPONENTS_PATH}/common/base/libvelox_exception.a") + + add_velox_dependency(type::tz + "${VELOX_COMPONENTS_PATH}/type/tz/libvelox_type_tz.a") + add_velox_dependency( + dwio::dwrf::proto + "${VELOX_COMPONENTS_PATH}/dwio/dwrf/proto/libvelox_dwio_dwrf_proto.a") + add_velox_dependency( + dwio::catalog::fbhive + "${VELOX_COMPONENTS_PATH}/dwio/catalog/fbhive/libvelox_dwio_catalog_fbhive.a" + ) + add_velox_dependency( + dwio::common::exception + "${VELOX_COMPONENTS_PATH}/dwio/common/exception/libvelox_dwio_common_exception.a" + ) + add_velox_dependency( + dwio::common::encryption + "${VELOX_COMPONENTS_PATH}/dwio/common/encryption/libvelox_dwio_common_encryption.a" + ) + + add_velox_dependency(core::config + "${VELOX_COMPONENTS_PATH}/core/libvelox_config.a") + add_velox_dependency( + common::encode "${VELOX_COMPONENTS_PATH}/common/encode/libvelox_encode.a") + add_velox_dependency(common::time + "${VELOX_COMPONENTS_PATH}/common/time/libvelox_time.a") if(BUILD_TESTS) - add_velox_dependency(common::file::test "${VELOX_COMPONENTS_PATH}/common/file/tests/libvelox_file_test_utils.a") + add_velox_dependency( + common::file::test + "${VELOX_COMPONENTS_PATH}/common/file/tests/libvelox_file_test_utils.a") endif() - add_velox_dependency(common::file "${VELOX_COMPONENTS_PATH}/common/file/libvelox_file.a") - add_velox_dependency(common::process "${VELOX_COMPONENTS_PATH}/common/process/libvelox_process.a") - - add_velox_dependency(common::test_util "${VELOX_COMPONENTS_PATH}/common/testutil/libvelox_test_util.a") - - add_velox_dependency(external::md5 "${VELOX_COMPONENTS_PATH}/external/md5/libmd5.a") - add_velox_dependency(external::date "${VELOX_COMPONENTS_PATH}/external/date/libvelox_external_date.a") - add_velox_dependency(velox::parquet::writer "${VELOX_COMPONENTS_PATH}/dwio/parquet/libvelox_dwio_parquet_writer.a") + add_velox_dependency(common::file + "${VELOX_COMPONENTS_PATH}/common/file/libvelox_file.a") + add_velox_dependency( + common::process + "${VELOX_COMPONENTS_PATH}/common/process/libvelox_process.a") + + add_velox_dependency( + common::test_util + "${VELOX_COMPONENTS_PATH}/common/testutil/libvelox_test_util.a") + + add_velox_dependency(external::md5 + "${VELOX_COMPONENTS_PATH}/external/md5/libmd5.a") + add_velox_dependency( + external::date + "${VELOX_COMPONENTS_PATH}/external/date/libvelox_external_date.a") + add_velox_dependency( + velox::parquet::writer + "${VELOX_COMPONENTS_PATH}/dwio/parquet/libvelox_dwio_parquet_writer.a") if(BUILD_TESTS) - add_velox_dependency(vector::test::util "${VELOX_COMPONENTS_PATH}/vector/tests/utils/libvelox_vector_test_lib.a") + add_velox_dependency( + vector::test::util + "${VELOX_COMPONENTS_PATH}/vector/tests/utils/libvelox_vector_test_lib.a") endif() - add_velox_dependency(common::compression "${VELOX_COMPONENTS_PATH}/common/compression/libvelox_common_compression.a") - add_velox_dependency(common::io "${VELOX_COMPONENTS_PATH}/common/io/libvelox_common_io.a") - add_velox_dependency(velox::status "${VELOX_COMPONENTS_PATH}/common/base/libvelox_status.a") - add_velox_dependency(external::simdjson "${VELOX_BUILD_PATH}/_deps/simdjson-build/libsimdjson.a") + add_velox_dependency( + common::compression + "${VELOX_COMPONENTS_PATH}/common/compression/libvelox_common_compression.a") + add_velox_dependency( + common::io "${VELOX_COMPONENTS_PATH}/common/io/libvelox_common_io.a") + add_velox_dependency(velox::status + "${VELOX_COMPONENTS_PATH}/common/base/libvelox_status.a") endmacro() macro(find_libhdfs3) @@ -241,18 +443,16 @@ macro(find_libhdfs3) find_path(libhdfs3_INCLUDE_DIR hdfs/hdfs.h) set(CMAKE_FIND_LIBRARY_SUFFIXES ".so") find_library(libhdfs3_LIBRARY NAMES hdfs3) - find_package_handle_standard_args(libhdfs3 DEFAULT_MSG - libhdfs3_INCLUDE_DIR - libhdfs3_LIBRARY - ) + find_package_handle_standard_args(libhdfs3 DEFAULT_MSG libhdfs3_INCLUDE_DIR + libhdfs3_LIBRARY) add_library(HDFS::hdfs3 SHARED IMPORTED) - set_target_properties(HDFS::hdfs3 PROPERTIES - INTERFACE_INCLUDE_DIRECTORIES "${libhdfs3_INCLUDE_DIR}" - IMPORTED_LOCATION "${libhdfs3_LIBRARY}" - ) + set_target_properties( + HDFS::hdfs3 + PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${libhdfs3_INCLUDE_DIR}" + IMPORTED_LOCATION "${libhdfs3_LIBRARY}") endif() - if (NOT libhdfs3_FOUND) + if(NOT libhdfs3_FOUND) message(FATAL_ERROR "LIBHDFS3 Library Not Found") endif() endmacro() @@ -262,30 +462,35 @@ macro(find_re2) if(re2_FOUND AND TARGET re2::re2) set(RE2_LIBRARY re2::re2) else() - find_library(RE2_LIBRARY NAMES re2 PATHS ${SYSTEM_LIB_PATH} ${SYSTEM_LIB64_PATH} ${SYSTEM_LIB_MULTIARCH_PATH} ${SYSTEM_LOCAL_LIB_PATH} ${SYSTEM_LOCAL_LIB64_PATH} NO_DEFAULT_PATH) + find_library( + RE2_LIBRARY + NAMES re2 + PATHS ${SYSTEM_LIB_PATH} ${SYSTEM_LIB64_PATH} ${SYSTEM_LIB_MULTIARCH_PATH} + ${SYSTEM_LOCAL_LIB_PATH} ${SYSTEM_LOCAL_LIB64_PATH} + NO_DEFAULT_PATH) endif() - if (NOT RE2_LIBRARY) - message(FATAL_ERROR "RE2 Library Not Found") + if(NOT RE2_LIBRARY) + message(FATAL_ERROR "RE2 Library Not Found") else() message(STATUS "RE2 Library Can Be Found in ${RE2_LIBRARY}") endif() endmacro() macro(find_awssdk) - set (CMAKE_FIND_LIBRARY_SUFFIXES ".a") + set(CMAKE_FIND_LIBRARY_SUFFIXES ".a") find_package(AWSSDK REQUIRED COMPONENTS s3;identity-management) endmacro() macro(find_gcssdk) - set (CMAKE_FIND_LIBRARY_SUFFIXES ".so") + set(CMAKE_FIND_LIBRARY_SUFFIXES ".so") find_package(google_cloud_cpp_storage REQUIRED) endmacro() macro(find_azure) find_package(CURL REQUIRED) find_package(LibXml2 REQUIRED) - set (CMAKE_FIND_LIBRARY_SUFFIXES ".a") + set(CMAKE_FIND_LIBRARY_SUFFIXES ".a") find_package(azure-storage-blobs-cpp CONFIG REQUIRED) find_package(azure-storage-files-datalake-cpp CONFIG REQUIRED) endmacro() @@ -311,6 +516,7 @@ set(VELOX_SRCS operators/serializer/VeloxRowToColumnarConverter.cc operators/writer/VeloxParquetDatasource.cc shuffle/VeloxShuffleReader.cc + shuffle/VeloxShuffleWriter.cc shuffle/VeloxHashBasedShuffleWriter.cc shuffle/VeloxSortBasedShuffleWriter.cc substrait/SubstraitParser.cc @@ -327,11 +533,11 @@ set(VELOX_SRCS utils/VeloxArrowUtils.cc utils/ConfigExtractor.cc utils/Common.cc - ) + utils/VeloxBatchAppender.cc) -if (ENABLE_HDFS) +if(ENABLE_HDFS) list(APPEND VELOX_SRCS utils/HdfsUtils.cc) -endif () +endif() if(ENABLE_S3) find_package(ZLIB) @@ -345,49 +551,68 @@ add_library(velox SHARED ${VELOX_SRCS}) if(ENABLE_GLUTEN_VCPKG) # Hide symbols of static dependencies - target_link_options(velox PRIVATE -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/symbols.map) + target_link_options( + velox PRIVATE -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/symbols.map) endif() -target_include_directories(velox PUBLIC - ${CMAKE_SYSTEM_INCLUDE_PATH} - ${JNI_INCLUDE_DIRS} - ${CMAKE_CURRENT_SOURCE_DIR} - ${VELOX_HOME}/ - ${VELOX_BUILD_PATH}/ - ${VELOX_BUILD_PATH}/_deps/xsimd-src/include/ - ${VELOX_HOME}/third_party/xsimd/include/) - -set_target_properties(velox PROPERTIES - LIBRARY_OUTPUT_DIRECTORY ${root_directory}/releases - ) - -## If folly is not installed in system lib paths, please add -## `-DCMAKE_PREFIX_PATH="${folly lib path}" to cmake arguments. -## It is also applicable to other dependencies. +target_include_directories( + velox + PUBLIC ${CMAKE_SYSTEM_INCLUDE_PATH} + ${JNI_INCLUDE_DIRS} + ${CMAKE_CURRENT_SOURCE_DIR} + ${VELOX_HOME}/ + ${VELOX_BUILD_PATH}/ + ${VELOX_BUILD_PATH}/_deps/xsimd-src/include/ + ${VELOX_HOME}/third_party/xsimd/include/) + +set_target_properties(velox PROPERTIES LIBRARY_OUTPUT_DIRECTORY + ${root_directory}/releases) + +# If folly is not installed in system lib paths, please add +# `-DCMAKE_PREFIX_PATH="${folly lib path}" to cmake arguments. It is also +# applicable to other dependencies. find_package(Folly REQUIRED CONFIG) -target_include_directories(velox PUBLIC - ${GTEST_INCLUDE_DIRS} - ${PROTOBUF_INCLUDE}) +target_include_directories(velox PUBLIC ${GTEST_INCLUDE_DIRS} + ${PROTOBUF_INCLUDE}) + +if(BUILD_JEMALLOC) + include(Findjemalloc_pic) + find_jemalloc() + if(JEMALLOC_NOT_FOUND) + include(Buildjemalloc_pic) + build_jemalloc() + endif() + add_definitions(-DENABLE_JEMALLOC) + target_link_libraries(velox PUBLIC jemalloc::libjemalloc) +endif() target_link_libraries(velox PUBLIC gluten) add_velox_dependencies() -# Arrow libraries appear after Velox dependencies to avoid linker error -find_arrow_lib(${ARROW_LIB_NAME}) -find_arrow_lib(${PARQUET_LIB_NAME}) -target_link_libraries(velox PUBLIC Arrow::arrow Arrow::parquet) - target_link_libraries(velox PUBLIC Folly::folly) find_re2() target_link_libraries(velox PUBLIC ${RE2_LIBRARY}) -# since https://github.com/facebookincubator/velox/commit/47970417ac92135e862c0fde350d4d60fa2f1423 +# since +# https://github.com/facebookincubator/velox/commit/47970417ac92135e862c0fde350d4d60fa2f1423 if(Stemmer_FOUND) target_link_libraries(velox PUBLIC stemmer::stemmer) else() - add_velox_dependency(velox "${VELOX_BUILD_PATH}/_deps/libstemmer/src/libstemmer/libstemmer.a") + add_velox_dependency( + velox "${VELOX_BUILD_PATH}/_deps/libstemmer/src/libstemmer/libstemmer.a") +endif() + +set(CMAKE_FIND_LIBRARY_SUFFIXES_BCK ${CMAKE_FIND_LIBRARY_SUFFIXES}) +set(CMAKE_FIND_LIBRARY_SUFFIXES ".a") +find_package(simdjson CONFIG) +if(simdjson_FOUND AND TARGET simdjson::simdjson) + target_link_libraries(velox PUBLIC simdjson::simdjson) +else() + add_velox_dependency(external::simdjson + "${VELOX_BUILD_PATH}/_deps/simdjson-build/libsimdjson.a") endif() +set(CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES_BCK}) if(ENABLE_GLUTEN_VCPKG) find_package(Thrift CONFIG) @@ -400,7 +625,8 @@ endif() if(Thrift_FOUND) target_link_libraries(velox PUBLIC thrift::thrift) else() - add_velox_dependency(thrift "${ARROW_HOME}/src/arrow_ep-build/thrift_ep-install/lib/libthrift.a") + add_velox_dependency( + thrift "${ARROW_HOME}/src/arrow_ep-build/thrift_ep-install/lib/libthrift.a") endif() if(BUILD_TESTS) @@ -440,7 +666,13 @@ if(ENABLE_ABFS) target_link_libraries(velox PUBLIC Azure::azure-storage-files-datalake) endif() -add_custom_command(TARGET velox POST_BUILD COMMAND ld $ || true - COMMENT "Checking ld result of libvelox.so") -add_custom_command(TARGET velox POST_BUILD COMMAND ldd $ || true - COMMENT "Checking ldd result of libvelox.so") +add_custom_command( + TARGET velox + POST_BUILD + COMMAND ld $ || true + COMMENT "Checking ld result of libvelox.so") +add_custom_command( + TARGET velox + POST_BUILD + COMMAND ldd $ || true + COMMENT "Checking ldd result of libvelox.so") diff --git a/cpp/velox/benchmarks/CMakeLists.txt b/cpp/velox/benchmarks/CMakeLists.txt index 74f21c29bc1d7..1aa199b13696a 100644 --- a/cpp/velox/benchmarks/CMakeLists.txt +++ b/cpp/velox/benchmarks/CMakeLists.txt @@ -15,10 +15,15 @@ find_arrow_lib(${PARQUET_LIB_NAME}) -set(VELOX_BENCHMARK_COMMON_SRCS common/FileReaderIterator.cc common/BenchmarkUtils.cc) +set(VELOX_BENCHMARK_COMMON_SRCS common/FileReaderIterator.cc + common/BenchmarkUtils.cc) add_library(velox_benchmark_common STATIC ${VELOX_BENCHMARK_COMMON_SRCS}) -target_include_directories(velox_benchmark_common PUBLIC ${CMAKE_SOURCE_DIR}/velox ${CMAKE_SOURCE_DIR}/core) -target_link_libraries(velox_benchmark_common PUBLIC Arrow::parquet velox benchmark::benchmark google::glog) +target_include_directories( + velox_benchmark_common PUBLIC ${CMAKE_SOURCE_DIR}/velox + ${CMAKE_SOURCE_DIR}/core) +target_link_libraries( + velox_benchmark_common PUBLIC Arrow::parquet velox benchmark::benchmark + google::glog) function(add_velox_benchmark BM_EXEC BM_FILE) add_executable(${BM_EXEC} ${BM_FILE}) @@ -34,8 +39,6 @@ add_velox_benchmark(parquet_write_benchmark ParquetWriteBenchmark.cc) add_velox_benchmark(plan_validator_util PlanValidatorUtil.cc) -add_velox_benchmark(shuffle_split_benchmark ShuffleSplitBenchmark.cc) - if(ENABLE_ORC) add_velox_benchmark(orc_converter exec/OrcConverter.cc) endif() diff --git a/cpp/velox/benchmarks/GenericBenchmark.cc b/cpp/velox/benchmarks/GenericBenchmark.cc index b7a50800e4ea7..87d77b7154f23 100644 --- a/cpp/velox/benchmarks/GenericBenchmark.cc +++ b/cpp/velox/benchmarks/GenericBenchmark.cc @@ -31,10 +31,10 @@ #include "compute/VeloxRuntime.h" #include "config/GlutenConfig.h" #include "shuffle/LocalPartitionWriter.h" -#include "shuffle/VeloxHashBasedShuffleWriter.h" #include "shuffle/VeloxShuffleWriter.h" #include "shuffle/rss/RssPartitionWriter.h" #include "utils/StringUtil.h" +#include "utils/Timer.h" #include "utils/VeloxArrowUtils.h" #include "utils/exception.h" #include "utils/tests/LocalRssClient.h" @@ -47,13 +47,18 @@ namespace { DEFINE_bool(print_result, true, "Print result for execution"); DEFINE_string(save_output, "", "Path to parquet file for saving the task output iterator"); DEFINE_bool(with_shuffle, false, "Add shuffle split at end."); -DEFINE_string(partitioning, "rr", "Short partitioning name. Valid options are rr, hash, range, single"); +DEFINE_string( + partitioning, + "rr", + "Short partitioning name. Valid options are rr, hash, range, single, random (only for test purpose)"); +DEFINE_string(shuffle_writer, "hash", "Shuffle writer type. Can be hash or sort"); DEFINE_bool(rss, false, "Mocking rss."); -DEFINE_bool(zstd, false, "Use ZSTD as shuffle compression codec"); -DEFINE_bool(qat_gzip, false, "Use QAT GZIP as shuffle compression codec"); -DEFINE_bool(qat_zstd, false, "Use QAT ZSTD as shuffle compression codec"); -DEFINE_bool(iaa_gzip, false, "Use IAA GZIP as shuffle compression codec"); +DEFINE_string( + compression, + "lz4", + "Specify the compression codec. Valid options are lz4, zstd, qat_gzip, qat_zstd, iaa_gzip"); DEFINE_int32(shuffle_partitions, 200, "Number of shuffle split (reducer) partitions"); +DEFINE_bool(run_shuffle, false, "Only run shuffle write."); DEFINE_bool(run_example, false, "Run the example and exit."); DEFINE_string(plan, "", "Path to input json file of the substrait plan."); @@ -61,33 +66,58 @@ DEFINE_string( split, "", "Path to input json file of the splits. Only valid for simulating the first stage. Use comma-separated list for multiple splits."); -DEFINE_string(data, "", "Path to input data files in parquet format, used for shuffle read."); +DEFINE_string(data, "", "Path to input data files in parquet format. Use comma-separated list for multiple files."); DEFINE_string(conf, "", "Path to the configuration file."); DEFINE_string(write_path, "/tmp", "Path to save the output from write tasks."); DEFINE_int64(memory_limit, std::numeric_limits::max(), "Memory limit used to trigger spill."); +DEFINE_string( + scan_mode, + "stream", + "Scan mode for reading parquet data." + "'stream' mode: Input file scan happens inside of the pipeline." + "'buffered' mode: First read all data into memory and feed the pipeline with it."); struct WriterMetrics { int64_t splitTime; int64_t evictTime; int64_t writeTime; int64_t compressTime; + + public: + explicit WriterMetrics() : splitTime(0), evictTime(0), writeTime(0), compressTime(0) {} }; -std::shared_ptr createShuffleWriter( - VeloxMemoryManager* memoryManager, - const std::string& dataFile, - const std::vector& localDirs) { +void setUpBenchmark(::benchmark::internal::Benchmark* bm) { + if (FLAGS_threads > 0) { + bm->Threads(FLAGS_threads); + } else { + bm->ThreadRange(1, std::thread::hardware_concurrency()); + } + if (FLAGS_iterations > 0) { + bm->Iterations(FLAGS_iterations); + } +} + +std::shared_ptr +createShuffleWriter(Runtime* runtime, const std::string& dataFile, const std::vector& localDirs) { PartitionWriterOptions partitionWriterOptions{}; - if (FLAGS_zstd) { + + // Configure compression. + if (FLAGS_compression == "lz4") { + partitionWriterOptions.codecBackend = CodecBackend::NONE; + partitionWriterOptions.compressionType = arrow::Compression::LZ4_FRAME; + partitionWriterOptions.compressionTypeStr = "lz4"; + } else if (FLAGS_compression == "zstd") { partitionWriterOptions.codecBackend = CodecBackend::NONE; partitionWriterOptions.compressionType = arrow::Compression::ZSTD; - } else if (FLAGS_qat_gzip) { + partitionWriterOptions.compressionTypeStr = "zstd"; + } else if (FLAGS_compression == "qat_gzip") { partitionWriterOptions.codecBackend = CodecBackend::QAT; partitionWriterOptions.compressionType = arrow::Compression::GZIP; - } else if (FLAGS_qat_zstd) { + } else if (FLAGS_compression == "qat_zstd") { partitionWriterOptions.codecBackend = CodecBackend::QAT; partitionWriterOptions.compressionType = arrow::Compression::ZSTD; - } else if (FLAGS_iaa_gzip) { + } else if (FLAGS_compression == "iaa_gzip") { partitionWriterOptions.codecBackend = CodecBackend::IAA; partitionWriterOptions.compressionType = arrow::Compression::GZIP; } @@ -98,180 +128,231 @@ std::shared_ptr createShuffleWriter( partitionWriter = std::make_unique( FLAGS_shuffle_partitions, std::move(partitionWriterOptions), - memoryManager->getArrowMemoryPool(), + runtime->memoryManager()->getArrowMemoryPool(), std::move(rssClient)); } else { partitionWriter = std::make_unique( FLAGS_shuffle_partitions, std::move(partitionWriterOptions), - memoryManager->getArrowMemoryPool(), + runtime->memoryManager()->getArrowMemoryPool(), dataFile, localDirs); } auto options = ShuffleWriterOptions{}; options.partitioning = gluten::toPartitioning(FLAGS_partitioning); - GLUTEN_ASSIGN_OR_THROW( - auto shuffleWriter, - VeloxHashBasedShuffleWriter::create( - FLAGS_shuffle_partitions, - std::move(partitionWriter), - std::move(options), - memoryManager->getLeafMemoryPool(), - memoryManager->getArrowMemoryPool())); - - return shuffleWriter; + if (FLAGS_shuffle_writer == "sort") { + options.shuffleWriterType = gluten::kSortShuffle; + } + auto shuffleWriter = + runtime->createShuffleWriter(FLAGS_shuffle_partitions, std::move(partitionWriter), std::move(options)); + + return std::reinterpret_pointer_cast(shuffleWriter); } void populateWriterMetrics( const std::shared_ptr& shuffleWriter, - int64_t shuffleWriteTime, + int64_t totalTime, WriterMetrics& metrics) { metrics.compressTime += shuffleWriter->totalCompressTime(); metrics.evictTime += shuffleWriter->totalEvictTime(); metrics.writeTime += shuffleWriter->totalWriteTime(); - metrics.evictTime += - (shuffleWriteTime - shuffleWriter->totalCompressTime() - shuffleWriter->totalEvictTime() - - shuffleWriter->totalWriteTime()); + auto splitTime = totalTime - metrics.compressTime - metrics.evictTime - metrics.writeTime; + if (splitTime > 0) { + metrics.splitTime += splitTime; + } +} + +void setCpu(::benchmark::State& state) { + // Pin each threads to different CPU# starting from 0 or --cpu. + auto cpu = state.thread_index(); + if (FLAGS_cpu != -1) { + cpu += FLAGS_cpu; + } + LOG(INFO) << "Setting CPU for thread " << state.thread_index() << " to " << cpu; + gluten::setCpu(cpu); +} + +void runShuffle( + Runtime* runtime, + BenchmarkAllocationListener* listener, + const std::shared_ptr& resultIter, + WriterMetrics& metrics) { + std::string dataFile; + std::vector localDirs; + bool isFromEnv; + GLUTEN_THROW_NOT_OK(setLocalDirsAndDataFileFromEnv(dataFile, localDirs, isFromEnv)); + + auto shuffleWriter = createShuffleWriter(runtime, dataFile, localDirs); + listener->setShuffleWriter(shuffleWriter.get()); + + int64_t totalTime = 0; + { + gluten::ScopedTimer timer(&totalTime); + while (resultIter->hasNext()) { + GLUTEN_THROW_NOT_OK(shuffleWriter->write(resultIter->next(), ShuffleWriter::kMinMemLimit)); + } + GLUTEN_THROW_NOT_OK(shuffleWriter->stop()); + } + + populateWriterMetrics(shuffleWriter, totalTime, metrics); + // Cleanup shuffle outputs + cleanupShuffleOutput(dataFile, localDirs, isFromEnv); +} + +void updateBenchmarkMetrics( + ::benchmark::State& state, + const int64_t& elapsedTime, + const int64_t& readInputTime, + const WriterMetrics& writerMetrics) { + state.counters["read_input_time"] = + benchmark::Counter(readInputTime, benchmark::Counter::kAvgIterations, benchmark::Counter::OneK::kIs1000); + state.counters["elapsed_time"] = + benchmark::Counter(elapsedTime, benchmark::Counter::kAvgIterations, benchmark::Counter::OneK::kIs1000); + + state.counters["shuffle_write_time"] = benchmark::Counter( + writerMetrics.writeTime, benchmark::Counter::kAvgIterations, benchmark::Counter::OneK::kIs1000); + state.counters["shuffle_spill_time"] = benchmark::Counter( + writerMetrics.evictTime, benchmark::Counter::kAvgIterations, benchmark::Counter::OneK::kIs1000); + state.counters["shuffle_split_time"] = benchmark::Counter( + writerMetrics.splitTime, benchmark::Counter::kAvgIterations, benchmark::Counter::OneK::kIs1000); + state.counters["shuffle_compress_time"] = benchmark::Counter( + writerMetrics.compressTime, benchmark::Counter::kAvgIterations, benchmark::Counter::OneK::kIs1000); } } // namespace +using RuntimeFactory = std::function listener)>; + auto BM_Generic = [](::benchmark::State& state, const std::string& planFile, const std::vector& splitFiles, const std::vector& dataFiles, - const std::unordered_map& conf, + RuntimeFactory runtimeFactory, FileReaderType readerType) { - // Pin each threads to different CPU# starting from 0 or --cpu. - if (FLAGS_cpu != -1) { - setCpu(FLAGS_cpu + state.thread_index()); - } else { - setCpu(state.thread_index()); - } - memory::MemoryManager::testingSetInstance({}); + setCpu(state); + + auto listener = std::make_unique(FLAGS_memory_limit); + auto* listenerPtr = listener.get(); + auto runtime = runtimeFactory(std::move(listener)); - auto memoryManager = std::make_unique( - "generic_benchmark", - gluten::defaultMemoryAllocator(), - std::make_unique(FLAGS_memory_limit)); - auto runtime = Runtime::create(kVeloxRuntimeKind, conf); auto plan = getPlanFromFile("Plan", planFile); std::vector splits{}; for (const auto& splitFile : splitFiles) { splits.push_back(getPlanFromFile("ReadRel.LocalFiles", splitFile)); } - auto startTime = std::chrono::steady_clock::now(); - int64_t collectBatchTime = 0; - WriterMetrics writerMetrics{}; - for (auto _ : state) { - std::vector> inputIters; - std::vector inputItersRaw; - if (!dataFiles.empty()) { - for (const auto& input : dataFiles) { - inputIters.push_back(getInputIteratorFromFileReader(input, readerType)); - } - std::transform( - inputIters.begin(), - inputIters.end(), - std::back_inserter(inputItersRaw), - [](std::shared_ptr iter) { - return static_cast(iter->getInputIter()); - }); - } - runtime->injectWriteFilesTempPath(FLAGS_write_path); - runtime->parsePlan(reinterpret_cast(plan.data()), plan.size(), std::nullopt); - for (auto& split : splits) { - runtime->parseSplitInfo(reinterpret_cast(split.data()), split.size(), std::nullopt); - } - auto resultIter = - runtime->createResultIterator(memoryManager.get(), "/tmp/test-spill", std::move(inputIters), conf); - if (auto listener = dynamic_cast(memoryManager->getListener())) { - listener->setIterator(resultIter.get()); - } - auto veloxPlan = dynamic_cast(runtime)->getVeloxPlan(); - if (FLAGS_with_shuffle) { - int64_t shuffleWriteTime; - TIME_NANO_START(shuffleWriteTime); - std::string dataFile; - std::vector localDirs; - bool isFromEnv; - GLUTEN_THROW_NOT_OK(setLocalDirsAndDataFileFromEnv(dataFile, localDirs, isFromEnv)); - const auto& shuffleWriter = createShuffleWriter(memoryManager.get(), dataFile, localDirs); - while (resultIter->hasNext()) { - GLUTEN_THROW_NOT_OK(shuffleWriter->write(resultIter->next(), ShuffleWriter::kMinMemLimit)); + WriterMetrics writerMetrics{}; + int64_t readInputTime = 0; + int64_t elapsedTime = 0; + + { + ScopedTimer timer(&elapsedTime); + for (auto _ : state) { + std::vector> inputIters; + std::vector inputItersRaw; + if (!dataFiles.empty()) { + for (const auto& input : dataFiles) { + inputIters.push_back(getInputIteratorFromFileReader(input, readerType)); + } + std::transform( + inputIters.begin(), + inputIters.end(), + std::back_inserter(inputItersRaw), + [](std::shared_ptr iter) { + return static_cast(iter->getInputIter()); + }); } - GLUTEN_THROW_NOT_OK(shuffleWriter->stop()); - TIME_NANO_END(shuffleWriteTime); - populateWriterMetrics(shuffleWriter, shuffleWriteTime, writerMetrics); - // Cleanup shuffle outputs - cleanupShuffleOutput(dataFile, localDirs, isFromEnv); - } else { - // May write the output into file. - ArrowSchema cSchema; - toArrowSchema(veloxPlan->outputType(), memoryManager->getLeafMemoryPool().get(), &cSchema); - GLUTEN_ASSIGN_OR_THROW(auto outputSchema, arrow::ImportSchema(&cSchema)); - ArrowWriter writer{FLAGS_save_output}; - state.PauseTiming(); - if (!FLAGS_save_output.empty()) { - GLUTEN_THROW_NOT_OK(writer.initWriter(*(outputSchema.get()))); + runtime->injectWriteFilesTempPath(FLAGS_write_path); + runtime->parsePlan(reinterpret_cast(plan.data()), plan.size(), std::nullopt); + for (auto& split : splits) { + runtime->parseSplitInfo(reinterpret_cast(split.data()), split.size(), std::nullopt); } - state.ResumeTiming(); + auto resultIter = runtime->createResultIterator("/tmp/test-spill", std::move(inputIters), runtime->getConfMap()); + listenerPtr->setIterator(resultIter.get()); - while (resultIter->hasNext()) { - auto array = resultIter->next()->exportArrowArray(); + if (FLAGS_with_shuffle) { + runShuffle(runtime, listenerPtr, resultIter, writerMetrics); + } else { + // May write the output into file. + auto veloxPlan = dynamic_cast(runtime)->getVeloxPlan(); + + ArrowSchema cSchema; + toArrowSchema(veloxPlan->outputType(), runtime->memoryManager()->getLeafMemoryPool().get(), &cSchema); + GLUTEN_ASSIGN_OR_THROW(auto outputSchema, arrow::ImportSchema(&cSchema)); + ArrowWriter writer{FLAGS_save_output}; state.PauseTiming(); - auto maybeBatch = arrow::ImportRecordBatch(array.get(), outputSchema); - if (!maybeBatch.ok()) { - state.SkipWithError(maybeBatch.status().message().c_str()); - return; + if (!FLAGS_save_output.empty()) { + GLUTEN_THROW_NOT_OK(writer.initWriter(*(outputSchema.get()))); } - if (FLAGS_print_result) { - LOG(INFO) << maybeBatch.ValueOrDie()->ToString(); + state.ResumeTiming(); + + while (resultIter->hasNext()) { + auto array = resultIter->next()->exportArrowArray(); + state.PauseTiming(); + auto maybeBatch = arrow::ImportRecordBatch(array.get(), outputSchema); + if (!maybeBatch.ok()) { + state.SkipWithError(maybeBatch.status().message().c_str()); + return; + } + if (FLAGS_print_result) { + LOG(INFO) << maybeBatch.ValueOrDie()->ToString(); + } + if (!FLAGS_save_output.empty()) { + GLUTEN_THROW_NOT_OK(writer.writeInBatches(maybeBatch.ValueOrDie())); + } } + + state.PauseTiming(); if (!FLAGS_save_output.empty()) { - GLUTEN_THROW_NOT_OK(writer.writeInBatches(maybeBatch.ValueOrDie())); + GLUTEN_THROW_NOT_OK(writer.closeWriter()); } + state.ResumeTiming(); } - state.PauseTiming(); - if (!FLAGS_save_output.empty()) { - GLUTEN_THROW_NOT_OK(writer.closeWriter()); - } - state.ResumeTiming(); - } - - collectBatchTime += - std::accumulate(inputItersRaw.begin(), inputItersRaw.end(), 0, [](int64_t sum, FileReaderIterator* iter) { - return sum + iter->getCollectBatchTime(); - }); + readInputTime += + std::accumulate(inputItersRaw.begin(), inputItersRaw.end(), 0, [](int64_t sum, FileReaderIterator* iter) { + return sum + iter->getCollectBatchTime(); + }); - auto* rawIter = static_cast(resultIter->getInputIter()); - const auto* task = rawIter->task(); - const auto* planNode = rawIter->veloxPlan(); - auto statsStr = facebook::velox::exec::printPlanWithStats(*planNode, task->taskStats(), true); - LOG(INFO) << statsStr; + auto* rawIter = static_cast(resultIter->getInputIter()); + const auto* task = rawIter->task(); + const auto* planNode = rawIter->veloxPlan(); + auto statsStr = facebook::velox::exec::printPlanWithStats(*planNode, task->taskStats(), true); + LOG(INFO) << statsStr; + } } + + updateBenchmarkMetrics(state, elapsedTime, readInputTime, writerMetrics); Runtime::release(runtime); +}; - auto endTime = std::chrono::steady_clock::now(); - auto duration = std::chrono::duration_cast(endTime - startTime).count(); +auto BM_ShuffleWrite = [](::benchmark::State& state, + const std::string& inputFile, + RuntimeFactory runtimeFactory, + FileReaderType readerType) { + setCpu(state); - state.counters["collect_batch_time"] = - benchmark::Counter(collectBatchTime, benchmark::Counter::kAvgIterations, benchmark::Counter::OneK::kIs1000); - state.counters["elapsed_time"] = - benchmark::Counter(duration, benchmark::Counter::kAvgIterations, benchmark::Counter::OneK::kIs1000); - state.counters["shuffle_write_time"] = benchmark::Counter( - writerMetrics.writeTime, benchmark::Counter::kAvgIterations, benchmark::Counter::OneK::kIs1000); - state.counters["shuffle_spill_time"] = benchmark::Counter( - writerMetrics.evictTime, benchmark::Counter::kAvgIterations, benchmark::Counter::OneK::kIs1000); - state.counters["shuffle_split_time"] = benchmark::Counter( - writerMetrics.splitTime, benchmark::Counter::kAvgIterations, benchmark::Counter::OneK::kIs1000); - state.counters["shuffle_compress_time"] = benchmark::Counter( - writerMetrics.compressTime, benchmark::Counter::kAvgIterations, benchmark::Counter::OneK::kIs1000); + auto listener = std::make_unique(FLAGS_memory_limit); + auto* listenerPtr = listener.get(); + auto runtime = runtimeFactory(std::move(listener)); - gluten::VeloxBackend::get()->tearDown(); + WriterMetrics writerMetrics{}; + int64_t readInputTime = 0; + int64_t elapsedTime = 0; + { + ScopedTimer timer(&elapsedTime); + for (auto _ : state) { + auto resultIter = getInputIteratorFromFileReader(inputFile, readerType); + runShuffle(runtime, listenerPtr, resultIter, writerMetrics); + + auto reader = static_cast(resultIter->getInputIter()); + readInputTime += reader->getCollectBatchTime(); + } + } + + updateBenchmarkMetrics(state, elapsedTime, readInputTime, writerMetrics); + Runtime::release(runtime); }; int main(int argc, char** argv) { @@ -279,10 +360,9 @@ int main(int argc, char** argv) { gflags::ParseCommandLineFlags(&argc, &argv, true); // Init Velox backend. - std::unordered_map backendConf; - std::unordered_map sessionConf; + auto backendConf = gluten::defaultConf(); + auto sessionConf = gluten::defaultConf(); backendConf.insert({gluten::kSparkBatchSize, std::to_string(FLAGS_batch_size)}); - backendConf.insert({kDebugModeEnabled, "true"}); if (!FLAGS_conf.empty()) { abortIfFileNotExists(FLAGS_conf); std::ifstream file(FLAGS_conf); @@ -334,6 +414,7 @@ int main(int argc, char** argv) { } initVeloxBackend(backendConf); + memory::MemoryManager::testingSetInstance({}); // Parse substrait plan, split file and data files. std::string substraitJsonFile = FLAGS_plan; @@ -352,6 +433,28 @@ int main(int argc, char** argv) { ::benchmark::Shutdown(); std::exit(EXIT_FAILURE); } + } else if (FLAGS_run_shuffle) { + std::string errorMsg{}; + if (FLAGS_data.empty()) { + errorMsg = "Missing '--split' or '--data' option."; + } else if (FLAGS_partitioning != "rr" && FLAGS_partitioning != "random") { + errorMsg = "--run-shuffle only support round-robin partitioning and random partitioning."; + } + if (errorMsg.empty()) { + try { + dataFiles = gluten::splitPaths(FLAGS_data, true); + if (dataFiles.size() > 1) { + errorMsg = "Only one data file is allowed for shuffle write."; + } + } catch (const std::exception& e) { + errorMsg = e.what(); + } + } + if (!errorMsg.empty()) { + LOG(ERROR) << "Incorrect usage: " << errorMsg << std::endl; + ::benchmark::Shutdown(); + std::exit(EXIT_FAILURE); + } } else { // Validate input args. std::string errorMsg{}; @@ -363,15 +466,17 @@ int main(int argc, char** argv) { errorMsg = "Missing '--split' or '--data' option."; } - try { - if (!FLAGS_data.empty()) { - dataFiles = gluten::splitPaths(FLAGS_data, true); - } - if (!FLAGS_split.empty()) { - splitFiles = gluten::splitPaths(FLAGS_split, true); + if (errorMsg.empty()) { + try { + if (!FLAGS_data.empty()) { + dataFiles = gluten::splitPaths(FLAGS_data, true); + } + if (!FLAGS_split.empty()) { + splitFiles = gluten::splitPaths(FLAGS_split, true); + } + } catch (const std::exception& e) { + errorMsg = e.what(); } - } catch (const std::exception& e) { - errorMsg = e.what(); } if (!errorMsg.empty()) { @@ -406,20 +511,27 @@ int main(int argc, char** argv) { } } -#define GENERIC_BENCHMARK(NAME, READER_TYPE) \ - do { \ - auto* bm = ::benchmark::RegisterBenchmark( \ - NAME, BM_Generic, substraitJsonFile, splitFiles, dataFiles, sessionConf, READER_TYPE) \ - ->MeasureProcessCPUTime() \ - ->UseRealTime(); \ - if (FLAGS_threads > 0) { \ - bm->Threads(FLAGS_threads); \ - } else { \ - bm->ThreadRange(1, std::thread::hardware_concurrency()); \ - } \ - if (FLAGS_iterations > 0) { \ - bm->Iterations(FLAGS_iterations); \ - } \ + RuntimeFactory runtimeFactory = [=](std::unique_ptr listener) { + return dynamic_cast(Runtime::create(kVeloxRuntimeKind, std::move(listener), sessionConf)); + }; + +#define GENERIC_BENCHMARK(READER_TYPE) \ + do { \ + auto* bm = \ + ::benchmark::RegisterBenchmark( \ + "GenericBenchmark", BM_Generic, substraitJsonFile, splitFiles, dataFiles, runtimeFactory, READER_TYPE) \ + ->MeasureProcessCPUTime() \ + ->UseRealTime(); \ + setUpBenchmark(bm); \ + } while (0) + +#define SHUFFLE_WRITE_BENCHMARK(READER_TYPE) \ + do { \ + auto* bm = \ + ::benchmark::RegisterBenchmark("ShuffleWrite", BM_ShuffleWrite, dataFiles[0], runtimeFactory, READER_TYPE) \ + ->MeasureProcessCPUTime() \ + ->UseRealTime(); \ + setUpBenchmark(bm); \ } while (0) LOG(INFO) << "Using options: "; @@ -432,14 +544,27 @@ int main(int argc, char** argv) { LOG(INFO) << "write_path: " << FLAGS_write_path; if (dataFiles.empty()) { - GENERIC_BENCHMARK("SkipInput", FileReaderType::kNone); + GENERIC_BENCHMARK(FileReaderType::kNone); } else { - GENERIC_BENCHMARK("InputFromBatchVector", FileReaderType::kBuffered); - GENERIC_BENCHMARK("InputFromBatchStream", FileReaderType::kStream); + FileReaderType readerType; + if (FLAGS_scan_mode == "buffered") { + readerType = FileReaderType::kBuffered; + LOG(INFO) << "Using buffered mode for reading parquet data."; + } else { + readerType = FileReaderType::kStream; + LOG(INFO) << "Using stream mode for reading parquet data."; + } + if (FLAGS_run_shuffle) { + SHUFFLE_WRITE_BENCHMARK(readerType); + } else { + GENERIC_BENCHMARK(readerType); + } } ::benchmark::RunSpecifiedBenchmarks(); ::benchmark::Shutdown(); + gluten::VeloxBackend::get()->tearDown(); + return 0; } diff --git a/cpp/velox/benchmarks/ParquetWriteBenchmark.cc b/cpp/velox/benchmarks/ParquetWriteBenchmark.cc index 7e9959797390c..45348ed4a63b1 100644 --- a/cpp/velox/benchmarks/ParquetWriteBenchmark.cc +++ b/cpp/velox/benchmarks/ParquetWriteBenchmark.cc @@ -257,7 +257,7 @@ class GoogleBenchmarkVeloxParquetWriteCacheScanBenchmark : public GoogleBenchmar // reuse the ParquetWriteConverter for batches caused system % increase a lot auto fileName = "velox_parquet_write.parquet"; - auto runtime = Runtime::create(kVeloxRuntimeKind); + auto runtime = Runtime::create(kVeloxRuntimeKind, AllocationListener::noop()); auto memoryManager = getDefaultMemoryManager(); auto veloxPool = memoryManager->getAggregateMemoryPool(); @@ -307,7 +307,7 @@ class GoogleBenchmarkVeloxParquetWriteCacheScanBenchmark : public GoogleBenchmar // GoogleBenchmarkArrowParquetWriteCacheScanBenchmark usage // ./parquet_write_benchmark --threads=1 --file /mnt/DP_disk1/int.parquet --output /tmp/parquet-write int main(int argc, char** argv) { - initVeloxBackend(); + gluten::initVeloxBackend(); uint32_t iterations = 1; uint32_t threads = 1; std::string datafile; diff --git a/cpp/velox/benchmarks/ShuffleSplitBenchmark.cc b/cpp/velox/benchmarks/ShuffleSplitBenchmark.cc deleted file mode 100644 index 4a4bb69b8d785..0000000000000 --- a/cpp/velox/benchmarks/ShuffleSplitBenchmark.cc +++ /dev/null @@ -1,394 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "benchmarks/common/BenchmarkUtils.h" -#include "memory/ColumnarBatch.h" -#include "shuffle/LocalPartitionWriter.h" -#include "shuffle/VeloxHashBasedShuffleWriter.h" -#include "shuffle/VeloxShuffleWriter.h" -#include "utils/TestUtils.h" -#include "utils/VeloxArrowUtils.h" -#include "utils/macros.h" - -void printTrace(void) { - char** strings; - size_t i, size; - enum Constexpr { kMaxSize = 1024 }; - void* array[kMaxSize]; - size = backtrace(array, kMaxSize); - strings = backtrace_symbols(array, size); - for (i = 0; i < size; i++) - printf(" %s\n", strings[i]); - puts(""); - free(strings); -} - -using arrow::RecordBatchReader; -using arrow::Status; - -using gluten::GlutenException; -using gluten::ShuffleWriterOptions; -using gluten::VeloxShuffleWriter; - -DEFINE_int32(partitions, -1, "Shuffle partitions"); -DEFINE_string(file, "", "Input file to split"); - -namespace gluten { - -const uint16_t kBatchBufferSize = 4096; -const uint16_t kPartitionBufferSize = 4096; - -class BenchmarkShuffleSplit { - public: - BenchmarkShuffleSplit(std::string fileName) { - getRecordBatchReader(fileName); - } - - void getRecordBatchReader(const std::string& inputFile) { - std::unique_ptr<::parquet::arrow::FileReader> parquetReader; - std::shared_ptr recordBatchReader; - - std::shared_ptr fs; - std::string fileName; - GLUTEN_ASSIGN_OR_THROW(fs, arrow::fs::FileSystemFromUriOrPath(inputFile, &fileName)) - - GLUTEN_ASSIGN_OR_THROW(file_, fs->OpenInputFile(fileName)); - - properties_.set_batch_size(kBatchBufferSize); - properties_.set_pre_buffer(false); - properties_.set_use_threads(false); - - GLUTEN_THROW_NOT_OK(::parquet::arrow::FileReader::Make( - arrow::default_memory_pool(), ::parquet::ParquetFileReader::Open(file_), properties_, &parquetReader)); - - GLUTEN_THROW_NOT_OK(parquetReader->GetSchema(&schema_)); - - auto numRowgroups = parquetReader->num_row_groups(); - - for (int i = 0; i < numRowgroups; ++i) { - rowGroupIndices_.push_back(i); - } - - auto numColumns = schema_->num_fields(); - for (int i = 0; i < numColumns; ++i) { - columnIndices_.push_back(i); - } - } - - void operator()(benchmark::State& state) { - if (FLAGS_cpu != -1) { - setCpu(FLAGS_cpu + state.thread_index()); - } else { - setCpu(state.thread_index()); - } - - auto options = ShuffleWriterOptions{}; - options.bufferSize = kPartitionBufferSize; - options.partitioning = Partitioning::kRoundRobin; - std::string dataFile; - std::vector localDirs; - bool isFromEnv; - GLUTEN_THROW_NOT_OK(setLocalDirsAndDataFileFromEnv(dataFile, localDirs, isFromEnv)); - - std::shared_ptr shuffleWriter; - int64_t elapseRead = 0; - int64_t numBatches = 0; - int64_t numRows = 0; - int64_t splitTime = 0; - auto startTime = std::chrono::steady_clock::now(); - - doSplit( - shuffleWriter, - elapseRead, - numBatches, - numRows, - splitTime, - FLAGS_partitions, - std::move(options), - dataFile, - localDirs, - state); - auto endTime = std::chrono::steady_clock::now(); - auto totalTime = (endTime - startTime).count(); - - cleanupShuffleOutput(dataFile, localDirs, isFromEnv); - - state.SetBytesProcessed(int64_t(shuffleWriter->rawPartitionBytes())); - - state.counters["rowgroups"] = - benchmark::Counter(rowGroupIndices_.size(), benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000); - state.counters["columns"] = - benchmark::Counter(columnIndices_.size(), benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000); - state.counters["batches"] = - benchmark::Counter(numBatches, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000); - state.counters["num_rows"] = - benchmark::Counter(numRows, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000); - state.counters["num_partitions"] = - benchmark::Counter(FLAGS_partitions, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000); - state.counters["batch_buffer_size"] = - benchmark::Counter(kBatchBufferSize, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1024); - state.counters["split_buffer_size"] = - benchmark::Counter(kPartitionBufferSize, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1024); - - state.counters["bytes_spilled"] = benchmark::Counter( - shuffleWriter->totalBytesEvicted(), benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1024); - state.counters["bytes_written"] = benchmark::Counter( - shuffleWriter->totalBytesWritten(), benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1024); - state.counters["bytes_raw"] = benchmark::Counter( - shuffleWriter->rawPartitionBytes(), benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1024); - - state.counters["parquet_parse"] = - benchmark::Counter(elapseRead, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000); - state.counters["write_time"] = benchmark::Counter( - shuffleWriter->totalWriteTime(), benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000); - state.counters["spill_time"] = benchmark::Counter( - shuffleWriter->totalEvictTime(), benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000); - state.counters["compress_time"] = benchmark::Counter( - shuffleWriter->totalCompressTime(), benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000); - - splitTime = splitTime - shuffleWriter->totalEvictTime() - shuffleWriter->totalCompressTime() - - shuffleWriter->totalWriteTime(); - - state.counters["split_time"] = - benchmark::Counter(splitTime, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000); - - state.counters["total_time"] = - benchmark::Counter(totalTime, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000); - shuffleWriter.reset(); - } - - protected: - long setCpu(uint32_t cpuindex) { - cpu_set_t cs; - CPU_ZERO(&cs); - CPU_SET(cpuindex, &cs); - return sched_setaffinity(0, sizeof(cs), &cs); - } - - virtual void doSplit( - std::shared_ptr& shuffleWriter, - int64_t& elapseRead, - int64_t& numBatches, - int64_t& numRows, - int64_t& splitTime, - const int numPartitions, - ShuffleWriterOptions options, - const std::string& dataFile, - const std::vector& localDirs, - benchmark::State& state) {} - - protected: - std::shared_ptr file_; - std::vector rowGroupIndices_; - std::vector columnIndices_; - std::shared_ptr schema_; - parquet::ArrowReaderProperties properties_; -}; - -class BenchmarkShuffleSplitCacheScanBenchmark : public BenchmarkShuffleSplit { - public: - BenchmarkShuffleSplitCacheScanBenchmark(std::string filename) : BenchmarkShuffleSplit(filename) {} - - protected: - void doSplit( - std::shared_ptr& shuffleWriter, - int64_t& elapseRead, - int64_t& numBatches, - int64_t& numRows, - int64_t& splitTime, - const int numPartitions, - ShuffleWriterOptions options, - const std::string& dataFile, - const std::vector& localDirs, - benchmark::State& state) { - std::vector localColumnIndices; - // local_column_indices.push_back(0); - /* local_column_indices.push_back(0); - local_column_indices.push_back(1); - local_column_indices.push_back(2); - local_column_indices.push_back(4); - local_column_indices.push_back(5); - local_column_indices.push_back(6); - local_column_indices.push_back(7); -*/ - localColumnIndices.push_back(8); - localColumnIndices.push_back(9); - localColumnIndices.push_back(13); - localColumnIndices.push_back(14); - localColumnIndices.push_back(15); - - std::shared_ptr localSchema; - arrow::FieldVector fields; - fields.push_back(schema_->field(8)); - fields.push_back(schema_->field(9)); - fields.push_back(schema_->field(13)); - fields.push_back(schema_->field(14)); - fields.push_back(schema_->field(15)); - localSchema = std::make_shared(fields); - - if (state.thread_index() == 0) - LOG(INFO) << localSchema->ToString(); - - auto partitionWriter = std::make_unique( - numPartitions, PartitionWriterOptions{}, defaultArrowMemoryPool().get(), dataFile, localDirs); - GLUTEN_ASSIGN_OR_THROW( - shuffleWriter, - VeloxHashBasedShuffleWriter::create( - numPartitions, - std::move(partitionWriter), - std::move(options), - defaultLeafVeloxMemoryPool(), - defaultArrowMemoryPool().get())); - - std::shared_ptr recordBatch; - - std::unique_ptr<::parquet::arrow::FileReader> parquetReader; - std::shared_ptr recordBatchReader; - GLUTEN_THROW_NOT_OK(::parquet::arrow::FileReader::Make( - defaultArrowMemoryPool().get(), ::parquet::ParquetFileReader::Open(file_), properties_, &parquetReader)); - - std::vector> batches; - GLUTEN_THROW_NOT_OK(parquetReader->GetRecordBatchReader(rowGroupIndices_, localColumnIndices, &recordBatchReader)); - do { - TIME_NANO_OR_THROW(elapseRead, recordBatchReader->ReadNext(&recordBatch)); - - if (recordBatch) { - batches.push_back(recordBatch); - numBatches += 1; - numRows += recordBatch->num_rows(); - } - } while (recordBatch); - LOG(INFO) << "parquet parse done elapsed time " << elapseRead / 1000000 << " ms "; - LOG(INFO) << "batches = " << numBatches << " rows = " << numRows; - - for (auto _ : state) { - for_each( - batches.cbegin(), - batches.cend(), - [&shuffleWriter, &splitTime](const std::shared_ptr& recordBatch) { - std::shared_ptr cb; - ARROW_ASSIGN_OR_THROW(cb, recordBatch2VeloxColumnarBatch(*recordBatch)); - TIME_NANO_OR_THROW(splitTime, shuffleWriter->write(cb, ShuffleWriter::kMinMemLimit)); - }); - // LOG(INFO) << " split done memory allocated = " << - // options.memoryPool->bytes_allocated(); - } - - TIME_NANO_OR_THROW(splitTime, shuffleWriter->stop()); - } -}; - -class BenchmarkShuffleSplitIterateScanBenchmark : public BenchmarkShuffleSplit { - public: - BenchmarkShuffleSplitIterateScanBenchmark(std::string filename) : BenchmarkShuffleSplit(filename) {} - - protected: - void doSplit( - std::shared_ptr& shuffleWriter, - int64_t& elapseRead, - int64_t& numBatches, - int64_t& numRows, - int64_t& splitTime, - const int numPartitions, - ShuffleWriterOptions options, - const std::string& dataFile, - const std::vector& localDirs, - benchmark::State& state) { - if (state.thread_index() == 0) - LOG(INFO) << schema_->ToString(); - - auto partitionWriter = std::make_unique( - numPartitions, PartitionWriterOptions{}, defaultArrowMemoryPool().get(), dataFile, localDirs); - GLUTEN_ASSIGN_OR_THROW( - shuffleWriter, - VeloxHashBasedShuffleWriter::create( - numPartitions, - std::move(partitionWriter), - std::move(options), - defaultLeafVeloxMemoryPool(), - defaultArrowMemoryPool().get())); - - std::shared_ptr recordBatch; - - std::unique_ptr<::parquet::arrow::FileReader> parquetReader; - std::shared_ptr recordBatchReader; - GLUTEN_THROW_NOT_OK(::parquet::arrow::FileReader::Make( - defaultArrowMemoryPool().get(), ::parquet::ParquetFileReader::Open(file_), properties_, &parquetReader)); - - for (auto _ : state) { - std::vector> batches; - GLUTEN_THROW_NOT_OK(parquetReader->GetRecordBatchReader(rowGroupIndices_, columnIndices_, &recordBatchReader)); - TIME_NANO_OR_THROW(elapseRead, recordBatchReader->ReadNext(&recordBatch)); - while (recordBatch) { - numBatches += 1; - numRows += recordBatch->num_rows(); - std::shared_ptr cb; - ARROW_ASSIGN_OR_THROW(cb, recordBatch2VeloxColumnarBatch(*recordBatch)); - TIME_NANO_OR_THROW(splitTime, shuffleWriter->write(cb, ShuffleWriter::kMinMemLimit)); - TIME_NANO_OR_THROW(elapseRead, recordBatchReader->ReadNext(&recordBatch)); - } - } - TIME_NANO_OR_THROW(splitTime, shuffleWriter->stop()); - } -}; - -} // namespace gluten - -int main(int argc, char** argv) { - benchmark::Initialize(&argc, argv); - gflags::ParseCommandLineFlags(&argc, &argv, true); - - if (FLAGS_file.size() == 0) { - LOG(WARNING) << "No input data file. Please specify via argument --file"; - } - - if (FLAGS_partitions == -1) { - FLAGS_partitions = std::thread::hardware_concurrency(); - } - - gluten::BenchmarkShuffleSplitIterateScanBenchmark iterateScanBenchmark(FLAGS_file); - - auto bm = benchmark::RegisterBenchmark("BenchmarkShuffleSplit::IterateScan", iterateScanBenchmark) - ->ReportAggregatesOnly(false) - ->MeasureProcessCPUTime() - ->Unit(benchmark::kSecond); - - if (FLAGS_threads > 0) { - bm->Threads(FLAGS_threads); - } else { - bm->ThreadRange(1, std::thread::hardware_concurrency()); - } - if (FLAGS_iterations > 0) { - bm->Iterations(FLAGS_iterations); - } - - benchmark::RunSpecifiedBenchmarks(); - benchmark::Shutdown(); -} diff --git a/cpp/velox/benchmarks/common/BenchmarkUtils.cc b/cpp/velox/benchmarks/common/BenchmarkUtils.cc index a9f6f0838cfa7..c3baa2f339151 100644 --- a/cpp/velox/benchmarks/common/BenchmarkUtils.cc +++ b/cpp/velox/benchmarks/common/BenchmarkUtils.cc @@ -31,14 +31,18 @@ DEFINE_int32(cpu, -1, "Run benchmark on specific CPU"); DEFINE_int32(threads, 1, "The number of threads to run this benchmark"); DEFINE_int32(iterations, 1, "The number of iterations to run this benchmark"); +namespace gluten { namespace { +std::unordered_map bmConfMap = defaultConf(); +} -std::unordered_map bmConfMap = {{gluten::kSparkBatchSize, std::to_string(FLAGS_batch_size)}}; - -} // namespace +std::unordered_map defaultConf() { + return { + {gluten::kSparkBatchSize, std::to_string(FLAGS_batch_size)}, + }; +} void initVeloxBackend(std::unordered_map& conf) { - conf[gluten::kGlogSeverityLevel] = "0"; gluten::VeloxBackend::create(conf); } @@ -190,9 +194,18 @@ void BenchmarkAllocationListener::allocationChanged(int64_t diff) { velox::succinctBytes(diff), velox::succinctBytes(usedBytes_)); auto neededBytes = usedBytes_ + diff - limit_; - auto spilledBytes = iterator_->spillFixedSize(neededBytes); + int64_t spilledBytes = 0; + if (iterator_) { + spilledBytes += iterator_->spillFixedSize(neededBytes); + } + if (spilledBytes < neededBytes && shuffleWriter_) { + int64_t reclaimed = 0; + GLUTEN_THROW_NOT_OK(shuffleWriter_->reclaimFixedSize(neededBytes - spilledBytes, &reclaimed)); + spilledBytes += reclaimed; + } LOG(INFO) << fmt::format("spill finish, got {}.", velox::succinctBytes(spilledBytes)); } else { usedBytes_ += diff; } } +} // namespace gluten diff --git a/cpp/velox/benchmarks/common/BenchmarkUtils.h b/cpp/velox/benchmarks/common/BenchmarkUtils.h index ff5e675f74ce1..181e56807bcd5 100644 --- a/cpp/velox/benchmarks/common/BenchmarkUtils.h +++ b/cpp/velox/benchmarks/common/BenchmarkUtils.h @@ -32,6 +32,7 @@ #include "memory/VeloxColumnarBatch.h" #include "memory/VeloxMemoryManager.h" #include "shuffle/Options.h" +#include "shuffle/ShuffleWriter.h" #include "utils/VeloxArrowUtils.h" #include "utils/exception.h" #include "velox/common/memory/Memory.h" @@ -41,6 +42,10 @@ DECLARE_int32(cpu); DECLARE_int32(threads); DECLARE_int32(iterations); +namespace gluten { + +std::unordered_map defaultConf(); + /// Initialize the Velox backend with default value. void initVeloxBackend(); @@ -111,10 +116,16 @@ class BenchmarkAllocationListener final : public gluten::AllocationListener { iterator_ = iterator; } + void setShuffleWriter(gluten::ShuffleWriter* shuffleWriter) { + shuffleWriter_ = shuffleWriter; + } + void allocationChanged(int64_t diff) override; private: uint64_t usedBytes_{0L}; uint64_t limit_{0L}; - gluten::ResultIterator* iterator_; + gluten::ResultIterator* iterator_{nullptr}; + gluten::ShuffleWriter* shuffleWriter_{nullptr}; }; +} // namespace gluten diff --git a/cpp/velox/benchmarks/common/FileReaderIterator.h b/cpp/velox/benchmarks/common/FileReaderIterator.h index 3fa94b6afba52..16db58ce45693 100644 --- a/cpp/velox/benchmarks/common/FileReaderIterator.h +++ b/cpp/velox/benchmarks/common/FileReaderIterator.h @@ -38,8 +38,6 @@ class FileReaderIterator : public ColumnarBatchIterator { virtual ~FileReaderIterator() = default; - virtual void createReader() = 0; - virtual std::shared_ptr getSchema() = 0; int64_t getCollectBatchTime() const { diff --git a/cpp/velox/benchmarks/common/ParquetReaderIterator.h b/cpp/velox/benchmarks/common/ParquetReaderIterator.h index e654dc1897b28..6d162e4b68d5e 100644 --- a/cpp/velox/benchmarks/common/ParquetReaderIterator.h +++ b/cpp/velox/benchmarks/common/ParquetReaderIterator.h @@ -27,7 +27,7 @@ class ParquetReaderIterator : public FileReaderIterator { public: explicit ParquetReaderIterator(const std::string& path) : FileReaderIterator(path) {} - void createReader() override { + void createReader() { parquet::ArrowReaderProperties properties = parquet::default_arrow_reader_properties(); properties.set_batch_size(FLAGS_batch_size); GLUTEN_THROW_NOT_OK(parquet::arrow::FileReader::Make( diff --git a/cpp/velox/compute/VeloxBackend.cc b/cpp/velox/compute/VeloxBackend.cc index 187c36e1e8bd4..a3658faa3a188 100644 --- a/cpp/velox/compute/VeloxBackend.cc +++ b/cpp/velox/compute/VeloxBackend.cc @@ -55,8 +55,10 @@ using namespace facebook; namespace gluten { namespace { -gluten::Runtime* veloxRuntimeFactory(const std::unordered_map& sessionConf) { - return new gluten::VeloxRuntime(sessionConf); +gluten::Runtime* veloxRuntimeFactory( + std::unique_ptr listener, + const std::unordered_map& sessionConf) { + return new gluten::VeloxRuntime(std::move(listener), sessionConf); } } // namespace @@ -156,8 +158,8 @@ void VeloxBackend::initCache() { cacheFilePrefix_ = getCacheFilePrefix(); std::string ssdCachePath = ssdCachePathPrefix + "/" + cacheFilePrefix_; ssdCacheExecutor_ = std::make_unique(ssdCacheIOThreads); - auto ssd = - std::make_unique(ssdCachePath, ssdCacheSize, ssdCacheShards, ssdCacheExecutor_.get()); + const cache::SsdCache::Config config(ssdCachePath, ssdCacheSize, ssdCacheShards, ssdCacheExecutor_.get()); + auto ssd = std::make_unique(config); std::error_code ec; const std::filesystem::space_info si = std::filesystem::space(ssdCachePathPrefix, ec); @@ -226,6 +228,9 @@ void VeloxBackend::initConnector() { FLAGS_cache_prefetch_min_pct = backendConf_->get(kCachePrefetchMinPct, 0); auto ioThreads = backendConf_->get(kVeloxIOThreads, kVeloxIOThreadsDefault); + GLUTEN_CHECK( + ioThreads >= 0, + kVeloxIOThreads + " was set to negative number " + std::to_string(ioThreads) + ", this should not happen."); if (ioThreads > 0) { ioExecutor_ = std::make_unique(ioThreads); } diff --git a/cpp/velox/compute/VeloxPlanConverter.cc b/cpp/velox/compute/VeloxPlanConverter.cc index ed42cb15a51eb..315ff2da67ad0 100644 --- a/cpp/velox/compute/VeloxPlanConverter.cc +++ b/cpp/velox/compute/VeloxPlanConverter.cc @@ -60,6 +60,7 @@ std::shared_ptr parseScanSplitInfo( splitInfo->starts.reserve(fileList.size()); splitInfo->lengths.reserve(fileList.size()); splitInfo->partitionColumns.reserve(fileList.size()); + splitInfo->properties.reserve(fileList.size()); splitInfo->metadataColumns.reserve(fileList.size()); for (const auto& file : fileList) { // Expect all Partitions share the same index. @@ -80,6 +81,13 @@ std::shared_ptr parseScanSplitInfo( splitInfo->paths.emplace_back(file.uri_file()); splitInfo->starts.emplace_back(file.start()); splitInfo->lengths.emplace_back(file.length()); + + facebook::velox::FileProperties fileProps; + if (file.has_properties()) { + fileProps.fileSize = file.properties().filesize(); + fileProps.modificationTime = file.properties().modificationtime(); + } + splitInfo->properties.emplace_back(fileProps); switch (file.file_format_case()) { case SubstraitFileFormatCase::kOrc: splitInfo->format = dwio::common::FileFormat::ORC; diff --git a/cpp/velox/compute/VeloxRuntime.cc b/cpp/velox/compute/VeloxRuntime.cc index 44f04ef31ae72..fde87b5b49bdc 100644 --- a/cpp/velox/compute/VeloxRuntime.cc +++ b/cpp/velox/compute/VeloxRuntime.cc @@ -35,7 +35,9 @@ #include "utils/VeloxArrowUtils.h" #ifdef ENABLE_HDFS + #include "operators/writer/VeloxParquetDatasourceHDFS.h" + #endif #ifdef ENABLE_S3 @@ -54,8 +56,12 @@ using namespace facebook; namespace gluten { -VeloxRuntime::VeloxRuntime(const std::unordered_map& confMap) : Runtime(confMap) { +VeloxRuntime::VeloxRuntime( + std::unique_ptr listener, + const std::unordered_map& confMap) + : Runtime(std::make_shared(std::move(listener)), confMap) { // Refresh session config. + vmm_ = dynamic_cast(memoryManager_.get()); veloxCfg_ = std::make_shared(confMap_); debugModeEnabled_ = veloxCfg_->get(kDebugModeEnabled, false); FLAGS_minloglevel = veloxCfg_->get(kGlogSeverityLevel, FLAGS_minloglevel); @@ -125,15 +131,17 @@ void VeloxRuntime::injectWriteFilesTempPath(const std::string& path) { writeFilesTempPath_ = path; } +VeloxMemoryManager* VeloxRuntime::memoryManager() { + return vmm_; +} + std::shared_ptr VeloxRuntime::createResultIterator( - MemoryManager* memoryManager, const std::string& spillDir, const std::vector>& inputs, const std::unordered_map& sessionConf) { LOG_IF(INFO, debugModeEnabled_) << "VeloxRuntime session config:" << printConfig(confMap_); - VeloxPlanConverter veloxPlanConverter( - inputs, getLeafVeloxPool(memoryManager).get(), sessionConf, writeFilesTempPath_); + VeloxPlanConverter veloxPlanConverter(inputs, vmm_->getLeafMemoryPool().get(), sessionConf, writeFilesTempPath_); veloxPlan_ = veloxPlanConverter.toVeloxPlan(substraitPlan_, std::move(localFiles_)); // Scan node can be required. @@ -144,15 +152,14 @@ std::shared_ptr VeloxRuntime::createResultIterator( // Separate the scan ids and stream ids, and get the scan infos. getInfoAndIds(veloxPlanConverter.splitInfos(), veloxPlan_->leafPlanNodeIds(), scanInfos, scanIds, streamIds); - auto* vmm = toVeloxMemoryManager(memoryManager); auto wholestageIter = std::make_unique( - vmm, veloxPlan_, scanIds, scanInfos, streamIds, spillDir, sessionConf, taskInfo_); + vmm_, veloxPlan_, scanIds, scanInfos, streamIds, spillDir, sessionConf, taskInfo_); return std::make_shared(std::move(wholestageIter), this); } -std::shared_ptr VeloxRuntime::createColumnar2RowConverter(MemoryManager* memoryManager) { - auto ctxVeloxPool = getLeafVeloxPool(memoryManager); - return std::make_shared(ctxVeloxPool); +std::shared_ptr VeloxRuntime::createColumnar2RowConverter() { + auto veloxPool = vmm_->getLeafMemoryPool(); + return std::make_shared(veloxPool); } std::shared_ptr VeloxRuntime::createOrGetEmptySchemaBatch(int32_t numRows) { @@ -165,53 +172,45 @@ std::shared_ptr VeloxRuntime::createOrGetEmptySchemaBatch(int32_t } std::shared_ptr VeloxRuntime::select( - MemoryManager* memoryManager, std::shared_ptr batch, std::vector columnIndices) { - auto ctxVeloxPool = getLeafVeloxPool(memoryManager); - auto veloxBatch = gluten::VeloxColumnarBatch::from(ctxVeloxPool.get(), batch); - auto outputBatch = veloxBatch->select(ctxVeloxPool.get(), std::move(columnIndices)); + auto veloxPool = vmm_->getLeafMemoryPool(); + auto veloxBatch = gluten::VeloxColumnarBatch::from(veloxPool.get(), batch); + auto outputBatch = veloxBatch->select(veloxPool.get(), std::move(columnIndices)); return outputBatch; } -std::shared_ptr VeloxRuntime::createRow2ColumnarConverter( - MemoryManager* memoryManager, - struct ArrowSchema* cSchema) { - auto ctxVeloxPool = getLeafVeloxPool(memoryManager); - return std::make_shared(cSchema, ctxVeloxPool); +std::shared_ptr VeloxRuntime::createRow2ColumnarConverter(struct ArrowSchema* cSchema) { + auto veloxPool = vmm_->getLeafMemoryPool(); + return std::make_shared(cSchema, veloxPool); } std::shared_ptr VeloxRuntime::createShuffleWriter( int numPartitions, std::unique_ptr partitionWriter, - ShuffleWriterOptions options, - MemoryManager* memoryManager) { - auto ctxPool = getLeafVeloxPool(memoryManager); - auto arrowPool = memoryManager->getArrowMemoryPool(); - std::shared_ptr shuffleWriter; - if (options.shuffleWriterType == kHashShuffle) { - GLUTEN_ASSIGN_OR_THROW( - shuffleWriter, - VeloxHashBasedShuffleWriter::create( - numPartitions, std::move(partitionWriter), std::move(options), ctxPool, arrowPool)); - } else if (options.shuffleWriterType == kSortShuffle) { - GLUTEN_ASSIGN_OR_THROW( - shuffleWriter, - VeloxSortBasedShuffleWriter::create( - numPartitions, std::move(partitionWriter), std::move(options), ctxPool, arrowPool)); - } + ShuffleWriterOptions options) { + auto veloxPool = vmm_->getLeafMemoryPool(); + auto arrowPool = vmm_->getArrowMemoryPool(); + GLUTEN_ASSIGN_OR_THROW( + std::shared_ptr shuffleWriter, + VeloxShuffleWriter::create( + options.shuffleWriterType, + numPartitions, + std::move(partitionWriter), + std::move(options), + veloxPool, + arrowPool)); return shuffleWriter; } std::shared_ptr VeloxRuntime::createDatasource( const std::string& filePath, - MemoryManager* memoryManager, std::shared_ptr schema) { static std::atomic_uint32_t id{0UL}; - auto veloxPool = getAggregateVeloxPool(memoryManager)->addAggregateChild("datasource." + std::to_string(id++)); + auto veloxPool = vmm_->getAggregateMemoryPool()->addAggregateChild("datasource." + std::to_string(id++)); // Pass a dedicate pool for S3 and GCS sinks as can't share veloxPool // with parquet writer. - auto sinkPool = getLeafVeloxPool(memoryManager); + auto sinkPool = vmm_->getLeafMemoryPool(); if (isSupportedHDFSPath(filePath)) { #ifdef ENABLE_HDFS return std::make_shared(filePath, veloxPool, sinkPool, schema); @@ -246,12 +245,10 @@ std::shared_ptr VeloxRuntime::createDatasource( std::shared_ptr VeloxRuntime::createShuffleReader( std::shared_ptr schema, - ShuffleReaderOptions options, - arrow::MemoryPool* pool, - MemoryManager* memoryManager) { + ShuffleReaderOptions options) { auto rowType = facebook::velox::asRowType(gluten::fromArrowSchema(schema)); auto codec = gluten::createArrowIpcCodec(options.compressionType, options.codecBackend); - auto ctxVeloxPool = getLeafVeloxPool(memoryManager); + auto ctxVeloxPool = vmm_->getLeafMemoryPool(); auto veloxCompressionType = facebook::velox::common::stringToCompressionKind(options.compressionTypeStr); auto deserializerFactory = std::make_unique( schema, @@ -259,19 +256,17 @@ std::shared_ptr VeloxRuntime::createShuffleReader( veloxCompressionType, rowType, options.batchSize, - pool, + vmm_->getArrowMemoryPool(), ctxVeloxPool, options.shuffleWriterType); auto reader = std::make_shared(std::move(deserializerFactory)); return reader; } -std::unique_ptr VeloxRuntime::createColumnarBatchSerializer( - MemoryManager* memoryManager, - arrow::MemoryPool* arrowPool, - struct ArrowSchema* cSchema) { - auto ctxVeloxPool = getLeafVeloxPool(memoryManager); - return std::make_unique(arrowPool, ctxVeloxPool, cSchema); +std::unique_ptr VeloxRuntime::createColumnarBatchSerializer(struct ArrowSchema* cSchema) { + auto arrowPool = vmm_->getArrowMemoryPool(); + auto veloxPool = vmm_->getLeafMemoryPool(); + return std::make_unique(arrowPool, veloxPool, cSchema); } void VeloxRuntime::dumpConf(const std::string& path) { diff --git a/cpp/velox/compute/VeloxRuntime.h b/cpp/velox/compute/VeloxRuntime.h index 80408bccb2b5a..096ecb6fbf13b 100644 --- a/cpp/velox/compute/VeloxRuntime.h +++ b/cpp/velox/compute/VeloxRuntime.h @@ -33,81 +33,49 @@ inline static const std::string kVeloxRuntimeKind{"velox"}; class VeloxRuntime final : public Runtime { public: - explicit VeloxRuntime(const std::unordered_map& confMap); + explicit VeloxRuntime( + std::unique_ptr listener, + const std::unordered_map& confMap); void parsePlan(const uint8_t* data, int32_t size, std::optional dumpFile) override; void parseSplitInfo(const uint8_t* data, int32_t size, std::optional dumpFile) override; - static std::shared_ptr getAggregateVeloxPool(MemoryManager* memoryManager) { - return toVeloxMemoryManager(memoryManager)->getAggregateMemoryPool(); - } - - static std::shared_ptr getLeafVeloxPool(MemoryManager* memoryManager) { - return toVeloxMemoryManager(memoryManager)->getLeafMemoryPool(); - } - - static VeloxMemoryManager* toVeloxMemoryManager(MemoryManager* memoryManager) { - if (auto veloxMemoryManager = dynamic_cast(memoryManager)) { - return veloxMemoryManager; - } else { - GLUTEN_CHECK(false, "Velox memory manager should be used for Velox runtime."); - } - } - - MemoryManager* createMemoryManager( - const std::string& name, - std::shared_ptr allocator, - std::unique_ptr listener) override { - return new VeloxMemoryManager(name, allocator, std::move(listener)); - } + VeloxMemoryManager* memoryManager() override; // FIXME This is not thread-safe? std::shared_ptr createResultIterator( - MemoryManager* memoryManager, const std::string& spillDir, const std::vector>& inputs = {}, const std::unordered_map& sessionConf = {}) override; - std::shared_ptr createColumnar2RowConverter(MemoryManager* memoryManager) override; + std::shared_ptr createColumnar2RowConverter() override; std::shared_ptr createOrGetEmptySchemaBatch(int32_t numRows) override; - std::shared_ptr select( - MemoryManager* memoryManager, - std::shared_ptr batch, - std::vector columnIndices) override; + std::shared_ptr select(std::shared_ptr batch, std::vector columnIndices) + override; - std::shared_ptr createRow2ColumnarConverter( - MemoryManager* memoryManager, - struct ArrowSchema* cSchema) override; + std::shared_ptr createRow2ColumnarConverter(struct ArrowSchema* cSchema) override; std::shared_ptr createShuffleWriter( int numPartitions, std::unique_ptr partitionWriter, - ShuffleWriterOptions options, - MemoryManager* memoryManager) override; + ShuffleWriterOptions options) override; Metrics* getMetrics(ColumnarBatchIterator* rawIter, int64_t exportNanos) override { auto iter = static_cast(rawIter); return iter->getMetrics(exportNanos); } - std::shared_ptr createDatasource( - const std::string& filePath, - MemoryManager* memoryManager, - std::shared_ptr schema) override; + std::shared_ptr createDatasource(const std::string& filePath, std::shared_ptr schema) + override; std::shared_ptr createShuffleReader( std::shared_ptr schema, - ShuffleReaderOptions options, - arrow::MemoryPool* pool, - MemoryManager* memoryManager) override; + ShuffleReaderOptions options) override; - std::unique_ptr createColumnarBatchSerializer( - MemoryManager* memoryManager, - arrow::MemoryPool* arrowPool, - struct ArrowSchema* cSchema) override; + std::unique_ptr createColumnarBatchSerializer(struct ArrowSchema* cSchema) override; std::string planString(bool details, const std::unordered_map& sessionConf) override; @@ -131,6 +99,7 @@ class VeloxRuntime final : public Runtime { std::vector& streamIds); private: + VeloxMemoryManager* vmm_; std::shared_ptr veloxPlan_; std::shared_ptr veloxCfg_; bool debugModeEnabled_{false}; diff --git a/cpp/velox/compute/WholeStageResultIterator.cc b/cpp/velox/compute/WholeStageResultIterator.cc index f719c119c3e09..296b9415b1594 100644 --- a/cpp/velox/compute/WholeStageResultIterator.cc +++ b/cpp/velox/compute/WholeStageResultIterator.cc @@ -83,8 +83,13 @@ WholeStageResultIterator::WholeStageResultIterator( std::unordered_set emptySet; velox::core::PlanFragment planFragment{planNode, velox::core::ExecutionStrategy::kUngrouped, 1, emptySet}; std::shared_ptr queryCtx = createNewVeloxQueryCtx(); + static std::atomic vtId{0}; // Velox task ID to distinguish from Spark task ID. task_ = velox::exec::Task::create( - fmt::format("Gluten_Stage_{}_TID_{}", std::to_string(taskInfo_.stageId), std::to_string(taskInfo_.taskId)), + fmt::format( + "Gluten_Stage_{}_TID_{}_VTID_{}", + std::to_string(taskInfo_.stageId), + std::to_string(taskInfo_.taskId), + std::to_string(vtId++)), std::move(planFragment), 0, std::move(queryCtx), @@ -109,6 +114,7 @@ WholeStageResultIterator::WholeStageResultIterator( const auto& paths = scanInfo->paths; const auto& starts = scanInfo->starts; const auto& lengths = scanInfo->lengths; + const auto& properties = scanInfo->properties; const auto& format = scanInfo->format; const auto& partitionColumns = scanInfo->partitionColumns; const auto& metadataColumns = scanInfo->metadataColumns; @@ -135,7 +141,9 @@ WholeStageResultIterator::WholeStageResultIterator( std::nullopt, customSplitInfo, nullptr, - deleteFiles); + deleteFiles, + std::unordered_map(), + properties[idx]); } else { split = std::make_shared( kHiveConnectorId, @@ -149,7 +157,8 @@ WholeStageResultIterator::WholeStageResultIterator( nullptr, std::unordered_map(), 0, - metadataColumn); + metadataColumn, + properties[idx]); } connectorSplits.emplace_back(split); } @@ -445,8 +454,6 @@ std::unordered_map WholeStageResultIterator::getQueryC } // Adjust timestamp according to the above configured session timezone. configs[velox::core::QueryConfig::kAdjustTimestampToTimezone] = "true"; - // Align Velox size function with Spark. - configs[velox::core::QueryConfig::kSparkLegacySizeOfNull] = std::to_string(veloxCfg_->get(kLegacySize, true)); { // partial aggregation memory config @@ -525,6 +532,7 @@ std::shared_ptr WholeStageResultIterator::createConnectorConfig() !veloxCfg_->get(kCaseSensitive, false) ? "true" : "false"; configs[velox::connector::hive::HiveConfig::kPartitionPathAsLowerCaseSession] = "false"; configs[velox::connector::hive::HiveConfig::kParquetWriteTimestampUnitSession] = "6"; + configs[velox::connector::hive::HiveConfig::kReadTimestampUnitSession] = "6"; configs[velox::connector::hive::HiveConfig::kMaxPartitionsPerWritersSession] = std::to_string(veloxCfg_->get(kMaxPartitions, 10000)); configs[velox::connector::hive::HiveConfig::kIgnoreMissingFilesSession] = diff --git a/cpp/velox/config/VeloxConfig.h b/cpp/velox/config/VeloxConfig.h index f57f1293e22e1..7a96f03f4985e 100644 --- a/cpp/velox/config/VeloxConfig.h +++ b/cpp/velox/config/VeloxConfig.h @@ -99,7 +99,7 @@ const std::string kVeloxAsyncTimeoutOnTaskStopping = const int32_t kVeloxAsyncTimeoutOnTaskStoppingDefault = 30000; // 30s // udf -const std::string kVeloxUdfLibraryPaths = "spark.gluten.sql.columnar.backend.velox.udfLibraryPaths"; +const std::string kVeloxUdfLibraryPaths = "spark.gluten.sql.columnar.backend.velox.internal.udfLibraryPaths"; // backtrace allocation const std::string kBacktraceAllocation = "spark.gluten.backtrace.allocation"; diff --git a/cpp/velox/jni/VeloxJniWrapper.cc b/cpp/velox/jni/VeloxJniWrapper.cc index 9da7355d1b3a9..f11edf1a435d7 100644 --- a/cpp/velox/jni/VeloxJniWrapper.cc +++ b/cpp/velox/jni/VeloxJniWrapper.cc @@ -30,6 +30,8 @@ #include "jni/JniFileSystem.h" #include "memory/VeloxMemoryManager.h" #include "substrait/SubstraitToVeloxPlanValidator.h" +#include "utils/ObjectStore.h" +#include "utils/VeloxBatchAppender.h" #include "velox/common/base/BloomFilter.h" #include @@ -158,8 +160,8 @@ JNIEXPORT jlong JNICALL Java_org_apache_gluten_utils_VeloxBloomFilterJniWrapper_ auto filter = std::make_shared>>(); filter->reset(capacity); GLUTEN_CHECK(filter->isSet(), "Bloom-filter is not initialized"); - return ctx->objectStore()->save(filter); - JNI_METHOD_END(gluten::kInvalidResourceHandle) + return ctx->saveObject(filter); + JNI_METHOD_END(gluten::kInvalidObjectHandle) } JNIEXPORT jlong JNICALL Java_org_apache_gluten_utils_VeloxBloomFilterJniWrapper_init( // NOLINT @@ -172,8 +174,8 @@ JNIEXPORT jlong JNICALL Java_org_apache_gluten_utils_VeloxBloomFilterJniWrapper_ auto filter = std::make_shared>>(); uint8_t* serialized = safeArray.elems(); filter->merge(reinterpret_cast(serialized)); - return ctx->objectStore()->save(filter); - JNI_METHOD_END(gluten::kInvalidResourceHandle) + return ctx->saveObject(filter); + JNI_METHOD_END(gluten::kInvalidObjectHandle) } JNIEXPORT void JNICALL Java_org_apache_gluten_utils_VeloxBloomFilterJniWrapper_insertLong( // NOLINT @@ -183,7 +185,7 @@ JNIEXPORT void JNICALL Java_org_apache_gluten_utils_VeloxBloomFilterJniWrapper_i jlong item) { JNI_METHOD_START auto ctx = gluten::getRuntime(env, wrapper); - auto filter = ctx->objectStore()->retrieve>>(handle); + auto filter = gluten::ObjectStore::retrieve>>(handle); GLUTEN_CHECK(filter->isSet(), "Bloom-filter is not initialized"); filter->insert(folly::hasher()(item)); JNI_METHOD_END() @@ -196,7 +198,7 @@ JNIEXPORT jboolean JNICALL Java_org_apache_gluten_utils_VeloxBloomFilterJniWrapp jlong item) { JNI_METHOD_START auto ctx = gluten::getRuntime(env, wrapper); - auto filter = ctx->objectStore()->retrieve>>(handle); + auto filter = gluten::ObjectStore::retrieve>>(handle); GLUTEN_CHECK(filter->isSet(), "Bloom-filter is not initialized"); bool out = filter->mayContain(folly::hasher()(item)); return out; @@ -221,8 +223,8 @@ JNIEXPORT void JNICALL Java_org_apache_gluten_utils_VeloxBloomFilterJniWrapper_m jlong other) { JNI_METHOD_START auto ctx = gluten::getRuntime(env, wrapper); - auto to = ctx->objectStore()->retrieve>>(handle); - auto from = ctx->objectStore()->retrieve>>(other); + auto to = gluten::ObjectStore::retrieve>>(handle); + auto from = gluten::ObjectStore::retrieve>>(other); GLUTEN_CHECK(to->isSet(), "Bloom-filter is not initialized"); GLUTEN_CHECK(from->isSet(), "Bloom-filter is not initialized"); std::vector serialized = serialize(from.get()); @@ -236,7 +238,7 @@ JNIEXPORT jbyteArray JNICALL Java_org_apache_gluten_utils_VeloxBloomFilterJniWra jlong handle) { JNI_METHOD_START auto ctx = gluten::getRuntime(env, wrapper); - auto filter = ctx->objectStore()->retrieve>>(handle); + auto filter = gluten::ObjectStore::retrieve>>(handle); GLUTEN_CHECK(filter->isSet(), "Bloom-filter is not initialized"); std::vector buffer = serialize(filter.get()); auto size = buffer.capacity(); @@ -246,6 +248,21 @@ JNIEXPORT jbyteArray JNICALL Java_org_apache_gluten_utils_VeloxBloomFilterJniWra JNI_METHOD_END(nullptr) } +JNIEXPORT jlong JNICALL Java_org_apache_gluten_utils_VeloxBatchAppenderJniWrapper_create( // NOLINT + JNIEnv* env, + jobject wrapper, + jint minOutputBatchSize, + jobject jIter) { + JNI_METHOD_START + auto ctx = gluten::getRuntime(env, wrapper); + auto pool = dynamic_cast(ctx->memoryManager())->getLeafMemoryPool(); + auto iter = gluten::makeJniColumnarBatchIterator(env, jIter, ctx, nullptr); + auto appender = std::make_shared( + std::make_unique(pool.get(), minOutputBatchSize, std::move(iter))); + return ctx->saveObject(appender); + JNI_METHOD_END(gluten::kInvalidObjectHandle) +} + #ifdef __cplusplus } #endif diff --git a/cpp/velox/memory/VeloxColumnarBatch.cc b/cpp/velox/memory/VeloxColumnarBatch.cc index 83428707b3208..0d8db312721a9 100644 --- a/cpp/velox/memory/VeloxColumnarBatch.cc +++ b/cpp/velox/memory/VeloxColumnarBatch.cc @@ -143,13 +143,13 @@ std::shared_ptr VeloxColumnarBatch::select( return std::make_shared(rowVector); } -std::pair VeloxColumnarBatch::getRowBytes(int32_t rowId) const { +std::vector VeloxColumnarBatch::toUnsafeRow(int32_t rowId) const { auto fast = std::make_unique(rowVector_); auto size = fast->rowSize(rowId); - char* rowBytes = new char[size]; - std::memset(rowBytes, 0, size); - fast->serialize(0, rowBytes); - return std::make_pair(rowBytes, size); + std::vector bytes(size); + std::memset(bytes.data(), 0, bytes.size()); + fast->serialize(0, bytes.data()); + return bytes; } } // namespace gluten diff --git a/cpp/velox/memory/VeloxColumnarBatch.h b/cpp/velox/memory/VeloxColumnarBatch.h index c319b7977c33a..6c79f2772d2dd 100644 --- a/cpp/velox/memory/VeloxColumnarBatch.h +++ b/cpp/velox/memory/VeloxColumnarBatch.h @@ -41,7 +41,7 @@ class VeloxColumnarBatch final : public ColumnarBatch { std::shared_ptr exportArrowSchema() override; std::shared_ptr exportArrowArray() override; - std::pair getRowBytes(int32_t rowId) const override; + std::vector toUnsafeRow(int32_t rowId) const override; std::shared_ptr select(facebook::velox::memory::MemoryPool* pool, std::vector columnIndices); facebook::velox::RowVectorPtr getRowVector() const; facebook::velox::RowVectorPtr getFlattenedRowVector(); diff --git a/cpp/velox/memory/VeloxMemoryManager.cc b/cpp/velox/memory/VeloxMemoryManager.cc index b7bd3a9f9a58d..442090004a417 100644 --- a/cpp/velox/memory/VeloxMemoryManager.cc +++ b/cpp/velox/memory/VeloxMemoryManager.cc @@ -16,6 +16,10 @@ */ #include "VeloxMemoryManager.h" +#ifdef ENABLE_JEMALLOC +#include +#endif + #include "velox/common/memory/MallocAllocator.h" #include "velox/common/memory/MemoryPool.h" #include "velox/exec/MemoryReclaimer.h" @@ -44,7 +48,7 @@ class ListenableArbitrator : public velox::memory::MemoryArbitrator { uint64_t growCapacity(velox::memory::MemoryPool* pool, uint64_t targetBytes) override { std::lock_guard l(mutex_); listener_->allocationChanged(targetBytes); - if (!pool->grow(targetBytes, 0)) { + if (!growPool(pool, targetBytes, 0)) { VELOX_FAIL("Failed to grow root pool's capacity for {}", velox::succinctBytes(targetBytes)); } return targetBytes; @@ -74,14 +78,14 @@ class ListenableArbitrator : public velox::memory::MemoryArbitrator { uint64_t targetBytes, bool allowSpill, bool allowAbort) override { - velox::memory::ScopedMemoryArbitrationContext ctx(nullptr); + velox::memory::ScopedMemoryArbitrationContext ctx((const velox::memory::MemoryPool*)nullptr); facebook::velox::exec::MemoryReclaimer::Stats status; VELOX_CHECK_EQ(pools.size(), 1, "Gluten only has one root pool"); std::lock_guard l(mutex_); // FIXME: Do we have recursive locking for this mutex? auto pool = pools.at(0); const uint64_t oldCapacity = pool->capacity(); pool->reclaim(targetBytes, 0, status); // ignore the output - pool->shrink(0); + shrinkPool(pool.get(), 0); const uint64_t newCapacity = pool->capacity(); uint64_t total = oldCapacity - newCapacity; listener_->allocationChanged(-total); @@ -104,14 +108,14 @@ class ListenableArbitrator : public velox::memory::MemoryArbitrator { // We should pass bytes as parameter "reservationBytes" when calling ::grow. auto freeByes = pool->freeBytes(); if (freeByes > bytes) { - if (pool->grow(0, bytes)) { + if (growPool(pool, 0, bytes)) { return; } } - auto reclaimedFreeBytes = pool->shrink(0); + auto reclaimedFreeBytes = shrinkPool(pool, 0); auto neededBytes = velox::bits::roundUp(bytes - reclaimedFreeBytes, memoryPoolTransferCapacity_); listener_->allocationChanged(neededBytes); - auto ret = pool->grow(reclaimedFreeBytes + neededBytes, bytes); + auto ret = growPool(pool, reclaimedFreeBytes + neededBytes, bytes); VELOX_CHECK( ret, "{} failed to grow {} bytes, current state {}", @@ -121,7 +125,7 @@ class ListenableArbitrator : public velox::memory::MemoryArbitrator { } uint64_t shrinkCapacityLocked(velox::memory::MemoryPool* pool, uint64_t bytes) { - uint64_t freeBytes = pool->shrink(bytes); + uint64_t freeBytes = shrinkPool(pool, bytes); listener_->allocationChanged(-freeBytes); return freeBytes; } @@ -157,17 +161,14 @@ class ArbitratorFactoryRegister { gluten::AllocationListener* listener_; }; -VeloxMemoryManager::VeloxMemoryManager( - const std::string& name, - std::shared_ptr allocator, - std::unique_ptr listener) - : MemoryManager(), name_(name), listener_(std::move(listener)) { +VeloxMemoryManager::VeloxMemoryManager(std::unique_ptr listener) + : MemoryManager(), listener_(std::move(listener)) { auto reservationBlockSize = VeloxBackend::get()->getBackendConf()->get( kMemoryReservationBlockSize, kMemoryReservationBlockSizeDefault); auto memInitCapacity = VeloxBackend::get()->getBackendConf()->get(kVeloxMemInitCapacity, kVeloxMemInitCapacityDefault); blockListener_ = std::make_unique(listener_.get(), reservationBlockSize); - listenableAlloc_ = std::make_unique(allocator.get(), blockListener_.get()); + listenableAlloc_ = std::make_unique(defaultMemoryAllocator().get(), blockListener_.get()); arrowPool_ = std::make_unique(listenableAlloc_.get()); ArbitratorFactoryRegister afr(listener_.get()); @@ -185,17 +186,17 @@ VeloxMemoryManager::VeloxMemoryManager( veloxMemoryManager_ = std::make_unique(mmOptions); veloxAggregatePool_ = veloxMemoryManager_->addRootPool( - name_ + "_root", + "root", velox::memory::kMaxMemory, // the 3rd capacity facebook::velox::memory::MemoryReclaimer::create()); - veloxLeafPool_ = veloxAggregatePool_->addLeafChild(name_ + "_default_leaf"); + veloxLeafPool_ = veloxAggregatePool_->addLeafChild("default_leaf"); } namespace { MemoryUsageStats collectVeloxMemoryUsageStats(const velox::memory::MemoryPool* pool) { MemoryUsageStats stats; - stats.set_current(pool->currentBytes()); + stats.set_current(pool->usedBytes()); stats.set_peak(pool->peakBytes()); // walk down root and all children pool->visitChildren([&](velox::memory::MemoryPool* pool) -> bool { @@ -216,7 +217,7 @@ int64_t shrinkVeloxMemoryPool(velox::memory::MemoryManager* mm, velox::memory::M std::string poolName{pool->root()->name() + "/" + pool->name()}; std::string logPrefix{"Shrink[" + poolName + "]: "}; VLOG(2) << logPrefix << "Trying to shrink " << size << " bytes of data..."; - VLOG(2) << logPrefix << "Pool has reserved " << pool->currentBytes() << "/" << pool->root()->reservedBytes() << "/" + VLOG(2) << logPrefix << "Pool has reserved " << pool->usedBytes() << "/" << pool->root()->reservedBytes() << "/" << pool->root()->capacity() << "/" << pool->root()->maxCapacity() << " bytes."; VLOG(2) << logPrefix << "Shrinking..."; const uint64_t oldCapacity = pool->capacity(); @@ -263,14 +264,14 @@ void VeloxMemoryManager::hold() { bool VeloxMemoryManager::tryDestructSafe() { // Velox memory pools considered safe to destruct when no alive allocations. for (const auto& pool : heldVeloxPools_) { - if (pool && pool->currentBytes() != 0) { + if (pool && pool->usedBytes() != 0) { return false; } } - if (veloxLeafPool_ && veloxLeafPool_->currentBytes() != 0) { + if (veloxLeafPool_ && veloxLeafPool_->usedBytes() != 0) { return false; } - if (veloxAggregatePool_ && veloxAggregatePool_->currentBytes() != 0) { + if (veloxAggregatePool_ && veloxAggregatePool_->usedBytes() != 0) { return false; } heldVeloxPools_.clear(); @@ -313,8 +314,10 @@ bool VeloxMemoryManager::tryDestructSafe() { VeloxMemoryManager::~VeloxMemoryManager() { static const uint32_t kWaitTimeoutMs = FLAGS_gluten_velox_aysnc_timeout_on_task_stopping; // 30s by default uint32_t accumulatedWaitMs = 0UL; + bool destructed = false; for (int32_t tryCount = 0; accumulatedWaitMs < kWaitTimeoutMs; tryCount++) { - if (tryDestructSafe()) { + destructed = tryDestructSafe(); + if (destructed) { if (tryCount > 0) { LOG(INFO) << "All the outstanding memory resources successfully released. "; } @@ -326,6 +329,13 @@ VeloxMemoryManager::~VeloxMemoryManager() { usleep(waitMs * 1000); accumulatedWaitMs += waitMs; } + if (!destructed) { + LOG(ERROR) << "Failed to release Velox memory manager after " << accumulatedWaitMs + << "ms as there are still outstanding memory resources. "; + } +#ifdef ENABLE_JEMALLOC + je_gluten_malloc_stats_print(NULL, NULL, NULL); +#endif } } // namespace gluten diff --git a/cpp/velox/memory/VeloxMemoryManager.h b/cpp/velox/memory/VeloxMemoryManager.h index 3ba5bbf7d25f8..3607ca793f3e4 100644 --- a/cpp/velox/memory/VeloxMemoryManager.h +++ b/cpp/velox/memory/VeloxMemoryManager.h @@ -27,10 +27,7 @@ namespace gluten { class VeloxMemoryManager final : public MemoryManager { public: - VeloxMemoryManager( - const std::string& name, - std::shared_ptr allocator, - std::unique_ptr listener); + VeloxMemoryManager(std::unique_ptr listener); ~VeloxMemoryManager() override; VeloxMemoryManager(const VeloxMemoryManager&) = delete; @@ -72,8 +69,6 @@ class VeloxMemoryManager final : public MemoryManager { private: bool tryDestructSafe(); - std::string name_; - #ifdef GLUTEN_ENABLE_HBM std::unique_ptr wrappedAlloc_; #endif @@ -92,8 +87,7 @@ class VeloxMemoryManager final : public MemoryManager { /// Not tracked by Spark and should only used in test or validation. inline std::shared_ptr getDefaultMemoryManager() { - static auto memoryManager = std::make_shared( - "test", gluten::defaultMemoryAllocator(), gluten::AllocationListener::noop()); + static auto memoryManager = std::make_shared(gluten::AllocationListener::noop()); return memoryManager; } diff --git a/cpp/velox/operators/functions/RegistrationAllFunctions.cc b/cpp/velox/operators/functions/RegistrationAllFunctions.cc index b88d781b69b24..6b6564fa4aa3a 100644 --- a/cpp/velox/operators/functions/RegistrationAllFunctions.cc +++ b/cpp/velox/operators/functions/RegistrationAllFunctions.cc @@ -21,11 +21,11 @@ #include "operators/functions/RowFunctionWithNull.h" #include "velox/expression/SpecialFormRegistry.h" #include "velox/expression/VectorFunction.h" +#include "velox/functions/lib/CheckedArithmetic.h" #include "velox/functions/lib/RegistrationHelpers.h" #include "velox/functions/prestosql/aggregates/RegisterAggregateFunctions.h" #include "velox/functions/prestosql/registration/RegistrationFunctions.h" #include "velox/functions/prestosql/window/WindowFunctionsRegistration.h" -#include "velox/functions/sparksql/Bitwise.h" #include "velox/functions/sparksql/Hash.h" #include "velox/functions/sparksql/Rand.h" #include "velox/functions/sparksql/Register.h" @@ -34,6 +34,14 @@ using namespace facebook; +namespace facebook::velox::functions { +void registerPrestoVectorFunctions() { + // Presto function. To be removed. + VELOX_REGISTER_VECTOR_FUNCTION(udf_arrays_overlap, "arrays_overlap"); + VELOX_REGISTER_VECTOR_FUNCTION(udf_transform_keys, "transform_keys"); + VELOX_REGISTER_VECTOR_FUNCTION(udf_transform_values, "transform_values"); +} +} // namespace facebook::velox::functions namespace gluten { namespace { void registerFunctionOverwrite() { @@ -44,9 +52,6 @@ void registerFunctionOverwrite() { velox::registerFunction({"round"}); velox::registerFunction({"round"}); velox::registerFunction({"round"}); - // TODO: the below rand function registry can be removed after presto function registry is removed. - velox::registerFunction>({"spark_rand"}); - velox::registerFunction>({"spark_rand"}); auto kRowConstructorWithNull = RowConstructorWithNullCallToSpecialForm::kRowConstructorWithNull; velox::exec::registerVectorFunction( @@ -66,15 +71,12 @@ void registerFunctionOverwrite() { velox::exec::registerFunctionCallToSpecialForm( kRowConstructorWithAllNull, std::make_unique(kRowConstructorWithAllNull)); - velox::functions::sparksql::registerBitwiseFunctions("spark_"); + + velox::functions::registerPrestoVectorFunctions(); } } // namespace void registerAllFunctions() { - // The registration order matters. Spark sql functions are registered after - // presto sql functions to overwrite the registration for same named - // functions. - velox::functions::prestosql::registerAllScalarFunctions(); velox::functions::sparksql::registerFunctions(""); velox::aggregate::prestosql::registerAllAggregateFunctions( "", true /*registerCompanionFunctions*/, false /*onlyPrestoSignatures*/, true /*overwrite*/); diff --git a/cpp/velox/operators/writer/VeloxParquetDatasource.cc b/cpp/velox/operators/writer/VeloxParquetDatasource.cc index 2677b0a812d10..58aa9f33af04f 100644 --- a/cpp/velox/operators/writer/VeloxParquetDatasource.cc +++ b/cpp/velox/operators/writer/VeloxParquetDatasource.cc @@ -43,13 +43,16 @@ namespace { const int32_t kGzipWindowBits4k = 12; } -void VeloxParquetDatasource::init(const std::unordered_map& sparkConfs) { +void VeloxParquetDatasource::initSink(const std::unordered_map& /* sparkConfs */) { if (strncmp(filePath_.c_str(), "file:", 5) == 0) { sink_ = dwio::common::FileSink::create(filePath_, {.pool = pool_.get()}); } else { throw std::runtime_error("The file path is not local when writing data with parquet format in velox runtime!"); } +} +void VeloxParquetDatasource::init(const std::unordered_map& sparkConfs) { + initSink(sparkConfs); ArrowSchema cSchema{}; arrow::Status status = arrow::ExportSchema(*(schema_.get()), &cSchema); if (!status.ok()) { @@ -117,7 +120,7 @@ void VeloxParquetDatasource::inspectSchema(struct ArrowSchema* out) { std::shared_ptr readFile{fs->openFileForRead(filePath_)}; std::unique_ptr reader = - velox::dwio::common::getReaderFactory(readerOptions.getFileFormat()) + velox::dwio::common::getReaderFactory(readerOptions.fileFormat()) ->createReader( std::make_unique( std::make_shared(readFile), *pool_.get()), diff --git a/cpp/velox/operators/writer/VeloxParquetDatasource.h b/cpp/velox/operators/writer/VeloxParquetDatasource.h index 3df444016beb6..12cf2c301a391 100644 --- a/cpp/velox/operators/writer/VeloxParquetDatasource.h +++ b/cpp/velox/operators/writer/VeloxParquetDatasource.h @@ -89,6 +89,7 @@ class VeloxParquetDatasource : public Datasource { : Datasource(filePath, schema), filePath_(filePath), schema_(schema), pool_(std::move(veloxPool)) {} void init(const std::unordered_map& sparkConfs) override; + virtual void initSink(const std::unordered_map& sparkConfs); void inspectSchema(struct ArrowSchema* out) override; void write(const std::shared_ptr& cb) override; void close() override; diff --git a/cpp/velox/operators/writer/VeloxParquetDatasourceABFS.h b/cpp/velox/operators/writer/VeloxParquetDatasourceABFS.h index 208e6a7ec55c8..82e8f794cbce5 100644 --- a/cpp/velox/operators/writer/VeloxParquetDatasourceABFS.h +++ b/cpp/velox/operators/writer/VeloxParquetDatasourceABFS.h @@ -41,13 +41,13 @@ class VeloxParquetDatasourceABFS final : public VeloxParquetDatasource { std::shared_ptr sinkPool, std::shared_ptr schema) : VeloxParquetDatasource(filePath, veloxPool, sinkPool, schema) {} - void init(const std::unordered_map& sparkConfs) override { + + void initSink(const std::unordered_map& sparkConfs) override { auto hiveConf = getHiveConfig(std::make_shared(sparkConfs)); auto fileSystem = filesystems::getFileSystem(filePath_, hiveConf); auto* abfsFileSystem = dynamic_cast(fileSystem.get()); sink_ = std::make_unique( abfsFileSystem->openFileForWrite(filePath_, {{}, sinkPool_.get()}), filePath_); - VeloxParquetDatasource::init(sparkConfs); } }; } // namespace gluten diff --git a/cpp/velox/operators/writer/VeloxParquetDatasourceGCS.h b/cpp/velox/operators/writer/VeloxParquetDatasourceGCS.h index b8a9b5431df67..0c2bfa2138929 100644 --- a/cpp/velox/operators/writer/VeloxParquetDatasourceGCS.h +++ b/cpp/velox/operators/writer/VeloxParquetDatasourceGCS.h @@ -40,12 +40,12 @@ class VeloxParquetDatasourceGCS final : public VeloxParquetDatasource { std::shared_ptr sinkPool, std::shared_ptr schema) : VeloxParquetDatasource(filePath, veloxPool, sinkPool, schema) {} - void init(const std::unordered_map& sparkConfs) override { + + void initSink(const std::unordered_map& /* sparkConfs */) override { auto fileSystem = filesystems::getFileSystem(filePath_, nullptr); auto* gcsFileSystem = dynamic_cast(fileSystem.get()); sink_ = std::make_unique( gcsFileSystem->openFileForWrite(filePath_, {{}, sinkPool_.get()}), filePath_); - VeloxParquetDatasource::init(sparkConfs); } }; } // namespace gluten diff --git a/cpp/velox/operators/writer/VeloxParquetDatasourceHDFS.h b/cpp/velox/operators/writer/VeloxParquetDatasourceHDFS.h index 32cf960cbf2f4..7722c8e51993f 100644 --- a/cpp/velox/operators/writer/VeloxParquetDatasourceHDFS.h +++ b/cpp/velox/operators/writer/VeloxParquetDatasourceHDFS.h @@ -41,10 +41,10 @@ class VeloxParquetDatasourceHDFS final : public VeloxParquetDatasource { std::shared_ptr sinkPool, std::shared_ptr schema) : VeloxParquetDatasource(filePath, veloxPool, sinkPool, schema) {} - void init(const std::unordered_map& sparkConfs) override { + + void initSink(const std::unordered_map& sparkConfs) override { auto hiveConf = getHiveConfig(std::make_shared(sparkConfs)); sink_ = dwio::common::FileSink::create(filePath_, {.connectorProperties = hiveConf, .pool = sinkPool_.get()}); - VeloxParquetDatasource::init(sparkConfs); } }; } // namespace gluten diff --git a/cpp/velox/operators/writer/VeloxParquetDatasourceS3.h b/cpp/velox/operators/writer/VeloxParquetDatasourceS3.h index a5c49fcd9f817..3231a8a1ee5cd 100644 --- a/cpp/velox/operators/writer/VeloxParquetDatasourceS3.h +++ b/cpp/velox/operators/writer/VeloxParquetDatasourceS3.h @@ -41,10 +41,10 @@ class VeloxParquetDatasourceS3 final : public VeloxParquetDatasource { std::shared_ptr sinkPool, std::shared_ptr schema) : VeloxParquetDatasource(filePath, veloxPool, sinkPool, schema) {} - void init(const std::unordered_map& sparkConfs) override { + + void initSink(const std::unordered_map& sparkConfs) override { auto hiveConf = getHiveConfig(std::make_shared(sparkConfs)); sink_ = dwio::common::FileSink::create(filePath_, {.connectorProperties = hiveConf, .pool = sinkPool_.get()}); - VeloxParquetDatasource::init(sparkConfs); } }; } // namespace gluten diff --git a/cpp/velox/shuffle/VeloxHashBasedShuffleWriter.cc b/cpp/velox/shuffle/VeloxHashBasedShuffleWriter.cc index daff1370332f1..3bd1a2fbc6cce 100644 --- a/cpp/velox/shuffle/VeloxHashBasedShuffleWriter.cc +++ b/cpp/velox/shuffle/VeloxHashBasedShuffleWriter.cc @@ -129,6 +129,14 @@ arrow::Status collectFlatVectorBufferStringView( return arrow::Status::OK(); } +template <> +arrow::Status collectFlatVectorBuffer( + facebook::velox::BaseVector* vector, + std::vector>& buffers, + arrow::MemoryPool* pool) { + return arrow::Status::OK(); +} + template <> arrow::Status collectFlatVectorBuffer( facebook::velox::BaseVector* vector, @@ -850,8 +858,8 @@ uint32_t VeloxHashBasedShuffleWriter::calculatePartitionBufferSize( memLimit > 0 && bytesPerRow > 0 ? memLimit / bytesPerRow / numPartitions_ >> 2 : options_.bufferSize; preAllocRowCnt = std::min(preAllocRowCnt, (uint64_t)options_.bufferSize); - VLOG(9) << "Calculated partition buffer size - memLimit: " << memLimit << ", bytesPerRow: " << bytesPerRow - << ", preAllocRowCnt: " << preAllocRowCnt << std::endl; + DLOG(INFO) << "Calculated partition buffer size - memLimit: " << memLimit << ", bytesPerRow: " << bytesPerRow + << ", preAllocRowCnt: " << preAllocRowCnt << std::endl; VS_PRINTLF(preAllocRowCnt); @@ -1392,7 +1400,7 @@ arrow::Result VeloxHashBasedShuffleWriter::partitionBufferSizeAfterShr arrow::Status VeloxHashBasedShuffleWriter::preAllocPartitionBuffers(uint32_t preAllocBufferSize) { for (auto& pid : partitionUsed_) { auto newSize = std::max(preAllocBufferSize, partition2RowCount_[pid]); - VLOG_IF(9, partitionBufferSize_[pid] != newSize) + DLOG_IF(INFO, partitionBufferSize_[pid] != newSize) << "Actual partition buffer size - current: " << partitionBufferSize_[pid] << ", newSize: " << newSize << std::endl; // Make sure the size to be allocated is larger than the size to be filled. diff --git a/cpp/velox/shuffle/VeloxShuffleWriter.cc b/cpp/velox/shuffle/VeloxShuffleWriter.cc new file mode 100644 index 0000000000000..4b4f73f9463cb --- /dev/null +++ b/cpp/velox/shuffle/VeloxShuffleWriter.cc @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "shuffle/VeloxShuffleWriter.h" +#include "shuffle/VeloxHashBasedShuffleWriter.h" +#include "shuffle/VeloxSortBasedShuffleWriter.h" + +namespace gluten { +arrow::Result> VeloxShuffleWriter::create( + ShuffleWriterType type, + uint32_t numPartitions, + std::unique_ptr partitionWriter, + ShuffleWriterOptions options, + std::shared_ptr veloxPool, + arrow::MemoryPool* arrowPool) { + std::shared_ptr shuffleWriter; + switch (type) { + case kHashShuffle: + return VeloxHashBasedShuffleWriter::create( + numPartitions, std::move(partitionWriter), std::move(options), veloxPool, arrowPool); + case kSortShuffle: + return VeloxSortBasedShuffleWriter::create( + numPartitions, std::move(partitionWriter), std::move(options), veloxPool, arrowPool); + default: + return arrow::Status::Invalid("Unsupported shuffle writer type: ", std::to_string(type)); + } +} +} // namespace gluten diff --git a/cpp/velox/shuffle/VeloxShuffleWriter.h b/cpp/velox/shuffle/VeloxShuffleWriter.h index 104b876162912..0b49789c6d89e 100644 --- a/cpp/velox/shuffle/VeloxShuffleWriter.h +++ b/cpp/velox/shuffle/VeloxShuffleWriter.h @@ -37,6 +37,7 @@ #include #include "memory/VeloxMemoryManager.h" +#include "shuffle/Options.h" #include "shuffle/PartitionWriter.h" #include "shuffle/Partitioner.h" #include "shuffle/ShuffleWriter.h" @@ -48,9 +49,17 @@ namespace gluten { class VeloxShuffleWriter : public ShuffleWriter { public: + static arrow::Result> create( + ShuffleWriterType type, + uint32_t numPartitions, + std::unique_ptr partitionWriter, + ShuffleWriterOptions options, + std::shared_ptr veloxPool, + arrow::MemoryPool* arrowPool); + facebook::velox::RowVectorPtr getStrippedRowVector(const facebook::velox::RowVector& rv) { // get new row type - auto rowType = rv.type()->asRow(); + auto& rowType = rv.type()->asRow(); auto typeChildren = rowType.children(); typeChildren.erase(typeChildren.begin()); auto newRowType = facebook::velox::ROW(std::move(typeChildren)); diff --git a/cpp/velox/substrait/SubstraitParser.cc b/cpp/velox/substrait/SubstraitParser.cc index f417618d8117e..b842914ca9334 100644 --- a/cpp/velox/substrait/SubstraitParser.cc +++ b/cpp/velox/substrait/SubstraitParser.cc @@ -131,6 +131,9 @@ void SubstraitParser::parseColumnTypes( case ::substrait::NamedStruct::METADATA_COL: columnTypes.push_back(ColumnType::kSynthesized); break; + case ::substrait::NamedStruct::ROWINDEX_COL: + columnTypes.push_back(ColumnType::kRowIndex); + break; default: VELOX_FAIL("Unspecified column type."); } @@ -388,24 +391,13 @@ std::unordered_map SubstraitParser::substraitVeloxFunc {"named_struct", "row_constructor"}, {"bit_or", "bitwise_or_agg"}, {"bit_and", "bitwise_and_agg"}, - {"bitwise_and", "spark_bitwise_and"}, - {"bitwise_not", "spark_bitwise_not"}, - {"bitwise_or", "spark_bitwise_or"}, - {"bitwise_xor", "spark_bitwise_xor"}, - // TODO: the below registry for rand functions can be removed - // after presto function registry is removed. - {"rand", "spark_rand"}, {"murmur3hash", "hash_with_seed"}, {"xxhash64", "xxhash64_with_seed"}, {"modulus", "remainder"}, {"date_format", "format_datetime"}, {"collect_set", "set_agg"}, - {"try_add", "plus"}, - {"forall", "all_match"}, - {"exists", "any_match"}, {"negative", "unaryminus"}, - {"get_array_item", "get"}, - {"arrays_zip", "zip"}}; + {"get_array_item", "get"}}; const std::unordered_map SubstraitParser::typeMap_ = { {"bool", "BOOLEAN"}, diff --git a/cpp/velox/substrait/SubstraitToVeloxPlan.cc b/cpp/velox/substrait/SubstraitToVeloxPlan.cc index b82eead2c565e..73047b2f49073 100644 --- a/cpp/velox/substrait/SubstraitToVeloxPlan.cc +++ b/cpp/velox/substrait/SubstraitToVeloxPlan.cc @@ -20,11 +20,13 @@ #include "VariantToVectorConverter.h" #include "velox/connectors/hive/HiveDataSink.h" #include "velox/exec/TableWriter.h" +#include "velox/type/Filter.h" #include "velox/type/Type.h" #include "utils/ConfigExtractor.h" #include "config/GlutenConfig.h" +#include "operators/plannodes/RowVectorStream.h" namespace gluten { namespace { @@ -709,16 +711,23 @@ core::PlanNodePtr SubstraitToVeloxPlanConverter::toVeloxPlan(const ::substrait:: namespace { void extractUnnestFieldExpr( - std::shared_ptr projNode, + std::shared_ptr child, int32_t index, std::vector& unnestFields) { - auto name = projNode->names()[index]; - auto expr = projNode->projections()[index]; - auto type = expr->type(); + if (auto projNode = std::dynamic_pointer_cast(child)) { + auto name = projNode->names()[index]; + auto expr = projNode->projections()[index]; + auto type = expr->type(); - auto unnestFieldExpr = std::make_shared(type, name); - VELOX_CHECK_NOT_NULL(unnestFieldExpr, " the key in unnest Operator only support field"); - unnestFields.emplace_back(unnestFieldExpr); + auto unnestFieldExpr = std::make_shared(type, name); + VELOX_CHECK_NOT_NULL(unnestFieldExpr, " the key in unnest Operator only support field"); + unnestFields.emplace_back(unnestFieldExpr); + } else { + auto name = child->outputType()->names()[index]; + auto field = child->outputType()->childAt(index); + auto unnestFieldExpr = std::make_shared(field, name); + unnestFields.emplace_back(unnestFieldExpr); + } } } // namespace @@ -751,10 +760,13 @@ core::PlanNodePtr SubstraitToVeloxPlanConverter::toVeloxPlan(const ::substrait:: SubstraitParser::configSetInOptimization(generateRel.advanced_extension(), "injectedProject="); if (injectedProject) { - auto projNode = std::dynamic_pointer_cast(childNode); + // Child should be either ProjectNode or ValueStreamNode in case of project fallback. VELOX_CHECK( - projNode != nullptr && projNode->names().size() > requiredChildOutput.size(), - "injectedProject is true, but the Project is missing or does not have the corresponding projection field") + (std::dynamic_pointer_cast(childNode) != nullptr || + std::dynamic_pointer_cast(childNode) != nullptr) && + childNode->outputType()->size() > requiredChildOutput.size(), + "injectedProject is true, but the ProjectNode or ValueStreamNode (in case of projection fallback)" + " is missing or does not have the corresponding projection field") bool isStack = generateRel.has_advanced_extension() && SubstraitParser::configSetInOptimization(generateRel.advanced_extension(), "isStack="); @@ -767,7 +779,8 @@ core::PlanNodePtr SubstraitToVeloxPlanConverter::toVeloxPlan(const ::substrait:: // +- Project [fake_column#128, [1,2,3] AS _pre_0#129] // +- RewrittenNodeWall Scan OneRowRelation[fake_column#128] // The last projection column in GeneratorRel's child(Project) is the column we need to unnest - extractUnnestFieldExpr(projNode, projNode->projections().size() - 1, unnest); + auto index = childNode->outputType()->size() - 1; + extractUnnestFieldExpr(childNode, index, unnest); } else { // For stack function, e.g. stack(2, 1,2,3), a sample // input substrait plan is like the following: @@ -781,10 +794,10 @@ core::PlanNodePtr SubstraitToVeloxPlanConverter::toVeloxPlan(const ::substrait:: auto generatorFunc = generator.scalar_function(); auto numRows = SubstraitParser::getLiteralValue(generatorFunc.arguments(0).value().literal()); auto numFields = static_cast(std::ceil((generatorFunc.arguments_size() - 1.0) / numRows)); - auto totalProjectCount = projNode->names().size(); + auto totalProjectCount = childNode->outputType()->size(); for (auto i = totalProjectCount - numFields; i < totalProjectCount; ++i) { - extractUnnestFieldExpr(projNode, i, unnest); + extractUnnestFieldExpr(childNode, i, unnest); } } } else { @@ -823,10 +836,11 @@ core::PlanNodePtr SubstraitToVeloxPlanConverter::toVeloxPlan(const ::substrait:: nextPlanNodeId(), replicated, unnest, std::move(unnestNames), ordinalityName, childNode); } -const core::WindowNode::Frame createWindowFrame( +const core::WindowNode::Frame SubstraitToVeloxPlanConverter::createWindowFrame( const ::substrait::Expression_WindowFunction_Bound& lower_bound, const ::substrait::Expression_WindowFunction_Bound& upper_bound, - const ::substrait::WindowType& type) { + const ::substrait::WindowType& type, + const RowTypePtr& inputType) { core::WindowNode::Frame frame; switch (type) { case ::substrait::WindowType::ROWS: @@ -839,9 +853,22 @@ const core::WindowNode::Frame createWindowFrame( VELOX_FAIL("the window type only support ROWS and RANGE, and the input type is ", std::to_string(type)); } - auto boundTypeConversion = [](::substrait::Expression_WindowFunction_Bound boundType) + auto specifiedBound = + [&](bool hasOffset, int64_t offset, const ::substrait::Expression& columnRef) -> core::TypedExprPtr { + if (hasOffset) { + VELOX_CHECK( + frame.type != core::WindowNode::WindowType::kRange, + "for RANGE frame offset, we should pre-calculate the range frame boundary and pass the column reference, but got a constant offset.") + return std::make_shared(BIGINT(), variant(offset)); + } else { + VELOX_CHECK( + frame.type != core::WindowNode::WindowType::kRows, "for ROW frame offset, we should pass a constant offset.") + return exprConverter_->toVeloxExpr(columnRef, inputType); + } + }; + + auto boundTypeConversion = [&](::substrait::Expression_WindowFunction_Bound boundType) -> std::tuple { - // TODO: support non-literal expression. if (boundType.has_current_row()) { return std::make_tuple(core::WindowNode::BoundType::kCurrentRow, nullptr); } else if (boundType.has_unbounded_following()) { @@ -849,13 +876,15 @@ const core::WindowNode::Frame createWindowFrame( } else if (boundType.has_unbounded_preceding()) { return std::make_tuple(core::WindowNode::BoundType::kUnboundedPreceding, nullptr); } else if (boundType.has_following()) { + auto following = boundType.following(); return std::make_tuple( core::WindowNode::BoundType::kFollowing, - std::make_shared(BIGINT(), variant(boundType.following().offset()))); + specifiedBound(following.has_offset(), following.offset(), following.ref())); } else if (boundType.has_preceding()) { + auto preceding = boundType.preceding(); return std::make_tuple( core::WindowNode::BoundType::kPreceding, - std::make_shared(BIGINT(), variant(boundType.preceding().offset()))); + specifiedBound(preceding.has_offset(), preceding.offset(), preceding.ref())); } else { VELOX_FAIL("The BoundType is not supported."); } @@ -906,7 +935,7 @@ core::PlanNodePtr SubstraitToVeloxPlanConverter::toVeloxPlan(const ::substrait:: windowColumnNames.push_back(windowFunction.column_name()); windowNodeFunctions.push_back( - {std::move(windowCall), std::move(createWindowFrame(lowerBound, upperBound, type)), ignoreNulls}); + {std::move(windowCall), std::move(createWindowFrame(lowerBound, upperBound, type, inputType)), ignoreNulls}); } // Construct partitionKeys @@ -1449,10 +1478,12 @@ connector::hive::SubfieldFilters SubstraitToVeloxPlanConverter::createSubfieldFi auto expr = scalarFunction.arguments()[0].value(); if (expr.has_scalar_function()) { // Set its child to filter info with reverse enabled. - setFilterInfo(scalarFunction.arguments()[0].value().scalar_function(), inputTypeList, columnToFilterInfo, true); + setFilterInfo(expr.scalar_function(), inputTypeList, columnToFilterInfo, true); + } else if (expr.has_singular_or_list()) { + auto singularOrList = expr.singular_or_list(); + setFilterInfo(singularOrList, columnToFilterInfo, true); } else { - // TODO: support push down of Not In. - VELOX_NYI("Scalar function expected."); + VELOX_NYI("Only support push down Not with scalar function or In."); } } else if (filterName == sOr) { VELOX_CHECK(scalarFunction.arguments().size() == 2); @@ -1577,24 +1608,26 @@ bool SubstraitToVeloxPlanConverter::canPushdownNot( std::vector& rangeRecorders) { VELOX_CHECK(scalarFunction.arguments().size() == 1, "Only one arg is expected for Not."); const auto& notArg = scalarFunction.arguments()[0]; - if (!notArg.value().has_scalar_function()) { - // Not for a Boolean Literal or Or List is not supported curretly. - // It can be pushed down with an AlwaysTrue or AlwaysFalse Range. - return false; - } - - auto argFunction = - SubstraitParser::findFunctionSpec(functionMap_, notArg.value().scalar_function().function_reference()); - auto functionName = SubstraitParser::getNameBeforeDelimiter(argFunction); + if (notArg.value().has_singular_or_list()) { + auto singularOrList = notArg.value().singular_or_list(); + if (!canPushdownSingularOrList(singularOrList)) { + return false; + } + uint32_t colIdx = getColumnIndexFromSingularOrList(singularOrList); + return rangeRecorders.at(colIdx).setInRange(); + } else if (notArg.value().has_scalar_function()) { + auto argFunction = + SubstraitParser::findFunctionSpec(functionMap_, notArg.value().scalar_function().function_reference()); + auto functionName = SubstraitParser::getNameBeforeDelimiter(argFunction); - static const std::unordered_set supportedNotFunctions = {sGte, sGt, sLte, sLt, sEqual}; + static const std::unordered_set supportedNotFunctions = {sGte, sGt, sLte, sLt, sEqual}; - uint32_t fieldIdx; - bool isFieldOrWithLiteral = fieldOrWithLiteral(notArg.value().scalar_function().arguments(), fieldIdx); + uint32_t fieldIdx; + bool isFieldOrWithLiteral = fieldOrWithLiteral(notArg.value().scalar_function().arguments(), fieldIdx); - if (supportedNotFunctions.find(functionName) != supportedNotFunctions.end() && isFieldOrWithLiteral && - rangeRecorders.at(fieldIdx).setCertainRangeForFunction(functionName, true /*reverse*/)) { - return true; + return ( + supportedNotFunctions.find(functionName) != supportedNotFunctions.end() && isFieldOrWithLiteral && + rangeRecorders.at(fieldIdx).setCertainRangeForFunction(functionName, true /*reverse*/)); } return false; } @@ -1950,6 +1983,7 @@ template void SubstraitToVeloxPlanConverter::setInFilter( const std::vector& variants, bool nullAllowed, + bool negated, const std::string& inputName, connector::hive::SubfieldFilters& filters) {} @@ -1957,6 +1991,7 @@ template <> void SubstraitToVeloxPlanConverter::setInFilter( const std::vector& variants, bool nullAllowed, + bool negated, const std::string& inputName, connector::hive::SubfieldFilters& filters) { std::vector values; @@ -1965,13 +2000,18 @@ void SubstraitToVeloxPlanConverter::setInFilter( int64_t value = variant.value(); values.emplace_back(value); } - filters[common::Subfield(inputName)] = common::createBigintValues(values, nullAllowed); + if (negated) { + filters[common::Subfield(inputName)] = common::createNegatedBigintValues(values, nullAllowed); + } else { + filters[common::Subfield(inputName)] = common::createBigintValues(values, nullAllowed); + } } template <> void SubstraitToVeloxPlanConverter::setInFilter( const std::vector& variants, bool nullAllowed, + bool negated, const std::string& inputName, connector::hive::SubfieldFilters& filters) { // Use bigint values for int type. @@ -1982,13 +2022,18 @@ void SubstraitToVeloxPlanConverter::setInFilter( int64_t value = variant.value(); values.emplace_back(value); } - filters[common::Subfield(inputName)] = common::createBigintValues(values, nullAllowed); + if (negated) { + filters[common::Subfield(inputName)] = common::createNegatedBigintValues(values, nullAllowed); + } else { + filters[common::Subfield(inputName)] = common::createBigintValues(values, nullAllowed); + } } template <> void SubstraitToVeloxPlanConverter::setInFilter( const std::vector& variants, bool nullAllowed, + bool negated, const std::string& inputName, connector::hive::SubfieldFilters& filters) { // Use bigint values for small int type. @@ -1999,13 +2044,18 @@ void SubstraitToVeloxPlanConverter::setInFilter( int64_t value = variant.value(); values.emplace_back(value); } - filters[common::Subfield(inputName)] = common::createBigintValues(values, nullAllowed); + if (negated) { + filters[common::Subfield(inputName)] = common::createNegatedBigintValues(values, nullAllowed); + } else { + filters[common::Subfield(inputName)] = common::createBigintValues(values, nullAllowed); + } } template <> void SubstraitToVeloxPlanConverter::setInFilter( const std::vector& variants, bool nullAllowed, + bool negated, const std::string& inputName, connector::hive::SubfieldFilters& filters) { // Use bigint values for tiny int type. @@ -2016,13 +2066,18 @@ void SubstraitToVeloxPlanConverter::setInFilter( int64_t value = variant.value(); values.emplace_back(value); } - filters[common::Subfield(inputName)] = common::createBigintValues(values, nullAllowed); + if (negated) { + filters[common::Subfield(inputName)] = common::createNegatedBigintValues(values, nullAllowed); + } else { + filters[common::Subfield(inputName)] = common::createBigintValues(values, nullAllowed); + } } template <> void SubstraitToVeloxPlanConverter::setInFilter( const std::vector& variants, bool nullAllowed, + bool negated, const std::string& inputName, connector::hive::SubfieldFilters& filters) { std::vector values; @@ -2031,7 +2086,11 @@ void SubstraitToVeloxPlanConverter::setInFilter( std::string value = variant.value(); values.emplace_back(value); } - filters[common::Subfield(inputName)] = std::make_unique(values, nullAllowed); + if (negated) { + filters[common::Subfield(inputName)] = std::make_unique(values, nullAllowed); + } else { + filters[common::Subfield(inputName)] = std::make_unique(values, nullAllowed); + } } template @@ -2086,6 +2145,17 @@ void SubstraitToVeloxPlanConverter::constructSubfieldFilters( if (filterInfo.notValue_) { filters[common::Subfield(inputName)] = std::make_unique(!filterInfo.notValue_.value().value(), nullAllowed); + } else if (filterInfo.notValues_.size() > 0) { + std::set notValues; + for (auto v : filterInfo.notValues_) { + notValues.emplace(v.value()); + } + if (notValues.size() == 1) { + filters[common::Subfield(inputName)] = std::make_unique(!(*notValues.begin()), nullAllowed); + } else { + // if there are more than one distinct value in NOT IN list, the filter should be AlwaysFalse + filters[common::Subfield(inputName)] = std::make_unique(); + } } else if (rangeSize == 0) { // IsNull/IsNotNull. if (!nullAllowed) { @@ -2124,11 +2194,22 @@ void SubstraitToVeloxPlanConverter::constructSubfieldFilters( if (filterInfo.values_.size() > 0) { // To filter out null is a default behaviour of Spark IN expression. nullAllowed = false; - setInFilter(filterInfo.values_, nullAllowed, inputName, filters); + setInFilter(filterInfo.values_, nullAllowed, false, inputName, filters); // Currently, In cannot coexist with other filter conditions // due to multirange is in 'OR' relation but 'AND' is needed. VELOX_CHECK(rangeSize == 0, "LowerBounds or upperBounds conditons cannot be supported after IN filter."); VELOX_CHECK(!filterInfo.notValue_.has_value(), "Not equal cannot be supported after IN filter."); + VELOX_CHECK(filterInfo.notValues_.size() == 0, "Not in cannot be supported after IN filter."); + return; + } + + // Handle not in filter. + if (filterInfo.notValues_.size() > 0) { + setInFilter(filterInfo.notValues_, filterInfo.nullAllowed_, true, inputName, filters); + // Currently, NOT In cannot coexist with other filter conditions + // due to multirange is in 'OR' relation but 'AND' is needed. + VELOX_CHECK(rangeSize == 0, "LowerBounds or upperBounds conditons cannot be supported after NOT IN filter."); + VELOX_CHECK(!filterInfo.notValue_.has_value(), "Not equal cannot be supported after NOT IN filter."); return; } @@ -2413,7 +2494,8 @@ uint32_t SubstraitToVeloxPlanConverter::getColumnIndexFromSingularOrList( void SubstraitToVeloxPlanConverter::setFilterInfo( const ::substrait::Expression_SingularOrList& singularOrList, - std::vector& columnToFilterInfo) { + std::vector& columnToFilterInfo, + bool reverse) { VELOX_CHECK(singularOrList.options_size() > 0, "At least one option is expected."); // Get the column index. uint32_t colIdx = getColumnIndexFromSingularOrList(singularOrList); @@ -2427,7 +2509,11 @@ void SubstraitToVeloxPlanConverter::setFilterInfo( variants.emplace_back(exprConverter_->toVeloxExpr(option.literal())->value()); } // Set the value list to filter info. - columnToFilterInfo[colIdx].setValues(variants); + if (!reverse) { + columnToFilterInfo[colIdx].setValues(variants); + } else { + columnToFilterInfo[colIdx].setNotValues(variants); + } } } // namespace gluten diff --git a/cpp/velox/substrait/SubstraitToVeloxPlan.h b/cpp/velox/substrait/SubstraitToVeloxPlan.h index 1bda6435eaee0..1535b1f85f51b 100644 --- a/cpp/velox/substrait/SubstraitToVeloxPlan.h +++ b/cpp/velox/substrait/SubstraitToVeloxPlan.h @@ -19,6 +19,7 @@ #include "SubstraitToVeloxExpr.h" #include "TypeUtils.h" +#include "velox/connectors/hive/FileProperties.h" #include "velox/connectors/hive/TableHandle.h" #include "velox/core/PlanNode.h" #include "velox/dwio/common/Options.h" @@ -51,6 +52,9 @@ struct SplitInfo { /// The file format of the files to be scanned. dwio::common::FileFormat format; + /// The file sizes and modification times of the files to be scanned. + std::vector> properties; + /// Make SplitInfo polymorphic virtual ~SplitInfo() = default; }; @@ -111,6 +115,7 @@ class SubstraitToVeloxPlanConverter { /// Index: the index of the partition this item belongs to. /// Starts: the start positions in byte to read from the items. /// Lengths: the lengths in byte to read from the items. + /// FileProperties: the file sizes and modification times of the files to be scanned. core::PlanNodePtr toVeloxPlan(const ::substrait::ReadRel& sRead); core::PlanNodePtr constructValueStreamNode(const ::substrait::ReadRel& sRead, int32_t streamIdx); @@ -372,6 +377,16 @@ class SubstraitToVeloxPlanConverter { } } + // Set a list of values to be used in the push down of 'not in' expression. + void setNotValues(const std::vector& notValues) { + for (const auto& value : notValues) { + notValues_.emplace_back(value); + } + if (!initialized_) { + initialized_ = true; + } + } + // Whether this filter map is initialized. bool initialized_ = false; @@ -397,6 +412,9 @@ class SubstraitToVeloxPlanConverter { // The list of values used in 'in' expression. std::vector values_; + + // The list of values should not be equal to. + std::vector notValues_; }; /// Returns unique ID to use for plan node. Produces sequential numbers @@ -459,9 +477,11 @@ class SubstraitToVeloxPlanConverter { bool reverse = false); /// Extract SingularOrList and set it to the filter info map. + /// If reverse is true, the opposite filter info will be set. void setFilterInfo( const ::substrait::Expression_SingularOrList& singularOrList, - std::vector& columnToFilterInfo); + std::vector& columnToFilterInfo, + bool reverse = false); /// Extract SingularOrList and returns the field index. static uint32_t getColumnIndexFromSingularOrList(const ::substrait::Expression_SingularOrList&); @@ -479,13 +499,15 @@ class SubstraitToVeloxPlanConverter { template void createNotEqualFilter(variant notVariant, bool nullAllowed, std::vector>& colFilters); - /// Create a values range to handle in filter. - /// variants: the list of values extracted from the in expression. + /// Create a values range to handle (not) in filter. + /// variants: the list of values extracted from the (not) in expression. + // negated: false for IN filter, true for NOT IN filter. /// inputName: the column input name. template void setInFilter( const std::vector& variants, bool nullAllowed, + bool negated, const std::string& inputName, connector::hive::SubfieldFilters& filters); @@ -550,6 +572,12 @@ class SubstraitToVeloxPlanConverter { return toVeloxPlan(rel.input()); } + const core::WindowNode::Frame createWindowFrame( + const ::substrait::Expression_WindowFunction_Bound& lower_bound, + const ::substrait::Expression_WindowFunction_Bound& upper_bound, + const ::substrait::WindowType& type, + const RowTypePtr& inputType); + /// The unique identification for each PlanNode. int planNodeId_ = 0; diff --git a/cpp/velox/substrait/SubstraitToVeloxPlanValidator.cc b/cpp/velox/substrait/SubstraitToVeloxPlanValidator.cc index 0b08ca20517be..8e6dd7c1c31f3 100644 --- a/cpp/velox/substrait/SubstraitToVeloxPlanValidator.cc +++ b/cpp/velox/substrait/SubstraitToVeloxPlanValidator.cc @@ -68,7 +68,6 @@ static const std::unordered_set kBlackList = { "repeat", "trunc", "sequence", - "arrays_overlap", "approx_percentile", "get_array_struct_fields"}; @@ -908,9 +907,11 @@ bool SubstraitToVeloxPlanValidator::validate(const ::substrait::JoinRel& joinRel switch (joinRel.type()) { case ::substrait::JoinRel_JoinType_JOIN_TYPE_INNER: case ::substrait::JoinRel_JoinType_JOIN_TYPE_LEFT: + case ::substrait::JoinRel_JoinType_JOIN_TYPE_LEFT_SEMI: + case ::substrait::JoinRel_JoinType_JOIN_TYPE_RIGHT_SEMI: break; default: - LOG_VALIDATION_MSG("Sort merge join only support inner and left join."); + LOG_VALIDATION_MSG("Sort merge join only support inner, left, left semi and right semi join."); return false; } } @@ -924,7 +925,7 @@ bool SubstraitToVeloxPlanValidator::validate(const ::substrait::JoinRel& joinRel case ::substrait::JoinRel_JoinType_JOIN_TYPE_ANTI: break; default: - LOG_VALIDATION_MSG("Sort merge join only support inner and left join."); + LOG_VALIDATION_MSG("Join type is not supported: " + std::to_string(joinRel.type())); return false; } @@ -1046,6 +1047,16 @@ bool SubstraitToVeloxPlanValidator::validateAggRelFunctionType(const ::substrait LOG_VALIDATION_MSG("Validation failed for function " + funcName + " resolve type in AggregateRel."); return false; } + static const std::unordered_set notSupportComplexTypeAggFuncs = {"set_agg", "min", "max"}; + if (notSupportComplexTypeAggFuncs.find(baseFuncName) != notSupportComplexTypeAggFuncs.end() && + exec::isRawInput(funcStep)) { + auto type = binder.tryResolveType(signature->argumentTypes()[0]); + if (type->isArray() || type->isMap() || type->isRow()) { + LOG_VALIDATION_MSG("Validation failed for function " + baseFuncName + " complex type is not supported."); + return false; + } + } + resolved = true; break; } diff --git a/cpp/velox/symbols.map b/cpp/velox/symbols.map index ebd2b9af00960..525faf3526a1a 100644 --- a/cpp/velox/symbols.map +++ b/cpp/velox/symbols.map @@ -6,6 +6,8 @@ }; Java_org_apache_gluten_*; + JNI_OnLoad; + JNI_OnUnload; local: # Hide symbols of static dependencies *; diff --git a/cpp/velox/tests/BufferOutputStreamTest.cc b/cpp/velox/tests/BufferOutputStreamTest.cc index 3b3f78ceaefea..0e16f8c877694 100644 --- a/cpp/velox/tests/BufferOutputStreamTest.cc +++ b/cpp/velox/tests/BufferOutputStreamTest.cc @@ -16,6 +16,7 @@ */ #include "memory/BufferOutputStream.h" +#include "compute/VeloxBackend.h" #include "memory/VeloxColumnarBatch.h" #include "velox/common/memory/ByteStream.h" #include "velox/vector/tests/utils/VectorTestBase.h" @@ -27,6 +28,7 @@ class BufferOutputStreamTest : public ::testing::Test, public test::VectorTestBa protected: // Velox requires the mem manager to be instanced. static void SetUpTestCase() { + VeloxBackend::create({}); memory::MemoryManager::testingSetInstance({}); } @@ -55,16 +57,16 @@ TEST_F(BufferOutputStreamTest, outputStream) { reference->write(data.data(), data.size()); } auto str = referenceSStream.str(); - auto numBytes = veloxPool_->currentBytes(); + auto numBytes = veloxPool_->usedBytes(); EXPECT_LT(0, numBytes); { auto buffer = out->getBuffer(); - EXPECT_EQ(numBytes, veloxPool_->currentBytes()); + EXPECT_EQ(numBytes, veloxPool_->usedBytes()); EXPECT_EQ(str, std::string(buffer->as(), buffer->size())); } out.reset(); // We expect dropping the stream frees the backing memory. - EXPECT_EQ(0, veloxPool_->currentBytes()); + EXPECT_EQ(0, veloxPool_->usedBytes()); } } // namespace gluten diff --git a/cpp/velox/tests/CMakeLists.txt b/cpp/velox/tests/CMakeLists.txt index 29beb69da2209..f3d65f127f67a 100644 --- a/cpp/velox/tests/CMakeLists.txt +++ b/cpp/velox/tests/CMakeLists.txt @@ -16,14 +16,9 @@ function(add_velox_test TEST_EXEC) set(options) set(one_value_args) - set(multi_value_args - SOURCES - ) - cmake_parse_arguments(ARG - "${options}" - "${one_value_args}" - "${multi_value_args}" - ${ARGN}) + set(multi_value_args SOURCES) + cmake_parse_arguments(ARG "${options}" "${one_value_args}" + "${multi_value_args}" ${ARGN}) if(ARG_SOURCES) set(SOURCES ${ARG_SOURCES}) @@ -31,34 +26,34 @@ function(add_velox_test TEST_EXEC) message(FATAL_ERROR "No sources specified for test ${TEST_NAME}") endif() add_executable(${TEST_EXEC} ${SOURCES} ${VELOX_TEST_COMMON_SRCS}) - target_include_directories(${TEST_EXEC} PRIVATE ${CMAKE_SOURCE_DIR}/velox ${CMAKE_SOURCE_DIR}/src ${VELOX_BUILD_PATH}/_deps/duckdb-src/src/include) - target_link_libraries(${TEST_EXEC} velox_benchmark_common GTest::gtest GTest::gtest_main) + target_include_directories( + ${TEST_EXEC} PRIVATE ${CMAKE_SOURCE_DIR}/velox ${CMAKE_SOURCE_DIR}/src + ${VELOX_BUILD_PATH}/_deps/duckdb-src/src/include) + target_link_libraries(${TEST_EXEC} velox_benchmark_common GTest::gtest + GTest::gtest_main) gtest_discover_tests(${TEST_EXEC} DISCOVERY_MODE PRE_TEST) endfunction() set(VELOX_TEST_COMMON_SRCS JsonToProtoConverter.cc FilePathGenerator.cc) add_velox_test(velox_shuffle_writer_test SOURCES VeloxShuffleWriterTest.cc) -# TODO: ORC is not well supported. -# add_velox_test(orc_test SOURCES OrcTest.cc) +# TODO: ORC is not well supported. add_velox_test(orc_test SOURCES OrcTest.cc) add_velox_test( - velox_operators_test - SOURCES - VeloxColumnarToRowTest.cc - VeloxRowToColumnarTest.cc - VeloxColumnarBatchSerializerTest.cc - VeloxColumnarBatchTest.cc) + velox_operators_test SOURCES VeloxColumnarToRowTest.cc + VeloxRowToColumnarTest.cc VeloxColumnarBatchSerializerTest.cc + VeloxColumnarBatchTest.cc) add_velox_test( - velox_plan_conversion_test - SOURCES - Substrait2VeloxPlanConversionTest.cc - Substrait2VeloxPlanValidatorTest.cc - Substrait2VeloxValuesNodeConversionTest.cc - SubstraitExtensionCollectorTest.cc - VeloxSubstraitRoundTripTest.cc - VeloxSubstraitSignatureTest.cc - VeloxToSubstraitTypeTest.cc) -add_velox_test(spark_functions_test SOURCES SparkFunctionTest.cc FunctionTest.cc) + velox_plan_conversion_test + SOURCES + Substrait2VeloxPlanConversionTest.cc + Substrait2VeloxPlanValidatorTest.cc + Substrait2VeloxValuesNodeConversionTest.cc + SubstraitExtensionCollectorTest.cc + VeloxSubstraitRoundTripTest.cc + VeloxSubstraitSignatureTest.cc + VeloxToSubstraitTypeTest.cc) +add_velox_test(spark_functions_test SOURCES SparkFunctionTest.cc + FunctionTest.cc) add_velox_test(execution_ctx_test SOURCES RuntimeTest.cc) add_velox_test(velox_memory_test SOURCES MemoryManagerTest.cc) add_velox_test(buffer_outputstream_test SOURCES BufferOutputStreamTest.cc) diff --git a/cpp/velox/tests/MemoryManagerTest.cc b/cpp/velox/tests/MemoryManagerTest.cc index 400beafcc1f80..52f2fa8b661c8 100644 --- a/cpp/velox/tests/MemoryManagerTest.cc +++ b/cpp/velox/tests/MemoryManagerTest.cc @@ -54,7 +54,7 @@ class MemoryManagerTest : public ::testing::Test { } void SetUp() override { - vmm_ = std::make_unique("test", stdAllocator_, std::make_unique()); + vmm_ = std::make_unique(std::make_unique()); listener_ = vmm_->getListener(); allocator_ = vmm_->allocator(); } diff --git a/cpp/velox/tests/RuntimeTest.cc b/cpp/velox/tests/RuntimeTest.cc index 377d760542659..563539d7d63e8 100644 --- a/cpp/velox/tests/RuntimeTest.cc +++ b/cpp/velox/tests/RuntimeTest.cc @@ -18,19 +18,36 @@ #include "compute/VeloxRuntime.h" #include +#include "compute/VeloxBackend.h" namespace gluten { +class DummyMemoryManager final : public MemoryManager { + public: + arrow::MemoryPool* getArrowMemoryPool() override { + throw GlutenException("Not yet implemented"); + } + const MemoryUsageStats collectMemoryUsageStats() const override { + throw GlutenException("Not yet implemented"); + } + const int64_t shrink(int64_t size) override { + throw GlutenException("Not yet implemented"); + } + void hold() override { + throw GlutenException("Not yet implemented"); + } +}; + class DummyRuntime final : public Runtime { public: - DummyRuntime(const std::unordered_map& conf) : Runtime(conf) {} + DummyRuntime(std::unique_ptr listener, const std::unordered_map& conf) + : Runtime(std::make_shared(), conf) {} void parsePlan(const uint8_t* data, int32_t size, std::optional dumpFile) override {} void parseSplitInfo(const uint8_t* data, int32_t size, std::optional dumpFile) override {} std::shared_ptr createResultIterator( - MemoryManager* memoryManager, const std::string& spillDir, const std::vector>& inputs, const std::unordered_map& sessionConf) override { @@ -38,54 +55,41 @@ class DummyRuntime final : public Runtime { auto iter = std::make_shared(std::move(resIter)); return iter; } - MemoryManager* createMemoryManager( - const std::string& name, - std::shared_ptr ptr, - std::unique_ptr uniquePtr) override { + MemoryManager* memoryManager() override { throw GlutenException("Not yet implemented"); } std::shared_ptr createOrGetEmptySchemaBatch(int32_t numRows) override { throw GlutenException("Not yet implemented"); } - std::shared_ptr createColumnar2RowConverter(MemoryManager* memoryManager) override { + std::shared_ptr createColumnar2RowConverter() override { throw GlutenException("Not yet implemented"); } - std::shared_ptr createRow2ColumnarConverter( - MemoryManager* memoryManager, - struct ArrowSchema* cSchema) override { + std::shared_ptr createRow2ColumnarConverter(struct ArrowSchema* cSchema) override { throw GlutenException("Not yet implemented"); } std::shared_ptr createShuffleWriter( int numPartitions, std::unique_ptr partitionWriter, - ShuffleWriterOptions, - MemoryManager* memoryManager) override { + ShuffleWriterOptions) override { throw GlutenException("Not yet implemented"); } Metrics* getMetrics(ColumnarBatchIterator* rawIter, int64_t exportNanos) override { static Metrics m(1); return &m; } - std::shared_ptr createDatasource( - const std::string& filePath, - MemoryManager* memoryManager, - std::shared_ptr schema) override { + std::shared_ptr createDatasource(const std::string& filePath, std::shared_ptr schema) + override { throw GlutenException("Not yet implemented"); } std::shared_ptr createShuffleReader( std::shared_ptr schema, - ShuffleReaderOptions options, - arrow::MemoryPool* pool, - MemoryManager* memoryManager) override { + ShuffleReaderOptions options) override { throw GlutenException("Not yet implemented"); } - std::unique_ptr createColumnarBatchSerializer( - MemoryManager* memoryManager, - arrow::MemoryPool* arrowPool, - struct ArrowSchema* cSchema) override { + std::unique_ptr createColumnarBatchSerializer(struct ArrowSchema* cSchema) override { throw GlutenException("Not yet implemented"); } - std::shared_ptr select(MemoryManager*, std::shared_ptr, std::vector) override { + std::shared_ptr select(std::shared_ptr, std::vector) override { throw GlutenException("Not yet implemented"); } std::string planString(bool details, const std::unordered_map& sessionConf) override { @@ -100,8 +104,6 @@ class DummyRuntime final : public Runtime { } private: - ResourceMap> resultIteratorHolder_; - class DummyResultIterator : public ColumnarBatchIterator { public: std::shared_ptr next() override { @@ -118,20 +120,30 @@ class DummyRuntime final : public Runtime { }; }; -static Runtime* dummyRuntimeFactory(const std::unordered_map conf) { - return new DummyRuntime(conf); +static Runtime* dummyRuntimeFactory( + std::unique_ptr listener, + const std::unordered_map conf) { + return new DummyRuntime(std::move(listener), conf); } TEST(TestRuntime, CreateRuntime) { Runtime::registerFactory("DUMMY", dummyRuntimeFactory); - auto runtime = Runtime::create("DUMMY"); + auto runtime = Runtime::create("DUMMY", AllocationListener::noop()); ASSERT_EQ(typeid(*runtime), typeid(DummyRuntime)); Runtime::release(runtime); } +TEST(TestRuntime, CreateVeloxRuntime) { + VeloxBackend::create({}); + auto runtime = Runtime::create(kVeloxRuntimeKind, AllocationListener::noop()); + ASSERT_EQ(typeid(*runtime), typeid(VeloxRuntime)); + Runtime::release(runtime); +} + TEST(TestRuntime, GetResultIterator) { - auto runtime = std::make_shared(std::unordered_map()); - auto iter = runtime->createResultIterator(nullptr, "/tmp/test-spill", {}, {}); + auto runtime = + std::make_shared(AllocationListener::noop(), std::unordered_map()); + auto iter = runtime->createResultIterator("/tmp/test-spill", {}, {}); ASSERT_TRUE(iter->hasNext()); auto next = iter->next(); ASSERT_NE(next, nullptr); diff --git a/cpp/velox/tests/VeloxShuffleWriterTest.cc b/cpp/velox/tests/VeloxShuffleWriterTest.cc index 52649128a9990..1c1be6fc1b6f3 100644 --- a/cpp/velox/tests/VeloxShuffleWriterTest.cc +++ b/cpp/velox/tests/VeloxShuffleWriterTest.cc @@ -375,6 +375,9 @@ TEST_P(RoundRobinPartitioningShuffleWriter, preAllocForceRealloc) { } TEST_P(RoundRobinPartitioningShuffleWriter, preAllocForceReuse) { + if (GetParam().shuffleWriterType == kSortShuffle) { + return; + } ASSERT_NOT_OK(initShuffleWriterOptions()); shuffleWriterOptions_.bufferReallocThreshold = 1; // Force re-alloc on buffer size changed. auto shuffleWriter = createShuffleWriter(defaultArrowMemoryPool().get()); @@ -440,12 +443,7 @@ TEST_P(RoundRobinPartitioningShuffleWriter, spillVerifyResult) { auto blockPid2 = takeRows({inputVector1_}, {{1, 3, 5, 7, 9, 1, 3, 5, 7, 9, 1, 3, 5, 7, 9}}); // Stop and verify. - shuffleWriteReadMultiBlocks( - *shuffleWriter, - 2, - inputVector1_->type(), - // {{block1Pid1, block1Pid1, block1Pid1}, {block1Pid2, block1Pid2, block1Pid2}}); - {{blockPid1}, {blockPid2}}); + shuffleWriteReadMultiBlocks(*shuffleWriter, 2, inputVector1_->type(), {{blockPid1}, {blockPid2}}); } TEST_F(VeloxShuffleWriterMemoryTest, memoryLeak) { diff --git a/cpp/velox/udf/Udaf.h b/cpp/velox/udf/Udaf.h index 5b33e0611ba22..2f292fbc6cb34 100644 --- a/cpp/velox/udf/Udaf.h +++ b/cpp/velox/udf/Udaf.h @@ -23,7 +23,7 @@ struct UdafEntry { const char* name; const char* dataType; - size_t numArgs; + int numArgs; const char** argTypes; const char* intermediateType{nullptr}; diff --git a/cpp/velox/udf/Udf.h b/cpp/velox/udf/Udf.h index 1fa3c54d5213e..a32bdaefe9ec4 100644 --- a/cpp/velox/udf/Udf.h +++ b/cpp/velox/udf/Udf.h @@ -23,7 +23,7 @@ struct UdfEntry { const char* name; const char* dataType; - size_t numArgs; + int numArgs; const char** argTypes; bool variableArity{false}; diff --git a/cpp/velox/udf/examples/MyUDAF.cc b/cpp/velox/udf/examples/MyUDAF.cc index e6c4b1fea7e00..710bce53ae65c 100644 --- a/cpp/velox/udf/examples/MyUDAF.cc +++ b/cpp/velox/udf/examples/MyUDAF.cc @@ -20,19 +20,22 @@ #include #include #include -#include + #include "udf/Udaf.h" +#include "udf/examples/UdfCommon.h" using namespace facebook::velox; using namespace facebook::velox::exec; namespace { +static const char* kBoolean = "boolean"; static const char* kInteger = "int"; static const char* kBigInt = "bigint"; static const char* kFloat = "float"; static const char* kDouble = "double"; +namespace myavg { // Copied from velox/exec/tests/SimpleAverageAggregate.cpp // Implementation of the average aggregation function through the @@ -98,84 +101,321 @@ class AverageAggregate { }; }; -exec::AggregateRegistrationResult registerSimpleAverageAggregate(const std::string& name) { - std::vector> signatures; +class MyAvgRegisterer final : public gluten::UdafRegisterer { + int getNumUdaf() override { + return 4; + } + + void populateUdafEntries(int& index, gluten::UdafEntry* udafEntries) override { + for (const auto& argTypes : {myAvgArg1_, myAvgArg2_, myAvgArg3_, myAvgArg4_}) { + udafEntries[index++] = {name_.c_str(), kDouble, 1, argTypes, myAvgIntermediateType_}; + } + } + + void registerSignatures() override { + registerSimpleAverageAggregate(); + } + + private: + exec::AggregateRegistrationResult registerSimpleAverageAggregate() { + std::vector> signatures; + + for (const auto& inputType : {"smallint", "integer", "bigint", "double"}) { + signatures.push_back(exec::AggregateFunctionSignatureBuilder() + .returnType("double") + .intermediateType("row(double,bigint)") + .argumentType(inputType) + .build()); + } - for (const auto& inputType : {"smallint", "integer", "bigint", "double"}) { signatures.push_back(exec::AggregateFunctionSignatureBuilder() - .returnType("double") + .returnType("real") .intermediateType("row(double,bigint)") - .argumentType(inputType) + .argumentType("real") .build()); - } - signatures.push_back(exec::AggregateFunctionSignatureBuilder() - .returnType("real") - .intermediateType("row(double,bigint)") - .argumentType("real") - .build()); - - return exec::registerAggregateFunction( - name, - std::move(signatures), - [name]( - core::AggregationNode::Step step, - const std::vector& argTypes, - const TypePtr& resultType, - const core::QueryConfig& /*config*/) -> std::unique_ptr { - VELOX_CHECK_LE(argTypes.size(), 1, "{} takes at most one argument", name); - auto inputType = argTypes[0]; - if (exec::isRawInput(step)) { - switch (inputType->kind()) { - case TypeKind::SMALLINT: - return std::make_unique>>(resultType); - case TypeKind::INTEGER: - return std::make_unique>>(resultType); - case TypeKind::BIGINT: - return std::make_unique>>(resultType); - case TypeKind::REAL: - return std::make_unique>>(resultType); - case TypeKind::DOUBLE: - return std::make_unique>>(resultType); - default: - VELOX_FAIL("Unknown input type for {} aggregation {}", name, inputType->kindName()); - } - } else { - switch (resultType->kind()) { - case TypeKind::REAL: - return std::make_unique>>(resultType); - case TypeKind::DOUBLE: - case TypeKind::ROW: - return std::make_unique>>(resultType); - default: - VELOX_FAIL("Unsupported result type for final aggregation: {}", resultType->kindName()); + return exec::registerAggregateFunction( + name_, + std::move(signatures), + [this]( + core::AggregationNode::Step step, + const std::vector& argTypes, + const TypePtr& resultType, + const core::QueryConfig& /*config*/) -> std::unique_ptr { + VELOX_CHECK_LE(argTypes.size(), 1, "{} takes at most one argument", name_); + auto inputType = argTypes[0]; + if (exec::isRawInput(step)) { + switch (inputType->kind()) { + case TypeKind::SMALLINT: + return std::make_unique>>(resultType); + case TypeKind::INTEGER: + return std::make_unique>>(resultType); + case TypeKind::BIGINT: + return std::make_unique>>(resultType); + case TypeKind::REAL: + return std::make_unique>>(resultType); + case TypeKind::DOUBLE: + return std::make_unique>>(resultType); + default: + VELOX_FAIL("Unknown input type for {} aggregation {}", name_, inputType->kindName()); + } + } else { + switch (resultType->kind()) { + case TypeKind::REAL: + return std::make_unique>>(resultType); + case TypeKind::DOUBLE: + case TypeKind::ROW: + return std::make_unique>>(resultType); + default: + VELOX_FAIL("Unsupported result type for final aggregation: {}", resultType->kindName()); + } } + }, + true /*registerCompanionFunctions*/, + true /*overwrite*/); + } + + const std::string name_ = "myavg"; + const char* myAvgArg1_[1] = {kInteger}; + const char* myAvgArg2_[1] = {kBigInt}; + const char* myAvgArg3_[1] = {kFloat}; + const char* myAvgArg4_[1] = {kDouble}; + + const char* myAvgIntermediateType_ = "struct"; +}; +} // namespace myavg + +namespace mycountif { + +// Copied from velox/functions/prestosql/aggregates/CountIfAggregate.cpp +class CountIfAggregate : public exec::Aggregate { + public: + explicit CountIfAggregate() : exec::Aggregate(BIGINT()) {} + + int32_t accumulatorFixedWidthSize() const override { + return sizeof(int64_t); + } + + void extractAccumulators(char** groups, int32_t numGroups, VectorPtr* result) override { + extractValues(groups, numGroups, result); + } + + void extractValues(char** groups, int32_t numGroups, VectorPtr* result) override { + auto* vector = (*result)->as>(); + VELOX_CHECK(vector); + vector->resize(numGroups); + + auto* rawValues = vector->mutableRawValues(); + for (vector_size_t i = 0; i < numGroups; ++i) { + rawValues[i] = *value(groups[i]); + } + } + + void addRawInput( + char** groups, + const SelectivityVector& rows, + const std::vector& args, + bool /*mayPushdown*/) override { + DecodedVector decoded(*args[0], rows); + + if (decoded.isConstantMapping()) { + if (decoded.isNullAt(0)) { + return; + } + if (decoded.valueAt(0)) { + rows.applyToSelected([&](vector_size_t i) { addToGroup(groups[i], 1); }); + } + } else if (decoded.mayHaveNulls()) { + rows.applyToSelected([&](vector_size_t i) { + if (decoded.isNullAt(i)) { + return; + } + if (decoded.valueAt(i)) { + addToGroup(groups[i], 1); } - }, - true /*registerCompanionFunctions*/, - true /*overwrite*/); + }); + } else { + rows.applyToSelected([&](vector_size_t i) { + if (decoded.valueAt(i)) { + addToGroup(groups[i], 1); + } + }); + } + } + + void addIntermediateResults( + char** groups, + const SelectivityVector& rows, + const std::vector& args, + bool /*mayPushdown*/) override { + DecodedVector decoded(*args[0], rows); + + if (decoded.isConstantMapping()) { + auto numTrue = decoded.valueAt(0); + rows.applyToSelected([&](vector_size_t i) { addToGroup(groups[i], numTrue); }); + return; + } + + rows.applyToSelected([&](vector_size_t i) { + auto numTrue = decoded.valueAt(i); + addToGroup(groups[i], numTrue); + }); + } + + void addSingleGroupRawInput( + char* group, + const SelectivityVector& rows, + const std::vector& args, + bool /*mayPushdown*/) override { + DecodedVector decoded(*args[0], rows); + + // Constant mapping - check once and add number of selected rows if true. + if (decoded.isConstantMapping()) { + if (!decoded.isNullAt(0)) { + auto isTrue = decoded.valueAt(0); + if (isTrue) { + addToGroup(group, rows.countSelected()); + } + } + return; + } + + int64_t numTrue = 0; + if (decoded.mayHaveNulls()) { + rows.applyToSelected([&](vector_size_t i) { + if (decoded.isNullAt(i)) { + return; + } + if (decoded.valueAt(i)) { + ++numTrue; + } + }); + } else { + rows.applyToSelected([&](vector_size_t i) { + if (decoded.valueAt(i)) { + ++numTrue; + } + }); + } + addToGroup(group, numTrue); + } + + void addSingleGroupIntermediateResults( + char* group, + const SelectivityVector& rows, + const std::vector& args, + bool /*mayPushdown*/) override { + auto arg = args[0]->as>(); + + int64_t numTrue = 0; + rows.applyToSelected([&](auto row) { numTrue += arg->valueAt(row); }); + + addToGroup(group, numTrue); + } + + protected: + void initializeNewGroupsInternal(char** groups, folly::Range indices) override { + for (auto i : indices) { + *value(groups[i]) = 0; + } + } + + private: + inline void addToGroup(char* group, int64_t numTrue) { + *value(group) += numTrue; + } +}; + +class MyCountIfRegisterer final : public gluten::UdafRegisterer { + int getNumUdaf() override { + return 1; + } + + void populateUdafEntries(int& index, gluten::UdafEntry* udafEntries) override { + udafEntries[index++] = {name_.c_str(), kBigInt, 1, myCountIfArg_, kBigInt}; + } + + void registerSignatures() override { + registerCountIfAggregate(); + } + + private: + void registerCountIfAggregate() { + std::vector> signatures{ + exec::AggregateFunctionSignatureBuilder() + .returnType("bigint") + .intermediateType("bigint") + .argumentType("boolean") + .build(), + }; + + exec::registerAggregateFunction( + name_, + std::move(signatures), + [this]( + core::AggregationNode::Step step, + std::vector argTypes, + const TypePtr& /*resultType*/, + const core::QueryConfig& /*config*/) -> std::unique_ptr { + VELOX_CHECK_EQ(argTypes.size(), 1, "{} takes one argument", name_); + + auto isPartial = exec::isRawInput(step); + if (isPartial) { + VELOX_CHECK_EQ(argTypes[0]->kind(), TypeKind::BOOLEAN, "{} function only accepts boolean parameter", name_); + } + + return std::make_unique(); + }, + {false /*orderSensitive*/}, + true, + true); + } + + const std::string name_ = "mycount_if"; + const char* myCountIfArg_[1] = {kBoolean}; +}; +} // namespace mycountif + +std::vector>& globalRegisters() { + static std::vector> registerers; + return registerers; } -} // namespace -const int kNumMyUdaf = 4; +void setupRegisterers() { + static bool inited = false; + if (inited) { + return; + } + auto& registerers = globalRegisters(); + registerers.push_back(std::make_shared()); + registerers.push_back(std::make_shared()); + inited = true; +} +} // namespace DEFINE_GET_NUM_UDAF { - return kNumMyUdaf; + setupRegisterers(); + + int numUdf = 0; + for (const auto& registerer : globalRegisters()) { + numUdf += registerer->getNumUdaf(); + } + return numUdf; } -const char* myAvgArg1[] = {kInteger}; -const char* myAvgArg2[] = {kBigInt}; -const char* myAvgArg3[] = {kFloat}; -const char* myAvgArg4[] = {kDouble}; -const char* myAvgIntermediateType = "struct"; DEFINE_GET_UDAF_ENTRIES { + setupRegisterers(); + int index = 0; - udafEntries[index++] = {"myavg", kDouble, 1, myAvgArg1, myAvgIntermediateType}; - udafEntries[index++] = {"myavg", kDouble, 1, myAvgArg2, myAvgIntermediateType}; - udafEntries[index++] = {"myavg", kDouble, 1, myAvgArg3, myAvgIntermediateType}; - udafEntries[index++] = {"myavg", kDouble, 1, myAvgArg4, myAvgIntermediateType}; + for (const auto& registerer : globalRegisters()) { + registerer->populateUdafEntries(index, udafEntries); + } } DEFINE_REGISTER_UDAF { - registerSimpleAverageAggregate("myavg"); + setupRegisterers(); + + for (const auto& registerer : globalRegisters()) { + registerer->registerSignatures(); + } } diff --git a/cpp/velox/udf/examples/MyUDF.cc b/cpp/velox/udf/examples/MyUDF.cc index 88bc3ad85da38..ee20ca39d0264 100644 --- a/cpp/velox/udf/examples/MyUDF.cc +++ b/cpp/velox/udf/examples/MyUDF.cc @@ -20,28 +20,17 @@ #include #include #include "udf/Udf.h" +#include "udf/examples/UdfCommon.h" using namespace facebook::velox; using namespace facebook::velox::exec; +namespace { + static const char* kInteger = "int"; static const char* kBigInt = "bigint"; static const char* kDate = "date"; -class UdfRegisterer { - public: - ~UdfRegisterer() = default; - - // Returns the number of UDFs in populateUdfEntries. - virtual int getNumUdf() = 0; - - // Populate the udfEntries, starting at the given index. - virtual void populateUdfEntries(int& index, gluten::UdfEntry* udfEntries) = 0; - - // Register all function signatures to velox. - virtual void registerSignatures() = 0; -}; - namespace myudf { template @@ -106,7 +95,7 @@ static std::shared_ptr makePlusConstant( // signatures: // bigint -> bigint // type: VectorFunction -class MyUdf1Registerer final : public UdfRegisterer { +class MyUdf1Registerer final : public gluten::UdfRegisterer { public: int getNumUdf() override { return 1; @@ -135,7 +124,7 @@ class MyUdf1Registerer final : public UdfRegisterer { // integer -> integer // bigint -> bigint // type: StatefulVectorFunction -class MyUdf2Registerer final : public UdfRegisterer { +class MyUdf2Registerer final : public gluten::UdfRegisterer { public: int getNumUdf() override { return 2; @@ -167,7 +156,7 @@ class MyUdf2Registerer final : public UdfRegisterer { // [integer,] ... -> integer // bigint, [bigint,] ... -> bigint // type: StatefulVectorFunction with variable arity -class MyUdf3Registerer final : public UdfRegisterer { +class MyUdf3Registerer final : public gluten::UdfRegisterer { public: int getNumUdf() override { return 2; @@ -215,7 +204,7 @@ struct MyDateSimpleFunction { // signatures: // date, integer -> bigint // type: SimpleFunction -class MyDateRegisterer final : public UdfRegisterer { +class MyDateRegisterer final : public gluten::UdfRegisterer { public: int getNumUdf() override { return 1; @@ -235,8 +224,8 @@ class MyDateRegisterer final : public UdfRegisterer { }; } // namespace mydate -std::vector>& globalRegisters() { - static std::vector> registerers; +std::vector>& globalRegisters() { + static std::vector> registerers; return registerers; } @@ -252,6 +241,7 @@ void setupRegisterers() { registerers.push_back(std::make_shared()); inited = true; } +} // namespace DEFINE_GET_NUM_UDF { setupRegisterers(); diff --git a/cpp/velox/udf/examples/UdfCommon.h b/cpp/velox/udf/examples/UdfCommon.h new file mode 100644 index 0000000000000..a68c474607cd1 --- /dev/null +++ b/cpp/velox/udf/examples/UdfCommon.h @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "udf/Udaf.h" +#include "udf/Udf.h" + +namespace gluten { + +class UdfRegisterer { + public: + ~UdfRegisterer() = default; + + // Returns the number of UDFs in populateUdfEntries. + virtual int getNumUdf() = 0; + + // Populate the udfEntries, starting at the given index. + virtual void populateUdfEntries(int& index, gluten::UdfEntry* udfEntries) = 0; + + // Register all function signatures to velox. + virtual void registerSignatures() = 0; +}; + +class UdafRegisterer { + public: + ~UdafRegisterer() = default; + + // Returns the number of UDFs in populateUdafEntries. + virtual int getNumUdaf() = 0; + + // Populate the udfEntries, starting at the given index. + virtual void populateUdafEntries(int& index, gluten::UdafEntry* udafEntries) = 0; + + // Register all function signatures to velox. + virtual void registerSignatures() = 0; +}; + +} // namespace gluten \ No newline at end of file diff --git a/cpp/velox/utils/ConfigExtractor.cc b/cpp/velox/utils/ConfigExtractor.cc index a71f143225b97..816166351c0e8 100644 --- a/cpp/velox/utils/ConfigExtractor.cc +++ b/cpp/velox/utils/ConfigExtractor.cc @@ -34,6 +34,13 @@ const bool kVeloxFileHandleCacheEnabledDefault = false; // Log granularity of AWS C++ SDK const std::string kVeloxAwsSdkLogLevel = "spark.gluten.velox.awsSdkLogLevel"; const std::string kVeloxAwsSdkLogLevelDefault = "FATAL"; +// Retry mode for AWS s3 +const std::string kVeloxS3RetryMode = "spark.gluten.velox.fs.s3a.retry.mode"; +const std::string kVeloxS3RetryModeDefault = "legacy"; +// Connection timeout for AWS s3 +const std::string kVeloxS3ConnectTimeout = "spark.gluten.velox.fs.s3a.connect.timeout"; +// Using default fs.s3a.connection.timeout value in hadoop +const std::string kVeloxS3ConnectTimeoutDefault = "200s"; } // namespace namespace gluten { @@ -64,6 +71,10 @@ std::shared_ptr getHiveConfig(std::shared_ptr< bool useInstanceCredentials = conf->get("spark.hadoop.fs.s3a.use.instance.credentials", false); std::string iamRole = conf->get("spark.hadoop.fs.s3a.iam.role", ""); std::string iamRoleSessionName = conf->get("spark.hadoop.fs.s3a.iam.role.session.name", ""); + std::string retryMaxAttempts = conf->get("spark.hadoop.fs.s3a.retry.limit", "20"); + std::string retryMode = conf->get(kVeloxS3RetryMode, kVeloxS3RetryModeDefault); + std::string maxConnections = conf->get("spark.hadoop.fs.s3a.connection.maximum", "15"); + std::string connectTimeout = conf->get(kVeloxS3ConnectTimeout, kVeloxS3ConnectTimeoutDefault); std::string awsSdkLogLevel = conf->get(kVeloxAwsSdkLogLevel, kVeloxAwsSdkLogLevelDefault); @@ -79,6 +90,14 @@ std::shared_ptr getHiveConfig(std::shared_ptr< if (envAwsEndpoint != nullptr) { awsEndpoint = std::string(envAwsEndpoint); } + const char* envRetryMaxAttempts = std::getenv("AWS_MAX_ATTEMPTS"); + if (envRetryMaxAttempts != nullptr) { + retryMaxAttempts = std::string(envRetryMaxAttempts); + } + const char* envRetryMode = std::getenv("AWS_RETRY_MODE"); + if (envRetryMode != nullptr) { + retryMode = std::string(envRetryMode); + } if (useInstanceCredentials) { hiveConfMap[facebook::velox::connector::hive::HiveConfig::kS3UseInstanceCredentials] = "true"; @@ -98,6 +117,10 @@ std::shared_ptr getHiveConfig(std::shared_ptr< hiveConfMap[facebook::velox::connector::hive::HiveConfig::kS3SSLEnabled] = sslEnabled ? "true" : "false"; hiveConfMap[facebook::velox::connector::hive::HiveConfig::kS3PathStyleAccess] = pathStyleAccess ? "true" : "false"; hiveConfMap[facebook::velox::connector::hive::HiveConfig::kS3LogLevel] = awsSdkLogLevel; + hiveConfMap[facebook::velox::connector::hive::HiveConfig::kS3MaxAttempts] = retryMaxAttempts; + hiveConfMap[facebook::velox::connector::hive::HiveConfig::kS3RetryMode] = retryMode; + hiveConfMap[facebook::velox::connector::hive::HiveConfig::kS3MaxConnections] = maxConnections; + hiveConfMap[facebook::velox::connector::hive::HiveConfig::kS3ConnectTimeout] = connectTimeout; #endif #ifdef ENABLE_GCS diff --git a/cpp/velox/utils/VeloxBatchAppender.cc b/cpp/velox/utils/VeloxBatchAppender.cc new file mode 100644 index 0000000000000..8fa1ade217e02 --- /dev/null +++ b/cpp/velox/utils/VeloxBatchAppender.cc @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "VeloxBatchAppender.h" + +namespace gluten { + +gluten::VeloxBatchAppender::VeloxBatchAppender( + facebook::velox::memory::MemoryPool* pool, + int32_t minOutputBatchSize, + std::unique_ptr in) + : pool_(pool), minOutputBatchSize_(minOutputBatchSize), in_(std::move(in)) {} + +std::shared_ptr VeloxBatchAppender::next() { + auto cb = in_->next(); + if (cb == nullptr) { + // Input iterator was drained. + return nullptr; + } + if (cb->numRows() >= minOutputBatchSize_) { + // Fast flush path. + return cb; + } + + auto vb = VeloxColumnarBatch::from(pool_, cb); + auto rv = vb->getRowVector(); + auto buffer = facebook::velox::RowVector::createEmpty(rv->type(), pool_); + buffer->append(rv.get()); + + for (auto nextCb = in_->next(); nextCb != nullptr; nextCb = in_->next()) { + auto nextVb = VeloxColumnarBatch::from(pool_, nextCb); + auto nextRv = nextVb->getRowVector(); + buffer->append(nextRv.get()); + if (buffer->size() >= minOutputBatchSize_) { + // Buffer is full. + break; + } + } + return std::make_shared(buffer); +} + +int64_t VeloxBatchAppender::spillFixedSize(int64_t size) { + return in_->spillFixedSize(size); +} +} // namespace gluten diff --git a/cpp/velox/utils/VeloxBatchAppender.h b/cpp/velox/utils/VeloxBatchAppender.h new file mode 100644 index 0000000000000..3698381d0add5 --- /dev/null +++ b/cpp/velox/utils/VeloxBatchAppender.h @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "memory/ColumnarBatchIterator.h" +#include "memory/VeloxColumnarBatch.h" +#include "utils/exception.h" +#include "velox/common/memory/MemoryPool.h" +#include "velox/vector/ComplexVector.h" + +namespace gluten { +class VeloxBatchAppender : public ColumnarBatchIterator { + public: + VeloxBatchAppender( + facebook::velox::memory::MemoryPool* pool, + int32_t minOutputBatchSize, + std::unique_ptr in); + + std::shared_ptr next() override; + + int64_t spillFixedSize(int64_t size) override; + + private: + facebook::velox::memory::MemoryPool* pool_; + const int32_t minOutputBatchSize_; + std::unique_ptr in_; +}; +} // namespace gluten diff --git a/cpp/velox/utils/tests/VeloxShuffleWriterTestBase.h b/cpp/velox/utils/tests/VeloxShuffleWriterTestBase.h index 94e2b071b430b..fd3ae3d547fee 100644 --- a/cpp/velox/utils/tests/VeloxShuffleWriterTestBase.h +++ b/cpp/velox/utils/tests/VeloxShuffleWriterTestBase.h @@ -26,6 +26,7 @@ #include "shuffle/PartitionWriter.h" #include "shuffle/VeloxShuffleReader.h" #include "utils/Compression.h" +#include "velox/type/Type.h" #include "velox/vector/tests/VectorTestUtils.h" namespace gluten { @@ -119,7 +120,7 @@ class VeloxShuffleWriterTestBase : public facebook::velox::test::VectorTestBase {"alice0", "bob1", "alice2", "bob3", "Alice4", "Bob5", "AlicE6", "boB7", "ALICE8", "BOB9"}), makeNullableFlatVector( {"alice", "bob", std::nullopt, std::nullopt, "Alice", "Bob", std::nullopt, "alicE", std::nullopt, "boB"}), - }; + facebook::velox::BaseVector::create(facebook::velox::UNKNOWN(), 10, pool())}; children2_ = { makeNullableFlatVector({std::nullopt, std::nullopt}), @@ -132,7 +133,7 @@ class VeloxShuffleWriterTestBase : public facebook::velox::test::VectorTestBase {"bob", "alicealicealicealicealicealicealicealicealicealicealicealicealicealicealicealicealicealicealicealicealicealicealicealicealicealicealicealicealicealicealicealicealicealicealicealicealicealicealicealice"}), makeNullableFlatVector({std::nullopt, std::nullopt}), - }; + facebook::velox::BaseVector::create(facebook::velox::UNKNOWN(), 2, pool())}; childrenNoNull_ = { makeFlatVector({0, 1}), diff --git a/dev/build_arrow.sh b/dev/build_arrow.sh new file mode 100755 index 0000000000000..897dfcd267a13 --- /dev/null +++ b/dev/build_arrow.sh @@ -0,0 +1,89 @@ +#!/bin/bash +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +CURRENT_DIR=$(cd "$(dirname "$BASH_SOURCE")"; pwd) +source ${CURRENT_DIR}/build_helper_functions.sh +VELOX_ARROW_BUILD_VERSION=15.0.0 +ARROW_PREFIX=$CURRENT_DIR/../ep/_ep/arrow_ep +BUILD_TYPE=Release + +function prepare_arrow_build() { + mkdir -p ${ARROW_PREFIX}/../ && cd ${ARROW_PREFIX}/../ && sudo rm -rf arrow_ep/ + wget_and_untar https://archive.apache.org/dist/arrow/arrow-${VELOX_ARROW_BUILD_VERSION}/apache-arrow-${VELOX_ARROW_BUILD_VERSION}.tar.gz arrow_ep + cd arrow_ep + patch -p1 < $CURRENT_DIR/../ep/build-velox/src/modify_arrow.patch + patch -p1 < $CURRENT_DIR/../ep/build-velox/src/modify_arrow_dataset_scan_option.patch +} + +function install_arrow_deps { + wget_and_untar https://github.com/openssl/openssl/archive/refs/tags/OpenSSL_1_1_1s.tar.gz openssl + pushd openssl + ./config no-shared && make depend && make && sudo make install + popd +} + +function build_arrow_cpp() { + pushd $ARROW_PREFIX/cpp + + cmake_install \ + -DARROW_PARQUET=ON \ + -DARROW_FILESYSTEM=ON \ + -DARROW_PROTOBUF_USE_SHARED=OFF \ + -DARROW_DEPENDENCY_USE_SHARED=OFF \ + -DARROW_DEPENDENCY_SOURCE=BUNDLED \ + -DARROW_WITH_THRIFT=ON \ + -DARROW_WITH_LZ4=ON \ + -DARROW_WITH_SNAPPY=ON \ + -DARROW_WITH_ZLIB=ON \ + -DARROW_WITH_ZSTD=ON \ + -DARROW_JEMALLOC=OFF \ + -DARROW_SIMD_LEVEL=NONE \ + -DARROW_RUNTIME_SIMD_LEVEL=NONE \ + -DARROW_WITH_UTF8PROC=OFF \ + -DARROW_TESTING=ON \ + -DCMAKE_INSTALL_PREFIX=/usr/local \ + -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ + -DARROW_BUILD_SHARED=OFF \ + -DARROW_BUILD_STATIC=ON + popd +} + +function build_arrow_java() { + ARROW_INSTALL_DIR="${ARROW_PREFIX}/install" + + pushd $ARROW_PREFIX/java + # Because arrow-bom module need the -DprocessAllModules + mvn versions:set -DnewVersion=15.0.0-gluten -DprocessAllModules + + mvn clean install -pl bom,maven/module-info-compiler-maven-plugin,vector -am \ + -DskipTests -Drat.skip -Dmaven.gitcommitid.skip -Dcheckstyle.skip -Dassembly.skipAssembly + + # Arrow C Data Interface CPP libraries + mvn generate-resources -P generate-libs-cdata-all-os -Darrow.c.jni.dist.dir=$ARROW_INSTALL_DIR \ + -Dmaven.test.skip -Drat.skip -Dmaven.gitcommitid.skip -Dcheckstyle.skip -N + + # Arrow JNI Date Interface CPP libraries + export PKG_CONFIG_PATH=/usr/local/lib64/pkgconfig${PKG_CONFIG_PATH:+:${PKG_CONFIG_PATH}} + mvn generate-resources -Pgenerate-libs-jni-macos-linux -N -Darrow.dataset.jni.dist.dir=$ARROW_INSTALL_DIR \ + -DARROW_GANDIVA=OFF -DARROW_JAVA_JNI_ENABLE_GANDIVA=OFF -DARROW_ORC=OFF -DARROW_JAVA_JNI_ENABLE_ORC=OFF \ + -Dmaven.test.skip -Drat.skip -Dmaven.gitcommitid.skip -Dcheckstyle.skip -N + + # Arrow Java libraries + mvn install -Parrow-jni -P arrow-c-data -pl c,dataset -am \ + -Darrow.c.jni.dist.dir=$ARROW_INSTALL_DIR/lib -Darrow.dataset.jni.dist.dir=$ARROW_INSTALL_DIR/lib -Darrow.cpp.build.dir=$ARROW_INSTALL_DIR/lib \ + -Dmaven.test.skip -Drat.skip -Dmaven.gitcommitid.skip -Dcheckstyle.skip -Dassembly.skipAssembly + popd +} diff --git a/dev/build_helper_functions.sh b/dev/build_helper_functions.sh new file mode 100644 index 0000000000000..221f01bf6fb5b --- /dev/null +++ b/dev/build_helper_functions.sh @@ -0,0 +1,118 @@ +#!/bin/bash +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +function get_cxx_flags { + local CPU_ARCH=$1 + + local OS + OS=$(uname) + local MACHINE + MACHINE=$(uname -m) + ADDITIONAL_FLAGS="" + + if [[ -z "$CPU_ARCH" ]] || [[ $CPU_ARCH == "unknown" ]]; then + if [ "$OS" = "Darwin" ]; then + + if [ "$MACHINE" = "x86_64" ]; then + local CPU_CAPABILITIES + CPU_CAPABILITIES=$(sysctl -a | grep machdep.cpu.features | awk '{print tolower($0)}') + + if [[ $CPU_CAPABILITIES =~ "avx" ]]; then + CPU_ARCH="avx" + else + CPU_ARCH="sse" + fi + + elif [[ $(sysctl -a | grep machdep.cpu.brand_string) =~ "Apple" ]]; then + # Apple silicon. + CPU_ARCH="arm64" + fi + + # On MacOs prevent the flood of translation visibility settings warnings. + ADDITIONAL_FLAGS="-fvisibility=hidden -fvisibility-inlines-hidden" + else [ "$OS" = "Linux" ]; + + local CPU_CAPABILITIES + CPU_CAPABILITIES=$(cat /proc/cpuinfo | grep flags | head -n 1| awk '{print tolower($0)}') + + if [[ "$CPU_CAPABILITIES" =~ "avx" ]]; then + CPU_ARCH="avx" + elif [[ "$CPU_CAPABILITIES" =~ "sse" ]]; then + CPU_ARCH="sse" + elif [ "$MACHINE" = "aarch64" ]; then + CPU_ARCH="aarch64" + fi + fi + fi + + case $CPU_ARCH in + + "arm64") + echo -n "-mcpu=apple-m1+crc -std=c++17 -fvisibility=hidden $ADDITIONAL_FLAGS" + ;; + + "avx") + echo -n "-mavx2 -mfma -mavx -mf16c -mlzcnt -std=c++17 -mbmi2 $ADDITIONAL_FLAGS" + ;; + + "sse") + echo -n "-msse4.2 -std=c++17 $ADDITIONAL_FLAGS" + ;; + + "aarch64") + echo -n "-mcpu=neoverse-n1 -std=c++17 $ADDITIONAL_FLAGS" + ;; + *) + echo -n "Architecture not supported!" + esac + +} + +function wget_and_untar { + local URL=$1 + local DIR=$2 + mkdir -p "${DIR}" + pushd "${DIR}" + curl -L "${URL}" > $2.tar.gz + tar -xz --strip-components=1 -f $2.tar.gz + popd +} + +function cmake_install { + local NAME=$(basename "$(pwd)") + local BINARY_DIR=_build + SUDO="${SUDO:-""}" + if [ -d "${BINARY_DIR}" ] && prompt "Do you want to rebuild ${NAME}?"; then + ${SUDO} rm -rf "${BINARY_DIR}" + fi + mkdir -p "${BINARY_DIR}" + CPU_TARGET="${CPU_TARGET:-unknown}" + COMPILER_FLAGS=$(get_cxx_flags $CPU_TARGET) + + # CMAKE_POSITION_INDEPENDENT_CODE is required so that Velox can be built into dynamic libraries \ + cmake -Wno-dev -B"${BINARY_DIR}" \ + -GNinja \ + -DCMAKE_POSITION_INDEPENDENT_CODE=ON \ + -DCMAKE_CXX_STANDARD=17 \ + "${INSTALL_PREFIX+-DCMAKE_PREFIX_PATH=}${INSTALL_PREFIX-}" \ + "${INSTALL_PREFIX+-DCMAKE_INSTALL_PREFIX=}${INSTALL_PREFIX-}" \ + -DCMAKE_CXX_FLAGS="$COMPILER_FLAGS" \ + -DBUILD_TESTING=OFF \ + "$@" + + cmake --build "${BINARY_DIR}" + ${SUDO} cmake --install "${BINARY_DIR}" +} diff --git a/dev/builddeps-veloxbe.sh b/dev/builddeps-veloxbe.sh index 35976d37a0361..6668d0871ad1b 100755 --- a/dev/builddeps-veloxbe.sh +++ b/dev/builddeps-veloxbe.sh @@ -13,7 +13,7 @@ BUILD_TESTS=OFF BUILD_EXAMPLES=OFF BUILD_BENCHMARKS=OFF BUILD_JEMALLOC=OFF -BUILD_PROTOBUF=ON +BUILD_PROTOBUF=OFF BUILD_VELOX_TESTS=OFF BUILD_VELOX_BENCHMARKS=OFF ENABLE_QAT=OFF @@ -24,14 +24,13 @@ ENABLE_S3=OFF ENABLE_HDFS=OFF ENABLE_ABFS=OFF ENABLE_EP_CACHE=OFF -ARROW_ENABLE_CUSTOM_CODEC=OFF ENABLE_VCPKG=OFF RUN_SETUP_SCRIPT=ON VELOX_REPO="" VELOX_BRANCH="" VELOX_HOME="" VELOX_PARAMETER="" -COMPILE_ARROW_JAVA=ON +BUILD_ARROW=ON SPARK_VERSION=ALL # set default number of threads as cpu cores minus 2 @@ -72,12 +71,10 @@ do ;; --enable_qat=*) ENABLE_QAT=("${arg#*=}") - ARROW_ENABLE_CUSTOM_CODEC=("${arg#*=}") shift # Remove argument name from processing ;; --enable_iaa=*) ENABLE_IAA=("${arg#*=}") - ARROW_ENABLE_CUSTOM_CODEC=("${arg#*=}") shift # Remove argument name from processing ;; --enable_hbm=*) @@ -136,8 +133,8 @@ do BUILD_VELOX_BENCHMARKS=("${arg#*=}") shift # Remove argument name from processing ;; - --compile_arrow_java=*) - COMPILE_ARROW_JAVA=("${arg#*=}") + --build_arrow=*) + BUILD_ARROW=("${arg#*=}") shift # Remove argument name from processing ;; --num_threads=*) @@ -189,30 +186,45 @@ fi concat_velox_param +function build_arrow { + echo "Start to build Arrow" + export SUDO=sudo + cd $GLUTEN_DIR/dev + source build_arrow.sh + prepare_arrow_build + build_arrow_cpp + echo "Finished building arrow CPP" + build_arrow_java + echo "Finished building arrow Java" +} + function build_velox { echo "Start to build Velox" cd $GLUTEN_DIR/ep/build-velox/src - ./get_velox.sh --enable_hdfs=$ENABLE_HDFS --build_protobuf=$BUILD_PROTOBUF --enable_s3=$ENABLE_S3 --enable_gcs=$ENABLE_GCS --enable_abfs=$ENABLE_ABFS $VELOX_PARAMETER + ./get_velox.sh --enable_hdfs=$ENABLE_HDFS --enable_s3=$ENABLE_S3 --enable_gcs=$ENABLE_GCS --enable_abfs=$ENABLE_ABFS $VELOX_PARAMETER # When BUILD_TESTS is on for gluten cpp, we need turn on VELOX_BUILD_TEST_UTILS via build_test_utils. ./build_velox.sh --run_setup_script=$RUN_SETUP_SCRIPT --enable_s3=$ENABLE_S3 --enable_gcs=$ENABLE_GCS --build_type=$BUILD_TYPE --enable_hdfs=$ENABLE_HDFS \ --enable_abfs=$ENABLE_ABFS --enable_ep_cache=$ENABLE_EP_CACHE --build_test_utils=$BUILD_TESTS --build_tests=$BUILD_VELOX_TESTS --build_benchmarks=$BUILD_VELOX_BENCHMARKS \ - --compile_arrow_java=$COMPILE_ARROW_JAVA --num_threads=$NUM_THREADS + --num_threads=$NUM_THREADS } -## compile gluten cpp function build_gluten_cpp { - echo "Start to Gluten CPP" + echo "Start to build Gluten CPP" cd $GLUTEN_DIR/cpp rm -rf build mkdir build cd build cmake -DBUILD_VELOX_BACKEND=ON -DCMAKE_BUILD_TYPE=$BUILD_TYPE \ -DBUILD_TESTS=$BUILD_TESTS -DBUILD_EXAMPLES=$BUILD_EXAMPLES -DBUILD_BENCHMARKS=$BUILD_BENCHMARKS -DBUILD_JEMALLOC=$BUILD_JEMALLOC \ - -DENABLE_HBM=$ENABLE_HBM -DENABLE_QAT=$ENABLE_QAT -DENABLE_IAA=$ENABLE_IAA -DENABLE_GCS=$ENABLE_GCS -DENABLE_S3=$ENABLE_S3 -DENABLE_HDFS=$ENABLE_HDFS -DENABLE_ABFS=$ENABLE_ABFS .. + -DENABLE_HBM=$ENABLE_HBM -DENABLE_QAT=$ENABLE_QAT -DENABLE_IAA=$ENABLE_IAA -DBUILD_PROTOBUF=$BUILD_PROTOBUF -DENABLE_GCS=$ENABLE_GCS \ + -DENABLE_S3=$ENABLE_S3 -DENABLE_HDFS=$ENABLE_HDFS -DENABLE_ABFS=$ENABLE_ABFS .. make -j $NUM_THREADS } function build_velox_backend { + if [ $BUILD_ARROW == "ON" ]; then + build_arrow + fi build_velox build_gluten_cpp } diff --git a/dev/ci-velox-buildstatic.sh b/dev/ci-velox-buildstatic.sh index 0754408169cf9..74688ff307cb7 100755 --- a/dev/ci-velox-buildstatic.sh +++ b/dev/ci-velox-buildstatic.sh @@ -2,8 +2,8 @@ yum install sudo patch java-1.8.0-openjdk-devel -y cd $GITHUB_WORKSPACE/ep/build-velox/src ./get_velox.sh source /opt/rh/devtoolset-9/enable -source $GITHUB_WORKSPACE/dev/vcpkg/env.sh cd $GITHUB_WORKSPACE/ +source ./dev/vcpkg/env.sh sed -i '/^headers/d' ep/build-velox/build/velox_ep/CMakeLists.txt export NUM_THREADS=4 -./dev/builddeps-veloxbe.sh --build_tests=OFF --build_benchmarks=OFF --enable_s3=ON --enable_gcs=ON --enable_hdfs=ON --enable_abfs=ON +./dev/builddeps-veloxbe.sh --build_tests=OFF --build_benchmarks=OFF --enable_s3=ON --enable_gcs=OFF --enable_hdfs=ON --enable_abfs=ON diff --git a/dev/formatcppcode.sh b/dev/formatcppcode.sh index d391235dac9d0..4ce4155f78b63 100755 --- a/dev/formatcppcode.sh +++ b/dev/formatcppcode.sh @@ -1,3 +1,13 @@ cd `dirname $0` + +# Check if clang-format-15 is installed +if ! command -v clang-format-15 &> /dev/null +then + echo "clang-format-15 could not be found" + echo "Installing clang-format-15..." + sudo apt update + sudo apt install clang-format-15 +fi + find ../cpp/core -regex '.*\.\(cc\|hpp\|cu\|c\|h\)' -exec clang-format-15 -style=file -i {} \; -find ../cpp/velox -regex '.*\.\(cc\|hpp\|cu\|c\|h\)' -exec clang-format-15 -style=file -i {} \; +find ../cpp/velox -regex '.*\.\(cc\|hpp\|cu\|c\|h\)' -exec clang-format-15 -style=file -i {} \; \ No newline at end of file diff --git a/dev/info.sh b/dev/info.sh index ed0e0a8dd3901..8a5fbd182f846 100644 --- a/dev/info.sh +++ b/dev/info.sh @@ -38,6 +38,7 @@ Commit: $(git rev-parse HEAD 2> /dev/null || echo "Not in a git repo.") CMake Version: $(cmake --version | grep -oE '[[:digit:]]+\.[[:digit:]]+\.[[:digit:]]+') System: $(print_info 'CMAKE_SYSTEM "') Arch: $(print_info 'CMAKE_SYSTEM_PROCESSOR') +CPU Name: $(lscpu | grep 'Model name') C++ Compiler: $(print_info 'CMAKE_CXX_COMPILER ==') C++ Compiler Version: $(print_info 'CMAKE_CXX_COMPILER_VERSION') C Compiler: $(print_info 'CMAKE_C_COMPILER ==') diff --git a/dev/package-vcpkg.sh b/dev/package-vcpkg.sh index 3f14f1c902d1d..4a248b556726b 100755 --- a/dev/package-vcpkg.sh +++ b/dev/package-vcpkg.sh @@ -6,9 +6,10 @@ CURRENT_DIR=$(cd "$(dirname "$BASH_SOURCE")"; pwd) GLUTEN_DIR="$CURRENT_DIR/.." cd "$GLUTEN_DIR" -source ./dev/vcpkg/env.sh +source /opt/rh/devtoolset-9/enable +source /opt/gluten/dev/vcpkg/env.sh ./dev/buildbundle-veloxbe.sh --build_tests=ON --build_benchmarks=ON --enable_s3=ON --enable_hdfs=ON mvn clean package -Pbackends-velox -Pceleborn -Puniffle -Pspark-3.2 -DskipTests mvn clean package -Pbackends-velox -Pceleborn -Puniffle -Pspark-3.3 -DskipTests mvn clean package -Pbackends-velox -Pceleborn -Puniffle -Pspark-3.4 -DskipTests -mvn clean package -Pbackends-velox -Pceleborn -Puniffle -Pspark-3.5 -DskipTests \ No newline at end of file +mvn clean package -Pbackends-velox -Pceleborn -Puniffle -Pspark-3.5 -DskipTests diff --git a/dev/vcpkg/CONTRIBUTING.md b/dev/vcpkg/CONTRIBUTING.md index b725f0b50fc57..719bc91db066f 100644 --- a/dev/vcpkg/CONTRIBUTING.md +++ b/dev/vcpkg/CONTRIBUTING.md @@ -13,7 +13,7 @@ Please init vcpkg env first: Vcpkg already maintains a lot of libraries. You can find them by vcpkg cli. -(NOTE: Please always use cli beacause [packages on vcpkg.io](https://vcpkg.io/en/packages.html) is outdate). +(NOTE: Please always use cli because [packages on vcpkg.io](https://vcpkg.io/en/packages.html) is outdate). ``` $ ./.vcpkg/vcpkg search folly @@ -28,7 +28,7 @@ folly[zlib] Support zlib for compression folly[zstd] Support zstd for compression ``` -`[...]` means additional features. Then add depend into [vcpkg.json](./vcpkg.json). +`[...]` means additional features. Then add the dependency into [vcpkg.json](./vcpkg.json). ``` json { @@ -144,7 +144,7 @@ See [vcpkg.json reference](https://learn.microsoft.com/en-us/vcpkg/reference/vcp `portfile.cmake` is a cmake script describing how to build and install the package. A typical portfile has 3 stages: -**Download and perpare source**: +**Download and prepare source**: ``` cmake # Download from Github diff --git a/dev/vcpkg/Makefile b/dev/vcpkg/Makefile index e52393b040ad0..11cd02d8819be 100644 --- a/dev/vcpkg/Makefile +++ b/dev/vcpkg/Makefile @@ -1,4 +1,4 @@ -DOCKER_IMAGE=gluten-builder-vcpkg +DOCKER_IMAGE=apache/gluten:gluten-vcpkg-builder_2024_05_22 GLUTEN_REPO=$(shell realpath -L ../..) CCACHE_DIR=$(HOME)/.ccache @@ -8,12 +8,14 @@ MAVEN_M2_DIR=$(HOME)/.m2 $(info $(GLUTEN_REPO)) .PHONY: docker-image build -build: docker-image | $(CCACHE_DIR) $(VCPKG_BINARY_CACHE_DIR) $(MAVEN_M2_DIR) +build: $(CCACHE_DIR) $(VCPKG_BINARY_CACHE_DIR) $(MAVEN_M2_DIR) docker run --rm -ti \ -v $(GLUTEN_REPO):$(GLUTEN_REPO) \ - -v $(VCPKG_BINARY_CACHE_DIR):/home/build/.cache/vcpkg \ - -v $(MAVEN_M2_DIR):/home/build/.m2 \ - -v $(CCACHE_DIR):/home/build/.ccache \ + -v $(VCPKG_BINARY_CACHE_DIR):/root/.cache/vcpkg \ + -v $(MAVEN_M2_DIR):/root/.m2 \ + -v $(CCACHE_DIR):/root/.ccache \ + -e http_proxy \ + -e https_proxy \ --workdir $(GLUTEN_REPO) \ -ti \ $(DOCKER_IMAGE) \ diff --git a/dev/vcpkg/ports/folly/boost-1.70.patch b/dev/vcpkg/ports/folly/boost-1.70.patch index 298516f826b01..da6b81788fc86 100644 --- a/dev/vcpkg/ports/folly/boost-1.70.patch +++ b/dev/vcpkg/ports/folly/boost-1.70.patch @@ -1,5 +1,5 @@ diff --git a/folly/portability/PThread.cpp b/folly/portability/PThread.cpp -index 2126ec0..ef82ade 100644 +index 777ac21ef..a4fc13a09 100644 --- a/folly/portability/PThread.cpp +++ b/folly/portability/PThread.cpp @@ -20,6 +20,8 @@ @@ -11,7 +11,7 @@ index 2126ec0..ef82ade 100644 #include -@@ -691,7 +693,7 @@ int pthread_setspecific(pthread_key_t key, const void* value) { +@@ -682,7 +684,7 @@ int pthread_setspecific(pthread_key_t key, const void* value) { boost::detail::thread::cleanup_caller_t(), boost::detail::thread::cleanup_func_t(), #else diff --git a/dev/vcpkg/ports/folly/disable-non-underscore-posix-names.patch b/dev/vcpkg/ports/folly/disable-non-underscore-posix-names.patch deleted file mode 100644 index c039947920b1b..0000000000000 --- a/dev/vcpkg/ports/folly/disable-non-underscore-posix-names.patch +++ /dev/null @@ -1,34 +0,0 @@ -diff --git a/folly/portability/Windows.h b/folly/portability/Windows.h -index 86fd0f9..ccad11e 100644 ---- a/folly/portability/Windows.h -+++ b/folly/portability/Windows.h -@@ -32,24 +32,17 @@ - // disabled to ensure all of the normal names get declared properly. - #include - --#ifndef __STDC__ --/* nolint */ --#define __STDC__ 1 - #pragma push_macro("_CRT_DECLARE_NONSTDC_NAMES") - #ifdef _CRT_DECLARE_NONSTDC_NAMES - #undef _CRT_DECLARE_NONSTDC_NAMES - #endif -+ -+#include - #pragma push_macro("_CRT_INTERNAL_NONSTDC_NAMES") --#undef _CRT_INTERNAL_NONSTDC_NAMES --#include // @manual nolint --#include // @manual nolint --#undef __STDC__ -+#define _CRT_INTERNAL_NONSTDC_NAMES 0 -+#include -+#include - #pragma pop_macro("_CRT_INTERNAL_NONSTDC_NAMES") --#pragma pop_macro("_CRT_DECLARE_NONSTDC_NAMES") --#else --#include // @manual nolint --#include // @manual nolint --#endif - - #if defined(min) || defined(max) - #error Windows.h needs to be included by this header, or else NOMINMAX needs \ diff --git a/dev/vcpkg/ports/folly/disable-uninitialized-resize-on-new-stl.patch b/dev/vcpkg/ports/folly/disable-uninitialized-resize-on-new-stl.patch new file mode 100644 index 0000000000000..6e7b6b5718230 --- /dev/null +++ b/dev/vcpkg/ports/folly/disable-uninitialized-resize-on-new-stl.patch @@ -0,0 +1,34 @@ +diff --git a/folly/memory/UninitializedMemoryHacks.h b/folly/memory/UninitializedMemoryHacks.h +index daf5eb735..1ac44d6b2 100644 +--- a/folly/memory/UninitializedMemoryHacks.h ++++ b/folly/memory/UninitializedMemoryHacks.h +@@ -101,6 +101,9 @@ template < + typename std::enable_if::value>::type> + inline void resizeWithoutInitialization( + std::basic_string& s, std::size_t n) { ++#if defined(_MSVC_STL_UPDATE) && _MSVC_STL_UPDATE >= 202206L ++ s.resize(n); ++#else + if (n <= s.size()) { + s.resize(n); + } else { +@@ -111,6 +114,7 @@ inline void resizeWithoutInitialization( + } + detail::unsafeStringSetLargerSize(s, n); + } ++#endif // defined(_MSVC_STL_UPDATE) && _MSVC_STL_UPDATE >= 202206L + } + + /** +@@ -278,8 +282,11 @@ struct MakeUnsafeStringSetLargerSize { + } // namespace folly + + #if defined(FOLLY_DECLARE_STRING_RESIZE_WITHOUT_INIT) ++#if defined(_MSVC_STL_UPDATE) && _MSVC_STL_UPDATE >= 202206L ++#else + FOLLY_DECLARE_STRING_RESIZE_WITHOUT_INIT(char) + FOLLY_DECLARE_STRING_RESIZE_WITHOUT_INIT(wchar_t) ++#endif // defined(_MSVC_STL_UPDATE) && _MSVC_STL_UPDATE >= 202206L + #endif + + namespace folly { diff --git a/dev/vcpkg/ports/folly/fix-deps.patch b/dev/vcpkg/ports/folly/fix-deps.patch index 179fa71be73eb..442ad80636865 100644 --- a/dev/vcpkg/ports/folly/fix-deps.patch +++ b/dev/vcpkg/ports/folly/fix-deps.patch @@ -1,12 +1,13 @@ diff --git a/CMake/folly-config.cmake.in b/CMake/folly-config.cmake.in -index 0b96f0a10..c90110287 100644 +index 0b96f0a10..1559f9f70 100644 --- a/CMake/folly-config.cmake.in +++ b/CMake/folly-config.cmake.in -@@ -29,10 +29,35 @@ endif() +@@ -29,10 +29,30 @@ endif() set(FOLLY_LIBRARIES Folly::folly) # Find folly's dependencies -find_dependency(fmt) ++find_dependency(fmt CONFIG) +find_dependency(double-conversion CONFIG) +find_dependency(glog CONFIG) +find_dependency(gflags CONFIG) @@ -27,12 +28,6 @@ index 0b96f0a10..c90110287 100644 +if (NOT @CMAKE_DISABLE_FIND_PACKAGE_LZ4@) + find_dependency(lz4 CONFIG) +endif() -+ -+if (@WITH_liburing@) -+ find_dependency(LibUring) -+endif() -+ -+find_dependency(fmt CONFIG) set(Boost_USE_STATIC_LIBS "@FOLLY_BOOST_LINK_STATIC@") -find_dependency(Boost 1.51.0 MODULE @@ -41,7 +36,7 @@ index 0b96f0a10..c90110287 100644 context filesystem diff --git a/CMake/folly-deps.cmake b/CMake/folly-deps.cmake -index 4b78e9f02..eb77e29c9 100644 +index 41a513a30..d75908f77 100644 --- a/CMake/folly-deps.cmake +++ b/CMake/folly-deps.cmake @@ -35,7 +35,7 @@ else() @@ -53,27 +48,17 @@ index 4b78e9f02..eb77e29c9 100644 COMPONENTS context filesystem -@@ -45,37 +45,38 @@ find_package(Boost 1.51.0 MODULE +@@ -45,37 +45,37 @@ find_package(Boost 1.51.0 MODULE thread REQUIRED ) -+set (Boost_LIBRARIES Boost::context Boost::filesystem Boost::program_options Boost::regex Boost::system Boost::thread -+ ) ++set(Boost_LIBRARIES Boost::boost Boost::context Boost::filesystem Boost::program_options Boost::regex Boost::system Boost::thread) list(APPEND FOLLY_LINK_LIBRARIES ${Boost_LIBRARIES}) -list(APPEND FOLLY_INCLUDE_DIRECTORIES ${Boost_INCLUDE_DIRS}) - +- -find_package(DoubleConversion MODULE REQUIRED) -list(APPEND FOLLY_LINK_LIBRARIES ${DOUBLE_CONVERSION_LIBRARY}) -list(APPEND FOLLY_INCLUDE_DIRECTORIES ${DOUBLE_CONVERSION_INCLUDE_DIR}) -+find_package(double-conversion CONFIG REQUIRED) -+list(APPEND FOLLY_LINK_LIBRARIES double-conversion::double-conversion) - --find_package(Glog CONFIG REQUIRED) --set(FOLLY_HAVE_LIBGLOG ${GLOG_FOUND}) -+find_package(glog CONFIG REQUIRED) -+set(FOLLY_HAVE_LIBGLOG 1) - list(APPEND FOLLY_LINK_LIBRARIES glog::glog) --list(APPEND FOLLY_INCLUDE_DIRECTORIES ${GLOG_INCLUDE_DIR}) - -find_package(Gflags MODULE) -set(FOLLY_HAVE_LIBGFLAGS ${LIBGFLAGS_FOUND}) @@ -82,22 +67,21 @@ index 4b78e9f02..eb77e29c9 100644 - list(APPEND FOLLY_INCLUDE_DIRECTORIES ${LIBGFLAGS_INCLUDE_DIR}) - set(FOLLY_LIBGFLAGS_LIBRARY ${LIBGFLAGS_LIBRARY}) - set(FOLLY_LIBGFLAGS_INCLUDE ${LIBGFLAGS_INCLUDE_DIR}) -+ -+find_package(gflags CONFIG) -+if(TARGET gflags::gflags) -+ set(FOLLY_HAVE_LIBGFLAGS 1) -+ list(APPEND FOLLY_LINK_LIBRARIES gflags::gflags) -+ set(FOLLY_LIBGFLAGS_LIBRARY gflags::gflags) - endif() +-endif() + +-find_package(Glog MODULE) +-set(FOLLY_HAVE_LIBGLOG ${GLOG_FOUND}) +-list(APPEND FOLLY_LINK_LIBRARIES ${GLOG_LIBRARY}) +-list(APPEND FOLLY_INCLUDE_DIRECTORIES ${GLOG_INCLUDE_DIR}) ++find_package(double-conversion CONFIG REQUIRED) ++list(APPEND FOLLY_LINK_LIBRARIES double-conversion::double-conversion) -find_package(LibEvent MODULE REQUIRED) -list(APPEND FOLLY_LINK_LIBRARIES ${LIBEVENT_LIB}) -list(APPEND FOLLY_INCLUDE_DIRECTORIES ${LIBEVENT_INCLUDE_DIR}) -+find_package(Libevent CONFIG REQUIRED) -+list(APPEND FOLLY_LINK_LIBRARIES libevent::core libevent::extra) -+if(NOT WIN32) -+ list(APPEND FOLLY_LINK_LIBRARIES libevent::pthreads) -+endif() ++find_package(glog CONFIG REQUIRED) ++set(FOLLY_HAVE_LIBGLOG 1) ++list(APPEND FOLLY_LINK_LIBRARIES glog::glog) -find_package(ZLIB MODULE) -set(FOLLY_HAVE_LIBZ ${ZLIB_FOUND}) @@ -105,18 +89,31 @@ index 4b78e9f02..eb77e29c9 100644 - list(APPEND FOLLY_INCLUDE_DIRECTORIES ${ZLIB_INCLUDE_DIRS}) - list(APPEND FOLLY_LINK_LIBRARIES ${ZLIB_LIBRARIES}) - list(APPEND CMAKE_REQUIRED_LIBRARIES ${ZLIB_LIBRARIES}) ++find_package(gflags CONFIG) ++if(TARGET gflags::gflags) ++ set(FOLLY_HAVE_LIBGFLAGS 1) ++ list(APPEND FOLLY_LINK_LIBRARIES gflags::gflags) ++ set(FOLLY_LIBGFLAGS_LIBRARY gflags::gflags) ++endif() ++ ++find_package(Libevent CONFIG REQUIRED) ++list(APPEND FOLLY_LINK_LIBRARIES libevent::core libevent::extra) ++if(NOT WIN32) ++ list(APPEND FOLLY_LINK_LIBRARIES libevent::pthreads) ++endif() ++ +if (CMAKE_REQUIRE_FIND_PACKAGE_ZLIB) -+ find_package(ZLIB MODULE REQUIRED) -+ set(FOLLY_HAVE_LIBZ ${ZLIB_FOUND}) -+ if (ZLIB_FOUND) -+ list(APPEND FOLLY_INCLUDE_DIRECTORIES ${ZLIB_INCLUDE_DIRS}) -+ list(APPEND FOLLY_LINK_LIBRARIES ${ZLIB_LIBRARIES}) -+ list(APPEND CMAKE_REQUIRED_LIBRARIES ${ZLIB_LIBRARIES}) -+ endif() ++ find_package(ZLIB MODULE REQUIRED) ++ set(FOLLY_HAVE_LIBZ ${ZLIB_FOUND}) ++ if (ZLIB_FOUND) ++ list(APPEND FOLLY_INCLUDE_DIRECTORIES ${ZLIB_INCLUDE_DIRS}) ++ list(APPEND FOLLY_LINK_LIBRARIES ${ZLIB_LIBRARIES}) ++ list(APPEND CMAKE_REQUIRED_LIBRARIES ${ZLIB_LIBRARIES}) ++ endif() endif() find_package(OpenSSL 1.1.1 MODULE REQUIRED) -@@ -104,25 +105,30 @@ if (LIBLZMA_FOUND) +@@ -103,25 +103,30 @@ if (LIBLZMA_FOUND) list(APPEND FOLLY_LINK_LIBRARIES ${LIBLZMA_LIBRARIES}) endif() @@ -125,7 +122,7 @@ index 4b78e9f02..eb77e29c9 100644 -if (LZ4_FOUND) - list(APPEND FOLLY_INCLUDE_DIRECTORIES ${LZ4_INCLUDE_DIR}) - list(APPEND FOLLY_LINK_LIBRARIES ${LZ4_LIBRARY}) -+if (NOT CMAKE_DISABLE_FIND_PACKAGE_LZ4) ++if(NOT CMAKE_DISABLE_FIND_PACKAGE_LZ4) + find_package(lz4 CONFIG) + if(TARGET lz4::lz4) + set(FOLLY_HAVE_LIBLZ4 1) @@ -138,15 +135,15 @@ index 4b78e9f02..eb77e29c9 100644 -if(ZSTD_FOUND) - list(APPEND FOLLY_INCLUDE_DIRECTORIES ${ZSTD_INCLUDE_DIR}) - list(APPEND FOLLY_LINK_LIBRARIES ${ZSTD_LIBRARY}) -+if (NOT CMAKE_DISABLE_FIND_PACKAGE_Zstd) -+ find_package(zstd CONFIG) -+ if(TARGET zstd::libzstd_shared ) -+ set(FOLLY_HAVE_LIBZSTD 1) -+ list(APPEND FOLLY_LINK_LIBRARIES zstd::libzstd_shared) -+ elseif( TARGET zstd::libzstd_static ) -+ set(FOLLY_HAVE_LIBZSTD 1) -+ list(APPEND FOLLY_LINK_LIBRARIES zstd::libzstd_static) -+ endif() ++if(NOT CMAKE_DISABLE_FIND_PACKAGE_Zstd) ++ find_package(zstd CONFIG) ++ if(TARGET zstd::libzstd_shared ) ++ set(FOLLY_HAVE_LIBZSTD 1) ++ list(APPEND FOLLY_LINK_LIBRARIES zstd::libzstd_shared) ++ elseif( TARGET zstd::libzstd_static ) ++ set(FOLLY_HAVE_LIBZSTD 1) ++ list(APPEND FOLLY_LINK_LIBRARIES zstd::libzstd_static) ++ endif() endif() -find_package(Snappy MODULE) @@ -154,49 +151,62 @@ index 4b78e9f02..eb77e29c9 100644 -if (SNAPPY_FOUND) - list(APPEND FOLLY_INCLUDE_DIRECTORIES ${SNAPPY_INCLUDE_DIR}) - list(APPEND FOLLY_LINK_LIBRARIES ${SNAPPY_LIBRARY}) -+if (NOT CMAKE_DISABLE_FIND_PACKAGE_Snappy) -+ find_package(Snappy CONFIG) -+ if(TARGET Snappy::snappy) -+ list(APPEND FOLLY_LINK_LIBRARIES Snappy::snappy) -+ endif() ++if(NOT CMAKE_DISABLE_FIND_PACKAGE_Snappy) ++ find_package(Snappy CONFIG) ++ if(TARGET Snappy::snappy) ++ list(APPEND FOLLY_LINK_LIBRARIES Snappy::snappy) ++ endif() endif() find_package(LibDwarf) -@@ -137,13 +143,18 @@ find_package(LibAIO) - list(APPEND FOLLY_LINK_LIBRARIES ${LIBAIO_LIBRARIES}) - list(APPEND FOLLY_INCLUDE_DIRECTORIES ${LIBAIO_INCLUDE_DIRS}) +@@ -132,17 +137,24 @@ find_package(Libiberty) + list(APPEND FOLLY_LINK_LIBRARIES ${LIBIBERTY_LIBRARIES}) + list(APPEND FOLLY_INCLUDE_DIRECTORIES ${LIBIBERTY_INCLUDE_DIRS}) + +-find_package(LibAIO) +-list(APPEND FOLLY_LINK_LIBRARIES ${LIBAIO_LIBRARIES}) +-list(APPEND FOLLY_INCLUDE_DIRECTORIES ${LIBAIO_INCLUDE_DIRS}) ++if(WITH_libaio) ++ find_package(LibAIO) ++ list(APPEND FOLLY_LINK_LIBRARIES ${LIBAIO_LIBRARIES}) ++ list(APPEND FOLLY_INCLUDE_DIRECTORIES ${LIBAIO_INCLUDE_DIRS}) ++endif() +-find_package(LibUring) +-list(APPEND FOLLY_LINK_LIBRARIES ${LIBURING_LIBRARIES}) +-list(APPEND FOLLY_INCLUDE_DIRECTORIES ${LIBURING_INCLUDE_DIRS}) +if(WITH_liburing) - find_package(LibUring) - list(APPEND FOLLY_LINK_LIBRARIES ${LIBURING_LIBRARIES}) - list(APPEND FOLLY_INCLUDE_DIRECTORIES ${LIBURING_INCLUDE_DIRS}) ++ find_package(LibUring) ++ list(APPEND FOLLY_LINK_LIBRARIES ${LIBURING_LIBRARIES}) ++ list(APPEND FOLLY_INCLUDE_DIRECTORIES ${LIBURING_INCLUDE_DIRS}) +endif() -find_package(Libsodium) -list(APPEND FOLLY_LINK_LIBRARIES ${LIBSODIUM_LIBRARIES}) -list(APPEND FOLLY_INCLUDE_DIRECTORIES ${LIBSODIUM_INCLUDE_DIRS}) -+if (NOT CMAKE_DISABLE_FIND_PACKAGE_unofficial-sodium) -+ find_package(unofficial-sodium CONFIG) -+ if(TARGET unofficial-sodium::sodium) -+ list(APPEND FOLLY_LINK_LIBRARIES unofficial-sodium::sodium) -+ endif() ++if(NOT CMAKE_DISABLE_FIND_PACKAGE_unofficial-sodium) ++ find_package(unofficial-sodium CONFIG) ++ if(TARGET unofficial-sodium::sodium) ++ list(APPEND FOLLY_LINK_LIBRARIES unofficial-sodium::sodium) ++ endif() +endif() list(APPEND FOLLY_LINK_LIBRARIES ${CMAKE_DL_LIBS}) list(APPEND CMAKE_REQUIRED_LIBRARIES ${CMAKE_DL_LIBS}) -@@ -154,9 +165,9 @@ if (PYTHON_EXTENSIONS) +@@ -153,10 +165,10 @@ if (PYTHON_EXTENSIONS) endif () find_package(LibUnwind) -list(APPEND FOLLY_LINK_LIBRARIES ${LIBUNWIND_LIBRARIES}) -list(APPEND FOLLY_INCLUDE_DIRECTORIES ${LIBUNWIND_INCLUDE_DIRS}) if (LIBUNWIND_FOUND) + set(FOLLY_HAVE_LIBUNWIND ON) + list(APPEND FOLLY_LINK_LIBRARIES ${LIBUNWIND_LIBRARIES}) + list(APPEND FOLLY_INCLUDE_DIRECTORIES ${LIBUNWIND_INCLUDE_DIRS}) - set(FOLLY_HAVE_LIBUNWIND ON) endif() if (CMAKE_SYSTEM_NAME MATCHES "FreeBSD") -@@ -299,11 +310,7 @@ endif() + list(APPEND FOLLY_LINK_LIBRARIES "execinfo") +@@ -298,11 +310,7 @@ endif() add_library(folly_deps INTERFACE) diff --git a/dev/vcpkg/ports/folly/fix-unistd-include.patch b/dev/vcpkg/ports/folly/fix-unistd-include.patch new file mode 100644 index 0000000000000..9e3fe5d7a7208 --- /dev/null +++ b/dev/vcpkg/ports/folly/fix-unistd-include.patch @@ -0,0 +1,13 @@ +diff --git a/folly/experimental/symbolizer/Elf.cpp b/folly/experimental/symbolizer/Elf.cpp +index 6a7f74f..6546fe0 100644 +--- a/folly/experimental/symbolizer/Elf.cpp ++++ b/folly/experimental/symbolizer/Elf.cpp +@@ -49,6 +49,8 @@ + #define FOLLY_ELF_NATIVE_CLASS __WORDSIZE + #endif // __ELF_NATIVE_CLASS + ++#include ++ + namespace folly { + namespace symbolizer { + diff --git a/dev/vcpkg/ports/folly/portfile.cmake b/dev/vcpkg/ports/folly/portfile.cmake index 12d503331821e..40a0231ea31f6 100644 --- a/dev/vcpkg/ports/folly/portfile.cmake +++ b/dev/vcpkg/ports/folly/portfile.cmake @@ -8,15 +8,15 @@ vcpkg_add_to_path("${PYTHON3_DIR}") vcpkg_from_github( OUT_SOURCE_PATH SOURCE_PATH REPO facebook/folly - REF "v${VERSION}" #v2023.12.04.00 - SHA512 5fe3d99dbd4f10698cc3f5076f80f627932c5f527210fac5610f60e373b90ce0ce44655e4be8467dbafcd007d6b82ec05f4b4014ff4697c5e144d6a2105a3c92 + REF "v${VERSION}" + SHA512 6f4fa9b30ff0aba4c1c89619ada2cafd8943ba9a06b6ac4086fba212f26f0df606c8735485110eec6977900d45fc33ddcfdb8095dc6728b8c12b5cac65672e12 HEAD_REF main PATCHES - reorder-glog-gflags.patch - disable-non-underscore-posix-names.patch boost-1.70.patch fix-windows-minmax.patch fix-deps.patch + disable-uninitialized-resize-on-new-stl.patch + fix-unistd-include.patch ) file(REMOVE "${SOURCE_PATH}/CMake/FindFmt.cmake") @@ -29,6 +29,7 @@ file(REMOVE "${SOURCE_PATH}/build/fbcode_builder/CMake/FindGMock.cmake") file(REMOVE "${SOURCE_PATH}/build/fbcode_builder/CMake/FindGflags.cmake") file(REMOVE "${SOURCE_PATH}/build/fbcode_builder/CMake/FindGlog.cmake") file(REMOVE "${SOURCE_PATH}/build/fbcode_builder/CMake/FindLibEvent.cmake") +file(REMOVE "${SOURCE_PATH}/build/fbcode_builder/CMake/FindLibUnwind.cmake") file(REMOVE "${SOURCE_PATH}/build/fbcode_builder/CMake/FindSodium.cmake") file(REMOVE "${SOURCE_PATH}/build/fbcode_builder/CMake/FindZstd.cmake") @@ -42,6 +43,7 @@ vcpkg_check_features(OUT_FEATURE_OPTIONS FEATURE_OPTIONS FEATURES "zlib" CMAKE_REQUIRE_FIND_PACKAGE_ZLIB "liburing" WITH_liburing + "libaio" WITH_libaio INVERTED_FEATURES "bzip2" CMAKE_DISABLE_FIND_PACKAGE_BZip2 "lzma" CMAKE_DISABLE_FIND_PACKAGE_LibLZMA @@ -54,19 +56,19 @@ vcpkg_check_features(OUT_FEATURE_OPTIONS FEATURE_OPTIONS vcpkg_cmake_configure( SOURCE_PATH "${SOURCE_PATH}" OPTIONS + # Exclude exception tracer, which is necessary to statically link libstdc++. + # https://github.com/facebook/folly/issues/1623 + -DFOLLY_NO_EXCEPTION_TRACER=ON -DMSVC_USE_STATIC_RUNTIME=${MSVC_USE_STATIC_RUNTIME} - # -DCMAKE_DISABLE_FIND_PACKAGE_LibDwarf=ON + -DCMAKE_DISABLE_FIND_PACKAGE_LibDwarf=OFF -DCMAKE_DISABLE_FIND_PACKAGE_Libiberty=ON -DCMAKE_DISABLE_FIND_PACKAGE_LibAIO=ON -DLIBAIO_FOUND=OFF + # Required by Velox. -DFOLLY_HAVE_INT128_T=ON -DCMAKE_INSTALL_DIR=share/folly - # Fix folly static link libstdc++ - # See: https://github.com/facebook/folly/blob/b88123c2abf4b3244ed285e6db0d4bea2d24f95f/CMakeLists.txt#L192 - -DFOLLY_NO_EXCEPTION_TRACER=ON ${FEATURE_OPTIONS} MAYBE_UNUSED_VARIABLES - LIBAIO_FOUND MSVC_USE_STATIC_RUNTIME ) diff --git a/dev/vcpkg/ports/folly/reorder-glog-gflags.patch b/dev/vcpkg/ports/folly/reorder-glog-gflags.patch deleted file mode 100644 index 911481d8854c6..0000000000000 --- a/dev/vcpkg/ports/folly/reorder-glog-gflags.patch +++ /dev/null @@ -1,28 +0,0 @@ -diff --git a/CMake/folly-deps.cmake b/CMake/folly-deps.cmake -index 92b8ce7..04589ed 100644 ---- a/CMake/folly-deps.cmake -+++ b/CMake/folly-deps.cmake -@@ -52,6 +52,11 @@ find_package(DoubleConversion MODULE REQUIRED) - list(APPEND FOLLY_LINK_LIBRARIES ${DOUBLE_CONVERSION_LIBRARY}) - list(APPEND FOLLY_INCLUDE_DIRECTORIES ${DOUBLE_CONVERSION_INCLUDE_DIR}) - -+find_package(Glog CONFIG REQUIRED) -+set(FOLLY_HAVE_LIBGLOG ${GLOG_FOUND}) -+list(APPEND FOLLY_LINK_LIBRARIES glog::glog) -+list(APPEND FOLLY_INCLUDE_DIRECTORIES ${GLOG_INCLUDE_DIR}) -+ - find_package(Gflags MODULE) - set(FOLLY_HAVE_LIBGFLAGS ${LIBGFLAGS_FOUND}) - if(LIBGFLAGS_FOUND) -@@ -61,11 +66,6 @@ if(LIBGFLAGS_FOUND) - set(FOLLY_LIBGFLAGS_INCLUDE ${LIBGFLAGS_INCLUDE_DIR}) - endif() - --find_package(Glog MODULE) --set(FOLLY_HAVE_LIBGLOG ${GLOG_FOUND}) --list(APPEND FOLLY_LINK_LIBRARIES ${GLOG_LIBRARY}) --list(APPEND FOLLY_INCLUDE_DIRECTORIES ${GLOG_INCLUDE_DIR}) -- - find_package(LibEvent MODULE REQUIRED) - list(APPEND FOLLY_LINK_LIBRARIES ${LIBEVENT_LIB}) - list(APPEND FOLLY_INCLUDE_DIRECTORIES ${LIBEVENT_INCLUDE_DIR}) diff --git a/dev/vcpkg/ports/folly/vcpkg-cmake-wrapper.cmake b/dev/vcpkg/ports/folly/vcpkg-cmake-wrapper.cmake index f5997e8364683..cce45a2e43e66 100644 --- a/dev/vcpkg/ports/folly/vcpkg-cmake-wrapper.cmake +++ b/dev/vcpkg/ports/folly/vcpkg-cmake-wrapper.cmake @@ -1,6 +1,6 @@ -set(FOLLY_PREV_MODULE_PATH ${CMAKE_MODULE_PATH}) -list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}) - -_find_package(${ARGS}) - +set(FOLLY_PREV_MODULE_PATH ${CMAKE_MODULE_PATH}) +list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}) + +_find_package(${ARGS}) + set(CMAKE_MODULE_PATH ${FOLLY_PREV_MODULE_PATH}) \ No newline at end of file diff --git a/dev/vcpkg/ports/folly/vcpkg.json b/dev/vcpkg/ports/folly/vcpkg.json index cbb1006baf579..b908f40dd9eac 100644 --- a/dev/vcpkg/ports/folly/vcpkg.json +++ b/dev/vcpkg/ports/folly/vcpkg.json @@ -1,7 +1,6 @@ { "name": "folly", - "version-string": "2023.12.04.00", - "port-version": 4, + "version-string": "2024.04.01.00", "description": "An open-source C++ library developed and used at Facebook. The library is UNSTABLE on Windows", "homepage": "https://github.com/facebook/folly", "license": "Apache-2.0", @@ -19,14 +18,13 @@ "boost-smart-ptr", "boost-system", "boost-thread", + "boost-variant", "double-conversion", "fmt", "gflags", "glog", "libevent", "openssl", - "libdwarf", - "libelf", { "name": "vcpkg-cmake", "host": true @@ -46,6 +44,10 @@ "bzip2" ] }, + "libaio": { + "description": "Support compile with libaio", + "supports": "linux & x64" + }, "libsodium": { "description": "Support libsodium for cryto", "dependencies": [ diff --git a/dev/vcpkg/ports/jemalloc/fix-configure-ac.patch b/dev/vcpkg/ports/jemalloc/fix-configure-ac.patch new file mode 100644 index 0000000000000..7799dfb9e80e3 --- /dev/null +++ b/dev/vcpkg/ports/jemalloc/fix-configure-ac.patch @@ -0,0 +1,13 @@ +diff --git a/configure.ac b/configure.ac +index f6d25f334..3115504e2 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -1592,7 +1592,7 @@ fi + [enable_uaf_detection="0"] + ) + if test "x$enable_uaf_detection" = "x1" ; then +- AC_DEFINE([JEMALLOC_UAF_DETECTION], [ ]) ++ AC_DEFINE([JEMALLOC_UAF_DETECTION], [ ], ["enable UAF"]) + fi + AC_SUBST([enable_uaf_detection]) + diff --git a/dev/vcpkg/ports/jemalloc/portfile.cmake b/dev/vcpkg/ports/jemalloc/portfile.cmake new file mode 100644 index 0000000000000..6cac12ca3b7c0 --- /dev/null +++ b/dev/vcpkg/ports/jemalloc/portfile.cmake @@ -0,0 +1,79 @@ +vcpkg_from_github( + OUT_SOURCE_PATH SOURCE_PATH + REPO jemalloc/jemalloc + REF 54eaed1d8b56b1aa528be3bdd1877e59c56fa90c + SHA512 527bfbf5db9a5c2b7b04df4785b6ae9d445cff8cb17298bf3e550c88890d2bd7953642d8efaa417580610508279b527d3a3b9e227d17394fd2013c88cb7ae75a + HEAD_REF master + PATCHES + fix-configure-ac.patch + preprocessor.patch +) +if(VCPKG_TARGET_IS_WINDOWS) + set(opts "ac_cv_search_log=none required" + "--without-private-namespace" + "--with-jemalloc-prefix=je_gluten_" + "--with-private-namespace=je_gluten_private_" + "--without-export" + "--disable-shared" + "--disable-cxx" + "--disable-libdl" + # For fixing an issue when loading native lib: cannot allocate memory in static TLS block. + "--disable-initial-exec-tls" + "CFLAGS=-fPIC" + "CXXFLAGS=-fPIC") +else() + set(opts + "--with-jemalloc-prefix=je_gluten_" + "--with-private-namespace=je_gluten_private_" + "--without-export" + "--disable-shared" + "--disable-cxx" + "--disable-libdl" + # For fixing an issue when loading native lib: cannot allocate memory in static TLS block. + "--disable-initial-exec-tls" + "CFLAGS=-fPIC" + "CXXFLAGS=-fPIC") +endif() + +vcpkg_configure_make( + SOURCE_PATH "${SOURCE_PATH}" + AUTOCONFIG + NO_WRAPPERS + OPTIONS ${opts} +) + +vcpkg_install_make() + +if(VCPKG_TARGET_IS_WINDOWS) + file(COPY "${SOURCE_PATH}/include/msvc_compat/strings.h" DESTINATION "${CURRENT_PACKAGES_DIR}/include/jemalloc/msvc_compat") + vcpkg_replace_string("${CURRENT_PACKAGES_DIR}/include/jemalloc/jemalloc.h" "" "\"msvc_compat/strings.h\"") + if(VCPKG_LIBRARY_LINKAGE STREQUAL "dynamic") + file(COPY "${CURRENT_BUILDTREES_DIR}/${TARGET_TRIPLET}-rel/lib/jemalloc.lib" DESTINATION "${CURRENT_PACKAGES_DIR}/lib") + file(MAKE_DIRECTORY "${CURRENT_PACKAGES_DIR}/bin") + file(RENAME "${CURRENT_PACKAGES_DIR}/lib/jemalloc.dll" "${CURRENT_PACKAGES_DIR}/bin/jemalloc.dll") + endif() + if(NOT VCPKG_BUILD_TYPE) + if(VCPKG_LIBRARY_LINKAGE STREQUAL "dynamic") + file(COPY "${CURRENT_BUILDTREES_DIR}/${TARGET_TRIPLET}-dbg/lib/jemalloc.lib" DESTINATION "${CURRENT_PACKAGES_DIR}/debug/lib") + file(MAKE_DIRECTORY "${CURRENT_PACKAGES_DIR}/debug/bin") + file(RENAME "${CURRENT_PACKAGES_DIR}/debug/lib/jemalloc.dll" "${CURRENT_PACKAGES_DIR}/debug/bin/jemalloc.dll") + endif() + endif() + if(VCPKG_LIBRARY_LINKAGE STREQUAL "static") + vcpkg_replace_string("${CURRENT_PACKAGES_DIR}/lib/pkgconfig/jemalloc.pc" "install_suffix=" "install_suffix=_s") + if(NOT VCPKG_BUILD_TYPE) + vcpkg_replace_string("${CURRENT_PACKAGES_DIR}/debug/lib/pkgconfig/jemalloc.pc" "install_suffix=" "install_suffix=_s") + endif() + endif() +endif() + +vcpkg_fixup_pkgconfig() + +vcpkg_copy_pdbs() + +file(REMOVE_RECURSE "${CURRENT_PACKAGES_DIR}/debug/include") +file(REMOVE_RECURSE "${CURRENT_PACKAGES_DIR}/debug/share") +file(REMOVE_RECURSE "${CURRENT_PACKAGES_DIR}/tools") + +# Handle copyright +file(INSTALL "${SOURCE_PATH}/COPYING" DESTINATION "${CURRENT_PACKAGES_DIR}/share/${PORT}" RENAME copyright) diff --git a/dev/vcpkg/ports/jemalloc/preprocessor.patch b/dev/vcpkg/ports/jemalloc/preprocessor.patch new file mode 100644 index 0000000000000..6e6e2d1403fb2 --- /dev/null +++ b/dev/vcpkg/ports/jemalloc/preprocessor.patch @@ -0,0 +1,12 @@ +diff --git a/configure.ac b/configure.ac +index 3115504e2..ffb504b08 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -749,6 +749,7 @@ case "${host}" in + so="dll" + if test "x$je_cv_msvc" = "xyes" ; then + importlib="lib" ++ JE_APPEND_VS(CPPFLAGS, -DJEMALLOC_NO_PRIVATE_NAMESPACE) + DSO_LDFLAGS="-LD" + EXTRA_LDFLAGS="-link -DEBUG" + CTARGET='-Fo$@' diff --git a/dev/vcpkg/ports/jemalloc/vcpkg.json b/dev/vcpkg/ports/jemalloc/vcpkg.json new file mode 100644 index 0000000000000..007e05b931c98 --- /dev/null +++ b/dev/vcpkg/ports/jemalloc/vcpkg.json @@ -0,0 +1,8 @@ +{ + "name": "jemalloc", + "version": "5.3.0", + "port-version": 1, + "description": "jemalloc is a general purpose malloc(3) implementation that emphasizes fragmentation avoidance and scalable concurrency support", + "homepage": "https://jemalloc.net/", + "license": "BSD-2-Clause" +} diff --git a/dev/vcpkg/ports/simdjson/portfile.cmake b/dev/vcpkg/ports/simdjson/portfile.cmake new file mode 100644 index 0000000000000..ecc2321c13e17 --- /dev/null +++ b/dev/vcpkg/ports/simdjson/portfile.cmake @@ -0,0 +1,44 @@ +vcpkg_from_github( + OUT_SOURCE_PATH SOURCE_PATH + REPO simdjson/simdjson + REF "v${VERSION}" + HEAD_REF master + SHA512 893ce0cb380b1418438f5910262325031f8071c4577589a491713f91c980964b4105c1e1aa7f2b9373deed40ecf6b48fe48a1aa243151e424f138f5418e4821c +) + +vcpkg_check_features( + OUT_FEATURE_OPTIONS FEATURE_OPTIONS + FEATURES + exceptions SIMDJSON_EXCEPTIONS + threads SIMDJSON_ENABLE_THREADS + INVERTED_FEATURES + deprecated SIMDJSON_DISABLE_DEPRECATED_API + utf8-validation SIMDJSON_SKIPUTF8VALIDATION +) + +string(COMPARE EQUAL "${VCPKG_LIBRARY_LINKAGE}" "static" SIMDJSON_BUILD_STATIC) + +vcpkg_cmake_configure( + SOURCE_PATH "${SOURCE_PATH}" + OPTIONS + -DSIMDJSON_JUST_LIBRARY=ON + -DSIMDJSON_SANITIZE_UNDEFINED=OFF + -DSIMDJSON_SANITIZE=OFF + -DSIMDJSON_SANITIZE_THREADS=OFF + -DSIMDJSON_BUILD_STATIC=${SIMDJSON_BUILD_STATIC} + -DSIMDJSON_DEVELOPMENT_CHECKS=OFF + -DSIMDJSON_VERBOSE_LOGGING=OFF + ${FEATURE_OPTIONS} +) + +vcpkg_cmake_install() + +vcpkg_copy_pdbs() + +vcpkg_cmake_config_fixup(CONFIG_PATH "lib/cmake/${PORT}") + +vcpkg_fixup_pkgconfig() + +file(REMOVE_RECURSE "${CURRENT_PACKAGES_DIR}/debug/include" "${CURRENT_PACKAGES_DIR}/debug/share") + +vcpkg_install_copyright(FILE_LIST "${SOURCE_PATH}/LICENSE") diff --git a/dev/vcpkg/ports/simdjson/vcpkg.json b/dev/vcpkg/ports/simdjson/vcpkg.json new file mode 100644 index 0000000000000..6e46382e42f8f --- /dev/null +++ b/dev/vcpkg/ports/simdjson/vcpkg.json @@ -0,0 +1,37 @@ +{ + "name": "simdjson", + "version": "3.9.3", + "description": "An extremely fast JSON library that can parse gigabytes of JSON per second", + "homepage": "https://simdjson.org/", + "license": "Apache-2.0", + "dependencies": [ + { + "name": "vcpkg-cmake", + "host": true + }, + { + "name": "vcpkg-cmake-config", + "host": true + } + ], + "default-features": [ + "deprecated", + "exceptions", + "threads", + "utf8-validation" + ], + "features": { + "deprecated": { + "description": "Enable deprecated APIs" + }, + "exceptions": { + "description": "Enable exception-throwing interface" + }, + "threads": { + "description": "Link with thread support" + }, + "utf8-validation": { + "description": "Enable UTF-8 validation" + } + } +} diff --git a/dev/vcpkg/vcpkg.json b/dev/vcpkg/vcpkg.json index 4593c86e7d51e..b7dcb059c0822 100644 --- a/dev/vcpkg/vcpkg.json +++ b/dev/vcpkg/vcpkg.json @@ -117,7 +117,6 @@ }, "overrides": [ { "name": "fmt", "version": "10.1.1" }, - { "name": "xsimd", "version": "10.0.0" }, - { "name": "simdjson", "version": "3.2.0" } + { "name": "xsimd", "version": "10.0.0" } ] } diff --git a/docs/Configuration.md b/docs/Configuration.md index a148ec3aa4cbc..2c2bd4de11f24 100644 --- a/docs/Configuration.md +++ b/docs/Configuration.md @@ -22,9 +22,9 @@ You can add these configurations into spark-defaults.conf to enable or disable t | spark.plugins | To load Gluten's components by Spark's plug-in loader | org.apache.gluten.GlutenPlugin | | spark.shuffle.manager | To turn on Gluten Columnar Shuffle Plugin | org.apache.spark.shuffle.sort.ColumnarShuffleManager | | spark.gluten.enabled | Enable Gluten, default is true. Just an experimental property. Recommend to enable/disable Gluten through the setting for `spark.plugins`. | true | +| spark.gluten.memory.isolation | (Experimental) Enable isolated memory mode. If true, Gluten controls the maximum off-heap memory can be used by each task to X, X = executor memory / max task slots. It's recommended to set true if Gluten serves concurrent queries within a single session, since not all memory Gluten allocated is guaranteed to be spillable. In the case, the feature should be enabled to avoid OOM. Note when true, setting spark.memory.storageFraction to a lower value is suggested since storage memory is considered non-usable by Gluten. | false | | spark.gluten.ras.enabled | Experimental: Enables RAS (relation algebra selector) during physical planning to generate more efficient query plan. Note, this feature is still in development and may not bring performance profits. | false | | spark.gluten.sql.columnar.maxBatchSize | Number of rows to be processed in each batch. Default value is 4096. | 4096 | -| spark.gluten.memory.isolation | (Experimental) Enable isolated memory mode. If true, Gluten controls the maximum off-heap memory can be used by each task to X, X = executor memory / max task slots. It's recommended to set true if Gluten serves concurrent queries within a single session, since not all memory Gluten allocated is guaranteed to be spillable. In the case, the feature should be enabled to avoid OOM. Note when true, setting spark.memory.storageFraction to a lower value is suggested since storage memory is considered non-usable by Gluten. | false | | spark.gluten.sql.columnar.scanOnly | When enabled, this config will overwrite all other operators' enabling, and only Scan and Filter pushdown will be offloaded to native. | false | | spark.gluten.sql.columnar.batchscan | Enable or Disable Columnar BatchScan, default is true | true | | spark.gluten.sql.columnar.hashagg | Enable or Disable Columnar Hash Aggregate, default is true | true | @@ -42,7 +42,7 @@ You can add these configurations into spark-defaults.conf to enable or disable t | spark.gluten.sql.columnar.tableCache | Enable or Disable Columnar Table Cache, default is false | true | | spark.gluten.sql.columnar.broadcastExchange | Enable or Disable Columnar Broadcast Exchange, default is true | true | | spark.gluten.sql.columnar.broadcastJoin | Enable or Disable Columnar BroadcastHashJoin, default is true | true | -| spark.gluten.sql.columnar.shuffle.sort.threshold | The threshold to determine whether to use sort-based columnar shuffle. Sort-based shuffle will be used if the number of partitions is greater than this threshold. | 100000 | +| spark.gluten.sql.columnar.shuffle.sort.threshold | The threshold to determine whether to use sort-based columnar shuffle. Sort-based shuffle will be used if the number of partitions is greater than this threshold. | 100000 | | spark.gluten.sql.columnar.shuffle.codec | Set up the codec to be used for Columnar Shuffle. If this configuration is not set, will check the value of spark.io.compression.codec. By default, Gluten use software compression. Valid options for software compression are lz4, zstd. Valid options for QAT and IAA is gzip. | lz4 | | spark.gluten.sql.columnar.shuffle.codecBackend | Enable using hardware accelerators for shuffle de/compression. Valid options are QAT and IAA. | | | spark.gluten.sql.columnar.shuffle.compressionMode | Setting different compression mode in shuffle, Valid options are buffer and rowvector, buffer option compress each buffer of RowVector individually into one pre-allocated large buffer, rowvector option first copies each buffer of RowVector to a large buffer and then compress the entire buffer in one go. | buffer | @@ -51,26 +51,25 @@ You can add these configurations into spark-defaults.conf to enable or disable t | spark.gluten.sql.columnar.shuffle.merge.threshold | Set the threshold control the minimum merged size. When a partition buffer is full, and the number of rows is below (`threshold * spark.gluten.sql.columnar.maxBatchSize`), it will be saved for merging. | 0.25 | | spark.gluten.sql.columnar.numaBinding | Set up NUMABinding, default is false | true | | spark.gluten.sql.columnar.coreRange | Set up the core range for NUMABinding, only works when numaBinding set to true.
The setting is based on the number of cores in your system. Use 72 cores as an example. | 0-17,36-53 |18-35,54-71 | -| spark.gluten.sql.native.bloomFilter | Enable or Disable native runtime bloom filter. | true | -| spark.gluten.sql.native.arrow.reader.enabled | Enable or Disable native arrow read CSV file format | false | | spark.gluten.sql.columnar.wholeStage.fallback.threshold | Configure the threshold for whether whole stage will fall back in AQE supported case by counting the number of ColumnarToRow & vanilla leaf node | \>= 1 | | spark.gluten.sql.columnar.query.fallback.threshold | Configure the threshold for whether query will fall back by counting the number of ColumnarToRow & vanilla leaf node | \>= 1 | | spark.gluten.sql.columnar.fallback.ignoreRowToColumnar | When true, the fallback policy ignores the RowToColumnar when counting fallback number. | true | | spark.gluten.sql.columnar.fallback.preferColumnar | When true, the fallback policy prefers to use Gluten plan rather than vanilla Spark plan if the both of them contains ColumnarToRow and the vanilla Spark plan ColumnarToRow number is not smaller than Gluten plan. | true | -| spark.gluten.sql.columnar.maxBatchSize | Set the number of rows for the output batch. | 4096 | +| spark.gluten.sql.columnar.force.hashagg | Force to use hash agg to replace sort agg. | true | +| spark.gluten.sql.columnar.vanillaReaders | Enable vanilla spark's vectorized reader. Please note it may bring perf. overhead due to extra data transition. We recommend to disable it if most queries can be fully offloaded to gluten. | false | +| spark.gluten.sql.native.bloomFilter | Enable or Disable native runtime bloom filter. | true | +| spark.gluten.sql.native.arrow.reader.enabled | Enable or Disable native arrow read CSV file format | false | | spark.gluten.shuffleWriter.bufferSize | Set the number of buffer rows for the shuffle writer | value of spark.gluten.sql.columnar.maxBatchSize | | spark.gluten.loadLibFromJar | Controls whether to load dynamic link library from a packed jar for gluten/cpp. Not applicable to static build and clickhouse backend. | false | | spark.gluten.loadLibOS | When `spark.gluten.loadLibFromJar` is true. Manually specify the system os to load library, e.g., CentOS | | | spark.gluten.loadLibOSVersion | Manually specify the system os version to load library, e.g., if `spark.gluten.loadLibOS` is CentOS, this config can be 7 | | -| spark.gluten.sql.columnar.force.hashagg | Force to use hash agg to replace sort agg. | true | -| spark.gluten.sql.columnar.vanillaReaders | Enable vanilla spark's vectorized reader. Please note it may bring perf. overhead due to extra data transition. We recommend to disable it if most queries can be fully offloaded to gluten. | false | | spark.gluten.expression.blacklist | A black list of expression to skip transform, multiple values separated by commas. | | | spark.gluten.sql.columnar.fallback.expressions.threshold | Fall back filter/project if the height of expression tree reaches this threshold, considering Spark codegen can bring better performance for such case. | 50 | | spark.gluten.sql.cartesianProductTransformerEnabled | Config to enable CartesianProductExecTransformer. | true | - | spark.gluten.sql.broadcastNestedLoopJoinTransformerEnabled | Config to enable BroadcastNestedLoopJoinExecTransformer. | true | - | spark.gluten.sql.cacheWholeStageTransformerContext | When true, `WholeStageTransformer` will cache the `WholeStageTransformerContext` when executing. It is used to get substrait plan node and native plan string. | false | - | spark.gluten.sql.injectNativePlanStringToExplain | When true, Gluten will inject native plan tree to explain string inside `WholeStageTransformerContext`. | false | - | spark.gluten.sql.fallbackRegexpExpressions | When true, Gluten will fall back all regexp expressions to avoid any incompatibility risk. | false | +| spark.gluten.sql.broadcastNestedLoopJoinTransformerEnabled | Config to enable BroadcastNestedLoopJoinExecTransformer. | true | +| spark.gluten.sql.cacheWholeStageTransformerContext | When true, `WholeStageTransformer` will cache the `WholeStageTransformerContext` when executing. It is used to get substrait plan node and native plan string. | false | +| spark.gluten.sql.injectNativePlanStringToExplain | When true, Gluten will inject native plan tree to explain string inside `WholeStageTransformerContext`. | false | +| spark.gluten.sql.fallbackRegexpExpressions | When true, Gluten will fall back all regexp expressions to avoid any incompatibility risk. | false | ## Velox Parameters @@ -81,41 +80,31 @@ The following configurations are related to Velox settings. | spark.gluten.sql.columnar.backend.velox.bloomFilter.expectedNumItems | The default number of expected items for the velox bloomfilter. | 1000000L | | spark.gluten.sql.columnar.backend.velox.bloomFilter.numBits | The default number of bits to use for the velox bloom filter. | 8388608L | | spark.gluten.sql.columnar.backend.velox.bloomFilter.maxNumBits | The max number of bits to use for the velox bloom filter. | 4194304L | - | spark.gluten.sql.columnar.backend.velox.fileHandleCacheEnabled | Disables caching if false. File handle cache should be disabled if files are mutable, i.e. file content may change while file path stays the same. | | - | spark.gluten.sql.columnar.backend.velox.directorySizeGuess | Set the directory size guess for velox file scan. | | - | spark.gluten.sql.columnar.backend.velox.filePreloadThreshold | Set the file preload threshold for velox file scan. | | - | spark.gluten.sql.columnar.backend.velox.prefetchRowGroups | Set the prefetch row groups for velox file scan. | | - | spark.gluten.sql.columnar.backend.velox.loadQuantum | Set the load quantum for velox file scan. | | +| spark.gluten.sql.columnar.backend.velox.fileHandleCacheEnabled | Disables caching if false. File handle cache should be disabled if files are mutable, i.e. file content may change while file path stays the same. | | +| spark.gluten.sql.columnar.backend.velox.directorySizeGuess | Set the directory size guess for velox file scan. | | +| spark.gluten.sql.columnar.backend.velox.filePreloadThreshold | Set the file preload threshold for velox file scan. | | +| spark.gluten.sql.columnar.backend.velox.prefetchRowGroups | Set the prefetch row groups for velox file scan. | | +| spark.gluten.sql.columnar.backend.velox.loadQuantum | Set the load quantum for velox file scan. | | | spark.gluten.sql.columnar.backend.velox.maxCoalescedDistanceBytes | Set the max coalesced distance bytes for velox file scan. | | | spark.gluten.sql.columnar.backend.velox.maxCoalescedBytes | Set the max coalesced bytes for velox file scan. | | | spark.gluten.sql.columnar.backend.velox.cachePrefetchMinPct | Set prefetch cache min pct for velox file scan. | | | spark.gluten.velox.awsSdkLogLevel | Log granularity of AWS C++ SDK in velox. | FATAL | +| spark.gluten.velox.fs.s3a.retry.mode | Retry mode for AWS s3 connection error, can be "legacy", "standard" and "adaptive". | legacy | +| spark.gluten.velox.fs.s3a.connect.timeout | Timeout for AWS s3 connection. | 1s | | spark.gluten.sql.columnar.backend.velox.orc.scan.enabled | Enable velox orc scan. If disabled, vanilla spark orc scan will be used. | true | | spark.gluten.sql.complexType.scan.fallback.enabled | Force fallback for complex type scan, including struct, map, array. | true | -``` -##### Columnar Process Configuration - -spark.plugins org.apache.gluten.GlutenPlugin -spark.shuffle.manager org.apache.spark.shuffle.sort.ColumnarShuffleManager -spark.driver.extraClassPath ${GLUTEN_HOME}/package/target/gluten-XXX.jar -spark.executor.extraClassPath ${GLUTEN_HOME}/package/target/gluten-XXX.jar -###### -``` - Additionally, you can control the configurations of gluten at thread level by local property. -| Parameters | Description | Recommend Setting | -|---------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------| -| gluten.enabledForCurrentThread | Control the usage of gluten at thread level. | true | +| Parameters | Description | Recommend Setting | +|--------------------------------|----------------------------------------------|-------------------| +| gluten.enabledForCurrentThread | Control the usage of gluten at thread level. | true | Below is an example of developing an application using scala to set local properties. -``` -##### Before executing the query, set local properties - +```scala +// Before executing the query, set local properties. sparkContext.setLocalProperty(key, value) spark.sql("select * from demo_tables").show() -###### ``` diff --git a/docs/developers/CppCodingStyle.md b/docs/developers/CppCodingStyle.md index 9dca4cf69fbc7..42101882a9e58 100644 --- a/docs/developers/CppCodingStyle.md +++ b/docs/developers/CppCodingStyle.md @@ -28,11 +28,20 @@ Gluten CPP coding, there are a few Philosophical rules as the following. ## Code Formatting Many aspects of C++ coding style will be covered by clang-format, such as spacing, -line width, indentation and ordering (for includes, using directives and etc).  +line width, indentation and ordering (for includes, using directives and etc). * Always ensure your code is compatible with clang-format-15 for Velox backend. * `dev/formatcppcode.sh` is provided for formatting Velox CPP code. +To format cmake files like CMakeLists.txt & *.cmake, `cmake-format` is required to +be installed. Here is an example. + +``` +apt install python3-pip -y +pip3 install --user cmake-format +cmake-format --first-comment-is-literal True --in-place cpp/velox/CMakeLists.txt +``` + ## Naming Conventions * Use **PascalCase** for types (class, struct, enum, type alias, type diff --git a/docs/developers/MicroBenchmarks.md b/docs/developers/MicroBenchmarks.md index 7fc2a535dcf11..21f222b42690d 100644 --- a/docs/developers/MicroBenchmarks.md +++ b/docs/developers/MicroBenchmarks.md @@ -250,6 +250,93 @@ cd /path/to/gluten/cpp/build/velox/benchmarks --threads 1 --noprint-result --with-shuffle ``` +Developers can leverage the `--with-shuffle` option to benchmark the shuffle-write process by creating +a simple pipeline of `table scan + shuffle write` in Gluten. This can be done by dumping the micro benchmark +inputs from a first stage. The steps are demonstrated as below: + +1. Start spark-shell or pyspark + +We need to set `spark.gluten.sql.benchmark_task.stageId` and `spark.gluten.saveDir` to dump the inputs. +Normally, the stage id should be greater than 0. You can run the command in step 2 in advance to get the +right stage id in your case. We shall set `spark.default.parallelism` to 1 and `spark.sql.files.maxPartitionBytes` +large enough to make sure there will be only 1 task in the first stage. + +``` +# Start pyspark +./bin/pyspark --master local[*] \ +--conf spark.gluten.sql.benchmark_task.stageId=1 \ +--conf spark.gluten.saveDir=/path/to/saveDir \ +--conf spark.default.parallelism=1 \ +--conf spark.sql.files.maxPartitionBytes=10g +... # omit other spark & gluten config +``` + +2. Run the table-scan command to dump the plan for the first stage + +If simulating single or round-robin partitioning, the first stage can only have the table scan operator. + +``` +>>> spark.read.format("parquet").load("file:///example.parquet").show() +``` + +If simulating hash partitioning, there will be a projection for generating the hash partitioning key. +Therefore we need to explicitly run the `repartition` to generate the `scan + project` pipeline for the first stage. +Note that using different number of shuffle partitions here doesn't change the generated pipeline. + +``` +>>> spark.read.format("parquet").load("file:///example.parquet").repartition(10, "key1", "key2").show() +``` + +Simuating range partitioning is not supported. + +3. Run the micro benchmark with dumped inputs + +General configurations for shuffle write: + +- `--with-shuffle`: Add shuffle write process at the end of the pipeline +- `--shuffle-writer`: Specify shuffle writer type. Valid options are sort and hash. Default is hash. +- `--partitioning`: Specify partitioning type. Valid options are rr, hash and single. Defualt is rr. + The partitioning type should match the command in step 2. +- `--shuffle-partitions`: Specify number of shuffle partitions. +- `--compression`: By default, the compression codec for shuffle outputs is lz4. You can switch to other compression codecs + or use hardware accelerators Valid options are: lz4, zstd, qat-gzip, qat-zstd and iaa-gzip. The compression levels are fixed (use default compression level 1). + + Note using QAT or IAA codec requires Gluten cpp is built with these features. + Please check the corresponding section in [Velox document](../get-started/Velox.md) first for how to + setup, build and enable these features in Gluten. For QAT support, please + check [Intel® QuickAssist Technology (QAT) support](../get-started/Velox.md#intel-quickassist-technology-qat-support). + For IAA support, please + check [Intel® In-memory Analytics Accelerator (IAA/IAX) support](../get-started/Velox.md#intel-in-memory-analytics-accelerator-iaaiax-support) + +```shell +cd /path/to/gluten/cpp/build/velox/benchmarks +./generic_benchmark \ +--plan /path/to/saveDir/plan_1_0.json \ +--conf /path/to/saveDir/conf_1_0.ini \ +--split /path/to/saveDir/split_1_0_0.json \ +--with-shuffle \ +--shuffle-writer sort \ +--partitioning hash \ +--threads 1 +``` + +### Run shuffle write task only + +Developers can only run shuffle write task via specifying `--run-shuffle` and `--data` options. +The parquet format input will be read from arrow-parquet reader and sent to shuffle writer. +This option is similar to the `--with-shuffle` option, but it doesn't require the plan and split files. +The round-robin partitioner is used by default. Besides, random partitioning can be used for testing purpose. +By specifying option `--partitioning random`, the partitioner will generate a random partition id for each row. + +```shell +cd /path/to/gluten/cpp/build/velox/benchmarks +./generic_benchmark \ +--run-shuffle \ +--data /path/to/input_for_shuffle_write.parquet +--shuffle-writer sort \ +--threads 1 +``` + ## Simulate write tasks The last operator for a write task is a file write operator, and the output from Velox pipeline only @@ -265,20 +352,6 @@ cd /path/to/gluten/cpp/build/velox/benchmarks --write-path /absolute_path/ ``` -By default, the compression codec for shuffle outputs is LZ4. You can switch to other codecs by -adding one of the following argument flags to the command: - -- --zstd: ZSTD codec, compression level 1 -- --qat-gzip: QAT GZIP codec, compression level 1 -- --qat-zstd: QAT ZSTD codec, compression level 1 -- --iaa-gzip: IAA GZIP codec, compression level 1 - -Note using QAT or IAA codec requires Gluten cpp is built with these features. -Please check the corresponding section in [Velox document](../get-started/Velox.md) first for how to -setup, build and enable these features in Gluten. For QAT support, please -check [Intel® QuickAssist Technology (QAT) support](../get-started/Velox.md#intel-quickassist-technology-qat-support). -For IAA support, please -check [Intel® In-memory Analytics Accelerator (IAA/IAX) support](../get-started/Velox.md#intel-in-memory-analytics-accelerator-iaaiax-support) ## Simulate task spilling diff --git a/docs/developers/NewToGluten.md b/docs/developers/NewToGluten.md index a397003adf36b..fa2ca520f232a 100644 --- a/docs/developers/NewToGluten.md +++ b/docs/developers/NewToGluten.md @@ -43,7 +43,7 @@ export PATH="$PATH:$JAVA_HOME/bin" ## OpenJDK 17 -By default, Gluten compiles package using JDK8. Enable maven profile by `-Pjava-17` to use JDK17, and please make sure your JAVA_HOME points to jdk17. +By default, Gluten compiles package using JDK8. Enable maven profile by `-Pjava-17` to use JDK17 or `-Pjava-11` to use JDK 11, and please make sure your JAVA_HOME points to jdk17 or jdk11 respectively. Apache Spark and Arrow requires setting java args `-Dio.netty.tryReflectionSetAccessible=true`, see [SPARK-29924](https://issues.apache.org/jira/browse/SPARK-29924) and [ARROW-6206](https://issues.apache.org/jira/browse/ARROW-6206). So please add following configs in `spark-defaults.conf`: @@ -289,6 +289,40 @@ Search `default formatter` in `Settings`, select Clang-Format. If your formatOnSave still make no effect, you can use shortcut `SHIFT+ALT+F` to format one file manually. +### Cmake format + +To format cmake files, like CMakeLists.txt & *.cmake, please install `cmake-format`. +``` +pip3 install --user cmake-format +``` +Here is an example to format a file in command line. +``` +cmake-format --first-comment-is-literal True --in-place cpp/velox/CMakeLists.txt +``` + +After the above installation, you can optionally do some configuration in Visual Studio Code to easily format cmake files. +1. Install `cmake-format` extension in Visual Studio Code. +2. Configure the extension. To do this, open the settings (File -> Preferences -> Settings), search for `cmake-format`, + and do the below settings: + * Set Args: `--first-comment-is-literal=True`. + * Set Exe Path to the path of the `cmake-format` command. If you installed `cmake-format` in a standard + location, you might not need to change this setting. +3. Now, you can format your CMake files by right-clicking in a file and selecting `Format Document`. + +### Add UT + +1. For Native Code Modifications: If you have modified native code, it is best to use gtest to test the native code. + A secondary option is to add Gluten UT to ensure coverage. + +2. For Gluten-Related Code Modifications: If you have modified code related to Gluten, it is preferable to add scalatest rather than JUnit. + Additionally, the test classes should be placed in the org.apache.gluten package. + +3. For Spark-Related Code Modifications: If you have modified code related to Spark, it is preferable to add scalatest rather than JUnit. + Additionally, the test classes should be placed in the org.apache.spark package. + +4. Placement of Non-Native Code UTs: Ensure that unit tests for non-native code are placed within org.apache.gluten and org.apache.spark packages. + This is important because the CI system runs unit tests from these two paths in parallel. Placing tests in other paths might cause your tests to be ignored. + # Debug cpp code with coredump ```bash diff --git a/docs/developers/SubstraitModifications.md b/docs/developers/SubstraitModifications.md index 38406425af969..24a9c1a2128d7 100644 --- a/docs/developers/SubstraitModifications.md +++ b/docs/developers/SubstraitModifications.md @@ -27,6 +27,7 @@ changed `Unbounded` in `WindowFunction` into `Unbounded_Preceding` and `Unbounde * Added `PartitionColumn` in `LocalFiles`([#2405](https://github.com/apache/incubator-gluten/pull/2405)). * Added `WriteRel` ([#3690](https://github.com/apache/incubator-gluten/pull/3690)). * Added `TopNRel` ([#5409](https://github.com/apache/incubator-gluten/pull/5409)). +* Added `ref` field in window bound `Preceding` and `Following` ([#5626](https://github.com/apache/incubator-gluten/pull/5626)). ## Modifications to type.proto diff --git a/docs/developers/VeloxUDF.md b/docs/developers/VeloxUDF.md index b88c4de1515e7..c896fd6726573 100644 --- a/docs/developers/VeloxUDF.md +++ b/docs/developers/VeloxUDF.md @@ -137,13 +137,10 @@ You can also specify the local or HDFS URIs to the UDF libraries or archives. Lo ## Try the example We provided Velox UDF examples in file [MyUDF.cc](../../cpp/velox/udf/examples/MyUDF.cc) and UDAF examples in file [MyUDAF.cc](../../cpp/velox/udf/examples/MyUDAF.cc). -You need to build the gluten cpp project with `--build_example=ON` to get the example libraries. +You need to build the gluten project with `--build_example=ON` to get the example libraries. ```shell -## compile Gluten cpp module -cd /path/to/gluten/cpp -## if you use custom velox_home, make sure specified here by --velox_home -./compile.sh --build_velox_backend=ON --build_examples=ON +./dev/buildbundle-veloxbe.sh --build_examples=ON ``` Then, you can find the example libraries at /path/to/gluten/cpp/build/velox/udf/examples/ diff --git a/docs/get-started/ClickHouse.md b/docs/get-started/ClickHouse.md index 4352a99e55f93..38ce048fe0def 100644 --- a/docs/get-started/ClickHouse.md +++ b/docs/get-started/ClickHouse.md @@ -629,19 +629,26 @@ public read-only account:gluten/hN2xX3uQ4m ### Celeborn support -Gluten with clickhouse backend has not yet supportted [Celeborn](https://github.com/apache/celeborn) natively as remote shuffle service using columar shuffle. However, you can still use Celeborn with row shuffle, which means a ColumarBatch will be converted to a row during shuffle. -Below introduction is used to enable this feature: +Gluten with clickhouse backend supports [Celeborn](https://github.com/apache/celeborn) as remote shuffle service. Currently, the supported Celeborn versions are `0.3.x` and `0.4.0`. + +Below introduction is used to enable this feature. First refer to this URL(https://github.com/apache/celeborn) to setup a celeborn cluster. +When compiling the Gluten Java module, it's required to enable `celeborn` profile, as follows: + +``` +mvn clean package -Pbackends-clickhouse -Pspark-3.3 -Pceleborn -DskipTests +``` + Then add the Spark Celeborn Client packages to your Spark application's classpath(usually add them into `$SPARK_HOME/jars`). - Celeborn: celeborn-client-spark-3-shaded_2.12-[celebornVersion].jar -Currently to use Celeborn following configurations are required in `spark-defaults.conf` +Currently to use Gluten following configurations are required in `spark-defaults.conf` ``` -spark.shuffle.manager org.apache.spark.shuffle.celeborn.SparkShuffleManager +spark.shuffle.manager org.apache.spark.shuffle.gluten.celeborn.CelebornShuffleManager # celeborn master spark.celeborn.master.endpoints clb-master:9097 @@ -670,24 +677,6 @@ spark.celeborn.storage.hdfs.dir hdfs:///celeborn spark.dynamicAllocation.enabled false ``` -#### Celeborn Columnar Shuffle Support -Currently, the supported Celeborn versions are `0.3.x` and `0.4.0`. -The native Celeborn support can be enabled by the following configuration -``` -spark.shuffle.manager=org.apache.spark.shuffle.gluten.celeborn.CelebornShuffleManager -``` - -quickly start a celeborn cluster -```shell -wget https://archive.apache.org/dist/incubator/celeborn/celeborn-0.3.0-incubating/apache-celeborn-0.3.0-incubating-bin.tgz && \ -tar -zxvf apache-celeborn-0.3.0-incubating-bin.tgz && \ -mv apache-celeborn-0.3.0-incubating-bin/conf/celeborn-defaults.conf.template apache-celeborn-0.3.0-incubating-bin/conf/celeborn-defaults.conf && \ -mv apache-celeborn-0.3.0-incubating-bin/conf/log4j2.xml.template apache-celeborn-0.3.0-incubating-bin/conf/log4j2.xml && \ -mkdir /opt/hadoop && chmod 777 /opt/hadoop && \ -echo -e "celeborn.worker.flusher.threads 4\nceleborn.worker.storage.dirs /tmp\nceleborn.worker.monitor.disk.enabled false" > apache-celeborn-0.3.0-incubating-bin/conf/celeborn-defaults.conf && \ -bash apache-celeborn-0.3.0-incubating-bin/sbin/start-master.sh && bash apache-celeborn-0.3.0-incubating-bin/sbin/start-worker.sh -``` - ### Columnar shuffle mode We have two modes of columnar shuffle 1. prefer cache diff --git a/docs/get-started/Velox.md b/docs/get-started/Velox.md index a24c99fda3028..5f9ae2a46b19d 100644 --- a/docs/get-started/Velox.md +++ b/docs/get-started/Velox.md @@ -9,7 +9,7 @@ parent: Getting-Started | Type | Version | |-------|---------------------------------| -| Spark | 3.2.2, 3.3.1, 3.4.2, 3.5.1(wip) | +| Spark | 3.2.2, 3.3.1, 3.4.2, 3.5.1 | | OS | Ubuntu20.04/22.04, Centos7/8 | | jdk | openjdk8/jdk17 | | scala | 2.12 | @@ -19,7 +19,7 @@ parent: Getting-Started Currently, Gluten+Velox backend is only tested on **Ubuntu20.04/Ubuntu22.04/Centos7/Centos8**. Other kinds of OS support are still in progress. The long term goal is to support several common OS and conda env deployment. -Gluten only fully tested in CI with 3.2.2, 3.3.1 and 3.4.2. We will add/update supported/tested versions according to the upstream changes. +Gluten only fully tested in CI with 3.2.2, 3.3.1, 3.4.2 and 3.5.1. We will add/update supported/tested versions according to the upstream changes. We need to set up the `JAVA_HOME` env. Currently, Gluten supports **java 8** and **java 17**. @@ -224,11 +224,11 @@ Currently there are several ways to asscess S3 in Spark. Please refer [Velox S3] Gluten with velox backend supports [Celeborn](https://github.com/apache/celeborn) as remote shuffle service. Currently, the supported Celeborn versions are `0.3.x` and `0.4.0`. -Below introduction is used to enable this feature +Below introduction is used to enable this feature. First refer to this URL(https://github.com/apache/celeborn) to setup a celeborn cluster. -When compiling the Gluten Java module, it's required to enable `rss` profile, as follows: +When compiling the Gluten Java module, it's required to enable `celeborn` profile, as follows: ``` mvn clean package -Pbackends-velox -Pspark-3.3 -Pceleborn -DskipTests @@ -303,8 +303,7 @@ First of all, compile gluten-iceberg module by a `iceberg` profile, as follows: mvn clean package -Pbackends-velox -Pspark-3.3 -Piceberg -DskipTests ``` -Then, put the additional `gluten-iceberg-XX-SNAPSHOT.jar` to the class path (usually it's `$SPARK_HOME/jars`). -The gluten-iceberg jar is in `gluten-iceberg/target` directory. +Once built successfully, iceberg features will be included in gluten-velox-bundle-X jar. Then you can query iceberg table by gluten/velox without scan's fallback. After the two steps, you can query iceberg table by gluten/velox without scan's fallback. diff --git a/docs/get-started/build-guide.md b/docs/get-started/build-guide.md index 3db2244ba229a..a9e9bd1266df6 100644 --- a/docs/get-started/build-guide.md +++ b/docs/get-started/build-guide.md @@ -14,8 +14,8 @@ Please set them via `--`, e.g. `--build_type=Release`. | build_tests | Build gluten cpp tests. | OFF | | build_examples | Build udf example. | OFF | | build_benchmarks | Build gluten cpp benchmarks. | OFF | -| build_jemalloc | Build with jemalloc. | ON | -| build_protobuf | Build protobuf lib. | ON | +| build_jemalloc | Build with jemalloc. | OFF | +| build_protobuf | Build protobuf lib. | OFF | | enable_qat | Enable QAT for shuffle data de/compression. | OFF | | enable_iaa | Enable IAA for shuffle data de/compression. | OFF | | enable_hbm | Enable HBM allocator. | OFF | @@ -55,17 +55,18 @@ Please set them via `--`, e.g., `--velox_home=/YOUR/PATH`. ### Maven build parameters The below parameters can be set via `-P` for mvn. -| Parameters | Description | Default state | -|---------------------|------------------------------------------------------------------------------|---------------| -| backends-velox | Build Gluten Velox backend. | disabled | -| backends-clickhouse | Build Gluten ClickHouse backend. | disabled | -| rss | Build Gluten with Remote Shuffle Service, only applicable for Velox backend. | disabled | -| delta | Build Gluten with Delta Lake support. | disabled | -| iceberg | Build Gluten with Iceberg support. | disabled | -| spark-3.2 | Build Gluten for Spark 3.2. | enabled | -| spark-3.3 | Build Gluten for Spark 3.3. | disabled | -| spark-3.4 | Build Gluten for Spark 3.4. | disabled | -| spark-3.5 | Build Gluten for Spark 3.5. | disabled | +| Parameters | Description | Default state | +|---------------------|---------------------------------------|---------------| +| backends-velox | Build Gluten Velox backend. | disabled | +| backends-clickhouse | Build Gluten ClickHouse backend. | disabled | +| celeborn | Build Gluten with Celeborn. | disabled | +| uniffle | Build Gluten with Uniffle. | disabled | +| delta | Build Gluten with Delta Lake support. | disabled | +| iceberg | Build Gluten with Iceberg support. | disabled | +| spark-3.2 | Build Gluten for Spark 3.2. | enabled | +| spark-3.3 | Build Gluten for Spark 3.3. | disabled | +| spark-3.4 | Build Gluten for Spark 3.4. | disabled | +| spark-3.5 | Build Gluten for Spark 3.5. | disabled | ## Gluten Jar for Deployment The gluten jar built out is under `GLUTEN_SRC/package/target/`. diff --git a/docs/velox-backend-limitations.md b/docs/velox-backend-limitations.md index 75b52f38e17ad..002bbb3c30175 100644 --- a/docs/velox-backend-limitations.md +++ b/docs/velox-backend-limitations.md @@ -118,6 +118,10 @@ spark.range(100).toDF("id") .saveAsTable("velox_ctas") ``` +#### HiveFileFormat write + +Gluten supports writes of HiveFileFormat when the output file type is of type `parquet` only + #### NaN support Velox does NOT support NaN. So unexpected result can be obtained for a few cases, e.g., comparing a number with NaN. diff --git a/docs/velox-backend-support-progress.md b/docs/velox-backend-support-progress.md index ccb253c24b48f..f3f1e100b2b25 100644 --- a/docs/velox-backend-support-progress.md +++ b/docs/velox-backend-support-progress.md @@ -100,7 +100,7 @@ Gluten supports 199 functions. (Drag to right to see all data types) | & | bitwise_and | bitwise_and | S | | | | | | | | | | | | | | | | | | | | | * | multiply | multiply | S | ANSI OFF | | S | S | S | S | S | | | | | | | | | | | | | | + | plus | add | S | ANSI OFF | | S | S | S | S | S | | | | | | | | | | | | | -| - | minus | substract | S | ANSI OFF | | S | S | S | S | S | | | | | | | | | | | | | +| - | minus | subtract | S | ANSI OFF | | S | S | S | S | S | | | | | | | | | | | | | | / | divide | divide | S | ANSI OFF | | S | S | S | S | S | | | | | | | | | | | | | | < | lt | lessthan | S | | S | S | S | S | S | S | S | | | S | | | | | | | | | | <= | lte | lessthanorequa | S | | S | S | S | S | S | S | S | | | S | | | | | | | | | @@ -178,7 +178,7 @@ Gluten supports 199 functions. (Drag to right to see all data types) | rpad | rpad | | S | | | | | | | | | | | S | | | | | | | | | | rtrim | rtrim | rtrim | S | | | | | | | | | | | S | | | | | | | | | | sentences | | | | | | | | | | | | | | | | | | | | | | | -| soundex | | | | | | | | | | | | | | | | | | | | | | | +| soundex | | soundex | S | | | | | | | | | | | | | | | | | | | | | space | | | | | | | | | | | | | | | | | | | | | | | | split | split | split | S | Mismatched | | | | | | | | | | | | | | | | | | | | split_part | split_part | | | Mismatched | | | | | | | | | | | | | | | | | | | @@ -233,7 +233,7 @@ Gluten supports 199 functions. (Drag to right to see all data types) | hex | | | S | | | | | | S | | | | | S | | | S | | | | | | | hypot | | | | | | | | | | | | | | | | | | | | | | | | ln | ln | | S | | | S | S | S | S | S | S | | | | | | | | | | | | -| log | ln | | S | | | S | S | S | S | S | S | | | | | | | | | | | | +| log | ln | log | S | | | S | S | S | S | S | S | | | | | | | | | | | | | log10 | log10 | | S | | | S | S | S | S | S | S | | | | | | | | | | | | | log1p | | | | | | | | | | | | | | | | | | | | | | | | log2 | log2 | | S | | | S | S | S | S | S | S | | | | | | | | | | | | @@ -272,7 +272,7 @@ Gluten supports 199 functions. (Drag to right to see all data types) | array_repeat | | | S | | S | S | S | S | S | S | S | S | S | S | S | | | | | | | | | array_sort | array_sort | array_sort | S | | | | | | | | | | | | | | | | | | | | | array_union | | | | | | | | | | | | | | | | | | | | | | | -| arrays_overlap | array_overlap | | | | | | | | | | | | | | | | | | | | | | +| arrays_overlap | array_overlap | S | | | | | | | | | | | | | | | | | | | | | | arrays_zip | zip | | S | | | | | | | | | | | | | | | | | | | | | cardinality | cardinality | | | | | | | | | | | | | | | | | | | | | | | element_at | element_at | element_at | S | | | | | | | | | | | | | | | | S | S | | | diff --git a/ep/build-velox/src/build_velox.sh b/ep/build-velox/src/build_velox.sh index 3fc0dc6f661f2..b55f65a98e9e3 100755 --- a/ep/build-velox/src/build_velox.sh +++ b/ep/build-velox/src/build_velox.sh @@ -34,7 +34,6 @@ ENABLE_TESTS=OFF # Set to ON for gluten cpp test build. BUILD_TEST_UTILS=OFF RUN_SETUP_SCRIPT=ON -COMPILE_ARROW_JAVA=ON NUM_THREADS="" OTHER_ARGUMENTS="" @@ -87,10 +86,6 @@ for arg in "$@"; do RUN_SETUP_SCRIPT=("${arg#*=}") shift # Remove argument name from processing ;; - --compile_arrow_java=*) - COMPILE_ARROW_JAVA=("${arg#*=}") - shift # Remove argument name from processing - ;; --num_threads=*) NUM_THREADS=("${arg#*=}") shift # Remove argument name from processing @@ -151,7 +146,9 @@ function compile { fi echo "NUM_THREADS_OPTS: $NUM_THREADS_OPTS" - export simdjson_SOURCE=BUNDLED + export simdjson_SOURCE=AUTO + # Quick fix for CI error due to velox rebase + export Arrow_SOURCE=BUNDLED if [ $ARCH == 'x86_64' ]; then make $COMPILE_TYPE $NUM_THREADS_OPTS EXTRA_CMAKE_FLAGS="${COMPILE_OPTION}" elif [[ "$ARCH" == 'arm64' || "$ARCH" == 'aarch64' ]]; then @@ -191,7 +188,7 @@ function get_build_summary { echo "ENABLE_S3=$ENABLE_S3,ENABLE_GCS=$ENABLE_GCS,ENABLE_HDFS=$ENABLE_HDFS,ENABLE_ABFS=$ENABLE_ABFS,\ BUILD_TYPE=$BUILD_TYPE,VELOX_HOME=$VELOX_HOME,ENABLE_BENCHMARK=$ENABLE_BENCHMARK,\ ENABLE_TESTS=$ENABLE_TESTS,BUILD_TEST_UTILS=$BUILD_TEST_UTILS,\ -COMPILE_ARROW_JAVA=$COMPILE_ARROW_JAVA,OTHER_ARGUMENTS=$OTHER_ARGUMENTS,COMMIT_HASH=$COMMIT_HASH" +OTHER_ARGUMENTS=$OTHER_ARGUMENTS,COMMIT_HASH=$COMMIT_HASH" } function check_commit { @@ -277,32 +274,6 @@ function setup_linux { fi } -function compile_arrow_java_module() { - ARROW_HOME="${VELOX_HOME}/_build/$COMPILE_TYPE/third_party/arrow_ep/src/arrow_ep" - ARROW_INSTALL_DIR="${ARROW_HOME}/../../install" - - pushd $ARROW_HOME/java - - mvn clean install -pl maven/module-info-compiler-maven-plugin -am \ - -Dmaven.test.skip -Drat.skip -Dmaven.gitcommitid.skip -Dcheckstyle.skip - - # Arrow C Data Interface CPP libraries - mvn generate-resources -P generate-libs-cdata-all-os -Darrow.c.jni.dist.dir=$ARROW_INSTALL_DIR \ - -Dmaven.test.skip -Drat.skip -Dmaven.gitcommitid.skip -Dcheckstyle.skip -N - - # Arrow JNI Date Interface CPP libraries - export PKG_CONFIG_PATH=/usr/local/lib64/pkgconfig${PKG_CONFIG_PATH:+:${PKG_CONFIG_PATH}} - mvn generate-resources -Pgenerate-libs-jni-macos-linux -N -Darrow.dataset.jni.dist.dir=$ARROW_INSTALL_DIR \ - -DARROW_GANDIVA=OFF -DARROW_JAVA_JNI_ENABLE_GANDIVA=OFF -DARROW_ORC=OFF -DARROW_JAVA_JNI_ENABLE_ORC=OFF \ - -Dmaven.test.skip -Drat.skip -Dmaven.gitcommitid.skip -Dcheckstyle.skip -N - - # Arrow Java libraries - mvn clean install -Parrow-jni -P arrow-c-data -pl dataset,c -am \ - -Darrow.c.jni.dist.dir=$ARROW_INSTALL_DIR/lib -Darrow.dataset.jni.dist.dir=$ARROW_INSTALL_DIR/lib -Darrow.cpp.build.dir=$ARROW_INSTALL_DIR/lib \ - -Dmaven.test.skip -Drat.skip -Dmaven.gitcommitid.skip -Dcheckstyle.skip - popd -} - CURRENT_DIR=$( cd "$(dirname "$BASH_SOURCE")" pwd @@ -332,9 +303,5 @@ echo "Target Velox build: $TARGET_BUILD_SUMMARY" check_commit compile -if [ $COMPILE_ARROW_JAVA == "ON" ]; then - compile_arrow_java_module -fi - echo "Successfully built Velox from Source." -echo $TARGET_BUILD_SUMMARY >"${VELOX_HOME}/velox-build.cache" +echo $TARGET_BUILD_SUMMARY > "${VELOX_HOME}/velox-build.cache" diff --git a/ep/build-velox/src/get_velox.sh b/ep/build-velox/src/get_velox.sh index 6c25c8f084261..6cd62332a3ccf 100755 --- a/ep/build-velox/src/get_velox.sh +++ b/ep/build-velox/src/get_velox.sh @@ -17,13 +17,11 @@ set -exu VELOX_REPO=https://github.com/oap-project/velox.git -VELOX_BRANCH=2024_05_24 +VELOX_BRANCH=2024_07_03 VELOX_HOME="" #Set on run gluten on HDFS ENABLE_HDFS=OFF -#It can be set to OFF when compiling velox again -BUILD_PROTOBUF=ON #Set on run gluten on S3 ENABLE_S3=OFF #Set on run gluten on GCS @@ -47,10 +45,6 @@ for arg in "$@"; do VELOX_HOME=("${arg#*=}") shift # Remove argument name from processing ;; - --build_protobuf=*) - BUILD_PROTOBUF=("${arg#*=}") - shift # Remove argument name from processing - ;; --enable_hdfs=*) ENABLE_HDFS=("${arg#*=}") shift # Remove argument name from processing @@ -95,19 +89,18 @@ function process_setup_ubuntu { sed -i '/ccache/a\ curl \\' scripts/setup-ubuntu.sh sed -i '/libgmock-dev/d' scripts/setup-ubuntu.sh # resolved by ep/build-velox/build/velox_ep/CMake/resolve_dependency_modules/gtest.cmake sed -i 's/github_checkout boostorg\/boost \"\${BOOST_VERSION}\" --recursive/wget_and_untar https:\/\/github.com\/boostorg\/boost\/releases\/download\/boost-1.84.0\/boost-1.84.0.tar.gz boost \&\& cd boost/g' scripts/setup-ubuntu.sh + sed -i '/^function install_folly.*/i function install_protobuf {\n wget https://github.com/protocolbuffers/protobuf/releases/download/v21.4/protobuf-all-21.4.tar.gz\n tar -xzf protobuf-all-21.4.tar.gz\n cd protobuf-21.4\n ./configure CXXFLAGS="-fPIC" --prefix=/usr/local\n make "-j$(nproc)"\n sudo make install\n sudo ldconfig\n}\n' scripts/setup-ubuntu.sh + sed -i '/^ run_and_time install_folly/a \ \ run_and_time install_protobuf' scripts/setup-ubuntu.sh if [ $ENABLE_HDFS == "ON" ]; then sed -i '/^function install_folly.*/i function install_libhdfs3 {\n github_checkout oap-project/libhdfs3 master \n cmake_install\n}\n' scripts/setup-ubuntu.sh sed -i '/^ run_and_time install_folly/a \ \ run_and_time install_libhdfs3' scripts/setup-ubuntu.sh sed -i '/ccache /a\ yasm \\' scripts/setup-ubuntu.sh fi - if [ $BUILD_PROTOBUF == "ON" ]; then - sed -i '/^function install_folly.*/i function install_protobuf {\n wget https://github.com/protocolbuffers/protobuf/releases/download/v21.4/protobuf-all-21.4.tar.gz\n tar -xzf protobuf-all-21.4.tar.gz\n cd protobuf-21.4\n ./configure CXXFLAGS="-fPIC" --prefix=/usr/local\n make "-j$(nproc)"\n sudo make install\n sudo ldconfig\n}\n' scripts/setup-ubuntu.sh - sed -i '/^ run_and_time install_folly/a \ \ run_and_time install_protobuf' scripts/setup-ubuntu.sh - fi sed -i "s/apt install -y/sudo apt install -y/" ${VELOX_HOME}/scripts/setup-adapters.sh if [ $ENABLE_S3 == "ON" ]; then sed -i '/^ run_and_time install_folly/a \ \ '${VELOX_HOME}/scripts'/setup-adapters.sh aws' scripts/setup-ubuntu.sh # it's used for velox CI + sed -i '/rpm -i minio-20220526054841.0.0.x86_64.rpm/a \ \ echo "Skip installing minio"' scripts/setup-adapters.sh sed -i 's/rpm -i minio-20220526054841.0.0.x86_64.rpm/#rpm -i minio-20220526054841.0.0.x86_64.rpm/g' scripts/setup-adapters.sh fi if [ $ENABLE_GCS == "ON" ]; then @@ -135,15 +128,14 @@ function process_setup_centos8 { sed -i '/^dnf_install autoconf/a\dnf_install libxml2-devel libgsasl-devel libuuid-devel' scripts/setup-centos8.sh sed -i '/^function install_gflags.*/i function install_openssl {\n wget_and_untar https://github.com/openssl/openssl/archive/refs/tags/OpenSSL_1_1_1s.tar.gz openssl \n cd openssl \n ./config no-shared && make depend && make && sudo make install \n cd ..\n}\n' scripts/setup-centos8.sh sed -i '/^ run_and_time install_fbthrift/a \ run_and_time install_openssl' scripts/setup-centos8.sh + sed -i '/cd protobuf/{n;s/\.\/configure --prefix=\/usr/\.\/configure CXXFLAGS="-fPIC" --prefix=\/usr\/local/;}' scripts/setup-centos8.sh if [ $ENABLE_HDFS == "ON" ]; then sed -i '/^function install_gflags.*/i function install_libhdfs3 {\n cd "\${DEPENDENCY_DIR}"\n github_checkout oap-project/libhdfs3 master\n cmake_install\n}\n' scripts/setup-centos8.sh sed -i '/^ run_and_time install_fbthrift/a \ run_and_time install_libhdfs3' scripts/setup-centos8.sh sed -i '/^ dnf_install ninja-build/a\ dnf_install yasm\' scripts/setup-centos8.sh fi - if [[ $BUILD_PROTOBUF == "ON" ]] || [[ $ENABLE_HDFS == "ON" ]]; then - sed -i '/cd protobuf/{n;s/\.\/configure --prefix=\/usr/\.\/configure CXXFLAGS="-fPIC" --prefix=\/usr\/local/;}' scripts/setup-centos8.sh - fi + sed -i "s/yum -y install/sudo yum -y install/" ${VELOX_HOME}/scripts/setup-adapters.sh if [ $ENABLE_S3 == "ON" ]; then sed -i '/^ run_and_time install_fbthrift/a \ \ '${VELOX_HOME}/scripts'/setup-adapters.sh aws' scripts/setup-centos8.sh @@ -171,15 +163,12 @@ function process_setup_centos7 { # install gtest sed -i '/^ run_and_time install_folly/a \ \ run_and_time install_gtest' scripts/setup-centos7.sh - + sed -i '/^ run_and_time install_folly/a \ \ run_and_time install_protobuf' scripts/setup-centos7.sh if [ $ENABLE_HDFS = "ON" ]; then sed -i '/^function install_protobuf.*/i function install_libhdfs3 {\n cd "\${DEPENDENCY_DIR}"\n github_checkout oap-project/libhdfs3 master \n cmake_install\n}\n' scripts/setup-centos7.sh sed -i '/^ run_and_time install_folly/a \ \ run_and_time install_libhdfs3' scripts/setup-centos7.sh sed -i '/^dnf_install ccache/a\ \ yasm \\' scripts/setup-centos7.sh fi - if [[ $BUILD_PROTOBUF == "ON" ]] || [[ $ENABLE_HDFS == "ON" ]]; then - sed -i '/^ run_and_time install_folly/a \ \ run_and_time install_protobuf' scripts/setup-centos7.sh - fi sed -i "s/yum -y install/sudo yum -y install/" ${VELOX_HOME}/scripts/setup-adapters.sh if [ $ENABLE_S3 == "ON" ]; then sed -i '/^ run_and_time install_folly/a \ \ '${VELOX_HOME}/scripts'/setup-adapters.sh aws' scripts/setup-centos7.sh @@ -218,7 +207,6 @@ function process_setup_tencentos32 { echo "Preparing Velox source code..." echo "ENABLE_HDFS=${ENABLE_HDFS}" -echo "BUILD_PROTOBUF=${BUILD_PROTOBUF}" CURRENT_DIR=$( cd "$(dirname "$BASH_SOURCE")" @@ -255,9 +243,11 @@ function apply_compilation_fixes { current_dir=$1 velox_home=$2 sudo cp ${current_dir}/modify_velox.patch ${velox_home}/ - sudo cp ${current_dir}/modify_arrow.patch ${velox_home}/third_party/ + sudo cp ${current_dir}/modify_arrow.patch ${velox_home}/CMake/resolve_dependency_modules/arrow/ + sudo cp ${current_dir}/modify_arrow_dataset_scan_option.patch ${velox_home}/CMake/resolve_dependency_modules/arrow/ git add ${velox_home}/modify_velox.patch # to avoid the file from being deleted by git clean -dffx :/ - git add ${velox_home}/third_party/modify_arrow.patch # to avoid the file from being deleted by git clean -dffx :/ + git add ${velox_home}/CMake/resolve_dependency_modules/arrow/modify_arrow.patch # to avoid the file from being deleted by git clean -dffx :/ + git add ${velox_home}/CMake/resolve_dependency_modules/arrow/modify_arrow_dataset_scan_option.patch # to avoid the file from being deleted by git clean -dffx :/ cd ${velox_home} echo "Applying patch to Velox source code..." git apply modify_velox.patch @@ -272,8 +262,8 @@ function setup_linux { local LINUX_VERSION_ID=$(. /etc/os-release && echo ${VERSION_ID}) # apply patches - sed -i 's/^ ninja -C "${BINARY_DIR}" install/ sudo ninja -C "${BINARY_DIR}" install/g' scripts/setup-helper-functions.sh sed -i 's/-mavx2 -mfma -mavx -mf16c -mlzcnt -std=c++17/-march=native -std=c++17 -mno-avx512f/g' scripts/setup-helper-functions.sh + sed -i 's/SUDO="${SUDO:-""}"/SUDO="${SUDO:-"sudo --preserve-env"}"/g' scripts/setup-helper-functions.sh if [[ "$LINUX_DISTRIBUTION" == "ubuntu" || "$LINUX_DISTRIBUTION" == "debian" || "$LINUX_DISTRIBUTION" == "pop" ]]; then process_setup_ubuntu elif [[ "$LINUX_DISTRIBUTION" == "centos" ]]; then diff --git a/ep/build-velox/src/modify_arrow.patch b/ep/build-velox/src/modify_arrow.patch index 5814958a936fa..7d4d8e557b582 100644 --- a/ep/build-velox/src/modify_arrow.patch +++ b/ep/build-velox/src/modify_arrow.patch @@ -1,3 +1,42 @@ +diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt +index d56f6a36d..9b4088df9 100644 +--- a/cpp/CMakeLists.txt ++++ b/cpp/CMakeLists.txt +@@ -773,8 +773,7 @@ if(ARROW_ORC) + list(APPEND ARROW_SHARED_LINK_LIBS orc::orc ${ARROW_PROTOBUF_LIBPROTOBUF}) + list(APPEND ARROW_STATIC_LINK_LIBS orc::orc ${ARROW_PROTOBUF_LIBPROTOBUF}) + if(ORC_SOURCE STREQUAL "SYSTEM") +- list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS orc::orc +- ${ARROW_PROTOBUF_LIBPROTOBUF}) ++ list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS orc::orc) + endif() + endif() + +@@ -823,9 +822,6 @@ if(ARROW_WITH_OPENTELEMETRY) + opentelemetry-cpp::ostream_span_exporter + opentelemetry-cpp::otlp_http_exporter) + endif() +- if(Protobuf_SOURCE STREQUAL "SYSTEM") +- list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS ${ARROW_PROTOBUF_LIBPROTOBUF}) +- endif() + list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS CURL::libcurl) + endif() + +@@ -860,6 +856,14 @@ if(ARROW_USE_XSIMD) + list(APPEND ARROW_STATIC_LINK_LIBS ${ARROW_XSIMD}) + endif() + ++# This should be done after if(ARROW_ORC) and if(ARROW_WITH_OPENTELEMETRY) ++# because they depend on Protobuf. ++if(ARROW_WITH_PROTOBUF) ++ if(Protobuf_SOURCE STREQUAL "SYSTEM") ++ list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS ${ARROW_PROTOBUF_LIBPROTOBUF}) ++ endif() ++endif() ++ + add_custom_target(arrow_dependencies) + add_custom_target(arrow_benchmark_dependencies) + add_custom_target(arrow_test_dependencies) diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index a2627c190..e453512e6 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake diff --git a/ep/build-velox/src/modify_arrow_dataset_scan_option.patch b/ep/build-velox/src/modify_arrow_dataset_scan_option.patch new file mode 100644 index 0000000000000..4af78c030c006 --- /dev/null +++ b/ep/build-velox/src/modify_arrow_dataset_scan_option.patch @@ -0,0 +1,883 @@ +diff --git a/cpp/src/arrow/dataset/file_csv.cc b/cpp/src/arrow/dataset/file_csv.cc +index 09ab77572..f09377cf9 100644 +--- a/cpp/src/arrow/dataset/file_csv.cc ++++ b/cpp/src/arrow/dataset/file_csv.cc +@@ -24,6 +24,7 @@ + #include + #include + ++#include "arrow/c/bridge.h" + #include "arrow/csv/options.h" + #include "arrow/csv/parser.h" + #include "arrow/csv/reader.h" +@@ -52,6 +53,9 @@ using internal::Executor; + using internal::SerialExecutor; + + namespace dataset { ++namespace { ++inline bool parseBool(const std::string& value) { return value == "true" ? true : false; } ++} // namespace + + struct CsvInspectedFragment : public InspectedFragment { + CsvInspectedFragment(std::vector column_names, +@@ -503,5 +507,33 @@ Future<> CsvFileWriter::FinishInternal() { + return Status::OK(); + } + ++Result> CsvFragmentScanOptions::from( ++ const std::unordered_map& configs) { ++ std::shared_ptr options = ++ std::make_shared(); ++ for (auto const& it : configs) { ++ auto& key = it.first; ++ auto& value = it.second; ++ if (key == "delimiter") { ++ options->parse_options.delimiter = value.data()[0]; ++ } else if (key == "quoting") { ++ options->parse_options.quoting = parseBool(value); ++ } else if (key == "column_types") { ++ int64_t schema_address = std::stol(value); ++ ArrowSchema* cSchema = reinterpret_cast(schema_address); ++ ARROW_ASSIGN_OR_RAISE(auto schema, arrow::ImportSchema(cSchema)); ++ auto& column_types = options->convert_options.column_types; ++ for (auto field : schema->fields()) { ++ column_types[field->name()] = field->type(); ++ } ++ } else if (key == "strings_can_be_null") { ++ options->convert_options.strings_can_be_null = parseBool(value); ++ } else { ++ return Status::Invalid("Config " + it.first + "is not supported."); ++ } ++ } ++ return options; ++} ++ + } // namespace dataset + } // namespace arrow +diff --git a/cpp/src/arrow/dataset/file_csv.h b/cpp/src/arrow/dataset/file_csv.h +index 42e3fd724..4d2825183 100644 +--- a/cpp/src/arrow/dataset/file_csv.h ++++ b/cpp/src/arrow/dataset/file_csv.h +@@ -85,6 +85,9 @@ class ARROW_DS_EXPORT CsvFileFormat : public FileFormat { + struct ARROW_DS_EXPORT CsvFragmentScanOptions : public FragmentScanOptions { + std::string type_name() const override { return kCsvTypeName; } + ++ static Result> from( ++ const std::unordered_map& configs); ++ + using StreamWrapFunc = std::function>( + std::shared_ptr)>; + +diff --git a/cpp/src/arrow/engine/substrait/expression_internal.cc b/cpp/src/arrow/engine/substrait/expression_internal.cc +index 5d892af9a..0f8b0448b 100644 +--- a/cpp/src/arrow/engine/substrait/expression_internal.cc ++++ b/cpp/src/arrow/engine/substrait/expression_internal.cc +@@ -1337,5 +1337,17 @@ Result> ToProto( + return std::move(out); + } + ++Status FromProto(const substrait::Expression::Literal& literal, ++ std::unordered_map& out) { ++ ARROW_RETURN_IF(!literal.has_map(), Status::Invalid("Literal does not have a map.")); ++ auto literalMap = literal.map(); ++ auto size = literalMap.key_values_size(); ++ for (auto i = 0; i < size; i++) { ++ substrait::Expression_Literal_Map_KeyValue keyValue = literalMap.key_values(i); ++ out.emplace(keyValue.key().string(), keyValue.value().string()); ++ } ++ return Status::OK(); ++} ++ + } // namespace engine + } // namespace arrow +diff --git a/cpp/src/arrow/engine/substrait/expression_internal.h b/cpp/src/arrow/engine/substrait/expression_internal.h +index 2ce2ee76a..9be81b7ab 100644 +--- a/cpp/src/arrow/engine/substrait/expression_internal.h ++++ b/cpp/src/arrow/engine/substrait/expression_internal.h +@@ -61,5 +61,9 @@ ARROW_ENGINE_EXPORT + Result FromProto(const substrait::AggregateFunction&, bool is_hash, + const ExtensionSet&, const ConversionOptions&); + ++ARROW_ENGINE_EXPORT ++Status FromProto(const substrait::Expression::Literal& literal, ++ std::unordered_map& out); ++ + } // namespace engine + } // namespace arrow +diff --git a/cpp/src/arrow/engine/substrait/serde.cc b/cpp/src/arrow/engine/substrait/serde.cc +index 9e670f121..02e5c7171 100644 +--- a/cpp/src/arrow/engine/substrait/serde.cc ++++ b/cpp/src/arrow/engine/substrait/serde.cc +@@ -247,6 +247,16 @@ Result DeserializeExpressions( + return FromProto(extended_expression, ext_set_out, conversion_options, registry); + } + ++Status DeserializeMap(const Buffer& buf, ++ std::unordered_map& out) { ++ // ARROW_ASSIGN_OR_RAISE(auto advanced_extension, ++ // ParseFromBuffer(buf)); ++ // return FromProto(advanced_extension, out); ++ ARROW_ASSIGN_OR_RAISE(auto literal, ++ ParseFromBuffer(buf)); ++ return FromProto(literal, out); ++} ++ + namespace { + + Result> MakeSingleDeclarationPlan( +diff --git a/cpp/src/arrow/engine/substrait/serde.h b/cpp/src/arrow/engine/substrait/serde.h +index ab749f4a6..6312ec239 100644 +--- a/cpp/src/arrow/engine/substrait/serde.h ++++ b/cpp/src/arrow/engine/substrait/serde.h +@@ -23,6 +23,7 @@ + #include + #include + #include ++#include + #include + + #include "arrow/compute/type_fwd.h" +@@ -183,6 +184,9 @@ ARROW_ENGINE_EXPORT Result DeserializeExpressions( + const ConversionOptions& conversion_options = {}, + ExtensionSet* ext_set_out = NULLPTR); + ++ARROW_ENGINE_EXPORT Status ++DeserializeMap(const Buffer& buf, std::unordered_map& out); ++ + /// \brief Deserializes a Substrait Type message to the corresponding Arrow type + /// + /// \param[in] buf a buffer containing the protobuf serialization of a Substrait Type +diff --git a/java/dataset/pom.xml b/java/dataset/pom.xml +index d4d3e2c0f..ce72eaa1f 100644 +--- a/java/dataset/pom.xml ++++ b/java/dataset/pom.xml +@@ -25,9 +25,10 @@ + jar + + ../../../cpp/release-build/ +- 2.5.0 + 1.11.0 + 1.11.3 ++ 0.31.0 ++ 3.25.3 + + + +@@ -47,6 +48,18 @@ + arrow-c-data + compile + ++ ++ io.substrait ++ core ++ ${substrait.version} ++ provided ++ ++ ++ com.google.protobuf ++ protobuf-java ++ ${protobuf.version} ++ provided ++ + + org.apache.arrow + arrow-memory-netty +diff --git a/java/dataset/src/main/cpp/jni_wrapper.cc b/java/dataset/src/main/cpp/jni_wrapper.cc +index 8d7dafd84..89cdc39fe 100644 +--- a/java/dataset/src/main/cpp/jni_wrapper.cc ++++ b/java/dataset/src/main/cpp/jni_wrapper.cc +@@ -25,6 +25,7 @@ + #include "arrow/c/helpers.h" + #include "arrow/dataset/api.h" + #include "arrow/dataset/file_base.h" ++#include "arrow/dataset/file_csv.h" + #include "arrow/filesystem/localfs.h" + #include "arrow/filesystem/path_util.h" + #ifdef ARROW_S3 +@@ -122,6 +123,19 @@ arrow::Result> GetFileFormat( + } + } + ++arrow::Result> ++GetFragmentScanOptions(jint file_format_id, ++ const std::unordered_map& configs) { ++ switch (file_format_id) { ++#ifdef ARROW_CSV ++ case 3: ++ return arrow::dataset::CsvFragmentScanOptions::from(configs); ++#endif ++ default: ++ return arrow::Status::Invalid("Illegal file format id: " ,file_format_id); ++ } ++} ++ + class ReserveFromJava : public arrow::dataset::jni::ReservationListener { + public: + ReserveFromJava(JavaVM* vm, jobject java_reservation_listener) +@@ -460,12 +474,13 @@ JNIEXPORT void JNICALL Java_org_apache_arrow_dataset_jni_JniWrapper_closeDataset + /* + * Class: org_apache_arrow_dataset_jni_JniWrapper + * Method: createScanner +- * Signature: (J[Ljava/lang/String;Ljava/nio/ByteBuffer;Ljava/nio/ByteBuffer;JJ)J ++ * Signature: ++ * (J[Ljava/lang/String;Ljava/nio/ByteBuffer;Ljava/nio/ByteBuffer;JJ;Ljava/nio/ByteBuffer;J)J + */ + JNIEXPORT jlong JNICALL Java_org_apache_arrow_dataset_jni_JniWrapper_createScanner( + JNIEnv* env, jobject, jlong dataset_id, jobjectArray columns, +- jobject substrait_projection, jobject substrait_filter, +- jlong batch_size, jlong memory_pool_id) { ++ jobject substrait_projection, jobject substrait_filter, jlong batch_size, ++ jlong file_format_id, jobject options, jlong memory_pool_id) { + JNI_METHOD_START + arrow::MemoryPool* pool = reinterpret_cast(memory_pool_id); + if (pool == nullptr) { +@@ -514,6 +529,14 @@ JNIEXPORT jlong JNICALL Java_org_apache_arrow_dataset_jni_JniWrapper_createScann + } + JniAssertOkOrThrow(scanner_builder->Filter(*filter_expr)); + } ++ if (file_format_id != -1 && options != nullptr) { ++ std::unordered_map option_map; ++ std::shared_ptr buffer = LoadArrowBufferFromByteBuffer(env, options); ++ JniAssertOkOrThrow(arrow::engine::DeserializeMap(*buffer, option_map)); ++ std::shared_ptr scan_options = ++ JniGetOrThrow(GetFragmentScanOptions(file_format_id, option_map)); ++ JniAssertOkOrThrow(scanner_builder->FragmentScanOptions(scan_options)); ++ } + JniAssertOkOrThrow(scanner_builder->BatchSize(batch_size)); + + auto scanner = JniGetOrThrow(scanner_builder->Finish()); +@@ -627,14 +650,31 @@ JNIEXPORT void JNICALL Java_org_apache_arrow_dataset_jni_JniWrapper_ensureS3Fina + /* + * Class: org_apache_arrow_dataset_file_JniWrapper + * Method: makeFileSystemDatasetFactory +- * Signature: (Ljava/lang/String;II)J ++ * Signature: (Ljava/lang/String;IILjava/lang/String;Ljava/nio/ByteBuffer)J + */ + JNIEXPORT jlong JNICALL +-Java_org_apache_arrow_dataset_file_JniWrapper_makeFileSystemDatasetFactory__Ljava_lang_String_2I( +- JNIEnv* env, jobject, jstring uri, jint file_format_id) { ++Java_org_apache_arrow_dataset_file_JniWrapper_makeFileSystemDatasetFactory( ++ JNIEnv* env, jobject, jstring uri, jint file_format_id, jobject options) { + JNI_METHOD_START + std::shared_ptr file_format = + JniGetOrThrow(GetFileFormat(file_format_id)); ++ if (options != nullptr) { ++ std::unordered_map option_map; ++ std::shared_ptr buffer = LoadArrowBufferFromByteBuffer(env, options); ++ JniAssertOkOrThrow(arrow::engine::DeserializeMap(*buffer, option_map)); ++ std::shared_ptr scan_options = ++ JniGetOrThrow(GetFragmentScanOptions(file_format_id, option_map)); ++ file_format->default_fragment_scan_options = scan_options; ++#ifdef ARROW_CSV ++ if (file_format_id == 3) { ++ std::shared_ptr csv_file_format = ++ std::dynamic_pointer_cast(file_format); ++ csv_file_format->parse_options = ++ std::dynamic_pointer_cast(scan_options) ++ ->parse_options; ++ } ++#endif ++ } + arrow::dataset::FileSystemFactoryOptions options; + std::shared_ptr d = + JniGetOrThrow(arrow::dataset::FileSystemDatasetFactory::Make( +@@ -645,16 +685,33 @@ Java_org_apache_arrow_dataset_file_JniWrapper_makeFileSystemDatasetFactory__Ljav + + /* + * Class: org_apache_arrow_dataset_file_JniWrapper +- * Method: makeFileSystemDatasetFactory +- * Signature: ([Ljava/lang/String;II)J ++ * Method: makeFileSystemDatasetFactoryWithFiles ++ * Signature: ([Ljava/lang/String;IIJ;Ljava/nio/ByteBuffer)J + */ + JNIEXPORT jlong JNICALL +-Java_org_apache_arrow_dataset_file_JniWrapper_makeFileSystemDatasetFactory___3Ljava_lang_String_2I( +- JNIEnv* env, jobject, jobjectArray uris, jint file_format_id) { ++Java_org_apache_arrow_dataset_file_JniWrapper_makeFileSystemDatasetFactoryWithFiles( ++ JNIEnv* env, jobject, jobjectArray uris, jint file_format_id, jobject options) { + JNI_METHOD_START + + std::shared_ptr file_format = + JniGetOrThrow(GetFileFormat(file_format_id)); ++ if (options != nullptr) { ++ std::unordered_map option_map; ++ std::shared_ptr buffer = LoadArrowBufferFromByteBuffer(env, options); ++ JniAssertOkOrThrow(arrow::engine::DeserializeMap(*buffer, option_map)); ++ std::shared_ptr scan_options = ++ JniGetOrThrow(GetFragmentScanOptions(file_format_id, option_map)); ++ file_format->default_fragment_scan_options = scan_options; ++#ifdef ARROW_CSV ++ if (file_format_id == 3) { ++ std::shared_ptr csv_file_format = ++ std::dynamic_pointer_cast(file_format); ++ csv_file_format->parse_options = ++ std::dynamic_pointer_cast(scan_options) ++ ->parse_options; ++ } ++#endif ++ } + arrow::dataset::FileSystemFactoryOptions options; + + std::vector uri_vec = ToStringVector(env, uris); +diff --git a/java/dataset/src/main/java/org/apache/arrow/dataset/file/FileSystemDatasetFactory.java b/java/dataset/src/main/java/org/apache/arrow/dataset/file/FileSystemDatasetFactory.java +index aa3156905..a0b6fb168 100644 +--- a/java/dataset/src/main/java/org/apache/arrow/dataset/file/FileSystemDatasetFactory.java ++++ b/java/dataset/src/main/java/org/apache/arrow/dataset/file/FileSystemDatasetFactory.java +@@ -17,8 +17,11 @@ + + package org.apache.arrow.dataset.file; + ++import java.util.Optional; ++ + import org.apache.arrow.dataset.jni.NativeDatasetFactory; + import org.apache.arrow.dataset.jni.NativeMemoryPool; ++import org.apache.arrow.dataset.scanner.FragmentScanOptions; + import org.apache.arrow.memory.BufferAllocator; + + /** +@@ -27,21 +30,34 @@ import org.apache.arrow.memory.BufferAllocator; + public class FileSystemDatasetFactory extends NativeDatasetFactory { + + public FileSystemDatasetFactory(BufferAllocator allocator, NativeMemoryPool memoryPool, FileFormat format, +- String uri) { +- super(allocator, memoryPool, createNative(format, uri)); ++ String uri, Optional fragmentScanOptions) { ++ super(allocator, memoryPool, createNative(format, uri, fragmentScanOptions)); ++ } ++ ++ public FileSystemDatasetFactory(BufferAllocator allocator, NativeMemoryPool memoryPool, FileFormat format, ++ String uri) { ++ super(allocator, memoryPool, createNative(format, uri, Optional.empty())); ++ } ++ ++ public FileSystemDatasetFactory(BufferAllocator allocator, NativeMemoryPool memoryPool, FileFormat format, ++ String[] uris, Optional fragmentScanOptions) { ++ super(allocator, memoryPool, createNative(format, uris, fragmentScanOptions)); + } + + public FileSystemDatasetFactory(BufferAllocator allocator, NativeMemoryPool memoryPool, FileFormat format, + String[] uris) { +- super(allocator, memoryPool, createNative(format, uris)); ++ super(allocator, memoryPool, createNative(format, uris, Optional.empty())); + } + +- private static long createNative(FileFormat format, String uri) { +- return JniWrapper.get().makeFileSystemDatasetFactory(uri, format.id()); ++ private static long createNative(FileFormat format, String uri, Optional fragmentScanOptions) { ++ return JniWrapper.get().makeFileSystemDatasetFactory(uri, format.id(), ++ fragmentScanOptions.map(FragmentScanOptions::serialize).orElse(null)); + } + +- private static long createNative(FileFormat format, String[] uris) { +- return JniWrapper.get().makeFileSystemDatasetFactory(uris, format.id()); ++ private static long createNative(FileFormat format, String[] uris, ++ Optional fragmentScanOptions) { ++ return JniWrapper.get().makeFileSystemDatasetFactoryWithFiles(uris, format.id(), ++ fragmentScanOptions.map(FragmentScanOptions::serialize).orElse(null)); + } + + } +diff --git a/java/dataset/src/main/java/org/apache/arrow/dataset/file/JniWrapper.java b/java/dataset/src/main/java/org/apache/arrow/dataset/file/JniWrapper.java +index c3a1a4e58..c3f8e12b3 100644 +--- a/java/dataset/src/main/java/org/apache/arrow/dataset/file/JniWrapper.java ++++ b/java/dataset/src/main/java/org/apache/arrow/dataset/file/JniWrapper.java +@@ -17,6 +17,8 @@ + + package org.apache.arrow.dataset.file; + ++import java.nio.ByteBuffer; ++ + import org.apache.arrow.dataset.jni.JniLoader; + + /** +@@ -43,7 +45,8 @@ public class JniWrapper { + * @return the native pointer of the arrow::dataset::FileSystemDatasetFactory instance. + * @see FileFormat + */ +- public native long makeFileSystemDatasetFactory(String uri, int fileFormat); ++ public native long makeFileSystemDatasetFactory(String uri, int fileFormat, ++ ByteBuffer serializedFragmentScanOptions); + + /** + * Create FileSystemDatasetFactory and return its native pointer. The pointer is pointing to a +@@ -54,7 +57,8 @@ public class JniWrapper { + * @return the native pointer of the arrow::dataset::FileSystemDatasetFactory instance. + * @see FileFormat + */ +- public native long makeFileSystemDatasetFactory(String[] uris, int fileFormat); ++ public native long makeFileSystemDatasetFactoryWithFiles(String[] uris, int fileFormat, ++ ByteBuffer serializedFragmentScanOptions); + + /** + * Write the content in a {@link org.apache.arrow.c.ArrowArrayStream} into files. This internally +diff --git a/java/dataset/src/main/java/org/apache/arrow/dataset/jni/JniWrapper.java b/java/dataset/src/main/java/org/apache/arrow/dataset/jni/JniWrapper.java +index 637a3e8f2..6d6309140 100644 +--- a/java/dataset/src/main/java/org/apache/arrow/dataset/jni/JniWrapper.java ++++ b/java/dataset/src/main/java/org/apache/arrow/dataset/jni/JniWrapper.java +@@ -80,7 +80,8 @@ public class JniWrapper { + * @return the native pointer of the arrow::dataset::Scanner instance. + */ + public native long createScanner(long datasetId, String[] columns, ByteBuffer substraitProjection, +- ByteBuffer substraitFilter, long batchSize, long memoryPool); ++ ByteBuffer substraitFilter, long batchSize, long fileFormat, ++ ByteBuffer serializedFragmentScanOptions, long memoryPool); + + /** + * Get a serialized schema from native instance of a Scanner. +diff --git a/java/dataset/src/main/java/org/apache/arrow/dataset/jni/NativeDataset.java b/java/dataset/src/main/java/org/apache/arrow/dataset/jni/NativeDataset.java +index d9abad997..3a96fe768 100644 +--- a/java/dataset/src/main/java/org/apache/arrow/dataset/jni/NativeDataset.java ++++ b/java/dataset/src/main/java/org/apache/arrow/dataset/jni/NativeDataset.java +@@ -17,6 +17,9 @@ + + package org.apache.arrow.dataset.jni; + ++import java.nio.ByteBuffer; ++ ++import org.apache.arrow.dataset.scanner.FragmentScanOptions; + import org.apache.arrow.dataset.scanner.ScanOptions; + import org.apache.arrow.dataset.source.Dataset; + +@@ -40,11 +43,18 @@ public class NativeDataset implements Dataset { + if (closed) { + throw new NativeInstanceReleasedException(); + } +- ++ int fileFormat = -1; ++ ByteBuffer serialized = null; ++ if (options.getFragmentScanOptions().isPresent()) { ++ FragmentScanOptions fragmentScanOptions = options.getFragmentScanOptions().get(); ++ fileFormat = fragmentScanOptions.fileFormatId(); ++ serialized = fragmentScanOptions.serialize(); ++ } + long scannerId = JniWrapper.get().createScanner(datasetId, options.getColumns().orElse(null), + options.getSubstraitProjection().orElse(null), + options.getSubstraitFilter().orElse(null), +- options.getBatchSize(), context.getMemoryPool().getNativeInstanceId()); ++ options.getBatchSize(), fileFormat, serialized, ++ context.getMemoryPool().getNativeInstanceId()); + + return new NativeScanner(context, scannerId); + } +diff --git a/java/dataset/src/main/java/org/apache/arrow/dataset/scanner/FragmentScanOptions.java b/java/dataset/src/main/java/org/apache/arrow/dataset/scanner/FragmentScanOptions.java +new file mode 100644 +index 000000000..8acb2b2d4 +--- /dev/null ++++ b/java/dataset/src/main/java/org/apache/arrow/dataset/scanner/FragmentScanOptions.java +@@ -0,0 +1,50 @@ ++/* ++ * Licensed to the Apache Software Foundation (ASF) under one or more ++ * contributor license agreements. See the NOTICE file distributed with ++ * this work for additional information regarding copyright ownership. ++ * The ASF licenses this file to You under the Apache License, Version 2.0 ++ * (the "License"); you may not use this file except in compliance with ++ * the License. You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++package org.apache.arrow.dataset.scanner; ++ ++import java.nio.ByteBuffer; ++import java.util.Map; ++ ++import org.apache.arrow.dataset.substrait.util.ConvertUtil; ++ ++import io.substrait.proto.Expression; ++ ++public interface FragmentScanOptions { ++ String typeName(); ++ ++ int fileFormatId(); ++ ++ ByteBuffer serialize(); ++ ++ /** ++ * serialize the map. ++ * ++ * @param config config map ++ * @return bufer to jni call argument, should be DirectByteBuffer ++ */ ++ default ByteBuffer serializeMap(Map config) { ++ if (config.isEmpty()) { ++ return null; ++ } ++ ++ Expression.Literal literal = ConvertUtil.mapToExpressionLiteral(config); ++ ByteBuffer buf = ByteBuffer.allocateDirect(literal.getSerializedSize()); ++ buf.put(literal.toByteArray()); ++ return buf; ++ } ++} +diff --git a/java/dataset/src/main/java/org/apache/arrow/dataset/scanner/ScanOptions.java b/java/dataset/src/main/java/org/apache/arrow/dataset/scanner/ScanOptions.java +index 995d05ac3..aad71930c 100644 +--- a/java/dataset/src/main/java/org/apache/arrow/dataset/scanner/ScanOptions.java ++++ b/java/dataset/src/main/java/org/apache/arrow/dataset/scanner/ScanOptions.java +@@ -31,6 +31,8 @@ public class ScanOptions { + private final Optional substraitProjection; + private final Optional substraitFilter; + ++ private final Optional fragmentScanOptions; ++ + /** + * Constructor. + * @param columns Projected columns. Empty for scanning all columns. +@@ -61,6 +63,7 @@ public class ScanOptions { + this.columns = columns; + this.substraitProjection = Optional.empty(); + this.substraitFilter = Optional.empty(); ++ this.fragmentScanOptions = Optional.empty(); + } + + public ScanOptions(long batchSize) { +@@ -83,6 +86,10 @@ public class ScanOptions { + return substraitFilter; + } + ++ public Optional getFragmentScanOptions() { ++ return fragmentScanOptions; ++ } ++ + /** + * Builder for Options used during scanning. + */ +@@ -91,6 +98,7 @@ public class ScanOptions { + private Optional columns; + private ByteBuffer substraitProjection; + private ByteBuffer substraitFilter; ++ private FragmentScanOptions fragmentScanOptions; + + /** + * Constructor. +@@ -136,6 +144,18 @@ public class ScanOptions { + return this; + } + ++ /** ++ * Set the FragmentScanOptions. ++ * ++ * @param fragmentScanOptions scan options ++ * @return the ScanOptions configured. ++ */ ++ public Builder fragmentScanOptions(FragmentScanOptions fragmentScanOptions) { ++ Preconditions.checkNotNull(fragmentScanOptions); ++ this.fragmentScanOptions = fragmentScanOptions; ++ return this; ++ } ++ + public ScanOptions build() { + return new ScanOptions(this); + } +@@ -146,5 +166,6 @@ public class ScanOptions { + columns = builder.columns; + substraitProjection = Optional.ofNullable(builder.substraitProjection); + substraitFilter = Optional.ofNullable(builder.substraitFilter); ++ fragmentScanOptions = Optional.ofNullable(builder.fragmentScanOptions); + } + } +diff --git a/java/dataset/src/main/java/org/apache/arrow/dataset/scanner/csv/CsvConvertOptions.java b/java/dataset/src/main/java/org/apache/arrow/dataset/scanner/csv/CsvConvertOptions.java +new file mode 100644 +index 000000000..08e35ede2 +--- /dev/null ++++ b/java/dataset/src/main/java/org/apache/arrow/dataset/scanner/csv/CsvConvertOptions.java +@@ -0,0 +1,51 @@ ++/* ++ * Licensed to the Apache Software Foundation (ASF) under one or more ++ * contributor license agreements. See the NOTICE file distributed with ++ * this work for additional information regarding copyright ownership. ++ * The ASF licenses this file to You under the Apache License, Version 2.0 ++ * (the "License"); you may not use this file except in compliance with ++ * the License. You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++package org.apache.arrow.dataset.scanner.csv; ++ ++import java.util.Map; ++import java.util.Optional; ++ ++import org.apache.arrow.c.ArrowSchema; ++ ++public class CsvConvertOptions { ++ ++ private final Map configs; ++ ++ private Optional cSchema = Optional.empty(); ++ ++ public CsvConvertOptions(Map configs) { ++ this.configs = configs; ++ } ++ ++ public Optional getArrowSchema() { ++ return cSchema; ++ } ++ ++ public Map getConfigs() { ++ return configs; ++ } ++ ++ public void set(String key, String value) { ++ configs.put(key, value); ++ } ++ ++ public void setArrowSchema(ArrowSchema cSchema) { ++ this.cSchema = Optional.of(cSchema); ++ } ++ ++} +diff --git a/java/dataset/src/main/java/org/apache/arrow/dataset/scanner/csv/CsvFragmentScanOptions.java b/java/dataset/src/main/java/org/apache/arrow/dataset/scanner/csv/CsvFragmentScanOptions.java +new file mode 100644 +index 000000000..88973f0a0 +--- /dev/null ++++ b/java/dataset/src/main/java/org/apache/arrow/dataset/scanner/csv/CsvFragmentScanOptions.java +@@ -0,0 +1,97 @@ ++/* ++ * Licensed to the Apache Software Foundation (ASF) under one or more ++ * contributor license agreements. See the NOTICE file distributed with ++ * this work for additional information regarding copyright ownership. ++ * The ASF licenses this file to You under the Apache License, Version 2.0 ++ * (the "License"); you may not use this file except in compliance with ++ * the License. You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++package org.apache.arrow.dataset.scanner.csv; ++ ++import java.io.Serializable; ++import java.nio.ByteBuffer; ++import java.util.Locale; ++import java.util.Map; ++import java.util.stream.Collectors; ++import java.util.stream.Stream; ++ ++import org.apache.arrow.dataset.file.FileFormat; ++import org.apache.arrow.dataset.scanner.FragmentScanOptions; ++ ++public class CsvFragmentScanOptions implements Serializable, FragmentScanOptions { ++ private final CsvConvertOptions convertOptions; ++ private final Map readOptions; ++ private final Map parseOptions; ++ ++ ++ /** ++ * csv scan options, map to CPP struct CsvFragmentScanOptions. ++ * ++ * @param convertOptions same struct in CPP ++ * @param readOptions same struct in CPP ++ * @param parseOptions same struct in CPP ++ */ ++ public CsvFragmentScanOptions(CsvConvertOptions convertOptions, ++ Map readOptions, ++ Map parseOptions) { ++ this.convertOptions = convertOptions; ++ this.readOptions = readOptions; ++ this.parseOptions = parseOptions; ++ } ++ ++ public String typeName() { ++ return FileFormat.CSV.name().toLowerCase(Locale.ROOT); ++ } ++ ++ /** ++ * File format id. ++ * ++ * @return id ++ */ ++ public int fileFormatId() { ++ return FileFormat.CSV.id(); ++ } ++ ++ /** ++ * Serialize this class to ByteBuffer and then called by jni call. ++ * ++ * @return DirectByteBuffer ++ */ ++ public ByteBuffer serialize() { ++ Map options = Stream.concat(Stream.concat(readOptions.entrySet().stream(), ++ parseOptions.entrySet().stream()), ++ convertOptions.getConfigs().entrySet().stream()).collect( ++ Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); ++ ++ if (convertOptions.getArrowSchema().isPresent()) { ++ options.put("column_types", Long.toString(convertOptions.getArrowSchema().get().memoryAddress())); ++ } ++ return serializeMap(options); ++ } ++ ++ public static CsvFragmentScanOptions deserialize(String serialized) { ++ throw new UnsupportedOperationException("Not implemented now"); ++ } ++ ++ public CsvConvertOptions getConvertOptions() { ++ return convertOptions; ++ } ++ ++ public Map getReadOptions() { ++ return readOptions; ++ } ++ ++ public Map getParseOptions() { ++ return parseOptions; ++ } ++ ++} +diff --git a/java/dataset/src/main/java/org/apache/arrow/dataset/substrait/util/ConvertUtil.java b/java/dataset/src/main/java/org/apache/arrow/dataset/substrait/util/ConvertUtil.java +new file mode 100644 +index 000000000..31a4023af +--- /dev/null ++++ b/java/dataset/src/main/java/org/apache/arrow/dataset/substrait/util/ConvertUtil.java +@@ -0,0 +1,46 @@ ++/* ++ * Licensed to the Apache Software Foundation (ASF) under one or more ++ * contributor license agreements. See the NOTICE file distributed with ++ * this work for additional information regarding copyright ownership. ++ * The ASF licenses this file to You under the Apache License, Version 2.0 ++ * (the "License"); you may not use this file except in compliance with ++ * the License. You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++package org.apache.arrow.dataset.substrait.util; ++ ++import java.util.Map; ++ ++import io.substrait.proto.Expression; ++ ++public class ConvertUtil { ++ ++ /** ++ * Convert map to substrait Expression. ++ * ++ * @return Substrait Expression ++ */ ++ public static Expression.Literal mapToExpressionLiteral(Map values) { ++ Expression.Literal.Builder literalBuilder = Expression.Literal.newBuilder(); ++ Expression.Literal.Map.KeyValue.Builder keyValueBuilder = ++ Expression.Literal.Map.KeyValue.newBuilder(); ++ Expression.Literal.Map.Builder mapBuilder = Expression.Literal.Map.newBuilder(); ++ for (Map.Entry entry : values.entrySet()) { ++ literalBuilder.setString(entry.getKey()); ++ keyValueBuilder.setKey(literalBuilder.build()); ++ literalBuilder.setString(entry.getValue()); ++ keyValueBuilder.setValue(literalBuilder.build()); ++ mapBuilder.addKeyValues(keyValueBuilder.build()); ++ } ++ literalBuilder.setMap(mapBuilder.build()); ++ return literalBuilder.build(); ++ } ++} +diff --git a/java/dataset/src/test/java/org/apache/arrow/dataset/substrait/TestAceroSubstraitConsumer.java b/java/dataset/src/test/java/org/apache/arrow/dataset/substrait/TestAceroSubstraitConsumer.java +index 0fba72892..e7903b7a4 100644 +--- a/java/dataset/src/test/java/org/apache/arrow/dataset/substrait/TestAceroSubstraitConsumer.java ++++ b/java/dataset/src/test/java/org/apache/arrow/dataset/substrait/TestAceroSubstraitConsumer.java +@@ -31,6 +31,9 @@ import java.util.HashMap; + import java.util.Map; + import java.util.Optional; + ++import org.apache.arrow.c.ArrowSchema; ++import org.apache.arrow.c.CDataDictionaryProvider; ++import org.apache.arrow.c.Data; + import org.apache.arrow.dataset.ParquetWriteSupport; + import org.apache.arrow.dataset.TestDataset; + import org.apache.arrow.dataset.file.FileFormat; +@@ -38,8 +41,11 @@ import org.apache.arrow.dataset.file.FileSystemDatasetFactory; + import org.apache.arrow.dataset.jni.NativeMemoryPool; + import org.apache.arrow.dataset.scanner.ScanOptions; + import org.apache.arrow.dataset.scanner.Scanner; ++import org.apache.arrow.dataset.scanner.csv.CsvConvertOptions; ++import org.apache.arrow.dataset.scanner.csv.CsvFragmentScanOptions; + import org.apache.arrow.dataset.source.Dataset; + import org.apache.arrow.dataset.source.DatasetFactory; ++import org.apache.arrow.memory.BufferAllocator; + import org.apache.arrow.vector.ipc.ArrowReader; + import org.apache.arrow.vector.types.pojo.ArrowType; + import org.apache.arrow.vector.types.pojo.Field; +@@ -49,6 +55,8 @@ import org.junit.ClassRule; + import org.junit.Test; + import org.junit.rules.TemporaryFolder; + ++import com.google.common.collect.ImmutableMap; ++ + public class TestAceroSubstraitConsumer extends TestDataset { + + @ClassRule +@@ -457,4 +465,42 @@ public class TestAceroSubstraitConsumer extends TestDataset { + substraitExpression.put(decodedSubstrait); + return substraitExpression; + } ++ ++ @Test ++ public void testCsvConvertOptions() throws Exception { ++ final Schema schema = new Schema(Arrays.asList( ++ Field.nullable("Id", new ArrowType.Int(32, true)), ++ Field.nullable("Name", new ArrowType.Utf8()), ++ Field.nullable("Language", new ArrowType.Utf8()) ++ ), null); ++ String path = "file://" + getClass().getResource("/").getPath() + "/data/student.csv"; ++ BufferAllocator allocator = rootAllocator(); ++ try (ArrowSchema cSchema = ArrowSchema.allocateNew(allocator); ++ CDataDictionaryProvider provider = new CDataDictionaryProvider()) { ++ Data.exportSchema(allocator, schema, provider, cSchema); ++ CsvConvertOptions convertOptions = new CsvConvertOptions(ImmutableMap.of("delimiter", ";")); ++ convertOptions.setArrowSchema(cSchema); ++ CsvFragmentScanOptions fragmentScanOptions = new CsvFragmentScanOptions( ++ convertOptions, ImmutableMap.of(), ImmutableMap.of()); ++ ScanOptions options = new ScanOptions.Builder(/*batchSize*/ 32768) ++ .columns(Optional.empty()) ++ .fragmentScanOptions(fragmentScanOptions) ++ .build(); ++ try ( ++ DatasetFactory datasetFactory = new FileSystemDatasetFactory(allocator, NativeMemoryPool.getDefault(), ++ FileFormat.CSV, path); ++ Dataset dataset = datasetFactory.finish(); ++ Scanner scanner = dataset.newScan(options); ++ ArrowReader reader = scanner.scanBatches() ++ ) { ++ assertEquals(schema.getFields(), reader.getVectorSchemaRoot().getSchema().getFields()); ++ int rowCount = 0; ++ while (reader.loadNextBatch()) { ++ assertEquals("[1, 2, 3]", reader.getVectorSchemaRoot().getVector("Id").toString()); ++ rowCount += reader.getVectorSchemaRoot().getRowCount(); ++ } ++ assertEquals(3, rowCount); ++ } ++ } ++ } + } +diff --git a/java/dataset/src/test/resources/data/student.csv b/java/dataset/src/test/resources/data/student.csv +new file mode 100644 +index 000000000..329194609 +--- /dev/null ++++ b/java/dataset/src/test/resources/data/student.csv +@@ -0,0 +1,4 @@ ++Id;Name;Language ++1;Juno;Java ++2;Peter;Python ++3;Celin;C++ diff --git a/ep/build-velox/src/modify_velox.patch b/ep/build-velox/src/modify_velox.patch index 4bcb228455b1f..7e1f19b3c18d2 100644 --- a/ep/build-velox/src/modify_velox.patch +++ b/ep/build-velox/src/modify_velox.patch @@ -35,11 +35,47 @@ index d49115f12..1aaa8e532 100644 + IMPORTED_LOCATION_DEBUG "${LZ4_LIBRARY_DEBUG}") + endif() endif() +diff --git a/CMake/resolve_dependency_modules/arrow/CMakeLists.txt b/CMake/resolve_dependency_modules/arrow/CMakeLists.txt +index 3f01df2fd..a8da374a2 100644 +--- a/CMake/resolve_dependency_modules/arrow/CMakeLists.txt ++++ b/CMake/resolve_dependency_modules/arrow/CMakeLists.txt +@@ -23,7 +23,11 @@ if(VELOX_ENABLE_ARROW) + + set(ARROW_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/arrow_ep") + set(ARROW_CMAKE_ARGS +- -DARROW_PARQUET=OFF ++ -DARROW_PARQUET=ON ++ -DARROW_FILESYSTEM=ON ++ -DARROW_PROTOBUF_USE_SHARED=OFF ++ -DARROW_DEPENDENCY_USE_SHARED=OFF ++ -DARROW_DEPENDENCY_SOURCE=BUNDLED + -DARROW_WITH_THRIFT=ON + -DARROW_WITH_LZ4=ON + -DARROW_WITH_SNAPPY=ON +@@ -37,7 +41,7 @@ if(VELOX_ENABLE_ARROW) + -DCMAKE_INSTALL_PREFIX=${ARROW_PREFIX}/install + -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} + -DARROW_BUILD_STATIC=ON +- -DThrift_SOURCE=${THRIFT_SOURCE}) ++ -DARROW_BUILD_SHARED=OFF) + set(ARROW_LIBDIR ${ARROW_PREFIX}/install/${CMAKE_INSTALL_LIBDIR}) + + add_library(thrift STATIC IMPORTED GLOBAL) +@@ -66,6 +70,9 @@ if(VELOX_ENABLE_ARROW) + arrow_ep + PREFIX ${ARROW_PREFIX} + URL ${VELOX_ARROW_SOURCE_URL} ++ PATCH_COMMAND patch -p1 < ${CMAKE_CURRENT_SOURCE_DIR}/modify_arrow.patch ++ COMMAND patch -p1 < ++ ${CMAKE_CURRENT_SOURCE_DIR}/modify_arrow_dataset_scan_option.patch + URL_HASH ${VELOX_ARROW_BUILD_SHA256_CHECKSUM} + SOURCE_SUBDIR cpp + CMAKE_ARGS ${ARROW_CMAKE_ARGS} diff --git a/CMakeLists.txt b/CMakeLists.txt -index 53aaf4391..90aba6916 100644 +index bb7c49907..3372d48b4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt -@@ -243,10 +243,15 @@ if(VELOX_ENABLE_ABFS) +@@ -234,10 +234,15 @@ if(VELOX_ENABLE_ABFS) endif() if(VELOX_ENABLE_HDFS) @@ -59,7 +95,7 @@ index 53aaf4391..90aba6916 100644 add_definitions(-DVELOX_ENABLE_HDFS3) endif() -@@ -386,7 +391,7 @@ resolve_dependency(Boost 1.77.0 COMPONENTS ${BOOST_INCLUDE_LIBRARIES}) +@@ -378,7 +383,7 @@ resolve_dependency(Boost 1.77.0 COMPONENTS ${BOOST_INCLUDE_LIBRARIES}) # for reference. find_package(range-v3) set_source(gflags) @@ -68,29 +104,6 @@ index 53aaf4391..90aba6916 100644 if(NOT TARGET gflags::gflags) # This is a bit convoluted, but we want to be able to use gflags::gflags as a # target even when velox is built as a subproject which uses -diff --git a/third_party/CMakeLists.txt b/third_party/CMakeLists.txt -index ce4c24dbe..785a2acc6 100644 ---- a/third_party/CMakeLists.txt -+++ b/third_party/CMakeLists.txt -@@ -26,7 +26,9 @@ if(VELOX_ENABLE_ARROW) - endif() - set(ARROW_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/arrow_ep") - set(ARROW_CMAKE_ARGS -- -DARROW_PARQUET=OFF -+ -DARROW_PARQUET=ON -+ -DARROW_FILESYSTEM=ON -+ -DARROW_PROTOBUF_USE_SHARED=OFF - -DARROW_WITH_THRIFT=ON - -DARROW_WITH_LZ4=ON - -DARROW_WITH_SNAPPY=ON -@@ -69,6 +71,7 @@ if(VELOX_ENABLE_ARROW) - arrow_ep - PREFIX ${ARROW_PREFIX} - URL ${VELOX_ARROW_SOURCE_URL} -+ PATCH_COMMAND patch -p1 < ${CMAKE_CURRENT_SOURCE_DIR}/modify_arrow.patch - URL_HASH ${VELOX_ARROW_BUILD_SHA256_CHECKSUM} - SOURCE_SUBDIR cpp - CMAKE_ARGS ${ARROW_CMAKE_ARGS} diff --git a/velox/common/process/tests/CMakeLists.txt b/velox/common/process/tests/CMakeLists.txt index 6797697a1..3e241f8f7 100644 --- a/velox/common/process/tests/CMakeLists.txt @@ -126,17 +139,6 @@ index 10ee508ba..027a58ecc 100644 setupEnvironment(hadoopHomeDirectory.string()); } -diff --git a/velox/dwio/common/CMakeLists.txt b/velox/dwio/common/CMakeLists.txt -index 9b6574d6e..61abddb59 100644 ---- a/velox/dwio/common/CMakeLists.txt -+++ b/velox/dwio/common/CMakeLists.txt -@@ -77,4 +77,5 @@ target_link_libraries( - velox_memory - Boost::regex - Folly::folly -- glog::glog) -+ glog::glog -+ protobuf::libprotobuf) diff --git a/velox/dwio/parquet/writer/arrow/tests/CMakeLists.txt b/velox/dwio/parquet/writer/arrow/tests/CMakeLists.txt index 2cabfc29a..54329ce23 100644 --- a/velox/dwio/parquet/writer/arrow/tests/CMakeLists.txt diff --git a/gluten-celeborn/clickhouse/pom.xml b/gluten-celeborn/clickhouse/pom.xml index 74b81031fb38a..f17f5968d3511 100755 --- a/gluten-celeborn/clickhouse/pom.xml +++ b/gluten-celeborn/clickhouse/pom.xml @@ -127,7 +127,7 @@ org.scalatestplus - scalatestplus-mockito_2.12 + scalatestplus-mockito_${scala.binary.version} 1.0.0-M2 test @@ -138,7 +138,7 @@ org.scalatestplus - scalatestplus-scalacheck_2.12 + scalatestplus-scalacheck_${scala.binary.version} 3.1.0.0-RC2 test diff --git a/gluten-celeborn/clickhouse/src/main/scala/org/apache/spark/shuffle/CHCelebornHashBasedColumnarShuffleWriter.scala b/gluten-celeborn/clickhouse/src/main/scala/org/apache/spark/shuffle/CHCelebornHashBasedColumnarShuffleWriter.scala index 75efa355302de..a7836e4a13d1e 100644 --- a/gluten-celeborn/clickhouse/src/main/scala/org/apache/spark/shuffle/CHCelebornHashBasedColumnarShuffleWriter.scala +++ b/gluten-celeborn/clickhouse/src/main/scala/org/apache/spark/shuffle/CHCelebornHashBasedColumnarShuffleWriter.scala @@ -19,9 +19,7 @@ package org.apache.spark.shuffle import org.apache.gluten.GlutenConfig import org.apache.gluten.backendsapi.clickhouse.CHBackendSettings import org.apache.gluten.memory.alloc.CHNativeMemoryAllocators -import org.apache.gluten.memory.memtarget.MemoryTarget -import org.apache.gluten.memory.memtarget.Spiller -import org.apache.gluten.memory.memtarget.Spillers +import org.apache.gluten.memory.memtarget.{MemoryTarget, Spiller, Spillers} import org.apache.gluten.vectorized._ import org.apache.spark._ @@ -33,7 +31,6 @@ import org.apache.celeborn.client.ShuffleClient import org.apache.celeborn.common.CelebornConf import java.io.IOException -import java.util import java.util.Locale class CHCelebornHashBasedColumnarShuffleWriter[K, V]( @@ -78,12 +75,17 @@ class CHCelebornHashBasedColumnarShuffleWriter[K, V]( CHBackendSettings.shuffleHashAlgorithm, celebornPartitionPusher, GlutenConfig.getConf.chColumnarThrowIfMemoryExceed, - GlutenConfig.getConf.chColumnarFlushBlockBufferBeforeEvict + GlutenConfig.getConf.chColumnarFlushBlockBufferBeforeEvict, + GlutenConfig.getConf.chColumnarForceExternalSortShuffle, + GlutenConfig.getConf.chColumnarForceMemorySortShuffle ) CHNativeMemoryAllocators.createSpillable( "CelebornShuffleWriter", new Spiller() { - override def spill(self: MemoryTarget, size: Long): Long = { + override def spill(self: MemoryTarget, phase: Spiller.Phase, size: Long): Long = { + if (!Spillers.PHASE_SET_SPILL_ONLY.contains(phase)) { + return 0L + } if (nativeShuffleWriter == -1L) { throw new IllegalStateException( "Fatal: spill() called before a celeborn shuffle writer " + @@ -96,8 +98,6 @@ class CHCelebornHashBasedColumnarShuffleWriter[K, V]( logInfo(s"Gluten shuffle writer: Spilled $spilled / $size bytes of data") spilled } - - override def applicablePhases(): util.Set[Spiller.Phase] = Spillers.PHASE_SET_SPILL_ONLY } ) } diff --git a/gluten-celeborn/clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseRSSColumnarSortShuffleAQESuite.scala b/gluten-celeborn/clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseRSSColumnarExternalSortShuffleSuite.scala similarity index 97% rename from gluten-celeborn/clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseRSSColumnarSortShuffleAQESuite.scala rename to gluten-celeborn/clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseRSSColumnarExternalSortShuffleSuite.scala index 0072fe8c95d64..3ecf1fc1ac7b3 100644 --- a/gluten-celeborn/clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseRSSColumnarSortShuffleAQESuite.scala +++ b/gluten-celeborn/clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseRSSColumnarExternalSortShuffleSuite.scala @@ -21,7 +21,7 @@ import org.apache.spark.sql.execution.CoalescedPartitionSpec import org.apache.spark.sql.execution.adaptive.{AdaptiveSparkPlanExec, AdaptiveSparkPlanHelper, AQEShuffleReadExec} import org.apache.spark.sql.internal.SQLConf -class GlutenClickHouseRSSColumnarSortShuffleAQESuite +class GlutenClickHouseRSSColumnarExternalSortShuffleSuite extends GlutenClickHouseTPCHAbstractSuite with AdaptiveSparkPlanHelper { @@ -45,7 +45,7 @@ class GlutenClickHouseRSSColumnarSortShuffleAQESuite .set("spark.sql.adaptive.enabled", "true") .set("spark.shuffle.service.enabled", "false") .set("spark.celeborn.client.spark.shuffle.writer", "hash") - .set("spark.gluten.sql.columnar.backend.ch.forceSortShuffle", "true") + .set("spark.gluten.sql.columnar.backend.ch.forceExternalSortShuffle", "true") } test("TPCH Q1") { diff --git a/gluten-celeborn/clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseRSSColumnarMemorySortShuffleSuite.scala b/gluten-celeborn/clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseRSSColumnarMemorySortShuffleSuite.scala new file mode 100644 index 0000000000000..ddef1d87cd088 --- /dev/null +++ b/gluten-celeborn/clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseRSSColumnarMemorySortShuffleSuite.scala @@ -0,0 +1,136 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.execution + +import org.apache.spark.SparkConf +import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper + +class GlutenClickHouseRSSColumnarMemorySortShuffleSuite + extends GlutenClickHouseTPCHAbstractSuite + with AdaptiveSparkPlanHelper { + + override protected val tablesPath: String = basePath + "/tpch-data-ch" + override protected val tpchQueries: String = rootPath + "queries/tpch-queries-ch" + override protected val queriesResults: String = + rootPath + "../../../../../backends-clickhouse/src/test/resources/mergetree-queries-output" + + override protected val parquetTableDataPath: String = + "../../../../../gluten-core/src/test/resources/tpch-data" + + /** Run Gluten + ClickHouse Backend with ColumnarShuffleManager */ + override protected def sparkConf: SparkConf = { + super.sparkConf + .set( + "spark.shuffle.manager", + "org.apache.spark.shuffle.gluten.celeborn.CelebornShuffleManager") + .set("spark.io.compression.codec", "LZ4") + .set("spark.sql.shuffle.partitions", "5") + .set("spark.sql.autoBroadcastJoinThreshold", "10MB") + .set("spark.sql.adaptive.enabled", "true") + .set("spark.shuffle.service.enabled", "false") + .set("spark.celeborn.client.spark.shuffle.writer", "hash") + .set("spark.gluten.sql.columnar.backend.ch.forceMemorySortShuffle", "true") + } + + test("TPCH Q1") { + runTPCHQuery(1) { df => } + } + + test("TPCH Q2") { + runTPCHQuery(2) { df => } + } + + test("TPCH Q3") { + runTPCHQuery(3) { df => } + } + + test("TPCH Q4") { + runTPCHQuery(4) { df => } + } + + test("TPCH Q5") { + runTPCHQuery(5) { df => } + } + + test("TPCH Q6") { + runTPCHQuery(6) { df => } + } + + test("TPCH Q7") { + runTPCHQuery(7) { df => } + } + + test("TPCH Q8") { + runTPCHQuery(8) { df => } + } + + test("TPCH Q9") { + runTPCHQuery(9) { df => } + } + + test("TPCH Q10") { + runTPCHQuery(10) { df => } + } + + test("TPCH Q11") { + runTPCHQuery(11) { df => } + } + + test("TPCH Q12") { + runTPCHQuery(12) { df => } + } + + test("TPCH Q13") { + runTPCHQuery(13) { df => } + } + + test("TPCH Q14") { + runTPCHQuery(14) { df => } + } + + test("TPCH Q15") { + runTPCHQuery(15) { df => } + } + + test("TPCH Q16") { + runTPCHQuery(16, noFallBack = false) { df => } + } + + test("TPCH Q17") { + runTPCHQuery(17) { df => } + } + + test("TPCH Q18") { + runTPCHQuery(18) { df => } + } + + test("TPCH Q19") { + runTPCHQuery(19) { df => } + } + + test("TPCH Q20") { + runTPCHQuery(20) { df => } + } + + test("TPCH Q21") { + runTPCHQuery(21, noFallBack = false) { df => } + } + + test("TPCH Q22") { + runTPCHQuery(22) { df => } + } +} diff --git a/gluten-celeborn/common/src/main/java/org/apache/spark/shuffle/gluten/celeborn/CelebornShuffleManager.java b/gluten-celeborn/common/src/main/java/org/apache/spark/shuffle/gluten/celeborn/CelebornShuffleManager.java index f454cf00c656f..e61aeb3d70883 100644 --- a/gluten-celeborn/common/src/main/java/org/apache/spark/shuffle/gluten/celeborn/CelebornShuffleManager.java +++ b/gluten-celeborn/common/src/main/java/org/apache/spark/shuffle/gluten/celeborn/CelebornShuffleManager.java @@ -16,6 +16,7 @@ */ package org.apache.spark.shuffle.gluten.celeborn; +import org.apache.gluten.GlutenConfig; import org.apache.gluten.backendsapi.BackendsApiManager; import org.apache.gluten.exception.GlutenException; @@ -194,9 +195,14 @@ public ShuffleHandle registerShuffle( if (dependency instanceof ColumnarShuffleDependency) { if (fallbackPolicyRunner.applyAllFallbackPolicy( lifecycleManager, dependency.partitioner().numPartitions())) { - logger.warn("Fallback to ColumnarShuffleManager!"); - columnarShuffleIds.add(shuffleId); - return columnarShuffleManager().registerShuffle(shuffleId, dependency); + if (GlutenConfig.getConf().enableCelebornFallback()) { + logger.warn("Fallback to ColumnarShuffleManager!"); + columnarShuffleIds.add(shuffleId); + return columnarShuffleManager().registerShuffle(shuffleId, dependency); + } else { + throw new GlutenException( + "The Celeborn service(Master: " + celebornConf.masterHost() + ") is unavailable"); + } } else { return registerCelebornShuffleHandle(shuffleId, dependency); } @@ -209,15 +215,17 @@ public ShuffleHandle registerShuffle( @Override public boolean unregisterShuffle(int shuffleId) { - if (columnarShuffleIds.contains(shuffleId)) { - if (columnarShuffleManager().unregisterShuffle(shuffleId)) { - return columnarShuffleIds.remove(shuffleId); - } else { - return false; - } + if (columnarShuffleIds.remove(shuffleId)) { + return columnarShuffleManager().unregisterShuffle(shuffleId); } return CelebornUtils.unregisterShuffle( - lifecycleManager, shuffleClient, shuffleIdTracker, shuffleId, appUniqueId, isDriver()); + lifecycleManager, + shuffleClient, + shuffleIdTracker, + shuffleId, + appUniqueId, + throwsFetchFailure, + isDriver()); } @Override @@ -299,7 +307,6 @@ public ShuffleWriter getWriter( return vanillaCelebornShuffleManager().getWriter(handle, mapId, context, metrics); } } else { - columnarShuffleIds.add(handle.shuffleId()); return columnarShuffleManager().getWriter(handle, mapId, context, metrics); } } catch (Exception e) { diff --git a/gluten-celeborn/common/src/main/java/org/apache/spark/shuffle/gluten/celeborn/CelebornUtils.java b/gluten-celeborn/common/src/main/java/org/apache/spark/shuffle/gluten/celeborn/CelebornUtils.java index 4593d019c27e3..6b4229ad30374 100644 --- a/gluten-celeborn/common/src/main/java/org/apache/spark/shuffle/gluten/celeborn/CelebornUtils.java +++ b/gluten-celeborn/common/src/main/java/org/apache/spark/shuffle/gluten/celeborn/CelebornUtils.java @@ -49,11 +49,21 @@ public static boolean unregisterShuffle( Object shuffleIdTracker, int appShuffleId, String appUniqueId, + boolean throwsFetchFailure, boolean isDriver) { try { - // for Celeborn 0.4.0 try { - if (lifecycleManager != null) { + try { + // for Celeborn 0.4.1 + if (lifecycleManager != null) { + Method unregisterAppShuffle = + lifecycleManager + .getClass() + .getMethod("unregisterAppShuffle", int.class, boolean.class); + unregisterAppShuffle.invoke(lifecycleManager, appShuffleId, throwsFetchFailure); + } + } catch (NoSuchMethodException ex) { + // for Celeborn 0.4.0 Method unregisterAppShuffle = lifecycleManager.getClass().getMethod("unregisterAppShuffle", int.class); unregisterAppShuffle.invoke(lifecycleManager, appShuffleId); @@ -65,7 +75,7 @@ public static boolean unregisterShuffle( unregisterAppShuffleId.invoke(shuffleIdTracker, shuffleClient, appShuffleId); } return true; - } catch (NoSuchMethodException ex) { + } catch (NoSuchMethodException | ClassNotFoundException ex) { try { if (lifecycleManager != null) { Method unregisterShuffleMethod = diff --git a/gluten-celeborn/velox/src/main/scala/org/apache/spark/shuffle/VeloxCelebornColumnarBatchSerializer.scala b/gluten-celeborn/velox/src/main/scala/org/apache/spark/shuffle/VeloxCelebornColumnarBatchSerializer.scala index 699626db12c50..1f125a164c8b6 100644 --- a/gluten-celeborn/velox/src/main/scala/org/apache/spark/shuffle/VeloxCelebornColumnarBatchSerializer.scala +++ b/gluten-celeborn/velox/src/main/scala/org/apache/spark/shuffle/VeloxCelebornColumnarBatchSerializer.scala @@ -19,7 +19,6 @@ package org.apache.spark.shuffle import org.apache.gluten.GlutenConfig import org.apache.gluten.exec.Runtimes import org.apache.gluten.memory.arrow.alloc.ArrowBufferAllocators -import org.apache.gluten.memory.nmm.NativeMemoryManagers import org.apache.gluten.utils.ArrowAbiUtil import org.apache.gluten.vectorized._ @@ -65,7 +64,7 @@ private class CelebornColumnarBatchSerializerInstance( extends SerializerInstance with Logging { - private val nmm = NativeMemoryManagers.contextInstance("ShuffleReader") + private val runtime = Runtimes.contextInstance("CelebornShuffleReader") private val shuffleReaderHandle = { val allocator: BufferAllocator = ArrowBufferAllocators @@ -86,12 +85,11 @@ private class CelebornColumnarBatchSerializerInstance( GlutenConfig.getConf.columnarShuffleCodecBackend.orNull val shuffleWriterType = conf.get("spark.celeborn.client.spark.shuffle.writer", "hash").toLowerCase(Locale.ROOT) - val jniWrapper = ShuffleReaderJniWrapper.create() + val jniWrapper = ShuffleReaderJniWrapper.create(runtime) val batchSize = GlutenConfig.getConf.maxBatchSize val handle = jniWrapper .make( cSchema.memoryAddress(), - nmm.getNativeInstanceHandle, compressionCodec, compressionCodecBackend, batchSize, @@ -119,11 +117,10 @@ private class CelebornColumnarBatchSerializerInstance( with TaskResource { private val byteIn: JniByteInputStream = JniByteInputStreams.create(in) private val wrappedOut: GeneralOutIterator = new ColumnarBatchOutIterator( - Runtimes.contextInstance(), + runtime, ShuffleReaderJniWrapper - .create() - .readStream(shuffleReaderHandle, byteIn), - nmm) + .create(runtime) + .readStream(shuffleReaderHandle, byteIn)) private var cb: ColumnarBatch = _ diff --git a/gluten-celeborn/velox/src/main/scala/org/apache/spark/shuffle/VeloxCelebornHashBasedColumnarShuffleWriter.scala b/gluten-celeborn/velox/src/main/scala/org/apache/spark/shuffle/VeloxCelebornHashBasedColumnarShuffleWriter.scala index 37ea11a73d2a6..87b16c65bd097 100644 --- a/gluten-celeborn/velox/src/main/scala/org/apache/spark/shuffle/VeloxCelebornHashBasedColumnarShuffleWriter.scala +++ b/gluten-celeborn/velox/src/main/scala/org/apache/spark/shuffle/VeloxCelebornHashBasedColumnarShuffleWriter.scala @@ -18,10 +18,8 @@ package org.apache.spark.shuffle import org.apache.gluten.GlutenConfig import org.apache.gluten.columnarbatch.ColumnarBatches -import org.apache.gluten.memory.memtarget.MemoryTarget -import org.apache.gluten.memory.memtarget.Spiller -import org.apache.gluten.memory.memtarget.Spillers -import org.apache.gluten.memory.nmm.NativeMemoryManagers +import org.apache.gluten.exec.Runtimes +import org.apache.gluten.memory.memtarget.{MemoryTarget, Spiller, Spillers} import org.apache.gluten.vectorized._ import org.apache.spark._ @@ -35,7 +33,6 @@ import org.apache.celeborn.client.ShuffleClient import org.apache.celeborn.common.CelebornConf import java.io.IOException -import java.util class VeloxCelebornHashBasedColumnarShuffleWriter[K, V]( shuffleId: Int, @@ -52,7 +49,9 @@ class VeloxCelebornHashBasedColumnarShuffleWriter[K, V]( client, writeMetrics) { - private val jniWrapper = ShuffleWriterJniWrapper.create() + private val runtime = Runtimes.contextInstance("CelebornShuffleWriter") + + private val jniWrapper = ShuffleWriterJniWrapper.create(runtime) private var splitResult: SplitResult = _ @@ -105,31 +104,6 @@ class VeloxCelebornHashBasedColumnarShuffleWriter[K, V]( clientPushBufferMaxSize, clientPushSortMemoryThreshold, celebornPartitionPusher, - NativeMemoryManagers - .create( - "CelebornShuffleWriter", - new Spiller() { - override def spill(self: MemoryTarget, size: Long): Long = { - if (nativeShuffleWriter == -1L) { - throw new IllegalStateException( - "Fatal: spill() called before a celeborn shuffle writer " + - "is created. This behavior should be" + - "optimized by moving memory " + - "allocations from make() to split()") - } - logInfo(s"Gluten shuffle writer: Trying to push $size bytes of data") - // fixme pass true when being called by self - val pushed = - jniWrapper.nativeEvict(nativeShuffleWriter, size, false) - logInfo(s"Gluten shuffle writer: Pushed $pushed / $size bytes of data") - pushed - } - - override def applicablePhases(): util.Set[Spiller.Phase] = - Spillers.PHASE_SET_SPILL_ONLY - } - ) - .getNativeInstanceHandle, handle, context.taskAttemptId(), GlutenShuffleUtils.getStartPartitionId(dep.nativePartitioning, context.partitionId), @@ -137,6 +111,19 @@ class VeloxCelebornHashBasedColumnarShuffleWriter[K, V]( shuffleWriterType, GlutenConfig.getConf.columnarShuffleReallocThreshold ) + runtime.addSpiller(new Spiller() { + override def spill(self: MemoryTarget, phase: Spiller.Phase, size: Long): Long = { + if (!Spillers.PHASE_SET_SPILL_ONLY.contains(phase)) { + return 0L + } + logInfo(s"Gluten shuffle writer: Trying to push $size bytes of data") + // fixme pass true when being called by self + val pushed = + jniWrapper.nativeEvict(nativeShuffleWriter, size, false) + logInfo(s"Gluten shuffle writer: Pushed $pushed / $size bytes of data") + pushed + } + }) } val startTime = System.nanoTime() jniWrapper.write(nativeShuffleWriter, cb.numRows, handle, availableOffHeapPerTask()) diff --git a/gluten-core/pom.xml b/gluten-core/pom.xml index 3934d535ef45e..880eddb4ee953 100644 --- a/gluten-core/pom.xml +++ b/gluten-core/pom.xml @@ -24,10 +24,6 @@ gluten-ui ${project.version} - - com.github.ben-manes.caffeine - caffeine - org.apache.gluten ${sparkshim.artifactId} @@ -88,7 +84,7 @@ org.scalacheck scalacheck_${scala.binary.version} - 1.13.5 + 1.17.0 test @@ -115,13 +111,13 @@ org.scalatestplus - scalatestplus-mockito_2.12 + scalatestplus-mockito_${scala.binary.version} 1.0.0-M2 test org.scalatestplus - scalatestplus-scalacheck_2.12 + scalatestplus-scalacheck_${scala.binary.version} 3.1.0.0-RC2 test diff --git a/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/MemoryTargets.java b/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/MemoryTargets.java index 2d6fc0748464d..75e3db2e7d1f8 100644 --- a/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/MemoryTargets.java +++ b/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/MemoryTargets.java @@ -22,7 +22,6 @@ import org.apache.spark.memory.TaskMemoryManager; -import java.util.List; import java.util.Map; public final class MemoryTargets { @@ -54,7 +53,7 @@ public static MemoryTarget dynamicOffHeapSizingIfEnabled(MemoryTarget memoryTarg public static MemoryTarget newConsumer( TaskMemoryManager tmm, String name, - List spillers, + Spiller spiller, Map virtualChildren) { final TreeMemoryConsumers.Factory factory; if (GlutenConfig.getConf().memoryIsolation()) { @@ -63,6 +62,6 @@ public static MemoryTarget newConsumer( factory = TreeMemoryConsumers.shared(); } - return dynamicOffHeapSizingIfEnabled(factory.newConsumer(tmm, name, spillers, virtualChildren)); + return factory.newConsumer(tmm, name, spiller, virtualChildren); } } diff --git a/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/Spiller.java b/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/Spiller.java index 0e0457a7c7e7b..a0ec6dc6c7c62 100644 --- a/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/Spiller.java +++ b/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/Spiller.java @@ -16,14 +16,11 @@ */ package org.apache.gluten.memory.memtarget; -import java.util.Set; - public interface Spiller { + long spill(MemoryTarget self, Phase phase, long size); - long spill(MemoryTarget self, long size); - - Set applicablePhases(); - + // Order of the elements matters, since + // consumer should call spillers with in the defined order. enum Phase { SHRINK, SPILL diff --git a/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/Spillers.java b/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/Spillers.java index 6032b9494bc84..38ed88f577781 100644 --- a/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/Spillers.java +++ b/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/Spillers.java @@ -16,16 +16,21 @@ */ package org.apache.gluten.memory.memtarget; -import java.util.Arrays; -import java.util.Collections; -import java.util.HashSet; -import java.util.Set; +import java.util.*; public final class Spillers { private Spillers() { // enclose factory ctor } + public static final Spiller NOOP = + new Spiller() { + @Override + public long spill(MemoryTarget self, Phase phase, long size) { + return 0; + } + }; + public static final Set PHASE_SET_ALL = Collections.unmodifiableSet( new HashSet<>(Arrays.asList(Spiller.Phase.SHRINK, Spiller.Phase.SPILL))); @@ -40,6 +45,10 @@ public static Spiller withMinSpillSize(Spiller spiller, long minSize) { return new WithMinSpillSize(spiller, minSize); } + public static AppendableSpillerList appendable() { + return new AppendableSpillerList(); + } + // Minimum spill target size should be larger than spark.gluten.memory.reservationBlockSize, // since any release action within size smaller than the block size may not have chance to // report back to the Java-side reservation listener. @@ -53,13 +62,30 @@ private WithMinSpillSize(Spiller delegated, long minSize) { } @Override - public long spill(MemoryTarget self, long size) { - return delegated.spill(self, Math.max(size, minSize)); + public long spill(MemoryTarget self, Spiller.Phase phase, long size) { + return delegated.spill(self, phase, Math.max(size, minSize)); + } + } + + public static class AppendableSpillerList implements Spiller { + private final List spillers = new LinkedList<>(); + + private AppendableSpillerList() {} + + public void append(Spiller spiller) { + spillers.add(spiller); } @Override - public Set applicablePhases() { - return delegated.applicablePhases(); + public long spill(MemoryTarget self, Phase phase, final long size) { + long remainingBytes = size; + for (Spiller spiller : spillers) { + if (remainingBytes <= 0) { + break; + } + remainingBytes -= spiller.spill(self, phase, remainingBytes); + } + return size - remainingBytes; } } } diff --git a/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/TreeMemoryTarget.java b/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/TreeMemoryTarget.java index 37d44fd7a675c..9543bb5ed9eb1 100644 --- a/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/TreeMemoryTarget.java +++ b/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/TreeMemoryTarget.java @@ -19,7 +19,6 @@ import org.apache.gluten.memory.MemoryUsageStatsBuilder; import org.apache.gluten.memory.memtarget.spark.TreeMemoryConsumer; -import java.util.List; import java.util.Map; /** An abstract for both {@link TreeMemoryConsumer} and it's non-consumer children nodes. */ @@ -29,12 +28,12 @@ public interface TreeMemoryTarget extends MemoryTarget, KnownNameAndStats { TreeMemoryTarget newChild( String name, long capacity, - List spillers, + Spiller spiller, Map virtualChildren); Map children(); TreeMemoryTarget parent(); - List getNodeSpillers(); + Spiller getNodeSpiller(); } diff --git a/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/TreeMemoryTargets.java b/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/TreeMemoryTargets.java index a8dc39ce3c16a..24d9fc0e2d4ac 100644 --- a/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/TreeMemoryTargets.java +++ b/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/TreeMemoryTargets.java @@ -23,19 +23,14 @@ import com.google.common.base.Preconditions; import org.apache.spark.util.Utils; -import java.util.Arrays; import java.util.Collections; import java.util.HashMap; -import java.util.List; import java.util.Map; import java.util.PriorityQueue; import java.util.Queue; -import java.util.function.Predicate; import java.util.stream.Collectors; public class TreeMemoryTargets { - public static final List SPILL_PHASES = - Arrays.asList(Spiller.Phase.SHRINK, Spiller.Phase.SPILL); private TreeMemoryTargets() { // enclose factory ctor @@ -45,26 +40,24 @@ public static TreeMemoryTarget newChild( TreeMemoryTarget parent, String name, long capacity, - List spillers, + Spiller spiller, Map virtualChildren) { - return new Node(parent, name, capacity, spillers, virtualChildren); + return new Node(parent, name, capacity, spiller, virtualChildren); } public static long spillTree(TreeMemoryTarget node, final long bytes) { long remainingBytes = bytes; - for (Spiller.Phase phase : SPILL_PHASES) { + for (Spiller.Phase phase : Spiller.Phase.values()) { // First shrink, then if no good, spill. if (remainingBytes <= 0) { break; } - remainingBytes -= - spillTree(node, remainingBytes, spiller -> spiller.applicablePhases().contains(phase)); + remainingBytes -= spillTree(node, phase, remainingBytes); } return bytes - remainingBytes; } - private static long spillTree( - TreeMemoryTarget node, final long bytes, Predicate spillerFilter) { + private static long spillTree(TreeMemoryTarget node, Spiller.Phase phase, final long bytes) { // sort children by used bytes, descending Queue q = new PriorityQueue<>( @@ -83,13 +76,9 @@ private static long spillTree( if (remainingBytes > 0) { // if still doesn't fit, spill self - final List applicableSpillers = - node.getNodeSpillers().stream().filter(spillerFilter).collect(Collectors.toList()); - for (int i = 0; i < applicableSpillers.size() && remainingBytes > 0; i++) { - final Spiller spiller = applicableSpillers.get(i); - long spilled = spiller.spill(node, remainingBytes); - remainingBytes -= spilled; - } + final Spiller spiller = node.getNodeSpiller(); + long spilled = spiller.spill(node, phase, remainingBytes); + remainingBytes -= spilled; } return bytes - remainingBytes; @@ -101,7 +90,7 @@ public static class Node implements TreeMemoryTarget, KnownNameAndStats { private final TreeMemoryTarget parent; private final String name; private final long capacity; - private final List spillers; + private final Spiller spiller; private final Map virtualChildren; private final SimpleMemoryUsageRecorder selfRecorder = new SimpleMemoryUsageRecorder(); @@ -109,7 +98,7 @@ private Node( TreeMemoryTarget parent, String name, long capacity, - List spillers, + Spiller spiller, Map virtualChildren) { this.parent = parent; this.capacity = capacity; @@ -119,7 +108,7 @@ private Node( } else { this.name = String.format("%s, %s", uniqueName, Utils.bytesToString(capacity)); } - this.spillers = Collections.unmodifiableList(spillers); + this.spiller = spiller; this.virtualChildren = virtualChildren; } @@ -140,8 +129,8 @@ private long borrow0(long size) { } @Override - public List getNodeSpillers() { - return spillers; + public Spiller getNodeSpiller() { + return spiller; } private boolean ensureFreeCapacity(long bytesNeeded) { @@ -209,9 +198,9 @@ public MemoryUsageStats stats() { public TreeMemoryTarget newChild( String name, long capacity, - List spillers, + Spiller spiller, Map virtualChildren) { - final Node child = new Node(this, name, capacity, spillers, virtualChildren); + final Node child = new Node(this, name, capacity, spiller, virtualChildren); if (children.containsKey(child.name())) { throw new IllegalArgumentException("Child already registered: " + child.name()); } diff --git a/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/spark/RegularMemoryConsumer.java b/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/spark/RegularMemoryConsumer.java index 3028547c007a8..76aa63aebb64d 100644 --- a/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/spark/RegularMemoryConsumer.java +++ b/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/spark/RegularMemoryConsumer.java @@ -63,8 +63,16 @@ public RegularMemoryConsumer( } @Override - public long spill(long size, MemoryConsumer trigger) { - long spilledOut = spiller.spill(this, size); + public long spill(final long size, MemoryConsumer trigger) { + long remainingBytes = size; + for (Spiller.Phase phase : Spiller.Phase.values()) { + // First shrink, then if no good, spill. + if (remainingBytes <= 0) { + break; + } + remainingBytes -= spiller.spill(this, phase, size); + } + long spilledOut = size - remainingBytes; if (TaskResources.inSparkTask()) { TaskResources.getLocalTaskContext().taskMetrics().incMemoryBytesSpilled(spilledOut); } diff --git a/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/spark/TreeMemoryConsumer.java b/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/spark/TreeMemoryConsumer.java index f86295b0697f9..44c725798c754 100644 --- a/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/spark/TreeMemoryConsumer.java +++ b/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/spark/TreeMemoryConsumer.java @@ -18,11 +18,7 @@ import org.apache.gluten.memory.MemoryUsageStatsBuilder; import org.apache.gluten.memory.SimpleMemoryUsageRecorder; -import org.apache.gluten.memory.memtarget.MemoryTargetUtil; -import org.apache.gluten.memory.memtarget.MemoryTargetVisitor; -import org.apache.gluten.memory.memtarget.Spiller; -import org.apache.gluten.memory.memtarget.TreeMemoryTarget; -import org.apache.gluten.memory.memtarget.TreeMemoryTargets; +import org.apache.gluten.memory.memtarget.*; import org.apache.gluten.proto.MemoryUsageStats; import com.google.common.base.Preconditions; @@ -33,7 +29,6 @@ import java.io.IOException; import java.util.Collections; import java.util.HashMap; -import java.util.List; import java.util.Map; import java.util.Set; import java.util.stream.Collectors; @@ -124,10 +119,10 @@ public long spill(long size, MemoryConsumer trigger) throws IOException { public TreeMemoryTarget newChild( String name, long capacity, - List spillers, + Spiller spiller, Map virtualChildren) { final TreeMemoryTarget child = - TreeMemoryTargets.newChild(this, name, capacity, spillers, virtualChildren); + TreeMemoryTargets.newChild(this, name, capacity, spiller, virtualChildren); if (children.containsKey(child.name())) { throw new IllegalArgumentException("Child already registered: " + child.name()); } @@ -147,9 +142,9 @@ public TreeMemoryTarget parent() { } @Override - public List getNodeSpillers() { + public Spiller getNodeSpiller() { // root doesn't spill - return Collections.emptyList(); + return Spillers.NOOP; } public TaskMemoryManager getTaskMemoryManager() { diff --git a/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/spark/TreeMemoryConsumers.java b/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/spark/TreeMemoryConsumers.java index 1da23d15e3539..7ab05bd3a2e79 100644 --- a/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/spark/TreeMemoryConsumers.java +++ b/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/spark/TreeMemoryConsumers.java @@ -19,13 +19,13 @@ import org.apache.gluten.GlutenConfig; import org.apache.gluten.memory.MemoryUsageStatsBuilder; import org.apache.gluten.memory.memtarget.Spiller; +import org.apache.gluten.memory.memtarget.Spillers; import org.apache.gluten.memory.memtarget.TreeMemoryTarget; import org.apache.commons.collections.map.ReferenceMap; import org.apache.spark.memory.TaskMemoryManager; import java.util.Collections; -import java.util.List; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; @@ -76,7 +76,7 @@ private TreeMemoryTarget getSharedAccount(TaskMemoryManager tmm) { m -> { TreeMemoryTarget tmc = new TreeMemoryConsumer((TaskMemoryManager) m); return tmc.newChild( - "root", perTaskCapacity, Collections.emptyList(), Collections.emptyMap()); + "root", perTaskCapacity, Spillers.NOOP, Collections.emptyMap()); }); } } @@ -84,11 +84,11 @@ private TreeMemoryTarget getSharedAccount(TaskMemoryManager tmm) { public TreeMemoryTarget newConsumer( TaskMemoryManager tmm, String name, - List spillers, + Spiller spiller, Map virtualChildren) { final TreeMemoryTarget account = getSharedAccount(tmm); return account.newChild( - name, TreeMemoryConsumer.CAPACITY_UNLIMITED, spillers, virtualChildren); + name, TreeMemoryConsumer.CAPACITY_UNLIMITED, spiller, virtualChildren); } } } diff --git a/gluten-core/src/main/java/org/apache/gluten/substrait/expression/BinaryLiteralNode.java b/gluten-core/src/main/java/org/apache/gluten/substrait/expression/BinaryLiteralNode.java index 864aeb741a154..3d1ee51741ee8 100644 --- a/gluten-core/src/main/java/org/apache/gluten/substrait/expression/BinaryLiteralNode.java +++ b/gluten-core/src/main/java/org/apache/gluten/substrait/expression/BinaryLiteralNode.java @@ -22,6 +22,9 @@ import com.google.protobuf.ByteString; import io.substrait.proto.Expression.Literal.Builder; +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; + public class BinaryLiteralNode extends LiteralNodeWithValue { public BinaryLiteralNode(byte[] value) { super(value, new BinaryTypeNode(true)); @@ -33,6 +36,14 @@ public BinaryLiteralNode(byte[] value, TypeNode typeNode) { @Override protected void updateLiteralBuilder(Builder literalBuilder, byte[] value) { - literalBuilder.setBinary(ByteString.copyFrom(value)); + ByteString byteValue; + try { + Method m = ByteString.class.getDeclaredMethod("wrap", byte[].class); + m.setAccessible(true); + byteValue = (ByteString) m.invoke(null, value); + } catch (NoSuchMethodException | InvocationTargetException | IllegalAccessException e) { + throw new RuntimeException(e); + } + literalBuilder.setBinary(byteValue); } } diff --git a/gluten-core/src/main/java/org/apache/gluten/substrait/expression/ExpressionBuilder.java b/gluten-core/src/main/java/org/apache/gluten/substrait/expression/ExpressionBuilder.java index 5d106938cef5a..e322e1528cac9 100644 --- a/gluten-core/src/main/java/org/apache/gluten/substrait/expression/ExpressionBuilder.java +++ b/gluten-core/src/main/java/org/apache/gluten/substrait/expression/ExpressionBuilder.java @@ -21,6 +21,8 @@ import org.apache.gluten.substrait.type.*; import org.apache.spark.sql.catalyst.InternalRow; +import org.apache.spark.sql.catalyst.expressions.Attribute; +import org.apache.spark.sql.catalyst.expressions.Expression; import org.apache.spark.sql.catalyst.util.ArrayData; import org.apache.spark.sql.catalyst.util.MapData; import org.apache.spark.sql.types.*; @@ -264,9 +266,10 @@ public static WindowFunctionNode makeWindowFunction( List expressionNodes, String columnName, TypeNode outputTypeNode, - String upperBound, - String lowerBound, - String frameType) { + Expression upperBound, + Expression lowerBound, + String frameType, + List originalInputAttributes) { return makeWindowFunction( functionId, expressionNodes, @@ -275,7 +278,8 @@ public static WindowFunctionNode makeWindowFunction( upperBound, lowerBound, frameType, - false); + false, + originalInputAttributes); } public static WindowFunctionNode makeWindowFunction( @@ -283,10 +287,11 @@ public static WindowFunctionNode makeWindowFunction( List expressionNodes, String columnName, TypeNode outputTypeNode, - String upperBound, - String lowerBound, + Expression upperBound, + Expression lowerBound, String frameType, - boolean ignoreNulls) { + boolean ignoreNulls, + List originalInputAttributes) { return new WindowFunctionNode( functionId, expressionNodes, @@ -295,6 +300,7 @@ public static WindowFunctionNode makeWindowFunction( upperBound, lowerBound, frameType, - ignoreNulls); + ignoreNulls, + originalInputAttributes); } } diff --git a/gluten-core/src/main/java/org/apache/gluten/substrait/expression/WindowFunctionNode.java b/gluten-core/src/main/java/org/apache/gluten/substrait/expression/WindowFunctionNode.java index 67d0d6e575fff..b9f1fbc126cca 100644 --- a/gluten-core/src/main/java/org/apache/gluten/substrait/expression/WindowFunctionNode.java +++ b/gluten-core/src/main/java/org/apache/gluten/substrait/expression/WindowFunctionNode.java @@ -16,17 +16,24 @@ */ package org.apache.gluten.substrait.expression; +import org.apache.gluten.exception.GlutenException; +import org.apache.gluten.expression.ExpressionConverter; import org.apache.gluten.substrait.type.TypeNode; import io.substrait.proto.Expression; import io.substrait.proto.FunctionArgument; import io.substrait.proto.FunctionOption; import io.substrait.proto.WindowType; +import org.apache.spark.sql.catalyst.expressions.Attribute; +import org.apache.spark.sql.catalyst.expressions.PreComputeRangeFrameBound; import java.io.Serializable; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; +import scala.collection.JavaConverters; + public class WindowFunctionNode implements Serializable { private final Integer functionId; private final List expressionNodes = new ArrayList<>(); @@ -34,23 +41,26 @@ public class WindowFunctionNode implements Serializable { private final String columnName; private final TypeNode outputTypeNode; - private final String upperBound; + private final org.apache.spark.sql.catalyst.expressions.Expression upperBound; - private final String lowerBound; + private final org.apache.spark.sql.catalyst.expressions.Expression lowerBound; private final String frameType; private final boolean ignoreNulls; + private final List originalInputAttributes; + WindowFunctionNode( Integer functionId, List expressionNodes, String columnName, TypeNode outputTypeNode, - String upperBound, - String lowerBound, + org.apache.spark.sql.catalyst.expressions.Expression upperBound, + org.apache.spark.sql.catalyst.expressions.Expression lowerBound, String frameType, - boolean ignoreNulls) { + boolean ignoreNulls, + List originalInputAttributes) { this.functionId = functionId; this.expressionNodes.addAll(expressionNodes); this.columnName = columnName; @@ -59,11 +69,13 @@ public class WindowFunctionNode implements Serializable { this.lowerBound = lowerBound; this.frameType = frameType; this.ignoreNulls = ignoreNulls; + this.originalInputAttributes = originalInputAttributes; } private Expression.WindowFunction.Bound.Builder setBound( - Expression.WindowFunction.Bound.Builder builder, String boundType) { - switch (boundType) { + Expression.WindowFunction.Bound.Builder builder, + org.apache.spark.sql.catalyst.expressions.Expression boundType) { + switch (boundType.sql()) { case ("CURRENT ROW"): Expression.WindowFunction.Bound.CurrentRow.Builder currentRowBuilder = Expression.WindowFunction.Bound.CurrentRow.newBuilder(); @@ -80,8 +92,36 @@ private Expression.WindowFunction.Bound.Builder setBound( builder.setUnboundedFollowing(followingBuilder.build()); break; default: - try { - Long offset = Long.valueOf(boundType); + if (boundType instanceof PreComputeRangeFrameBound) { + // Used only when backend is velox and frame type is RANGE. + if (!frameType.equals("RANGE")) { + throw new GlutenException( + "Only Range frame supports PreComputeRangeFrameBound, but got " + frameType); + } + ExpressionNode refNode = + ExpressionConverter.replaceWithExpressionTransformer( + ((PreComputeRangeFrameBound) boundType).child().toAttribute(), + JavaConverters.asScalaIteratorConverter(originalInputAttributes.iterator()) + .asScala() + .toSeq()) + .doTransform(new HashMap()); + Long offset = Long.valueOf(boundType.eval(null).toString()); + if (offset < 0) { + Expression.WindowFunction.Bound.Preceding.Builder refPrecedingBuilder = + Expression.WindowFunction.Bound.Preceding.newBuilder(); + refPrecedingBuilder.setRef(refNode.toProtobuf()); + builder.setPreceding(refPrecedingBuilder.build()); + } else { + Expression.WindowFunction.Bound.Following.Builder refFollowingBuilder = + Expression.WindowFunction.Bound.Following.newBuilder(); + refFollowingBuilder.setRef(refNode.toProtobuf()); + builder.setFollowing(refFollowingBuilder.build()); + } + } else if (boundType.foldable()) { + // Used when + // 1. Velox backend and frame type is ROW + // 2. Clickhouse backend + Long offset = Long.valueOf(boundType.eval(null).toString()); if (offset < 0) { Expression.WindowFunction.Bound.Preceding.Builder offsetPrecedingBuilder = Expression.WindowFunction.Bound.Preceding.newBuilder(); @@ -93,9 +133,9 @@ private Expression.WindowFunction.Bound.Builder setBound( offsetFollowingBuilder.setOffset(offset); builder.setFollowing(offsetFollowingBuilder.build()); } - } catch (NumberFormatException e) { + } else { throw new UnsupportedOperationException( - "Unsupported Window Function Frame Type:" + boundType); + "Unsupported Window Function Frame Bound Type: " + boundType); } } return builder; diff --git a/gluten-core/src/main/java/org/apache/gluten/substrait/rel/LocalFilesBuilder.java b/gluten-core/src/main/java/org/apache/gluten/substrait/rel/LocalFilesBuilder.java index 94acc83367f5d..7e085f81f4e66 100644 --- a/gluten-core/src/main/java/org/apache/gluten/substrait/rel/LocalFilesBuilder.java +++ b/gluten-core/src/main/java/org/apache/gluten/substrait/rel/LocalFilesBuilder.java @@ -27,6 +27,8 @@ public static LocalFilesNode makeLocalFiles( List paths, List starts, List lengths, + List fileSizes, + List modificationTimes, List> partitionColumns, List> metadataColumns, LocalFilesNode.ReadFileFormat fileFormat, @@ -36,6 +38,8 @@ public static LocalFilesNode makeLocalFiles( paths, starts, lengths, + fileSizes, + modificationTimes, partitionColumns, metadataColumns, fileFormat, diff --git a/gluten-core/src/main/java/org/apache/gluten/substrait/rel/LocalFilesNode.java b/gluten-core/src/main/java/org/apache/gluten/substrait/rel/LocalFilesNode.java index cbcda72dd03a3..fa9f3d51612bb 100644 --- a/gluten-core/src/main/java/org/apache/gluten/substrait/rel/LocalFilesNode.java +++ b/gluten-core/src/main/java/org/apache/gluten/substrait/rel/LocalFilesNode.java @@ -34,6 +34,8 @@ public class LocalFilesNode implements SplitInfo { private final List paths = new ArrayList<>(); private final List starts = new ArrayList<>(); private final List lengths = new ArrayList<>(); + private final List fileSizes = new ArrayList<>(); + private final List modificationTimes = new ArrayList<>(); private final List> partitionColumns = new ArrayList<>(); private final List> metadataColumns = new ArrayList<>(); private final List preferredLocations = new ArrayList<>(); @@ -60,6 +62,8 @@ public enum ReadFileFormat { List paths, List starts, List lengths, + List fileSizes, + List modificationTimes, List> partitionColumns, List> metadataColumns, ReadFileFormat fileFormat, @@ -68,6 +72,8 @@ public enum ReadFileFormat { this.paths.addAll(paths); this.starts.addAll(starts); this.lengths.addAll(lengths); + this.fileSizes.addAll(fileSizes); + this.modificationTimes.addAll(modificationTimes); this.fileFormat = fileFormat; this.partitionColumns.addAll(partitionColumns); this.metadataColumns.addAll(metadataColumns); @@ -153,6 +159,18 @@ public ReadRel.LocalFiles toProtobuf() { } fileBuilder.setLength(lengths.get(i)); fileBuilder.setStart(starts.get(i)); + + if (!fileSizes.isEmpty() + && !modificationTimes.isEmpty() + && fileSizes.size() == modificationTimes.size() + && fileSizes.size() == paths.size()) { + ReadRel.LocalFiles.FileOrFiles.fileProperties.Builder filePropsBuilder = + ReadRel.LocalFiles.FileOrFiles.fileProperties.newBuilder(); + filePropsBuilder.setFileSize(fileSizes.get(i)); + filePropsBuilder.setModificationTime(modificationTimes.get(i)); + fileBuilder.setProperties(filePropsBuilder.build()); + } + if (!metadataColumns.isEmpty()) { Map metadataColumn = metadataColumns.get(i); if (!metadataColumn.isEmpty()) { diff --git a/gluten-core/src/main/java/org/apache/gluten/vectorized/JniWorkspace.java b/gluten-core/src/main/java/org/apache/gluten/vectorized/JniWorkspace.java index a7c12387a2218..810c945d35ab6 100644 --- a/gluten-core/src/main/java/org/apache/gluten/vectorized/JniWorkspace.java +++ b/gluten-core/src/main/java/org/apache/gluten/vectorized/JniWorkspace.java @@ -75,14 +75,14 @@ private static JniWorkspace createDefault() { } } - public static void enableDebug() { + public static void enableDebug(String debugDir) { // Preserve the JNI libraries even after process exits. // This is useful for debugging native code if the debug symbols were embedded in // the libraries. synchronized (DEFAULT_INSTANCE_INIT_LOCK) { if (DEBUG_INSTANCE == null) { final File tempRoot = - Paths.get("/tmp").resolve("gluten-jni-debug-" + UUID.randomUUID()).toFile(); + Paths.get(debugDir).resolve("gluten-jni-debug-" + UUID.randomUUID()).toFile(); try { FileUtils.forceMkdir(tempRoot); } catch (IOException e) { diff --git a/gluten-core/src/main/resources/substrait/proto/substrait/algebra.proto b/gluten-core/src/main/resources/substrait/proto/substrait/algebra.proto index 266aba4b01576..0e51baf5ad4c2 100644 --- a/gluten-core/src/main/resources/substrait/proto/substrait/algebra.proto +++ b/gluten-core/src/main/resources/substrait/proto/substrait/algebra.proto @@ -198,6 +198,13 @@ message ReadRel { string value = 2; } repeated metadataColumn metadata_columns = 19; + + // File properties contained in split + message fileProperties { + int64 fileSize = 1; + int64 modificationTime = 2; + } + fileProperties properties = 20; } } } @@ -989,18 +996,28 @@ message Expression { message Bound { // Defines that the bound extends this far back from the current record. message Preceding { - // A strictly positive integer specifying the number of records that - // the window extends back from the current record. Required. Use - // CurrentRow for offset zero and Following for negative offsets. - int64 offset = 1; + oneof kind { + // A strictly positive integer specifying the number of records that + // the window extends back from the current record. Use + // CurrentRow for offset zero and Following for negative offsets. + int64 offset = 1; + + // the reference to pre-project range frame boundary. + Expression ref = 2; + } } // Defines that the bound extends this far ahead of the current record. message Following { - // A strictly positive integer specifying the number of records that - // the window extends ahead of the current record. Required. Use - // CurrentRow for offset zero and Preceding for negative offsets. - int64 offset = 1; + oneof kind { + // A strictly positive integer specifying the number of records that + // the window extends ahead of the current record. Use + // CurrentRow for offset zero and Preceding for negative offsets. + int64 offset = 1; + + // the reference to pre-project range frame boundary. + Expression ref = 2; + } } // Defines that the bound extends to or from the current record. diff --git a/gluten-core/src/main/resources/substrait/proto/substrait/type.proto b/gluten-core/src/main/resources/substrait/proto/substrait/type.proto index 5c7ee6a382ce1..b5fcb95623acc 100644 --- a/gluten-core/src/main/resources/substrait/proto/substrait/type.proto +++ b/gluten-core/src/main/resources/substrait/proto/substrait/type.proto @@ -238,5 +238,6 @@ message NamedStruct { NORMAL_COL = 0; PARTITION_COL = 1; METADATA_COL = 2; + ROWINDEX_COL = 3; } } diff --git a/gluten-core/src/main/scala/org/apache/gluten/GlutenPlugin.scala b/gluten-core/src/main/scala/org/apache/gluten/GlutenPlugin.scala index 6c3d62c1e207b..16929ca4bd4a7 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/GlutenPlugin.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/GlutenPlugin.scala @@ -22,7 +22,7 @@ import org.apache.gluten.backendsapi.BackendsApiManager import org.apache.gluten.events.GlutenBuildInfoEvent import org.apache.gluten.exception.GlutenException import org.apache.gluten.expression.ExpressionMappings -import org.apache.gluten.extension.{ColumnarOverrides, OthersExtensionOverrides, QueryStagePrepOverrides, StrategyOverrides} +import org.apache.gluten.extension.{ColumnarOverrides, OthersExtensionOverrides, QueryStagePrepOverrides} import org.apache.gluten.test.TestStats import org.apache.gluten.utils.TaskListener @@ -31,7 +31,6 @@ import org.apache.spark.api.plugin.{DriverPlugin, ExecutorPlugin, PluginContext, import org.apache.spark.internal.Logging import org.apache.spark.listener.GlutenListenerFactory import org.apache.spark.network.util.JavaUtils -import org.apache.spark.rpc.{GlutenDriverEndpoint, GlutenExecutorEndpoint} import org.apache.spark.sql.SparkSessionExtensions import org.apache.spark.sql.execution.ui.GlutenEventUtils import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf} @@ -73,8 +72,7 @@ private[gluten] class GlutenDriverPlugin extends DriverPlugin with Logging { } // Initialize Backends API BackendsApiManager.initialize() - BackendsApiManager.getListenerApiInstance.onDriverStart(conf) - GlutenDriverEndpoint.glutenDriverEndpointRef = (new GlutenDriverEndpoint).self + BackendsApiManager.getListenerApiInstance.onDriverStart(sc, pluginContext) GlutenListenerFactory.addToSparkListenerBus(sc) ExpressionMappings.expressionExtensionTransformer = ExpressionUtil.extendedExpressionTransformer( @@ -145,6 +143,12 @@ private[gluten] class GlutenDriverPlugin extends DriverPlugin with Logging { } conf.set(SPARK_SESSION_EXTS_KEY, extensions) + // adaptive custom cost evaluator class + if (GlutenConfig.getConf.enableGluten && GlutenConfig.getConf.enableGlutenCostEvaluator) { + val costEvaluator = "org.apache.spark.sql.execution.adaptive.GlutenCostEvaluator" + conf.set(SQLConf.ADAPTIVE_CUSTOM_COST_EVALUATOR_CLASS.key, costEvaluator) + } + // check memory off-heap enabled and size val minOffHeapSize = "1MB" if ( @@ -164,8 +168,9 @@ private[gluten] class GlutenDriverPlugin extends DriverPlugin with Logging { // task slots val taskSlots = SparkResourceUtil.getTaskSlots(conf) + conf.set(GlutenConfig.GLUTEN_NUM_TASK_SLOTS_PER_EXECUTOR_KEY, taskSlots.toString) - var onHeapSize: Long = + val onHeapSize: Long = if (conf.contains(GlutenConfig.GLUTEN_ONHEAP_SIZE_KEY)) { conf.getSizeAsBytes(GlutenConfig.GLUTEN_ONHEAP_SIZE_KEY) } else { @@ -177,7 +182,7 @@ private[gluten] class GlutenDriverPlugin extends DriverPlugin with Logging { // size. Otherwise, the off-heap size is set to the value specified by the user (if any). // Note that this means that we will IGNORE the off-heap size specified by the user if the // dynamic off-heap feature is enabled. - var offHeapSize: Long = + val offHeapSize: Long = if (conf.getBoolean(GlutenConfig.GLUTEN_DYNAMIC_OFFHEAP_SIZING_ENABLED, false)) { // Since when dynamic off-heap sizing is enabled, we commingle on-heap // and off-heap memory, we set the off-heap size to the usable on-heap size. We will @@ -257,7 +262,6 @@ private[gluten] class GlutenDriverPlugin extends DriverPlugin with Logging { } private[gluten] class GlutenExecutorPlugin extends ExecutorPlugin { - private var executorEndpoint: GlutenExecutorEndpoint = _ private val taskListeners: Seq[TaskListener] = Array(TaskResources) /** Initialize the executor plugin. */ @@ -267,8 +271,7 @@ private[gluten] class GlutenExecutorPlugin extends ExecutorPlugin { // Initialize Backends API // TODO categorize the APIs by driver's or executor's BackendsApiManager.initialize() - BackendsApiManager.getListenerApiInstance.onExecutorStart(conf) - executorEndpoint = new GlutenExecutorEndpoint(ctx.executorID(), conf) + BackendsApiManager.getListenerApiInstance.onExecutorStart(ctx) } /** Clean up and terminate this plugin. For example: close the native engine. */ @@ -297,7 +300,7 @@ private[gluten] class GlutenSessionExtensions extends (SparkSessionExtensions => } private[gluten] trait GlutenSparkExtensionsInjector { - def inject(extensions: SparkSessionExtensions) + def inject(extensions: SparkSessionExtensions): Unit } private[gluten] object GlutenPlugin { @@ -309,7 +312,6 @@ private[gluten] object GlutenPlugin { val DEFAULT_INJECTORS: List[GlutenSparkExtensionsInjector] = List( QueryStagePrepOverrides, ColumnarOverrides, - StrategyOverrides, OthersExtensionOverrides ) } diff --git a/gluten-core/src/main/scala/org/apache/gluten/backendsapi/Backend.scala b/gluten-core/src/main/scala/org/apache/gluten/backendsapi/Backend.scala index 6ad78e10536f3..2c465ac619936 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/backendsapi/Backend.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/backendsapi/Backend.scala @@ -33,8 +33,6 @@ trait Backend { def listenerApi(): ListenerApi - def broadcastApi(): BroadcastApi - def settings(): BackendSettingsApi } diff --git a/gluten-core/src/main/scala/org/apache/gluten/backendsapi/BackendSettingsApi.scala b/gluten-core/src/main/scala/org/apache/gluten/backendsapi/BackendSettingsApi.scala index 9c5c13271aebf..d159486373ace 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/backendsapi/BackendSettingsApi.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/backendsapi/BackendSettingsApi.scala @@ -43,6 +43,9 @@ trait BackendSettingsApi { options: Map[String, String]): ValidationResult = ValidationResult.ok def supportNativeWrite(fields: Array[StructField]): Boolean = true def supportNativeMetadataColumns(): Boolean = false + def supportNativeRowIndexColumn(): Boolean = false + def supportNativeInputFileRelatedExpr(): Boolean = false + def supportExpandExec(): Boolean = false def supportSortExec(): Boolean = false def supportSortMergeJoinExec(): Boolean = true @@ -83,7 +86,6 @@ trait BackendSettingsApi { * the result columns from the shuffle. */ def supportShuffleWithProject(outputPartitioning: Partitioning, child: SparkPlan): Boolean = false - def utilizeShuffledHashJoinHint(): Boolean = false def excludeScanExecFromCollapsedStage(): Boolean = false def rescaleDecimalArithmetic: Boolean = false @@ -144,10 +146,14 @@ trait BackendSettingsApi { def supportBroadcastNestedLoopJoinExec(): Boolean = false + def supportSampleExec(): Boolean = false + /** Merge two phases hash based aggregate if need */ def mergeTwoPhasesHashBaseAggregateIfNeed(): Boolean = false def supportColumnarArrowUdf(): Boolean = false def generateHdfsConfForLibhdfs(): Boolean = false + + def needPreComputeRangeFrameBoundary(): Boolean = false } diff --git a/gluten-core/src/main/scala/org/apache/gluten/backendsapi/BackendsApiManager.scala b/gluten-core/src/main/scala/org/apache/gluten/backendsapi/BackendsApiManager.scala index 1d9690d178423..f2c93d8c70fc7 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/backendsapi/BackendsApiManager.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/backendsapi/BackendsApiManager.scala @@ -83,10 +83,6 @@ object BackendsApiManager { backend.metricsApi() } - def getBroadcastApiInstance: BroadcastApi = { - backend.broadcastApi() - } - def getSettings: BackendSettingsApi = { backend.settings } diff --git a/gluten-core/src/main/scala/org/apache/gluten/backendsapi/BroadcastApi.scala b/gluten-core/src/main/scala/org/apache/gluten/backendsapi/BroadcastApi.scala deleted file mode 100644 index 8b8b0d649a779..0000000000000 --- a/gluten-core/src/main/scala/org/apache/gluten/backendsapi/BroadcastApi.scala +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.gluten.backendsapi - -trait BroadcastApi { - - /** - * Should call by driver. Collect Broadcast Hash Table Ids. - * - * @param executionId - * execution id - * @param buildTableId - * build table id - */ - def collectExecutionBroadcastTableId(executionId: String, buildTableId: String): Unit = {} - - /** - * Should call by executor. On execution end. Clean executor broadcast build hashtable. - * - * @param executionId - * execution id - * @param broadcastTableIds - * broadcast table ids - */ - def cleanExecutionBroadcastTable( - executionId: String, - broadcastTableIds: java.util.Set[String]): Unit = {} -} diff --git a/gluten-core/src/main/scala/org/apache/gluten/backendsapi/IteratorApi.scala b/gluten-core/src/main/scala/org/apache/gluten/backendsapi/IteratorApi.scala index d999948d70478..53dc8f47861fb 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/backendsapi/IteratorApi.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/backendsapi/IteratorApi.scala @@ -24,7 +24,6 @@ import org.apache.gluten.substrait.rel.LocalFilesNode.ReadFileFormat import org.apache.gluten.substrait.rel.SplitInfo import org.apache.spark._ -import org.apache.spark.rdd.RDD import org.apache.spark.sql.connector.read.InputPartition import org.apache.spark.sql.execution.metric.SQLMetric import org.apache.spark.sql.types.StructType @@ -81,14 +80,4 @@ trait IteratorApi { partitionIndex: Int, materializeInput: Boolean = false): Iterator[ColumnarBatch] // scalastyle:on argcount - - /** Generate Native FileScanRDD, currently only for ClickHouse Backend. */ - def genNativeFileScanRDD( - sparkContext: SparkContext, - wsCxt: WholeStageTransformContext, - splitInfos: Seq[SplitInfo], - scan: BasicScanExecTransformer, - numOutputRows: SQLMetric, - numOutputBatches: SQLMetric, - scanTime: SQLMetric): RDD[ColumnarBatch] } diff --git a/gluten-core/src/main/scala/org/apache/gluten/backendsapi/ListenerApi.scala b/gluten-core/src/main/scala/org/apache/gluten/backendsapi/ListenerApi.scala index aaba345fc4fd8..bad169b721407 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/backendsapi/ListenerApi.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/backendsapi/ListenerApi.scala @@ -16,11 +16,12 @@ */ package org.apache.gluten.backendsapi -import org.apache.spark.SparkConf +import org.apache.spark.SparkContext +import org.apache.spark.api.plugin.PluginContext trait ListenerApi { - def onDriverStart(conf: SparkConf): Unit = {} + def onDriverStart(sc: SparkContext, pc: PluginContext): Unit = {} def onDriverShutdown(): Unit = {} - def onExecutorStart(conf: SparkConf): Unit = {} + def onExecutorStart(pc: PluginContext): Unit = {} def onExecutorShutdown(): Unit = {} } diff --git a/gluten-core/src/main/scala/org/apache/gluten/backendsapi/MetricsApi.scala b/gluten-core/src/main/scala/org/apache/gluten/backendsapi/MetricsApi.scala index 99b44a2de350f..a96f27f5a8a33 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/backendsapi/MetricsApi.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/backendsapi/MetricsApi.scala @@ -113,6 +113,10 @@ trait MetricsApi extends Serializable { def genNestedLoopJoinTransformerMetricsUpdater(metrics: Map[String, SQLMetric]): MetricsUpdater + def genSampleTransformerMetrics(sparkContext: SparkContext): Map[String, SQLMetric] + + def genSampleTransformerMetricsUpdater(metrics: Map[String, SQLMetric]): MetricsUpdater + def genColumnarInMemoryTableMetrics(sparkContext: SparkContext): Map[String, SQLMetric] = Map("numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) } diff --git a/gluten-core/src/main/scala/org/apache/gluten/backendsapi/SparkPlanExecApi.scala b/gluten-core/src/main/scala/org/apache/gluten/backendsapi/SparkPlanExecApi.scala index 429b926cdceb7..ff7449e2d3404 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/backendsapi/SparkPlanExecApi.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/backendsapi/SparkPlanExecApi.scala @@ -20,6 +20,7 @@ import org.apache.gluten.exception.GlutenNotSupportException import org.apache.gluten.execution._ import org.apache.gluten.expression._ import org.apache.gluten.extension.columnar.transition.{Convention, ConventionFunc} +import org.apache.gluten.sql.shims.SparkShimLoader import org.apache.gluten.substrait.expression.{ExpressionBuilder, ExpressionNode, WindowFunctionNode} import org.apache.spark.ShuffleDependency @@ -101,7 +102,7 @@ trait SparkPlanExecApi { aggregateExpressions: Seq[AggregateExpression], aggregateAttributes: Seq[Attribute]): HashAggregateExecPullOutBaseHelper - def genColumnarShuffleExchange(shuffle: ShuffleExchangeExec, newChild: SparkPlan): SparkPlan + def genColumnarShuffleExchange(shuffle: ShuffleExchangeExec): SparkPlan /** Generate ShuffledHashJoinExecTransformer. */ def genShuffledHashJoinExecTransformer( @@ -125,6 +126,13 @@ trait SparkPlanExecApi { right: SparkPlan, isNullAwareAntiJoin: Boolean = false): BroadcastHashJoinExecTransformerBase + def genSampleExecTransformer( + lowerBound: Double, + upperBound: Double, + withReplacement: Boolean, + seed: Long, + child: SparkPlan): SampleExecTransformer + /** Generate ShuffledHashJoinExecTransformer. */ def genSortMergeJoinExecTransformer( leftKeys: Seq[Expression], @@ -199,12 +207,13 @@ trait SparkPlanExecApi { GenericExpressionTransformer(substraitExprName, Seq(), original) } - def genTryAddTransformer( + def genTryArithmeticTransformer( substraitExprName: String, left: ExpressionTransformer, right: ExpressionTransformer, - original: TryEval): ExpressionTransformer = { - throw new GlutenNotSupportException("try_add is not supported") + original: TryEval, + checkArithmeticExprName: String): ExpressionTransformer = { + throw new GlutenNotSupportException(s"$checkArithmeticExprName is not supported") } def genTryEvalTransformer( @@ -214,11 +223,12 @@ trait SparkPlanExecApi { throw new GlutenNotSupportException("try_eval is not supported") } - def genAddTransformer( + def genArithmeticTransformer( substraitExprName: String, left: ExpressionTransformer, right: ExpressionTransformer, - original: Add): ExpressionTransformer = { + original: Expression, + checkArithmeticExprName: String): ExpressionTransformer = { GenericExpressionTransformer(substraitExprName, Seq(left, right), original) } @@ -296,6 +306,13 @@ trait SparkPlanExecApi { GenericExpressionTransformer(substraitExprName, children, expr) } + def genPreciseTimestampConversionTransformer( + substraitExprName: String, + children: Seq[ExpressionTransformer], + expr: PreciseTimestampConversion): ExpressionTransformer = { + throw new GlutenNotSupportException("PreciseTimestampConversion is not supported") + } + /** * Generate ShuffleDependency for ColumnarShuffleExchangeExec. * @@ -413,7 +430,9 @@ trait SparkPlanExecApi { * * @return */ - def genExtendedColumnarPostRules(): List[SparkSession => Rule[SparkPlan]] + def genExtendedColumnarPostRules(): List[SparkSession => Rule[SparkPlan]] = { + SparkShimLoader.getSparkShims.getExtendedColumnarPostRules() ::: List() + } def genInjectPostHocResolutionRules(): List[SparkSession => Rule[LogicalPlan]] @@ -445,13 +464,6 @@ trait SparkPlanExecApi { original) } - def genSizeExpressionTransformer( - substraitExprName: String, - child: ExpressionTransformer, - original: Size): ExpressionTransformer = { - GenericExpressionTransformer(substraitExprName, Seq(child), original) - } - def genLikeTransformer( substraitExprName: String, left: ExpressionTransformer, @@ -520,9 +532,10 @@ trait SparkPlanExecApi { new JArrayList[ExpressionNode](), columnName, ConverterUtils.getTypeNode(aggWindowFunc.dataType, aggWindowFunc.nullable), - WindowExecTransformer.getFrameBound(frame.upper), - WindowExecTransformer.getFrameBound(frame.lower), - frame.frameType.sql + frame.upper, + frame.lower, + frame.frameType.sql, + originalInputAttributes.asJava ) windowExpressionNodes.add(windowFunctionNode) case aggExpression: AggregateExpression => @@ -545,9 +558,10 @@ trait SparkPlanExecApi { childrenNodeList, columnName, ConverterUtils.getTypeNode(aggExpression.dataType, aggExpression.nullable), - WindowExecTransformer.getFrameBound(frame.upper), - WindowExecTransformer.getFrameBound(frame.lower), - frame.frameType.sql + frame.upper, + frame.lower, + frame.frameType.sql, + originalInputAttributes.asJava ) windowExpressionNodes.add(windowFunctionNode) case wf @ (_: Lead | _: Lag) => @@ -581,10 +595,11 @@ trait SparkPlanExecApi { childrenNodeList, columnName, ConverterUtils.getTypeNode(offsetWf.dataType, offsetWf.nullable), - WindowExecTransformer.getFrameBound(frame.upper), - WindowExecTransformer.getFrameBound(frame.lower), + frame.upper, + frame.lower, frame.frameType.sql, - offsetWf.ignoreNulls + offsetWf.ignoreNulls, + originalInputAttributes.asJava ) windowExpressionNodes.add(windowFunctionNode) case wf @ NthValue(input, offset: Literal, ignoreNulls: Boolean) => @@ -600,10 +615,11 @@ trait SparkPlanExecApi { childrenNodeList, columnName, ConverterUtils.getTypeNode(wf.dataType, wf.nullable), - frame.upper.sql, - frame.lower.sql, + frame.upper, + frame.lower, frame.frameType.sql, - ignoreNulls + ignoreNulls, + originalInputAttributes.asJava ) windowExpressionNodes.add(windowFunctionNode) case wf @ NTile(buckets: Expression) => @@ -616,9 +632,10 @@ trait SparkPlanExecApi { childrenNodeList, columnName, ConverterUtils.getTypeNode(wf.dataType, wf.nullable), - frame.upper.sql, - frame.lower.sql, - frame.frameType.sql + frame.upper, + frame.lower, + frame.frameType.sql, + originalInputAttributes.asJava ) windowExpressionNodes.add(windowFunctionNode) case _ => @@ -647,17 +664,20 @@ trait SparkPlanExecApi { def postProcessPushDownFilter( extraFilters: Seq[Expression], sparkExecNode: LeafExecNode): Seq[Expression] = { + def getPushedFilter(dataFilters: Seq[Expression]): Seq[Expression] = { + val pushedFilters = + dataFilters ++ FilterHandler.getRemainingFilters(dataFilters, extraFilters) + pushedFilters.filterNot(_.references.exists { + attr => SparkShimLoader.getSparkShims.isRowIndexMetadataColumn(attr.name) + }) + } sparkExecNode match { case fileSourceScan: FileSourceScanExec => - fileSourceScan.dataFilters ++ FilterHandler.getRemainingFilters( - fileSourceScan.dataFilters, - extraFilters) + getPushedFilter(fileSourceScan.dataFilters) case batchScan: BatchScanExec => batchScan.scan match { case fileScan: FileScan => - fileScan.dataFilters ++ FilterHandler.getRemainingFilters( - fileScan.dataFilters, - extraFilters) + getPushedFilter(fileScan.dataFilters) case _ => // TODO: For data lake format use pushedFilters in SupportsPushDownFilters extraFilters diff --git a/gluten-core/src/main/scala/org/apache/gluten/backendsapi/TransformerApi.scala b/gluten-core/src/main/scala/org/apache/gluten/backendsapi/TransformerApi.scala index e41df0f2f2405..522be378790be 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/backendsapi/TransformerApi.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/backendsapi/TransformerApi.scala @@ -22,7 +22,7 @@ import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} import org.apache.spark.sql.connector.read.InputPartition import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, PartitionDirectory} -import org.apache.spark.sql.types.{DataType, DecimalType} +import org.apache.spark.sql.types.{DataType, DecimalType, StructType} import org.apache.spark.util.collection.BitSet import com.google.protobuf.{Any, Message} @@ -34,6 +34,7 @@ trait TransformerApi { /** Generate Seq[InputPartition] for FileSourceScanExecTransformer. */ def genInputPartitionSeq( relation: HadoopFsRelation, + requiredSchema: StructType, selectedPartitions: Array[PartitionDirectory], output: Seq[Attribute], bucketedScan: Boolean, @@ -70,4 +71,7 @@ trait TransformerApi { def getNativePlanString(substraitPlan: Array[Byte], details: Boolean): String def packPBMessage(message: Message): Any + + /** This method is only used for CH backend tests */ + def invalidateSQLExecutionResource(executionId: String): Unit = {} } diff --git a/gluten-core/src/main/scala/org/apache/gluten/execution/BasicPhysicalOperatorTransformer.scala b/gluten-core/src/main/scala/org/apache/gluten/execution/BasicPhysicalOperatorTransformer.scala index e703295d08511..0b792d52e0561 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/execution/BasicPhysicalOperatorTransformer.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/execution/BasicPhysicalOperatorTransformer.scala @@ -129,8 +129,8 @@ abstract class FilterExecTransformerBase(val cond: Expression, val input: SparkP doNativeValidation(substraitContext, relNode) } - override def doTransform(context: SubstraitContext): TransformContext = { - val childCtx = child.asInstanceOf[TransformSupport].doTransform(context) + override protected def doTransform(context: SubstraitContext): TransformContext = { + val childCtx = child.asInstanceOf[TransformSupport].transform(context) val remainingCondition = getRemainingCondition val operatorId = context.nextOperatorId(this.nodeName) if (remainingCondition == null) { @@ -190,7 +190,7 @@ case class ProjectExecTransformer private (projectList: Seq[NamedExpression], ch BackendsApiManager.getMetricsApiInstance.genProjectTransformerMetricsUpdater(metrics) override def doTransform(context: SubstraitContext): TransformContext = { - val childCtx = child.asInstanceOf[TransformSupport].doTransform(context) + val childCtx = child.asInstanceOf[TransformSupport].transform(context) val operatorId = context.nextOperatorId(this.nodeName) if ((projectList == null || projectList.isEmpty) && childCtx != null) { // The computing for this project is not needed. @@ -365,7 +365,7 @@ object FilterHandler extends PredicateHelper { * the filter conditions not pushed down into Scan. */ def getRemainingFilters(scanFilters: Seq[Expression], filters: Seq[Expression]): Seq[Expression] = - (ExpressionSet(filters) -- ExpressionSet(scanFilters)).toSeq + (filters.toSet -- scanFilters.toSet).toSeq // Separate and compare the filter conditions in Scan and Filter. // Try to push down the remaining conditions in Filter into Scan. diff --git a/gluten-core/src/main/scala/org/apache/gluten/execution/BasicScanExecTransformer.scala b/gluten-core/src/main/scala/org/apache/gluten/execution/BasicScanExecTransformer.scala index b0bc0ea7b27de..64071fb14c0c0 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/execution/BasicScanExecTransformer.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/execution/BasicScanExecTransformer.scala @@ -19,22 +19,20 @@ package org.apache.gluten.execution import org.apache.gluten.backendsapi.BackendsApiManager import org.apache.gluten.expression.{ConverterUtils, ExpressionConverter} import org.apache.gluten.extension.ValidationResult +import org.apache.gluten.sql.shims.SparkShimLoader import org.apache.gluten.substrait.`type`.ColumnTypeNode import org.apache.gluten.substrait.SubstraitContext import org.apache.gluten.substrait.extensions.ExtensionBuilder -import org.apache.gluten.substrait.plan.PlanBuilder import org.apache.gluten.substrait.rel.{RelBuilder, SplitInfo} import org.apache.gluten.substrait.rel.LocalFilesNode.ReadFileFormat -import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.connector.read.InputPartition import org.apache.spark.sql.hive.HiveTableScanExecTransformer import org.apache.spark.sql.types.{BooleanType, StringType, StructField, StructType} -import org.apache.spark.sql.vectorized.ColumnarBatch -import com.google.common.collect.Lists import com.google.protobuf.StringValue +import io.substrait.proto.NamedStruct import scala.collection.JavaConverters._ @@ -75,28 +73,6 @@ trait BasicScanExecTransformer extends LeafTransformSupport with BaseDataSource .genSplitInfo(_, getPartitionSchema, fileFormat, getMetadataColumns.map(_.name))) } - def doExecuteColumnarInternal(): RDD[ColumnarBatch] = { - val numOutputRows = longMetric("numOutputRows") - val numOutputVectors = longMetric("outputVectors") - val scanTime = longMetric("scanTime") - val substraitContext = new SubstraitContext - val transformContext = doTransform(substraitContext) - val outNames = - filteRedundantField(outputAttributes()).map(ConverterUtils.genColumnNameWithExprId).asJava - val planNode = - PlanBuilder.makePlan(substraitContext, Lists.newArrayList(transformContext.root), outNames) - - BackendsApiManager.getIteratorApiInstance.genNativeFileScanRDD( - sparkContext, - WholeStageTransformContext(planNode, substraitContext), - getSplitInfos, - this, - numOutputRows, - numOutputVectors, - scanTime - ) - } - override protected def doValidateInternal(): ValidationResult = { var fields = schema.fields @@ -117,7 +93,7 @@ trait BasicScanExecTransformer extends LeafTransformSupport with BaseDataSource } val substraitContext = new SubstraitContext - val relNode = doTransform(substraitContext).root + val relNode = transform(substraitContext).root doNativeValidation(substraitContext, relNode) } @@ -133,18 +109,20 @@ trait BasicScanExecTransformer extends LeafTransformSupport with BaseDataSource } } - override def doTransform(context: SubstraitContext): TransformContext = { - val output = filteRedundantField(outputAttributes()) + override protected def doTransform(context: SubstraitContext): TransformContext = { + val output = outputAttributes() val typeNodes = ConverterUtils.collectAttributeTypeNodes(output) val nameList = ConverterUtils.collectAttributeNamesWithoutExprId(output) val columnTypeNodes = output.map { attr => if (getPartitionSchema.exists(_.name.equals(attr.name))) { - new ColumnTypeNode(1) + new ColumnTypeNode(NamedStruct.ColumnType.PARTITION_COL_VALUE) + } else if (SparkShimLoader.getSparkShims.isRowIndexMetadataColumn(attr.name)) { + new ColumnTypeNode(NamedStruct.ColumnType.ROWINDEX_COL_VALUE) } else if (attr.isMetadataCol) { - new ColumnTypeNode(2) + new ColumnTypeNode(NamedStruct.ColumnType.METADATA_COL_VALUE) } else { - new ColumnTypeNode(0) + new ColumnTypeNode(NamedStruct.ColumnType.NORMAL_COL_VALUE) } }.asJava // Will put all filter expressions into an AND expression @@ -178,21 +156,4 @@ trait BasicScanExecTransformer extends LeafTransformSupport with BaseDataSource context.nextOperatorId(this.nodeName)) TransformContext(output, output, readNode) } - - def filteRedundantField(outputs: Seq[Attribute]): Seq[Attribute] = { - var final_output: List[Attribute] = List() - val outputList = outputs.toArray - for (i <- 0 to outputList.size - 1) { - var dup = false - for (j <- 0 to i - 1) { - if (outputList(i).name == outputList(j).name) { - dup = true - } - } - if (!dup) { - final_output = final_output :+ outputList(i) - } - } - final_output.toSeq - } } diff --git a/gluten-core/src/main/scala/org/apache/gluten/execution/BatchScanExecTransformer.scala b/gluten-core/src/main/scala/org/apache/gluten/execution/BatchScanExecTransformer.scala index b0c8c59e7bb5d..6bff68895a249 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/execution/BatchScanExecTransformer.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/execution/BatchScanExecTransformer.scala @@ -20,9 +20,9 @@ import org.apache.gluten.backendsapi.BackendsApiManager import org.apache.gluten.exception.GlutenNotSupportException import org.apache.gluten.extension.ValidationResult import org.apache.gluten.metrics.MetricsUpdater +import org.apache.gluten.sql.shims.SparkShimLoader import org.apache.gluten.substrait.rel.LocalFilesNode.ReadFileFormat -import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.QueryPlan @@ -31,7 +31,6 @@ import org.apache.spark.sql.connector.read.{InputPartition, Scan} import org.apache.spark.sql.execution.datasources.v2.{BatchScanExecShim, FileScan} import org.apache.spark.sql.execution.metric.SQLMetric import org.apache.spark.sql.types.StructType -import org.apache.spark.sql.vectorized.ColumnarBatch /** Columnar Based BatchScanExec. */ case class BatchScanExecTransformer( @@ -137,6 +136,13 @@ abstract class BatchScanExecTransformerBase( return ValidationResult.notOk(s"Unsupported aggregation push down for $scan.") } + if ( + SparkShimLoader.getSparkShims.findRowIndexColumnIndexInSchema(schema) > 0 && + !BackendsApiManager.getSettings.supportNativeRowIndexColumn() + ) { + return ValidationResult.notOk("Unsupported row index column scan in native.") + } + if (hasUnsupportedColumns) { return ValidationResult.notOk(s"Unsupported columns scan in native.") } @@ -144,10 +150,6 @@ abstract class BatchScanExecTransformerBase( super.doValidateInternal() } - override def doExecuteColumnar(): RDD[ColumnarBatch] = { - doExecuteColumnarInternal() - } - override def metricsUpdater(): MetricsUpdater = BackendsApiManager.getMetricsApiInstance.genBatchScanTransformerMetricsUpdater(metrics) diff --git a/gluten-core/src/main/scala/org/apache/gluten/execution/BroadcastNestedLoopJoinExecTransformer.scala b/gluten-core/src/main/scala/org/apache/gluten/execution/BroadcastNestedLoopJoinExecTransformer.scala index 19a2ec8541e25..092612ea73407 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/execution/BroadcastNestedLoopJoinExecTransformer.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/execution/BroadcastNestedLoopJoinExecTransformer.scala @@ -22,15 +22,13 @@ import org.apache.gluten.metrics.MetricsUpdater import org.apache.gluten.substrait.SubstraitContext import org.apache.gluten.utils.SubstraitUtil -import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} import org.apache.spark.sql.catalyst.optimizer.{BuildLeft, BuildRight, BuildSide} import org.apache.spark.sql.catalyst.plans.{InnerLike, JoinType, LeftOuter, RightOuter} import org.apache.spark.sql.catalyst.plans.physical.Partitioning -import org.apache.spark.sql.execution.{SparkPlan, SQLExecution} +import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.joins.BaseJoinExec import org.apache.spark.sql.execution.metric.SQLMetric -import org.apache.spark.sql.vectorized.ColumnarBatch import io.substrait.proto.CrossRel @@ -66,20 +64,6 @@ abstract class BroadcastNestedLoopJoinExecTransformer( (right, left) } - override def columnarInputRDDs: Seq[RDD[ColumnarBatch]] = { - val streamedRDD = getColumnarInputRDDs(streamedPlan) - val broadcastRDD = { - val executionId = sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY) - BackendsApiManager.getBroadcastApiInstance - .collectExecutionBroadcastTableId(executionId, buildTableId) - createBroadcastBuildSideRDD() - } - // FIXME: Do we have to make build side a RDD? - streamedRDD :+ broadcastRDD - } - - protected def createBroadcastBuildSideRDD(): BroadcastBuildSideRDD - @transient override lazy val metrics: Map[String, SQLMetric] = BackendsApiManager.getMetricsApiInstance.genNestedLoopJoinTransformerMetrics(sparkContext) @@ -119,12 +103,12 @@ abstract class BroadcastNestedLoopJoinExecTransformer( } } - override def doTransform(context: SubstraitContext): TransformContext = { - val streamedPlanContext = streamedPlan.asInstanceOf[TransformSupport].doTransform(context) + override protected def doTransform(context: SubstraitContext): TransformContext = { + val streamedPlanContext = streamedPlan.asInstanceOf[TransformSupport].transform(context) val (inputStreamedRelNode, inputStreamedOutput) = (streamedPlanContext.root, streamedPlanContext.outputAttributes) - val buildPlanContext = buildPlan.asInstanceOf[TransformSupport].doTransform(context) + val buildPlanContext = buildPlan.asInstanceOf[TransformSupport].transform(context) val (inputBuildRelNode, inputBuildOutput) = (buildPlanContext.root, buildPlanContext.outputAttributes) diff --git a/gluten-core/src/main/scala/org/apache/gluten/execution/CartesianProductExecTransformer.scala b/gluten-core/src/main/scala/org/apache/gluten/execution/CartesianProductExecTransformer.scala index c5a4a0eb81e82..91831f18493ad 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/execution/CartesianProductExecTransformer.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/execution/CartesianProductExecTransformer.scala @@ -77,12 +77,12 @@ case class CartesianProductExecTransformer( BackendsApiManager.getMetricsApiInstance.genNestedLoopJoinTransformerMetricsUpdater(metrics) } - override def doTransform(context: SubstraitContext): TransformContext = { - val leftPlanContext = left.asInstanceOf[TransformSupport].doTransform(context) + override protected def doTransform(context: SubstraitContext): TransformContext = { + val leftPlanContext = left.asInstanceOf[TransformSupport].transform(context) val (inputLeftRelNode, inputLeftOutput) = (leftPlanContext.root, leftPlanContext.outputAttributes) - val rightPlanContext = right.asInstanceOf[TransformSupport].doTransform(context) + val rightPlanContext = right.asInstanceOf[TransformSupport].transform(context) val (inputRightRelNode, inputRightOutput) = (rightPlanContext.root, rightPlanContext.outputAttributes) diff --git a/gluten-core/src/main/scala/org/apache/gluten/execution/CoalesceExecTransformer.scala b/gluten-core/src/main/scala/org/apache/gluten/execution/ColumnarCoalesceExec.scala similarity index 87% rename from gluten-core/src/main/scala/org/apache/gluten/execution/CoalesceExecTransformer.scala rename to gluten-core/src/main/scala/org/apache/gluten/execution/ColumnarCoalesceExec.scala index 8f30805beeedc..f40a7f8f07f60 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/execution/CoalesceExecTransformer.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/execution/ColumnarCoalesceExec.scala @@ -16,7 +16,7 @@ */ package org.apache.gluten.execution -import org.apache.gluten.extension.{GlutenPlan, ValidationResult} +import org.apache.gluten.extension.GlutenPlan import org.apache.spark.{Partition, SparkContext, TaskContext} import org.apache.spark.rdd.RDD @@ -26,7 +26,7 @@ import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, SinglePartiti import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} import org.apache.spark.sql.vectorized.ColumnarBatch -case class CoalesceExecTransformer(numPartitions: Int, child: SparkPlan) +case class ColumnarCoalesceExec(numPartitions: Int, child: SparkPlan) extends UnaryExecNode with GlutenPlan { @@ -38,9 +38,6 @@ case class CoalesceExecTransformer(numPartitions: Int, child: SparkPlan) if (numPartitions == 1) SinglePartition else UnknownPartitioning(numPartitions) } - override protected def doValidateInternal(): ValidationResult = - ValidationResult.ok - override protected def doExecute(): RDD[InternalRow] = { throw new UnsupportedOperationException() } @@ -49,18 +46,18 @@ case class CoalesceExecTransformer(numPartitions: Int, child: SparkPlan) if (numPartitions == 1 && child.executeColumnar().getNumPartitions < 1) { // Make sure we don't output an RDD with 0 partitions, when claiming that we have a // `SinglePartition`. - new CoalesceExecTransformer.EmptyRDDWithPartitions(sparkContext, numPartitions) + new ColumnarCoalesceExec.EmptyRDDWithPartitions(sparkContext, numPartitions) } else { child.executeColumnar().coalesce(numPartitions, shuffle = false) } } - override protected def withNewChildInternal(newChild: SparkPlan): CoalesceExecTransformer = + override protected def withNewChildInternal(newChild: SparkPlan): ColumnarCoalesceExec = copy(child = newChild) } -object CoalesceExecTransformer { +object ColumnarCoalesceExec { class EmptyRDDWithPartitions(@transient private val sc: SparkContext, numPartitions: Int) extends RDD[ColumnarBatch](sc, Nil) { diff --git a/gluten-core/src/main/scala/org/apache/gluten/execution/DataSourceScanTransformerRegister.scala b/gluten-core/src/main/scala/org/apache/gluten/execution/DataSourceScanTransformerRegister.scala index b899790c34725..5b46c23857d3d 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/execution/DataSourceScanTransformerRegister.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/execution/DataSourceScanTransformerRegister.scala @@ -16,7 +16,6 @@ */ package org.apache.gluten.execution -import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.execution.FileSourceScanExec import org.apache.spark.sql.execution.datasources.v2.BatchScanExec @@ -44,15 +43,12 @@ trait DataSourceScanTransformerRegister { val scanClassName: String def createDataSourceTransformer( - batchScan: FileSourceScanExec, - newPartitionFilters: Seq[Expression]): FileSourceScanExecTransformerBase = { + batchScan: FileSourceScanExec): FileSourceScanExecTransformerBase = { throw new UnsupportedOperationException( "This should not be called, please implement this method in child class."); } - def createDataSourceV2Transformer( - batchScan: BatchScanExec, - newPartitionFilters: Seq[Expression]): BatchScanExecTransformerBase = { + def createDataSourceV2Transformer(batchScan: BatchScanExec): BatchScanExecTransformerBase = { throw new UnsupportedOperationException( "This should not be called, please implement this method in child class."); } diff --git a/gluten-core/src/main/scala/org/apache/gluten/execution/ExpandExecTransformer.scala b/gluten-core/src/main/scala/org/apache/gluten/execution/ExpandExecTransformer.scala index aa98d88b29a85..362debb531ee6 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/execution/ExpandExecTransformer.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/execution/ExpandExecTransformer.scala @@ -110,8 +110,8 @@ case class ExpandExecTransformer( doNativeValidation(substraitContext, relNode) } - override def doTransform(context: SubstraitContext): TransformContext = { - val childCtx = child.asInstanceOf[TransformSupport].doTransform(context) + override protected def doTransform(context: SubstraitContext): TransformContext = { + val childCtx = child.asInstanceOf[TransformSupport].transform(context) val operatorId = context.nextOperatorId(this.nodeName) if (projections == null || projections.isEmpty) { // The computing for this Expand is not needed. diff --git a/gluten-core/src/main/scala/org/apache/gluten/execution/FileSourceScanExecTransformer.scala b/gluten-core/src/main/scala/org/apache/gluten/execution/FileSourceScanExecTransformer.scala index c3d2da7f0466c..4f120488c2fb5 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/execution/FileSourceScanExecTransformer.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/execution/FileSourceScanExecTransformer.scala @@ -19,9 +19,9 @@ package org.apache.gluten.execution import org.apache.gluten.backendsapi.BackendsApiManager import org.apache.gluten.extension.ValidationResult import org.apache.gluten.metrics.MetricsUpdater +import org.apache.gluten.sql.shims.SparkShimLoader import org.apache.gluten.substrait.rel.LocalFilesNode.ReadFileFormat -import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Expression, PlanExpression} import org.apache.spark.sql.catalyst.plans.QueryPlan @@ -30,7 +30,6 @@ import org.apache.spark.sql.execution.FileSourceScanExecShim import org.apache.spark.sql.execution.datasources.HadoopFsRelation import org.apache.spark.sql.execution.metric.SQLMetric import org.apache.spark.sql.types.StructType -import org.apache.spark.sql.vectorized.ColumnarBatch import org.apache.spark.util.collection.BitSet case class FileSourceScanExecTransformer( @@ -103,20 +102,20 @@ abstract class FileSourceScanExecTransformerBase( override def getMetadataColumns(): Seq[AttributeReference] = metadataColumns - def getPartitionFilters(): Seq[Expression] = partitionFilters - override def outputAttributes(): Seq[Attribute] = output override def getPartitions: Seq[InputPartition] = { BackendsApiManager.getTransformerApiInstance.genInputPartitionSeq( relation, + requiredSchema, dynamicallySelectedPartitions, output, bucketedScan, optionalBucketSet, optionalNumCoalescedBuckets, disableBucketedScan, - filterExprs()) + filterExprs() + ) } override def getPartitionSchema: StructType = relation.partitionSchema @@ -134,6 +133,13 @@ abstract class FileSourceScanExecTransformerBase( return ValidationResult.notOk(s"Unsupported metadata columns scan in native.") } + if ( + SparkShimLoader.getSparkShims.findRowIndexColumnIndexInSchema(schema) > 0 && + !BackendsApiManager.getSettings.supportNativeRowIndexColumn() + ) { + return ValidationResult.notOk("Unsupported row index column scan in native.") + } + if (hasUnsupportedColumns) { return ValidationResult.notOk(s"Unsupported columns scan in native.") } @@ -147,10 +153,6 @@ abstract class FileSourceScanExecTransformerBase( super.doValidateInternal() } - override protected def doExecuteColumnar(): RDD[ColumnarBatch] = { - doExecuteColumnarInternal() - } - override def metricsUpdater(): MetricsUpdater = BackendsApiManager.getMetricsApiInstance.genFileSourceScanTransformerMetricsUpdater(metrics) @@ -183,6 +185,7 @@ abstract class FileSourceScanExecTransformerBase( relation.fileFormat.getClass.getSimpleName match { case "OrcFileFormat" => ReadFileFormat.OrcReadFormat case "ParquetFileFormat" => ReadFileFormat.ParquetReadFormat + case "DeltaParquetFileFormat" => ReadFileFormat.ParquetReadFormat case "DwrfFileFormat" => ReadFileFormat.DwrfReadFormat case "DeltaMergeTreeFileFormat" => ReadFileFormat.MergeTreeReadFormat case "CSVFileFormat" => ReadFileFormat.TextReadFormat diff --git a/gluten-core/src/main/scala/org/apache/gluten/execution/GenerateExecTransformerBase.scala b/gluten-core/src/main/scala/org/apache/gluten/execution/GenerateExecTransformerBase.scala index 5811f7b47fe4b..b5c9b85aeb0d5 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/execution/GenerateExecTransformerBase.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/execution/GenerateExecTransformerBase.scala @@ -76,8 +76,8 @@ abstract class GenerateExecTransformerBase( doNativeValidation(context, relNode) } - override def doTransform(context: SubstraitContext): TransformContext = { - val childCtx = child.asInstanceOf[TransformSupport].doTransform(context) + override protected def doTransform(context: SubstraitContext): TransformContext = { + val childCtx = child.asInstanceOf[TransformSupport].transform(context) val relNode = getRelNode(context, childCtx.root, getGeneratorNode(context), validation = false) TransformContext(child.output, output, relNode) } diff --git a/gluten-core/src/main/scala/org/apache/gluten/execution/HashAggregateExecBaseTransformer.scala b/gluten-core/src/main/scala/org/apache/gluten/execution/HashAggregateExecBaseTransformer.scala index 49a9ee1e816a3..9345b3a3636fc 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/execution/HashAggregateExecBaseTransformer.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/execution/HashAggregateExecBaseTransformer.scala @@ -84,9 +84,14 @@ abstract class HashAggregateExecBaseTransformer( val functionString = truncatedString(allAggregateExpressions, "[", ", ", "]", maxFields) val outputString = truncatedString(output, "[", ", ", "]", maxFields) if (verbose) { - s"HashAggregateTransformer(keys=$keyString, functions=$functionString, output=$outputString)" + s"HashAggregateTransformer(keys=$keyString, " + + s"functions=$functionString, " + + s"isStreamingAgg=$isCapableForStreamingAggregation, " + + s"output=$outputString)" } else { - s"HashAggregateTransformer(keys=$keyString, functions=$functionString)" + s"HashAggregateTransformer(keys=$keyString, " + + s"functions=$functionString, " + + s"isStreamingAgg=$isCapableForStreamingAggregation)" } } diff --git a/gluten-core/src/main/scala/org/apache/gluten/execution/JoinExecTransformer.scala b/gluten-core/src/main/scala/org/apache/gluten/execution/JoinExecTransformer.scala index e47ad8c7b1bd9..cd22c578594c6 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/execution/JoinExecTransformer.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/execution/JoinExecTransformer.scala @@ -31,7 +31,7 @@ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.optimizer.{BuildLeft, BuildRight, BuildSide} import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.physical._ -import org.apache.spark.sql.execution.{ExpandOutputPartitioningShim, SparkPlan, SQLExecution} +import org.apache.spark.sql.execution.{ExpandOutputPartitioningShim, ExplainUtils, SparkPlan} import org.apache.spark.sql.execution.joins.{BaseJoinExec, HashedRelationBroadcastMode, HashJoin} import org.apache.spark.sql.execution.metric.SQLMetric import org.apache.spark.sql.types._ @@ -100,6 +100,11 @@ trait HashJoinLikeExecTransformer extends BaseJoinExec with TransformSupport { def joinBuildSide: BuildSide def hashJoinType: JoinType + override def simpleStringWithNodeId(): String = { + val opId = ExplainUtils.getOpId(this) + s"$nodeName $joinType $joinBuildSide ($opId)".trim + } + // Note: "metrics" is made transient to avoid sending driver-side metrics to tasks. @transient override lazy val metrics: Map[String, SQLMetric] = BackendsApiManager.getMetricsApiInstance.genHashJoinTransformerMetrics(sparkContext) @@ -227,12 +232,12 @@ trait HashJoinLikeExecTransformer extends BaseJoinExec with TransformSupport { doNativeValidation(substraitContext, relNode) } - override def doTransform(context: SubstraitContext): TransformContext = { - val streamedPlanContext = streamedPlan.asInstanceOf[TransformSupport].doTransform(context) + override protected def doTransform(context: SubstraitContext): TransformContext = { + val streamedPlanContext = streamedPlan.asInstanceOf[TransformSupport].transform(context) val (inputStreamedRelNode, inputStreamedOutput) = (streamedPlanContext.root, streamedPlanContext.outputAttributes) - val buildPlanContext = buildPlan.asInstanceOf[TransformSupport].doTransform(context) + val buildPlanContext = buildPlan.asInstanceOf[TransformSupport].transform(context) val (inputBuildRelNode, inputBuildOutput) = (buildPlanContext.root, buildPlanContext.outputAttributes) @@ -414,18 +419,4 @@ abstract class BroadcastHashJoinExecTransformerBase( override def genJoinParametersInternal(): (Int, Int, String) = { (1, if (isNullAwareAntiJoin) 1 else 0, buildHashTableId) } - - override def columnarInputRDDs: Seq[RDD[ColumnarBatch]] = { - val streamedRDD = getColumnarInputRDDs(streamedPlan) - val broadcastRDD = { - val executionId = sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY) - BackendsApiManager.getBroadcastApiInstance - .collectExecutionBroadcastTableId(executionId, buildHashTableId) - createBroadcastBuildSideRDD() - } - // FIXME: Do we have to make build side a RDD? - streamedRDD :+ broadcastRDD - } - - protected def createBroadcastBuildSideRDD(): BroadcastBuildSideRDD } diff --git a/gluten-core/src/main/scala/org/apache/gluten/execution/LimitTransformer.scala b/gluten-core/src/main/scala/org/apache/gluten/execution/LimitTransformer.scala index 3379542ad1d63..8859844be48e9 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/execution/LimitTransformer.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/execution/LimitTransformer.scala @@ -53,8 +53,8 @@ case class LimitTransformer(child: SparkPlan, offset: Long, count: Long) doNativeValidation(context, relNode) } - override def doTransform(context: SubstraitContext): TransformContext = { - val childCtx = child.asInstanceOf[TransformSupport].doTransform(context) + override protected def doTransform(context: SubstraitContext): TransformContext = { + val childCtx = child.asInstanceOf[TransformSupport].transform(context) val operatorId = context.nextOperatorId(this.nodeName) val relNode = getRelNode(context, operatorId, offset, count, child.output, childCtx.root, false) TransformContext(child.output, child.output, relNode) diff --git a/gluten-core/src/main/scala/org/apache/gluten/execution/SampleExecTransformer.scala b/gluten-core/src/main/scala/org/apache/gluten/execution/SampleExecTransformer.scala new file mode 100644 index 0000000000000..6f9ef34282bf0 --- /dev/null +++ b/gluten-core/src/main/scala/org/apache/gluten/execution/SampleExecTransformer.scala @@ -0,0 +1,126 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.execution + +import org.apache.gluten.backendsapi.BackendsApiManager +import org.apache.gluten.expression.{ConverterUtils, ExpressionConverter} +import org.apache.gluten.extension.ValidationResult +import org.apache.gluten.metrics.MetricsUpdater +import org.apache.gluten.substrait.`type`.TypeBuilder +import org.apache.gluten.substrait.SubstraitContext +import org.apache.gluten.substrait.extensions.ExtensionBuilder +import org.apache.gluten.substrait.rel.{RelBuilder, RelNode} + +import org.apache.spark.internal.Logging +import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, LessThan, Literal, Rand} +import org.apache.spark.sql.execution.SparkPlan +import org.apache.spark.sql.types.DoubleType + +import scala.collection.JavaConverters._ + +/** + * SampleExec supports two sampling methods: with replacement and without replacement. This + * transformer currently supports only sampling without replacement. For sampling without + * replacement, sampleExec uses `seed + partitionId` as the seed for each partition. The `upperBound + * \- lowerBound` value is used as the fraction, and the XORShiftRandom number generator is + * employed. Each row undergoes a Bernoulli trial, and if the generated random number falls within + * the range [lowerBound, upperBound), the row is included; otherwise, it is skipped. + * + * This transformer converts sampleExec to a Substrait Filter relation, achieving a similar sampling + * effect through the filter op with rand sampling expression. Specifically, the `upperBound - + * lowerBound` value is used as the fraction, and the node be translated to `filter(rand(seed + + * partitionId) < fraction)` for random sampling. + */ +case class SampleExecTransformer( + lowerBound: Double, + upperBound: Double, + withReplacement: Boolean, + seed: Long, + child: SparkPlan) + extends UnaryTransformSupport + with Logging { + def fraction: Double = upperBound - lowerBound + + def condition: Expression = { + val randExpr: Expression = Rand(seed) + val sampleRateExpr: Expression = Literal(fraction, DoubleType) + LessThan(randExpr, sampleRateExpr) + } + + override def output: Seq[Attribute] = child.output + + // Note: "metrics" is made transient to avoid sending driver-side metrics to tasks. + @transient override lazy val metrics = + BackendsApiManager.getMetricsApiInstance.genSampleTransformerMetrics(sparkContext) + + override def metricsUpdater(): MetricsUpdater = + BackendsApiManager.getMetricsApiInstance.genSampleTransformerMetricsUpdater(metrics) + + def getRelNode( + context: SubstraitContext, + condExpr: Expression, + originalInputAttributes: Seq[Attribute], + operatorId: Long, + input: RelNode, + validation: Boolean): RelNode = { + assert(condExpr != null) + val args = context.registeredFunction + val condExprNode = ExpressionConverter + .replaceWithExpressionTransformer(condExpr, attributeSeq = originalInputAttributes) + .doTransform(args) + + if (!validation) { + RelBuilder.makeFilterRel(input, condExprNode, context, operatorId) + } else { + // Use a extension node to send the input types through Substrait plan for validation. + val inputTypeNodeList = originalInputAttributes + .map(attr => ConverterUtils.getTypeNode(attr.dataType, attr.nullable)) + .asJava + val extensionNode = ExtensionBuilder.makeAdvancedExtension( + BackendsApiManager.getTransformerApiInstance.packPBMessage( + TypeBuilder.makeStruct(false, inputTypeNodeList).toProtobuf)) + RelBuilder.makeFilterRel(input, condExprNode, extensionNode, context, operatorId) + } + } + + override protected def doValidateInternal(): ValidationResult = { + if (withReplacement) { + return ValidationResult.notOk( + "Unsupported sample exec in native with " + + s"withReplacement parameter is $withReplacement") + } + val substraitContext = new SubstraitContext + val operatorId = substraitContext.nextOperatorId((this.nodeName)) + // Firstly, need to check if the Substrait plan for this operator can be successfully generated. + val relNode = + getRelNode(substraitContext, condition, child.output, operatorId, null, validation = true) + // Then, validate the generated plan in native engine. + doNativeValidation(substraitContext, relNode) + } + + override protected def doTransform(context: SubstraitContext): TransformContext = { + val childCtx = child.asInstanceOf[TransformSupport].transform(context) + val operatorId = context.nextOperatorId(this.nodeName) + val currRel = + getRelNode(context, condition, child.output, operatorId, childCtx.root, validation = false) + assert(currRel != null, "Filter rel should be valid.") + TransformContext(childCtx.outputAttributes, output, currRel) + } + + override protected def withNewChildInternal(newChild: SparkPlan): SampleExecTransformer = + copy(child = newChild) +} diff --git a/gluten-core/src/main/scala/org/apache/gluten/execution/ScanTransformerFactory.scala b/gluten-core/src/main/scala/org/apache/gluten/execution/ScanTransformerFactory.scala index fc2b8f5064923..44a823834f926 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/execution/ScanTransformerFactory.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/execution/ScanTransformerFactory.scala @@ -17,8 +17,7 @@ package org.apache.gluten.execution import org.apache.gluten.exception.GlutenNotSupportException -import org.apache.gluten.expression.ExpressionConverter -import org.apache.gluten.extension.columnar.TransformHints +import org.apache.gluten.extension.columnar.FallbackTags import org.apache.gluten.sql.shims.SparkShimLoader import org.apache.spark.sql.catalyst.expressions.Expression @@ -37,14 +36,7 @@ object ScanTransformerFactory { def createFileSourceScanTransformer( scanExec: FileSourceScanExec, - allPushDownFilters: Option[Seq[Expression]] = None, - validation: Boolean = false): FileSourceScanExecTransformerBase = { - // transform BroadcastExchangeExec to ColumnarBroadcastExchangeExec in partitionFilters - val newPartitionFilters = if (validation) { - scanExec.partitionFilters - } else { - ExpressionConverter.transformDynamicPruningExpr(scanExec.partitionFilters) - } + allPushDownFilters: Option[Seq[Expression]] = None): FileSourceScanExecTransformerBase = { val fileFormat = scanExec.relation.fileFormat lookupDataSourceScanTransformer(fileFormat.getClass.getName) match { case Some(clz) => @@ -52,13 +44,13 @@ object ScanTransformerFactory { .getDeclaredConstructor() .newInstance() .asInstanceOf[DataSourceScanTransformerRegister] - .createDataSourceTransformer(scanExec, newPartitionFilters) + .createDataSourceTransformer(scanExec) case _ => - new FileSourceScanExecTransformer( + FileSourceScanExecTransformer( scanExec.relation, scanExec.output, scanExec.requiredSchema, - newPartitionFilters, + scanExec.partitionFilters, scanExec.optionalBucketSet, scanExec.optionalNumCoalescedBuckets, allPushDownFilters.getOrElse(scanExec.dataFilters), @@ -69,8 +61,7 @@ object ScanTransformerFactory { } private def lookupBatchScanTransformer( - batchScanExec: BatchScanExec, - newPartitionFilters: Seq[Expression]): BatchScanExecTransformerBase = { + batchScanExec: BatchScanExec): BatchScanExecTransformerBase = { val scan = batchScanExec.scan lookupDataSourceScanTransformer(scan.getClass.getName) match { case Some(clz) => @@ -78,14 +69,14 @@ object ScanTransformerFactory { .getDeclaredConstructor() .newInstance() .asInstanceOf[DataSourceScanTransformerRegister] - .createDataSourceV2Transformer(batchScanExec, newPartitionFilters) + .createDataSourceV2Transformer(batchScanExec) case _ => scan match { case _: FileScan => - new BatchScanExecTransformer( + BatchScanExecTransformer( batchScanExec.output, batchScanExec.scan, - newPartitionFilters, + batchScanExec.runtimeFilters, table = SparkShimLoader.getSparkShims.getBatchScanExecTable(batchScanExec) ) case _ => @@ -99,14 +90,7 @@ object ScanTransformerFactory { allPushDownFilters: Option[Seq[Expression]] = None, validation: Boolean = false): SparkPlan = { if (supportedBatchScan(batchScan.scan)) { - val newPartitionFilters = if (validation) { - // No transformation is needed for DynamicPruningExpressions - // during the validation process. - batchScan.runtimeFilters - } else { - ExpressionConverter.transformDynamicPruningExpr(batchScan.runtimeFilters) - } - val transformer = lookupBatchScanTransformer(batchScan, newPartitionFilters) + val transformer = lookupBatchScanTransformer(batchScan) if (!validation && allPushDownFilters.isDefined) { transformer.setPushDownFilters(allPushDownFilters.get) // Validate again if allPushDownFilters is defined. @@ -115,7 +99,7 @@ object ScanTransformerFactory { transformer } else { val newSource = batchScan.copy(runtimeFilters = transformer.runtimeFilters) - TransformHints.tagNotTransformable(newSource, validationResult.reason.get) + FallbackTags.add(newSource, validationResult.reason.get) newSource } } else { @@ -125,12 +109,8 @@ object ScanTransformerFactory { if (validation) { throw new GlutenNotSupportException(s"Unsupported scan ${batchScan.scan}") } - // If filter expressions aren't empty, we need to transform the inner operators, - // and fallback the BatchScanExec itself. - val newSource = batchScan.copy(runtimeFilters = ExpressionConverter - .transformDynamicPruningExpr(batchScan.runtimeFilters)) - TransformHints.tagNotTransformable(newSource, "The scan in BatchScanExec is not supported.") - newSource + FallbackTags.add(batchScan, "The scan in BatchScanExec is not supported.") + batchScan } } diff --git a/gluten-core/src/main/scala/org/apache/gluten/execution/SortExecTransformer.scala b/gluten-core/src/main/scala/org/apache/gluten/execution/SortExecTransformer.scala index 9e4a20d40ad42..f79dc69e680b5 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/execution/SortExecTransformer.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/execution/SortExecTransformer.scala @@ -101,8 +101,8 @@ case class SortExecTransformer( doNativeValidation(substraitContext, relNode) } - override def doTransform(context: SubstraitContext): TransformContext = { - val childCtx = child.asInstanceOf[TransformSupport].doTransform(context) + override protected def doTransform(context: SubstraitContext): TransformContext = { + val childCtx = child.asInstanceOf[TransformSupport].transform(context) val operatorId = context.nextOperatorId(this.nodeName) if (sortOrder == null || sortOrder.isEmpty) { // The computing for this project is not needed. diff --git a/gluten-core/src/main/scala/org/apache/gluten/execution/SortMergeJoinExecTransformer.scala b/gluten-core/src/main/scala/org/apache/gluten/execution/SortMergeJoinExecTransformer.scala index 5ca11a53c88a9..f032c4ca00879 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/execution/SortMergeJoinExecTransformer.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/execution/SortMergeJoinExecTransformer.scala @@ -55,11 +55,6 @@ abstract class SortMergeJoinExecTransformerBase( val (bufferedKeys, streamedKeys, bufferedPlan, streamedPlan) = (rightKeys, leftKeys, right, left) - override def simpleStringWithNodeId(): String = { - val opId = ExplainUtils.getOpId(this) - s"$nodeName $joinType ($opId)".trim - } - override def verboseStringWithOperatorId(): String = { val joinCondStr = if (condition.isDefined) { s"${condition.get}" @@ -191,12 +186,12 @@ abstract class SortMergeJoinExecTransformerBase( doNativeValidation(substraitContext, relNode) } - override def doTransform(context: SubstraitContext): TransformContext = { - val streamedPlanContext = streamedPlan.asInstanceOf[TransformSupport].doTransform(context) + override protected def doTransform(context: SubstraitContext): TransformContext = { + val streamedPlanContext = streamedPlan.asInstanceOf[TransformSupport].transform(context) val (inputStreamedRelNode, inputStreamedOutput) = (streamedPlanContext.root, streamedPlanContext.outputAttributes) - val bufferedPlanContext = bufferedPlan.asInstanceOf[TransformSupport].doTransform(context) + val bufferedPlanContext = bufferedPlan.asInstanceOf[TransformSupport].transform(context) val (inputBuildRelNode, inputBuildOutput) = (bufferedPlanContext.root, bufferedPlanContext.outputAttributes) @@ -255,7 +250,6 @@ case class SortMergeJoinExecTransformer( projectList) { override protected def doValidateInternal(): ValidationResult = { - val substraitContext = new SubstraitContext // Firstly, need to check if the Substrait plan for this operator can be successfully generated. if (substraitJoinType == JoinRel.JoinType.JOIN_TYPE_OUTER) { return ValidationResult diff --git a/gluten-core/src/main/scala/org/apache/gluten/execution/SortUtils.scala b/gluten-core/src/main/scala/org/apache/gluten/execution/SortUtils.scala new file mode 100644 index 0000000000000..2c0ad1b0a59a1 --- /dev/null +++ b/gluten-core/src/main/scala/org/apache/gluten/execution/SortUtils.scala @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.execution + +import org.apache.gluten.extension.columnar.rewrite.RewrittenNodeWall + +import org.apache.spark.sql.execution.{ProjectExec, SortExec, SparkPlan} + +object SortUtils { + def dropPartialSort(plan: SparkPlan): SparkPlan = plan match { + case RewrittenNodeWall(p) => RewrittenNodeWall(dropPartialSort(p)) + case sort: SortExec if !sort.global => sort.child + // from pre/post project-pulling + case ProjectExec(_, SortExec(_, false, ProjectExec(_, p), _)) + if plan.outputSet == p.outputSet => + p + case _ => plan + } +} diff --git a/gluten-core/src/main/scala/org/apache/gluten/execution/WholeStageTransformer.scala b/gluten-core/src/main/scala/org/apache/gluten/execution/WholeStageTransformer.scala index b809ac4bf1a4c..78132c08c7823 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/execution/WholeStageTransformer.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/execution/WholeStageTransformer.scala @@ -21,7 +21,7 @@ import org.apache.gluten.backendsapi.BackendsApiManager import org.apache.gluten.exception.GlutenException import org.apache.gluten.expression._ import org.apache.gluten.extension.GlutenPlan -import org.apache.gluten.metrics.{GlutenTimeMetric, MetricsUpdater, NoopMetricsUpdater} +import org.apache.gluten.metrics.{GlutenTimeMetric, MetricsUpdater} import org.apache.gluten.substrait.`type`.{TypeBuilder, TypeNode} import org.apache.gluten.substrait.SubstraitContext import org.apache.gluten.substrait.plan.{PlanBuilder, PlanNode} @@ -68,7 +68,22 @@ trait TransformSupport extends GlutenPlan { */ def columnarInputRDDs: Seq[RDD[ColumnarBatch]] - def doTransform(context: SubstraitContext): TransformContext = { + final def transform(context: SubstraitContext): TransformContext = { + if (isCanonicalizedPlan) { + throw new IllegalStateException( + "A canonicalized plan is not supposed to be executed transform.") + } + if (TransformerState.underValidationState) { + doTransform(context) + } else { + // Materialize subquery first before going to do transform. + executeQuery { + doTransform(context) + } + } + } + + protected def doTransform(context: SubstraitContext): TransformContext = { throw new UnsupportedOperationException( s"This operator doesn't support doTransform with SubstraitContext.") } @@ -97,7 +112,7 @@ trait UnaryTransformSupport extends TransformSupport with UnaryExecNode { case class WholeStageTransformer(child: SparkPlan, materializeInput: Boolean = false)( val transformStageId: Int -) extends GenerateTreeStringShim +) extends WholeStageTransformerGenerateTreeStringShim with UnaryTransformSupport { assert(child.isInstanceOf[TransformSupport]) @@ -182,7 +197,7 @@ case class WholeStageTransformer(child: SparkPlan, materializeInput: Boolean = f val substraitContext = new SubstraitContext val childCtx = child .asInstanceOf[TransformSupport] - .doTransform(substraitContext) + .transform(substraitContext) if (childCtx == null) { throw new NullPointerException(s"WholeStageTransformer can't do Transform on $child") } @@ -216,8 +231,6 @@ case class WholeStageTransformer(child: SparkPlan, materializeInput: Boolean = f } def doWholeStageTransform(): WholeStageTransformContext = { - // invoke SparkPlan.prepare to do subquery preparation etc. - super.prepare() val context = generateWholeStageTransformContext() if (conf.getConf(GlutenConfig.CACHE_WHOLE_STAGE_TRANSFORMER_CONTEXT)) { wholeStageTransformerContext = Some(context) @@ -252,11 +265,17 @@ case class WholeStageTransformer(child: SparkPlan, materializeInput: Boolean = f } transformChildren(child, basicScanExecTransformers) - basicScanExecTransformers + basicScanExecTransformers.toSeq } override def doExecuteColumnar(): RDD[ColumnarBatch] = { val pipelineTime: SQLMetric = longMetric("pipelineTime") + // We should do transform first to make sure all subqueries are materialized + val wsCtx = GlutenTimeMetric.withMillisTime { + doWholeStageTransform() + }( + t => + logOnLevel(substraitPlanLogLevel, s"$nodeName generating the substrait plan took: $t ms.")) val inputRDDs = new ColumnarInputRDDsWrapper(columnarInputRDDs) // Check if BatchScan exists. val basicScanExecTransformers = findAllScanTransformers() @@ -271,22 +290,11 @@ case class WholeStageTransformer(child: SparkPlan, materializeInput: Boolean = f val allScanPartitions = basicScanExecTransformers.map(_.getPartitions) val allScanSplitInfos = getSplitInfosFromPartitions(basicScanExecTransformers, allScanPartitions) - - val (wsCtx, inputPartitions) = GlutenTimeMetric.withMillisTime { - val wsCtx = doWholeStageTransform() - val partitions = - BackendsApiManager.getIteratorApiInstance.genPartitions( - wsCtx, - allScanSplitInfos, - basicScanExecTransformers) - - (wsCtx, partitions) - }( - t => - logOnLevel( - substraitPlanLogLevel, - s"$nodeName generating the substrait plan took: $t ms.")) - + val inputPartitions = + BackendsApiManager.getIteratorApiInstance.genPartitions( + wsCtx, + allScanSplitInfos, + basicScanExecTransformers) val rdd = new GlutenWholeStageColumnarRDD( sparkContext, inputPartitions, @@ -321,22 +329,18 @@ case class WholeStageTransformer(child: SparkPlan, materializeInput: Boolean = f * GlutenDataFrameAggregateSuite) in these cases, separate RDDs takes care of SCAN as a * result, genFinalStageIterator rather than genFirstStageIterator will be invoked */ - val resCtx = GlutenTimeMetric.withMillisTime(doWholeStageTransform()) { - t => - logOnLevel(substraitPlanLogLevel, s"$nodeName generating the substrait plan took: $t ms.") - } new WholeStageZippedPartitionsRDD( sparkContext, inputRDDs, numaBindingInfo, sparkConf, - resCtx, + wsCtx, pipelineTime, BackendsApiManager.getMetricsApiInstance.metricsUpdatingFunction( child, - resCtx.substraitContext.registeredRelMap, - resCtx.substraitContext.registeredJoinParams, - resCtx.substraitContext.registeredAggregationParams + wsCtx.substraitContext.registeredRelMap, + wsCtx.substraitContext.registeredJoinParams, + wsCtx.substraitContext.registeredAggregationParams ), materializeInput ) @@ -346,7 +350,7 @@ case class WholeStageTransformer(child: SparkPlan, materializeInput: Boolean = f override def metricsUpdater(): MetricsUpdater = { child match { case transformer: TransformSupport => transformer.metricsUpdater() - case _ => NoopMetricsUpdater + case _ => MetricsUpdater.None } } @@ -357,7 +361,7 @@ case class WholeStageTransformer(child: SparkPlan, materializeInput: Boolean = f case _ => false } .map(_.asInstanceOf[TransformSupport].metricsUpdater()) - .getOrElse(NoopMetricsUpdater) + .getOrElse(MetricsUpdater.None) } override protected def withNewChildInternal(newChild: SparkPlan): WholeStageTransformer = diff --git a/gluten-core/src/main/scala/org/apache/gluten/execution/WindowExecTransformer.scala b/gluten-core/src/main/scala/org/apache/gluten/execution/WindowExecTransformer.scala index d7c3d3dd56a4a..6832221a404d9 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/execution/WindowExecTransformer.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/execution/WindowExecTransformer.scala @@ -179,8 +179,8 @@ case class WindowExecTransformer( doNativeValidation(substraitContext, relNode) } - override def doTransform(context: SubstraitContext): TransformContext = { - val childCtx = child.asInstanceOf[TransformSupport].doTransform(context) + override protected def doTransform(context: SubstraitContext): TransformContext = { + val childCtx = child.asInstanceOf[TransformSupport].transform(context) val operatorId = context.nextOperatorId(this.nodeName) if (windowExpression == null || windowExpression.isEmpty) { // The computing for this operator is not needed. @@ -197,16 +197,3 @@ case class WindowExecTransformer( override protected def withNewChildInternal(newChild: SparkPlan): WindowExecTransformer = copy(child = newChild) } - -object WindowExecTransformer { - - /** Gets lower/upper bound represented in string. */ - def getFrameBound(bound: Expression): String = { - // The lower/upper can be either a foldable Expression or a SpecialFrameBoundary. - if (bound.foldable) { - bound.eval().toString - } else { - bound.sql - } - } -} diff --git a/gluten-core/src/main/scala/org/apache/gluten/execution/WindowGroupLimitExecTransformer.scala b/gluten-core/src/main/scala/org/apache/gluten/execution/WindowGroupLimitExecTransformer.scala index bba79fa761e82..46a4e1aa4eeec 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/execution/WindowGroupLimitExecTransformer.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/execution/WindowGroupLimitExecTransformer.scala @@ -146,8 +146,8 @@ case class WindowGroupLimitExecTransformer( doNativeValidation(substraitContext, relNode) } - override def doTransform(context: SubstraitContext): TransformContext = { - val childCtx = child.asInstanceOf[TransformSupport].doTransform(context) + override protected def doTransform(context: SubstraitContext): TransformContext = { + val childCtx = child.asInstanceOf[TransformSupport].transform(context) val operatorId = context.nextOperatorId(this.nodeName) val currRel = diff --git a/gluten-core/src/main/scala/org/apache/gluten/execution/WriteFilesExecTransformer.scala b/gluten-core/src/main/scala/org/apache/gluten/execution/WriteFilesExecTransformer.scala index 7df4afa8a6c1f..14d58bfa83771 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/execution/WriteFilesExecTransformer.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/execution/WriteFilesExecTransformer.scala @@ -161,8 +161,8 @@ case class WriteFilesExecTransformer( doNativeValidation(substraitContext, relNode) } - override def doTransform(context: SubstraitContext): TransformContext = { - val childCtx = child.asInstanceOf[TransformSupport].doTransform(context) + override protected def doTransform(context: SubstraitContext): TransformContext = { + val childCtx = child.asInstanceOf[TransformSupport].transform(context) val operatorId = context.nextOperatorId(this.nodeName) val currRel = getRelNode(context, getFinalChildOutput(), operatorId, childCtx.root, validation = false) diff --git a/gluten-core/src/main/scala/org/apache/gluten/expression/ConverterUtils.scala b/gluten-core/src/main/scala/org/apache/gluten/expression/ConverterUtils.scala index a944de3d3cefe..473ee7f9d62f6 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/expression/ConverterUtils.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/expression/ConverterUtils.scala @@ -73,7 +73,7 @@ object ConverterUtils extends Logging { } def collectAttributeTypeNodes(attributes: JList[Attribute]): JList[TypeNode] = { - collectAttributeTypeNodes(attributes.asScala) + collectAttributeTypeNodes(attributes.asScala.toSeq) } def collectAttributeTypeNodes(attributes: Seq[Attribute]): JList[TypeNode] = { @@ -85,7 +85,7 @@ object ConverterUtils extends Logging { } def collectAttributeNamesWithExprId(attributes: JList[Attribute]): JList[String] = { - collectAttributeNamesWithExprId(attributes.asScala) + collectAttributeNamesWithExprId(attributes.asScala.toSeq) } def collectAttributeNamesWithExprId(attributes: Seq[Attribute]): JList[String] = { @@ -197,7 +197,7 @@ object ConverterUtils extends Logging { val (field, nullable) = parseFromSubstraitType(typ) StructField("", field, nullable) } - (StructType(fields), isNullable(substraitType.getStruct.getNullability)) + (StructType(fields.toSeq), isNullable(substraitType.getStruct.getNullability)) case Type.KindCase.LIST => val list = substraitType.getList val (elementType, containsNull) = parseFromSubstraitType(list.getType) diff --git a/gluten-core/src/main/scala/org/apache/gluten/expression/ExpressionConverter.scala b/gluten-core/src/main/scala/org/apache/gluten/expression/ExpressionConverter.scala index 2d514118ac300..b5bcb6876e4d9 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/expression/ExpressionConverter.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/expression/ExpressionConverter.scala @@ -16,10 +16,8 @@ */ package org.apache.gluten.expression -import org.apache.gluten.GlutenConfig import org.apache.gluten.backendsapi.BackendsApiManager import org.apache.gluten.exception.GlutenNotSupportException -import org.apache.gluten.extension.columnar.transition.Transitions import org.apache.gluten.sql.shims.SparkShimLoader import org.apache.gluten.test.TestStats import org.apache.gluten.utils.DecimalArithmeticUtil @@ -29,14 +27,12 @@ import org.apache.spark.sql.catalyst.SQLConfHelper import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.objects.StaticInvoke import org.apache.spark.sql.catalyst.optimizer.NormalizeNaNAndZero -import org.apache.spark.sql.execution.{ScalarSubquery, _} -import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec -import org.apache.spark.sql.execution.exchange.BroadcastExchangeExec +import org.apache.spark.sql.execution.ScalarSubquery import org.apache.spark.sql.hive.HiveUDFTransformer import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ -trait Transformable extends Unevaluable { +trait Transformable { def getTransformer(childrenTransformers: Seq[ExpressionTransformer]): ExpressionTransformer } @@ -400,14 +396,12 @@ object ExpressionConverter extends SQLConfHelper with Logging { r ) case size: Size => - if (size.legacySizeOfNull != SQLConf.get.legacySizeOfNull) { - throw new GlutenNotSupportException( - "The value of legacySizeOfNull field of size is " + - "not equals to legacySizeOfNull of SQLConf, this case is not supported yet") - } - BackendsApiManager.getSparkPlanExecApiInstance.genSizeExpressionTransformer( + // Covers Spark ArraySize which is replaced by Size(child, false). + val child = + replaceWithExpressionTransformerInternal(size.child, attributeSeq, expressionsMap) + GenericExpressionTransformer( substraitExprName, - replaceWithExpressionTransformerInternal(size.child, attributeSeq, expressionsMap), + Seq(child, LiteralTransformer(size.legacySizeOfNull)), size) case namedStruct: CreateNamedStruct => BackendsApiManager.getSparkPlanExecApiInstance.genNamedStructTransformer( @@ -563,18 +557,68 @@ object ExpressionConverter extends SQLConfHelper with Logging { arrayTransform ) case tryEval @ TryEval(a: Add) => - BackendsApiManager.getSparkPlanExecApiInstance.genTryAddTransformer( + BackendsApiManager.getSparkPlanExecApiInstance.genTryArithmeticTransformer( substraitExprName, replaceWithExpressionTransformerInternal(a.left, attributeSeq, expressionsMap), replaceWithExpressionTransformerInternal(a.right, attributeSeq, expressionsMap), - tryEval + tryEval, + ExpressionNames.CHECKED_ADD + ) + case tryEval @ TryEval(a: Subtract) => + BackendsApiManager.getSparkPlanExecApiInstance.genTryArithmeticTransformer( + substraitExprName, + replaceWithExpressionTransformerInternal(a.left, attributeSeq, expressionsMap), + replaceWithExpressionTransformerInternal(a.right, attributeSeq, expressionsMap), + tryEval, + ExpressionNames.CHECKED_SUBTRACT + ) + case tryEval @ TryEval(a: Divide) => + BackendsApiManager.getSparkPlanExecApiInstance.genTryArithmeticTransformer( + substraitExprName, + replaceWithExpressionTransformerInternal(a.left, attributeSeq, expressionsMap), + replaceWithExpressionTransformerInternal(a.right, attributeSeq, expressionsMap), + tryEval, + ExpressionNames.CHECKED_DIVIDE + ) + case tryEval @ TryEval(a: Multiply) => + BackendsApiManager.getSparkPlanExecApiInstance.genTryArithmeticTransformer( + substraitExprName, + replaceWithExpressionTransformerInternal(a.left, attributeSeq, expressionsMap), + replaceWithExpressionTransformerInternal(a.right, attributeSeq, expressionsMap), + tryEval, + ExpressionNames.CHECKED_MULTIPLY ) case a: Add => - BackendsApiManager.getSparkPlanExecApiInstance.genAddTransformer( + BackendsApiManager.getSparkPlanExecApiInstance.genArithmeticTransformer( substraitExprName, replaceWithExpressionTransformerInternal(a.left, attributeSeq, expressionsMap), replaceWithExpressionTransformerInternal(a.right, attributeSeq, expressionsMap), - a + a, + ExpressionNames.CHECKED_ADD + ) + case a: Subtract => + BackendsApiManager.getSparkPlanExecApiInstance.genArithmeticTransformer( + substraitExprName, + replaceWithExpressionTransformerInternal(a.left, attributeSeq, expressionsMap), + replaceWithExpressionTransformerInternal(a.right, attributeSeq, expressionsMap), + a, + ExpressionNames.CHECKED_SUBTRACT + ) + case a: Multiply => + BackendsApiManager.getSparkPlanExecApiInstance.genArithmeticTransformer( + substraitExprName, + replaceWithExpressionTransformerInternal(a.left, attributeSeq, expressionsMap), + replaceWithExpressionTransformerInternal(a.right, attributeSeq, expressionsMap), + a, + ExpressionNames.CHECKED_MULTIPLY + ) + case a: Divide => + BackendsApiManager.getSparkPlanExecApiInstance.genArithmeticTransformer( + substraitExprName, + replaceWithExpressionTransformerInternal(a.left, attributeSeq, expressionsMap), + replaceWithExpressionTransformerInternal(a.right, attributeSeq, expressionsMap), + a, + ExpressionNames.CHECKED_DIVIDE ) case tryEval: TryEval => // This is a placeholder to handle try_eval(other expressions). @@ -604,6 +648,27 @@ object ExpressionConverter extends SQLConfHelper with Logging { replaceWithExpressionTransformerInternal(s.child, attributeSeq, expressionsMap), LiteralTransformer(Literal(s.randomSeed.get))), s) + case c: PreciseTimestampConversion => + BackendsApiManager.getSparkPlanExecApiInstance.genPreciseTimestampConversionTransformer( + substraitExprName, + Seq(replaceWithExpressionTransformerInternal(c.child, attributeSeq, expressionsMap)), + c + ) + case t: TransformKeys => + // default is `EXCEPTION` + val mapKeyDedupPolicy = SQLConf.get.getConf(SQLConf.MAP_KEY_DEDUP_POLICY) + if (mapKeyDedupPolicy == SQLConf.MapKeyDedupPolicy.LAST_WIN.toString) { + // TODO: Remove after fix ready for + // https://github.com/facebookincubator/velox/issues/10219 + throw new GlutenNotSupportException( + "LAST_WIN policy is not supported yet in native to deduplicate map keys" + ) + } + GenericExpressionTransformer( + substraitExprName, + t.children.map(replaceWithExpressionTransformerInternal(_, attributeSeq, expressionsMap)), + t + ) case expr => GenericExpressionTransformer( substraitExprName, @@ -630,118 +695,4 @@ object ExpressionConverter extends SQLConfHelper with Logging { } substraitExprName } - - /** - * Transform BroadcastExchangeExec to ColumnarBroadcastExchangeExec in DynamicPruningExpression. - * - * @param partitionFilters - * The partition filter of Scan - * @return - * Transformed partition filter - */ - def transformDynamicPruningExpr(partitionFilters: Seq[Expression]): Seq[Expression] = { - - def convertBroadcastExchangeToColumnar( - exchange: BroadcastExchangeExec): ColumnarBroadcastExchangeExec = { - val newChild = Transitions.toBackendBatchPlan(exchange.child) - ColumnarBroadcastExchangeExec(exchange.mode, newChild) - } - - if ( - GlutenConfig.getConf.enableScanOnly || !GlutenConfig.getConf.enableColumnarBroadcastExchange - ) { - // Disable ColumnarSubqueryBroadcast for scan-only execution - // or ColumnarBroadcastExchange was disabled. - partitionFilters - } else { - val newPartitionFilters = partitionFilters.map { - case dynamicPruning: DynamicPruningExpression => - dynamicPruning.transform { - // Lookup inside subqueries for duplicate exchanges. - case in: InSubqueryExec => - in.plan match { - case s: SubqueryBroadcastExec => - val newIn = s - .transform { - case exchange: BroadcastExchangeExec => - convertBroadcastExchangeToColumnar(exchange) - } - .asInstanceOf[SubqueryBroadcastExec] - val transformSubqueryBroadcast = ColumnarSubqueryBroadcastExec( - newIn.name, - newIn.index, - newIn.buildKeys, - newIn.child) - - // When AQE is on, spark will apply ReuseAdaptiveSubquery rule first, - // it will reuse vanilla SubqueryBroadcastExec, - // and then use gluten ColumnarOverrides rule to transform Subquery, - // so all the SubqueryBroadcastExec in the ReusedSubqueryExec will be transformed - // to a new ColumnarSubqueryBroadcastExec for each SubqueryBroadcastExec, - // which will lead to execute ColumnarSubqueryBroadcastExec.relationFuture - // repeatedly even in the ReusedSubqueryExec. - // - // On the other hand, it needs to use - // the AdaptiveSparkPlanExec.AdaptiveExecutionContext to hold the reused map - // for each query. - newIn.child match { - case a: AdaptiveSparkPlanExec if SQLConf.get.subqueryReuseEnabled => - // When AQE is on and reuseSubquery is on. - a.context.subqueryCache - .update(newIn.canonicalized, transformSubqueryBroadcast) - case _ => - } - in.copy(plan = transformSubqueryBroadcast.asInstanceOf[BaseSubqueryExec]) - case r: ReusedSubqueryExec if r.child.isInstanceOf[SubqueryBroadcastExec] => - val newIn = r.child - .transform { - case exchange: BroadcastExchangeExec => - convertBroadcastExchangeToColumnar(exchange) - } - .asInstanceOf[SubqueryBroadcastExec] - newIn.child match { - case a: AdaptiveSparkPlanExec => - // Only when AQE is on, it needs to replace SubqueryBroadcastExec - // with reused ColumnarSubqueryBroadcastExec - val cachedSubquery = a.context.subqueryCache.get(newIn.canonicalized) - if (cachedSubquery.isDefined) { - in.copy(plan = ReusedSubqueryExec(cachedSubquery.get)) - } else { - val errMsg = "Can not get the reused ColumnarSubqueryBroadcastExec" + - "by the ${newIn.canonicalized}" - logWarning(errMsg) - throw new UnsupportedOperationException(errMsg) - } - case _ => - val errMsg = "Can not get the reused ColumnarSubqueryBroadcastExec" + - "by the ${newIn.canonicalized}" - logWarning(errMsg) - throw new UnsupportedOperationException(errMsg) - } - case _ => in - } - } - case e: Expression => e - } - updateSubqueryResult(newPartitionFilters) - newPartitionFilters - } - } - - private def updateSubqueryResult(partitionFilters: Seq[Expression]): Unit = { - // When it includes some DynamicPruningExpression, - // it needs to execute InSubqueryExec first, - // because doTransform path can't execute 'doExecuteColumnar' which will - // execute prepare subquery first. - partitionFilters.foreach { - case DynamicPruningExpression(inSubquery: InSubqueryExec) => - if (inSubquery.values().isEmpty) inSubquery.updateResult() - case e: Expression => - e.foreach { - case s: ScalarSubquery => s.updateResult() - case _ => - } - case _ => - } - } } diff --git a/gluten-core/src/main/scala/org/apache/gluten/expression/ExpressionMappings.scala b/gluten-core/src/main/scala/org/apache/gluten/expression/ExpressionMappings.scala index dadeb6fec7260..edccf3514710c 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/expression/ExpressionMappings.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/expression/ExpressionMappings.scala @@ -76,6 +76,7 @@ object ExpressionMappings { Sig[Length](LENGTH), Sig[Lower](LOWER), Sig[Upper](UPPER), + Sig[SoundEx](SOUNDEX), Sig[StringLocate](LOCATE), Sig[StringTrimLeft](LTRIM), Sig[StringTrimRight](RTRIM), @@ -100,6 +101,10 @@ object ExpressionMappings { Sig[Encode](ENCODE), Sig[Uuid](UUID), Sig[BitLength](BIT_LENGTH), + Sig[OctetLength](OCTET_LENGTH), + Sig[Levenshtein](LEVENSHTEIN), + Sig[UnBase64](UNBASE64), + Sig[Base64](BASE64), // URL functions Sig[ParseUrl](PARSE_URL), @@ -195,6 +200,7 @@ object ExpressionMappings { Sig[UnixMicros](UNIX_MICROS), Sig[MillisToTimestamp](TIMESTAMP_MILLIS), Sig[MicrosToTimestamp](TIMESTAMP_MICROS), + Sig[PreciseTimestampConversion](PRECYSE_TIMESTAMP_CONVERSION), // JSON functions Sig[GetJsonObject](GET_JSON_OBJECT), Sig[LengthOfJsonArray](JSON_ARRAY_LENGTH), @@ -277,6 +283,7 @@ object ExpressionMappings { Sig[SparkPartitionID](SPARK_PARTITION_ID), Sig[WidthBucket](WIDTH_BUCKET), Sig[AssertTrue](ASSERT_TRUE), + Sig[ReplicateRows](REPLICATE_ROWS), // Decimal Sig[UnscaledValue](UNSCALED_VALUE), // Generator function diff --git a/gluten-core/src/main/scala/org/apache/gluten/expression/ScalarSubqueryTransformer.scala b/gluten-core/src/main/scala/org/apache/gluten/expression/ScalarSubqueryTransformer.scala index 0accf9ffd0f99..9508d27df73bd 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/expression/ScalarSubqueryTransformer.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/expression/ScalarSubqueryTransformer.scala @@ -18,6 +18,7 @@ package org.apache.gluten.expression import org.apache.gluten.substrait.expression.{ExpressionBuilder, ExpressionNode} +import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.execution.ScalarSubquery @@ -30,21 +31,11 @@ case class ScalarSubqueryTransformer(substraitExprName: String, query: ScalarSub if (TransformerState.underValidationState) { return ExpressionBuilder.makeLiteral(null, query.dataType, true) } - // the first column in first row from `query`. - val rows = query.plan.executeCollect() - if (rows.length > 1) { - throw new IllegalStateException( - s"more than one row returned by a subquery used as an expression:\n${query.plan}") - } - val result: AnyRef = if (rows.length == 1) { - assert( - rows(0).numFields == 1, - s"Expects 1 field, but got ${rows(0).numFields}; something went wrong in analysis") - rows(0).get(0, query.dataType) - } else { - // If there is no rows returned, the result should be null. - null - } + // After https://github.com/apache/incubator-gluten/pull/5862, we do not need to execute + // subquery manually so the exception behavior is same with vanilla Spark. + // Note that, this code change is just for simplify. The subquery has already been materialized + // before doing transform. + val result = query.eval(InternalRow.empty) ExpressionBuilder.makeLiteral(result, query.dataType, result == null) } } diff --git a/gluten-core/src/main/scala/org/apache/gluten/expression/UDFMappings.scala b/gluten-core/src/main/scala/org/apache/gluten/expression/UDFMappings.scala index 7c836252b92d1..3b64c5117fbb5 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/expression/UDFMappings.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/expression/UDFMappings.scala @@ -32,7 +32,7 @@ object UDFMappings extends Logging { val pythonUDFMap: Map[String, String] = Map() val scalaUDFMap: Map[String, String] = Map() - private def appendKVToMap(key: String, value: String, res: Map[String, String]) { + private def appendKVToMap(key: String, value: String, res: Map[String, String]): Unit = { if (key.isEmpty || value.isEmpty()) { throw new IllegalArgumentException(s"key:$key or value:$value is empty") } @@ -46,7 +46,7 @@ object UDFMappings extends Logging { res.put(key.toLowerCase(Locale.ROOT), value) } - private def parseStringToMap(input: String, res: Map[String, String]) { + private def parseStringToMap(input: String, res: Map[String, String]): Unit = { input.split(",").map { item => val keyValue = item.split(":") @@ -57,7 +57,7 @@ object UDFMappings extends Logging { } } - def loadFromSparkConf(conf: SparkConf) { + def loadFromSparkConf(conf: SparkConf): Unit = { val strHiveUDFs = conf.get(GlutenConfig.GLUTEN_SUPPORTED_HIVE_UDFS, "") if (!StringUtils.isBlank(strHiveUDFs)) { parseStringToMap(strHiveUDFs, hiveUDFMap) diff --git a/gluten-core/src/main/scala/org/apache/gluten/extension/StrategyOverrides.scala b/gluten-core/src/main/scala/org/apache/gluten/extension/StrategyOverrides.scala deleted file mode 100644 index d016eacccf118..0000000000000 --- a/gluten-core/src/main/scala/org/apache/gluten/extension/StrategyOverrides.scala +++ /dev/null @@ -1,219 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.gluten.extension - -import org.apache.gluten.{GlutenConfig, GlutenSparkExtensionsInjector} -import org.apache.gluten.backendsapi.BackendsApiManager -import org.apache.gluten.extension.columnar.TRANSFORM_UNSUPPORTED -import org.apache.gluten.extension.columnar.TransformHints.TAG -import org.apache.gluten.utils.LogicalPlanSelector - -import org.apache.spark.sql.{SparkSession, SparkSessionExtensions, Strategy} -import org.apache.spark.sql.catalyst.SQLConfHelper -import org.apache.spark.sql.catalyst.expressions.Expression -import org.apache.spark.sql.catalyst.optimizer.{BuildLeft, BuildRight, JoinSelectionHelper} -import org.apache.spark.sql.catalyst.plans._ -import org.apache.spark.sql.catalyst.plans.logical._ -import org.apache.spark.sql.execution.{joins, JoinSelectionShim, SparkPlan} -import org.apache.spark.sql.execution.adaptive.{BroadcastQueryStageExec, LogicalQueryStage} -import org.apache.spark.sql.execution.joins.BroadcastHashJoinExec - -object StrategyOverrides extends GlutenSparkExtensionsInjector { - override def inject(extensions: SparkSessionExtensions): Unit = { - extensions.injectPlannerStrategy(JoinSelectionOverrides) - } -} - -case class JoinSelectionOverrides(session: SparkSession) - extends Strategy - with JoinSelectionHelper - with SQLConfHelper { - - private def isBroadcastStage(plan: LogicalPlan): Boolean = plan match { - case LogicalQueryStage(_, _: BroadcastQueryStageExec) => true - case _ => false - } - - def extractEqualJoinKeyCondition( - joinType: JoinType, - leftKeys: Seq[Expression], - rightKeys: Seq[Expression], - condition: Option[Expression], - left: LogicalPlan, - right: LogicalPlan, - hint: JoinHint, - forceShuffledHashJoin: Boolean): Seq[SparkPlan] = { - if (isBroadcastStage(left) || isBroadcastStage(right)) { - val buildSide = if (isBroadcastStage(left)) BuildLeft else BuildRight - Seq( - BroadcastHashJoinExec( - leftKeys, - rightKeys, - joinType, - buildSide, - condition, - planLater(left), - planLater(right))) - } else { - // Generate BHJ here, avoid to do match in `JoinSelection` again. - val isHintEmpty = hint.leftHint.isEmpty && hint.rightHint.isEmpty - val buildSide = getBroadcastBuildSide(left, right, joinType, hint, !isHintEmpty, conf) - if (buildSide.isDefined) { - return Seq( - joins.BroadcastHashJoinExec( - leftKeys, - rightKeys, - joinType, - buildSide.get, - condition, - planLater(left), - planLater(right))) - } - - if ( - forceShuffledHashJoin && - !BackendsApiManager.getSparkPlanExecApiInstance.joinFallback( - joinType, - left.outputSet, - right.outputSet, - condition) && - !left.getTagValue(TAG).isDefined && - !right.getTagValue(TAG).isDefined - ) { - // Force use of ShuffledHashJoin in preference to SortMergeJoin. With no respect to - // conf setting "spark.sql.join.preferSortMergeJoin". - val (leftBuildable, rightBuildable) = - if (BackendsApiManager.getSettings.utilizeShuffledHashJoinHint()) { - // Currently, ClickHouse backend can not support AQE, so it needs to use join hint - // to decide the build side, after supporting AQE, will remove this. - val leftHintEnabled = hintToShuffleHashJoinLeft(hint) - val rightHintEnabled = hintToShuffleHashJoinRight(hint) - val leftHintMergeEnabled = hint.leftHint.exists(_.strategy.contains(SHUFFLE_MERGE)) - val rightHintMergeEnabled = hint.rightHint.exists(_.strategy.contains(SHUFFLE_MERGE)) - if (leftHintEnabled || rightHintEnabled) { - (leftHintEnabled, rightHintEnabled) - } else if (leftHintMergeEnabled || rightHintMergeEnabled) { - // hack: when set SHUFFLE_MERGE hint, it means that - // it don't use this side as the build side - (!leftHintMergeEnabled, !rightHintMergeEnabled) - } else { - (canBuildShuffledHashJoinLeft(joinType), canBuildShuffledHashJoinRight(joinType)) - } - } else { - (canBuildShuffledHashJoinLeft(joinType), canBuildShuffledHashJoinRight(joinType)) - } - - if (!leftBuildable && !rightBuildable) { - return Nil - } - val buildSide = if (!leftBuildable) { - BuildRight - } else if (!rightBuildable) { - BuildLeft - } else { - getSmallerSide(left, right) - } - - return Option(buildSide) - .map { - buildSide => - Seq( - joins.ShuffledHashJoinExec( - leftKeys, - rightKeys, - joinType, - buildSide, - condition, - planLater(left), - planLater(right))) - } - .getOrElse(Nil) - } - Nil - } - } - - override def canBuildShuffledHashJoinLeft(joinType: JoinType): Boolean = { - BackendsApiManager.getSettings.supportHashBuildJoinTypeOnLeft(joinType) - } - - override def canBuildShuffledHashJoinRight(joinType: JoinType): Boolean = { - BackendsApiManager.getSettings.supportHashBuildJoinTypeOnRight(joinType) - } - - def existsMultiJoins(plan: LogicalPlan, count: Int = 0): Boolean = { - plan match { - case plan: Join => - if ((count + 1) >= GlutenConfig.getConf.logicalJoinOptimizationThrottle) return true - plan.children.exists(existsMultiJoins(_, count + 1)) - case plan: Project => - if ((count + 1) >= GlutenConfig.getConf.logicalJoinOptimizationThrottle) return true - plan.children.exists(existsMultiJoins(_, count + 1)) - case other => false - } - } - - def tagNotTransformable(plan: LogicalPlan, reason: String): LogicalPlan = { - plan.setTagValue(TAG, TRANSFORM_UNSUPPORTED(Some(reason))) - plan - } - - def tagNotTransformableRecursive(plan: LogicalPlan, reason: String): LogicalPlan = { - tagNotTransformable( - plan.withNewChildren(plan.children.map(tagNotTransformableRecursive(_, reason))), - reason) - } - - def existLeftOuterJoin(plan: LogicalPlan): Boolean = { - plan.collect { - case join: Join if join.joinType.sql.equals("LEFT OUTER") => - return true - }.size > 0 - } - - override def apply(plan: LogicalPlan): Seq[SparkPlan] = - LogicalPlanSelector.maybeNil(session, plan) { - // Ignore forceShuffledHashJoin if exist multi continuous joins - if ( - GlutenConfig.getConf.enableLogicalJoinOptimize && - existsMultiJoins(plan) && existLeftOuterJoin(plan) - ) { - tagNotTransformableRecursive(plan, "exist multi continuous joins") - } - plan match { - // If the build side of BHJ is already decided by AQE, we need to keep the build side. - case JoinSelectionShim.ExtractEquiJoinKeysShim( - joinType, - leftKeys, - rightKeys, - condition, - left, - right, - hint) => - extractEqualJoinKeyCondition( - joinType, - leftKeys, - rightKeys, - condition, - left, - right, - hint, - GlutenConfig.getConf.forceShuffledHashJoin) - case _ => Nil - } - } -} diff --git a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/ColumnarRuleApplier.scala b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/ColumnarRuleApplier.scala index 17bf017305f28..ee5bcd883e7e7 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/ColumnarRuleApplier.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/ColumnarRuleApplier.scala @@ -16,8 +16,44 @@ */ package org.apache.gluten.extension.columnar +import org.apache.gluten.GlutenConfig +import org.apache.gluten.metrics.GlutenTimeMetric +import org.apache.gluten.utils.LogLevelUtil + +import org.apache.spark.internal.Logging +import org.apache.spark.sql.catalyst.rules.{Rule, RuleExecutor} import org.apache.spark.sql.execution.SparkPlan trait ColumnarRuleApplier { def apply(plan: SparkPlan, outputsColumnar: Boolean): SparkPlan } + +object ColumnarRuleApplier { + class Executor(phase: String, rules: Seq[Rule[SparkPlan]]) extends RuleExecutor[SparkPlan] { + private val batch: Batch = + Batch(s"Columnar (Phase [$phase])", Once, rules.map(r => new LoggedRule(r)): _*) + + // TODO Remove this exclusion then pass Spark's idempotence check. + override protected val excludedOnceBatches: Set[String] = Set(batch.name) + + override protected def batches: Seq[Batch] = List(batch) + } + + private class LoggedRule(delegate: Rule[SparkPlan]) + extends Rule[SparkPlan] + with Logging + with LogLevelUtil { + // Columnar plan change logging added since https://github.com/apache/incubator-gluten/pull/456. + private val transformPlanLogLevel = GlutenConfig.getConf.transformPlanLogLevel + override val ruleName: String = delegate.ruleName + + override def apply(plan: SparkPlan): SparkPlan = GlutenTimeMetric.withMillisTime { + logOnLevel( + transformPlanLogLevel, + s"Preparing to apply rule $ruleName on plan:\n${plan.toString}") + val out = delegate.apply(plan) + logOnLevel(transformPlanLogLevel, s"Plan after applied rule $ruleName:\n${plan.toString}") + out + }(t => logOnLevel(transformPlanLogLevel, s"Applying rule $ruleName took $t ms.")) + } +} diff --git a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/EnsureLocalSortRequirements.scala b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/EnsureLocalSortRequirements.scala index 0f5fc21aff870..afc29a51e19a7 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/EnsureLocalSortRequirements.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/EnsureLocalSortRequirements.scala @@ -37,7 +37,7 @@ object EnsureLocalSortRequirements extends Rule[SparkPlan] { requiredOrdering: Seq[SortOrder]): SparkPlan = { val newChild = SortExec(requiredOrdering, global = false, child = originalChild) if (!GlutenConfig.getConf.enableColumnarSort) { - TransformHints.tagNotTransformable(newChild, "columnar Sort is not enabled in SortExec") + FallbackTags.add(newChild, "columnar Sort is not enabled in SortExec") newChild } else { val newChildWithTransformer = @@ -50,7 +50,7 @@ object EnsureLocalSortRequirements extends Rule[SparkPlan] { if (validationResult.isValid) { newChildWithTransformer } else { - TransformHints.tagNotTransformable(newChild, validationResult) + FallbackTags.add(newChild, validationResult) newChild } } diff --git a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/ExpandFallbackPolicy.scala b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/ExpandFallbackPolicy.scala index 6f8d7cde703bf..e334fcfbce889 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/ExpandFallbackPolicy.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/ExpandFallbackPolicy.scala @@ -235,7 +235,18 @@ case class ExpandFallbackPolicy(isAdaptiveContext: Boolean, originalPlan: SparkP } } - private def fallbackToRowBasedPlan(outputsColumnar: Boolean): SparkPlan = { + private def fallbackToRowBasedPlan(glutenPlan: SparkPlan, outputsColumnar: Boolean): SparkPlan = { + // Propagate fallback reason to vanilla SparkPlan + glutenPlan.foreach { + case _: GlutenPlan => + case p: SparkPlan if FallbackTags.nonEmpty(p) && p.logicalLink.isDefined => + originalPlan + .find(_.logicalLink.exists(_.fastEquals(p.logicalLink.get))) + .filterNot(FallbackTags.nonEmpty) + .foreach(origin => FallbackTags.tag(origin, FallbackTags.getTag(p))) + case _ => + } + val planWithTransitions = Transitions.insertTransitions(originalPlan, outputsColumnar) planWithTransitions } @@ -259,7 +270,7 @@ case class ExpandFallbackPolicy(isAdaptiveContext: Boolean, originalPlan: SparkP // Scan Parquet // | // ColumnarToRow - val vanillaSparkPlan = fallbackToRowBasedPlan(outputsColumnar) + val vanillaSparkPlan = fallbackToRowBasedPlan(plan, outputsColumnar) val vanillaSparkTransitionCost = countTransitionCostForVanillaSparkPlan(vanillaSparkPlan) if ( GlutenConfig.getConf.fallbackPreferColumnar && @@ -267,7 +278,7 @@ case class ExpandFallbackPolicy(isAdaptiveContext: Boolean, originalPlan: SparkP ) { plan } else { - TransformHints.tagAllNotTransformable( + FallbackTags.addRecursively( vanillaSparkPlan, TRANSFORM_UNSUPPORTED(fallbackInfo.reason, appendReasonIfExists = false)) FallbackNode(vanillaSparkPlan) diff --git a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/TransformHintRule.scala b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/FallbackTagRule.scala similarity index 83% rename from gluten-core/src/main/scala/org/apache/gluten/extension/columnar/TransformHintRule.scala rename to gluten-core/src/main/scala/org/apache/gluten/extension/columnar/FallbackTagRule.scala index c9fcc52aa0915..d34cb0df4e7e4 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/TransformHintRule.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/FallbackTagRule.scala @@ -21,7 +21,7 @@ import org.apache.gluten.backendsapi.BackendsApiManager import org.apache.gluten.exception.GlutenNotSupportException import org.apache.gluten.execution._ import org.apache.gluten.extension.{GlutenPlan, ValidationResult} -import org.apache.gluten.extension.columnar.TransformHints.EncodeTransformableTagImplicits +import org.apache.gluten.extension.columnar.FallbackTags.EncodeFallbackTagImplicits import org.apache.gluten.extension.columnar.validator.{Validator, Validators} import org.apache.gluten.sql.shims.SparkShimLoader @@ -45,19 +45,19 @@ import org.apache.spark.sql.types.StringType import org.apache.commons.lang3.exception.ExceptionUtils -sealed trait TransformHint { +sealed trait FallbackTag { val stacktrace: Option[String] = - if (TransformHints.DEBUG) { + if (FallbackTags.DEBUG) { Some(ExceptionUtils.getStackTrace(new Throwable())) } else None } case class TRANSFORM_UNSUPPORTED(reason: Option[String], appendReasonIfExists: Boolean = true) - extends TransformHint + extends FallbackTag -object TransformHints { - val TAG: TreeNodeTag[TransformHint] = - TreeNodeTag[TransformHint]("org.apache.gluten.transformhint") +object FallbackTags { + val TAG: TreeNodeTag[FallbackTag] = + TreeNodeTag[FallbackTag]("org.apache.gluten.FallbackTag") val DEBUG = false @@ -69,29 +69,23 @@ object TransformHints { * validation rule. So user should not consider the plan "transformable" unless all validation * rules are passed. */ - def isNotTransformable(plan: SparkPlan): Boolean = { - getHintOption(plan) match { + def nonEmpty(plan: SparkPlan): Boolean = { + getTagOption(plan) match { case Some(TRANSFORM_UNSUPPORTED(_, _)) => true case _ => false } } /** - * NOTE: To be deprecated. Do not create new usages of this method. - * - * Since it's usually not safe to consider a plan "transformable" during validation phase. Another - * validation rule could turn "transformable" to "non-transformable" before implementing the plan - * within Gluten transformers. + * If true, it implies the plan maybe transformable during validation phase but not guaranteed, + * since another validation rule could turn it to "non-transformable" before implementing the plan + * within Gluten transformers. If false, the plan node will be guaranteed fallback to Vanilla plan + * node while being implemented. */ - def isTransformable(plan: SparkPlan): Boolean = { - getHintOption(plan) match { - case None => true - case _ => false - } - } + def maybeOffloadable(plan: SparkPlan): Boolean = !nonEmpty(plan) - def tag(plan: SparkPlan, hint: TransformHint): Unit = { - val mergedHint = getHintOption(plan) + def tag(plan: SparkPlan, hint: FallbackTag): Unit = { + val mergedHint = getTagOption(plan) .map { case originalHint @ TRANSFORM_UNSUPPORTED(Some(originalReason), originAppend) => hint match { @@ -123,33 +117,33 @@ object TransformHints { plan.unsetTagValue(TAG) } - def tagNotTransformable(plan: SparkPlan, validationResult: ValidationResult): Unit = { + def add(plan: SparkPlan, validationResult: ValidationResult): Unit = { if (!validationResult.isValid) { tag(plan, TRANSFORM_UNSUPPORTED(validationResult.reason)) } } - def tagNotTransformable(plan: SparkPlan, reason: String): Unit = { + def add(plan: SparkPlan, reason: String): Unit = { tag(plan, TRANSFORM_UNSUPPORTED(Some(reason))) } - def tagAllNotTransformable(plan: SparkPlan, hint: TRANSFORM_UNSUPPORTED): Unit = { + def addRecursively(plan: SparkPlan, hint: TRANSFORM_UNSUPPORTED): Unit = { plan.foreach { case _: GlutenPlan => // ignore case other => tag(other, hint) } } - def getHint(plan: SparkPlan): TransformHint = { - getHintOption(plan).getOrElse( + def getTag(plan: SparkPlan): FallbackTag = { + getTagOption(plan).getOrElse( throw new IllegalStateException("Transform hint tag not set in plan: " + plan.toString())) } - def getHintOption(plan: SparkPlan): Option[TransformHint] = { + def getTagOption(plan: SparkPlan): Option[FallbackTag] = { plan.getTagValue(TAG) } - implicit class EncodeTransformableTagImplicits(validationResult: ValidationResult) { + implicit class EncodeFallbackTagImplicits(validationResult: ValidationResult) { def tagOnFallback(plan: SparkPlan): Unit = { if (validationResult.isValid) { return @@ -163,7 +157,7 @@ object TransformHints { case class FallbackOnANSIMode(session: SparkSession) extends Rule[SparkPlan] { override def apply(plan: SparkPlan): SparkPlan = { if (GlutenConfig.getConf.enableAnsiMode) { - plan.foreach(TransformHints.tagNotTransformable(_, "does not support ansi mode")) + plan.foreach(FallbackTags.add(_, "does not support ansi mode")) } plan } @@ -182,11 +176,14 @@ case class FallbackMultiCodegens(session: SparkSession) extends Rule[SparkPlan] case plan: ShuffledHashJoinExec => if ((count + 1) >= optimizeLevel) return true plan.children.exists(existsMultiCodegens(_, count + 1)) - case other => false + case plan: SortMergeJoinExec if GlutenConfig.getConf.forceShuffledHashJoin => + if ((count + 1) >= optimizeLevel) return true + plan.children.exists(existsMultiCodegens(_, count + 1)) + case _ => false } - def tagNotTransformable(plan: SparkPlan): SparkPlan = { - TransformHints.tagNotTransformable(plan, "fallback multi codegens") + def addFallbackTag(plan: SparkPlan): SparkPlan = { + FallbackTags.add(plan, "fallback multi codegens") plan } @@ -203,35 +200,35 @@ case class FallbackMultiCodegens(session: SparkSession) extends Rule[SparkPlan] } } - def tagNotTransformableRecursive(plan: SparkPlan): SparkPlan = { + def addFallbackTagRecursive(plan: SparkPlan): SparkPlan = { plan match { case p: ShuffleExchangeExec => - tagNotTransformable(p.withNewChildren(p.children.map(tagNotTransformableForMultiCodegens))) + addFallbackTag(p.withNewChildren(p.children.map(tagOnFallbackForMultiCodegens))) case p: BroadcastExchangeExec => - tagNotTransformable(p.withNewChildren(p.children.map(tagNotTransformableForMultiCodegens))) + addFallbackTag(p.withNewChildren(p.children.map(tagOnFallbackForMultiCodegens))) case p: ShuffledHashJoinExec => - tagNotTransformable(p.withNewChildren(p.children.map(tagNotTransformableRecursive))) + addFallbackTag(p.withNewChildren(p.children.map(addFallbackTagRecursive))) case p if !supportCodegen(p) => - p.withNewChildren(p.children.map(tagNotTransformableForMultiCodegens)) + p.withNewChildren(p.children.map(tagOnFallbackForMultiCodegens)) case p if isAQEShuffleReadExec(p) => - p.withNewChildren(p.children.map(tagNotTransformableForMultiCodegens)) + p.withNewChildren(p.children.map(tagOnFallbackForMultiCodegens)) case p: QueryStageExec => p - case p => tagNotTransformable(p.withNewChildren(p.children.map(tagNotTransformableRecursive))) + case p => addFallbackTag(p.withNewChildren(p.children.map(addFallbackTagRecursive))) } } - def tagNotTransformableForMultiCodegens(plan: SparkPlan): SparkPlan = { + def tagOnFallbackForMultiCodegens(plan: SparkPlan): SparkPlan = { plan match { case plan if existsMultiCodegens(plan) => - tagNotTransformableRecursive(plan) + addFallbackTagRecursive(plan) case other => - other.withNewChildren(other.children.map(tagNotTransformableForMultiCodegens)) + other.withNewChildren(other.children.map(tagOnFallbackForMultiCodegens)) } } override def apply(plan: SparkPlan): SparkPlan = { if (physicalJoinOptimize) { - tagNotTransformableForMultiCodegens(plan) + tagOnFallbackForMultiCodegens(plan) } else plan } } @@ -275,13 +272,11 @@ case class FallbackEmptySchemaRelation() extends Rule[SparkPlan] { if (p.children.exists(_.output.isEmpty)) { // Some backends are not eligible to offload plan with zero-column input. // If any child have empty output, mark the plan and that child as UNSUPPORTED. - TransformHints.tagNotTransformable(p, "at least one of its children has empty output") + FallbackTags.add(p, "at least one of its children has empty output") p.children.foreach { child => if (child.output.isEmpty && !child.isInstanceOf[WriteFilesExec]) { - TransformHints.tagNotTransformable( - child, - "at least one of its children has empty output") + FallbackTags.add(child, "at least one of its children has empty output") } } } @@ -294,8 +289,8 @@ case class FallbackEmptySchemaRelation() extends Rule[SparkPlan] { // The doValidate function will be called to check if the conversion is supported. // If false is returned or any unsupported exception is thrown, a row guard will // be added on the top of that plan to prevent actual conversion. -case class AddTransformHintRule() extends Rule[SparkPlan] { - import AddTransformHintRule._ +case class AddFallbackTagRule() extends Rule[SparkPlan] { + import AddFallbackTagRule._ private val glutenConf: GlutenConfig = GlutenConfig.getConf private val validator = Validators .builder() @@ -308,22 +303,15 @@ case class AddTransformHintRule() extends Rule[SparkPlan] { .build() def apply(plan: SparkPlan): SparkPlan = { - addTransformableTags(plan) - } - - /** Inserts a transformable tag on top of those that are not supported. */ - private def addTransformableTags(plan: SparkPlan): SparkPlan = { - // Walk the tree with post-order - val out = plan.mapChildren(addTransformableTags) - addTransformableTag(out) - out + plan.foreachUp { case p => addFallbackTag(p) } + plan } - private def addTransformableTag(plan: SparkPlan): Unit = { + private def addFallbackTag(plan: SparkPlan): Unit = { val outcome = validator.validate(plan) outcome match { case Validator.Failed(reason) => - TransformHints.tagNotTransformable(plan, reason) + FallbackTags.add(plan, reason) return case Validator.Passed => } @@ -343,7 +331,7 @@ case class AddTransformHintRule() extends Rule[SparkPlan] { // If filter expressions aren't empty, we need to transform the inner operators. if (plan.partitionFilters.isEmpty) { val transformer = - ScanTransformerFactory.createFileSourceScanTransformer(plan, validation = true) + ScanTransformerFactory.createFileSourceScanTransformer(plan) transformer.doValidate().tagOnFallback(plan) } case plan if HiveTableScanExecTransformer.isHiveTableScan(plan) => @@ -392,7 +380,7 @@ case class AddTransformHintRule() extends Rule[SparkPlan] { plan.leftKeys, plan.rightKeys, plan.joinType, - plan.buildSide, + OffloadJoin.getBuildSide(plan), plan.condition, plan.left, plan.right, @@ -458,8 +446,9 @@ case class AddTransformHintRule() extends Rule[SparkPlan] { ) transformer.doValidate().tagOnFallback(plan) case plan: CoalesceExec => - val transformer = CoalesceExecTransformer(plan.numPartitions, plan.child) - transformer.doValidate().tagOnFallback(plan) + ColumnarCoalesceExec(plan.numPartitions, plan.child) + .doValidate() + .tagOnFallback(plan) case plan: GlobalLimitExec => val (limit, offset) = SparkShimLoader.getSparkShims.getLimitAndOffsetFromGlobalLimit(plan) @@ -500,12 +489,21 @@ case class AddTransformHintRule() extends Rule[SparkPlan] { plan.child, offset) transformer.doValidate().tagOnFallback(plan) + case plan: SampleExec => + val transformer = BackendsApiManager.getSparkPlanExecApiInstance.genSampleExecTransformer( + plan.lowerBound, + plan.upperBound, + plan.withReplacement, + plan.seed, + plan.child + ) + transformer.doValidate().tagOnFallback(plan) case _ => - // Currently we assume a plan to be transformable by default. + // Currently we assume a plan to be offload-able by default. } } catch { case e @ (_: GlutenNotSupportException | _: UnsupportedOperationException) => - TransformHints.tagNotTransformable( + FallbackTags.add( plan, s"${e.getMessage}, original Spark plan is " + s"${plan.getClass}(${plan.children.toList.map(_.getClass)})") @@ -516,7 +514,7 @@ case class AddTransformHintRule() extends Rule[SparkPlan] { } } -object AddTransformHintRule { +object AddFallbackTagRule { implicit private class ValidatorBuilderImplicits(builder: Validators.Builder) { /** @@ -554,9 +552,9 @@ object AddTransformHintRule { } } -case class RemoveTransformHintRule() extends Rule[SparkPlan] { +case class RemoveFallbackTagRule() extends Rule[SparkPlan] { override def apply(plan: SparkPlan): SparkPlan = { - plan.foreach(TransformHints.untag) + plan.foreach(FallbackTags.untag) plan } } diff --git a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/MiscColumnarRules.scala b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/MiscColumnarRules.scala index fab973ffb0ed1..15fc8bea7054f 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/MiscColumnarRules.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/MiscColumnarRules.scala @@ -16,20 +16,21 @@ */ package org.apache.gluten.extension.columnar -import org.apache.gluten.extension.columnar.transition.ColumnarToRowLike +import org.apache.gluten.extension.columnar.transition.{ColumnarToRowLike, Transitions} import org.apache.gluten.utils.{LogLevelUtil, PlanUtil} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.rules.{PlanChangeLogger, Rule} import org.apache.spark.sql.execution._ -import org.apache.spark.sql.execution.adaptive.BroadcastQueryStageExec -import org.apache.spark.sql.execution.exchange.{BroadcastExchangeLike, ShuffleExchangeLike} +import org.apache.spark.sql.execution.adaptive.{AdaptiveSparkPlanExec, BroadcastQueryStageExec} +import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, BroadcastExchangeLike, ShuffleExchangeLike} +import org.apache.spark.sql.internal.SQLConf object MiscColumnarRules { object TransformPreOverrides { def apply(): TransformPreOverrides = { TransformPreOverrides( - List(OffloadFilter()), + List(OffloadProject(), OffloadFilter()), List( OffloadOthers(), OffloadAggregate(), @@ -58,6 +59,101 @@ object MiscColumnarRules { } } + // Replaces all SubqueryBroadcastExec used by sub-queries with ColumnarSubqueryBroadcastExec. + // This prevents query execution from being failed by fallen-back SubqueryBroadcastExec with + // child plan with columnar output (e.g., an adaptive Spark plan that yields final plan that + // is full-offloaded). ColumnarSubqueryBroadcastExec is both compatible with row-based and + // columnar child plan so is always functional. + case class RewriteSubqueryBroadcast() extends Rule[SparkPlan] { + override def apply(plan: SparkPlan): SparkPlan = { + val out = plan.transformWithSubqueries { + case p => + // Since https://github.com/apache/incubator-gluten/pull/1851. + // + // When AQE is on, the AQE sub-query cache should already be filled with + // row-based SubqueryBroadcastExec for reusing. Thus we are doing the same + // memorize-and-reuse work here for the replaced columnar version. + val reuseRemoved = removeReuses(p) + val replaced = replace(reuseRemoved) + replaced + } + out + } + + private def removeReuses(p: SparkPlan): SparkPlan = { + val out = p.transformExpressions { + case pe: ExecSubqueryExpression => + val newPlan = pe.plan match { + case ReusedSubqueryExec(s: SubqueryBroadcastExec) => + // Remove ReusedSubqueryExec. We will re-create reuses in subsequent method + // #replace. + // + // We assume only meeting reused sub-queries in AQE execution. When AQE is off, + // Spark adds reuses only after applying columnar rules by preparation rule + // ReuseExchangeAndSubquery. + assert(s.child.isInstanceOf[AdaptiveSparkPlanExec]) + s + case other => + other + } + pe.withNewPlan(newPlan) + } + out + } + + private def replace(p: SparkPlan): SparkPlan = { + val out = p.transformExpressions { + case pe: ExecSubqueryExpression => + val newPlan = pe.plan match { + case s: SubqueryBroadcastExec => + val columnarSubqueryBroadcast = toColumnarSubqueryBroadcast(s) + val maybeReused = columnarSubqueryBroadcast.child match { + case a: AdaptiveSparkPlanExec if SQLConf.get.subqueryReuseEnabled => + val cached = a.context.subqueryCache.get(columnarSubqueryBroadcast.canonicalized) + if (cached.nonEmpty) { + // Reuse the one in cache. + ReusedSubqueryExec(cached.get) + } else { + // Place columnar sub-query broadcast into cache, then return it. + a.context.subqueryCache + .update(columnarSubqueryBroadcast.canonicalized, columnarSubqueryBroadcast) + columnarSubqueryBroadcast + } + case _ => + // We are not in AQE. + columnarSubqueryBroadcast + } + maybeReused + case other => other + } + pe.withNewPlan(newPlan) + } + out + } + + private def toColumnarBroadcastExchange( + exchange: BroadcastExchangeExec): ColumnarBroadcastExchangeExec = { + val newChild = Transitions.toBackendBatchPlan(exchange.child) + ColumnarBroadcastExchangeExec(exchange.mode, newChild) + } + + private def toColumnarSubqueryBroadcast( + from: SubqueryBroadcastExec): ColumnarSubqueryBroadcastExec = { + val newChild = from.child match { + case exchange: BroadcastExchangeExec => + toColumnarBroadcastExchange(exchange) + case aqe: AdaptiveSparkPlanExec => + // Keeps the child if its is AQE even if its supportsColumnar == false. + // ColumnarSubqueryBroadcastExec is compatible with both row-based + // and columnar inputs. + aqe + case other => other + } + val out = ColumnarSubqueryBroadcastExec(from.name, from.index, from.buildKeys, newChild) + out + } + } + // Remove topmost columnar-to-row otherwise AQE throws error. // See: org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec#newQueryStage // diff --git a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/OffloadSingleNode.scala b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/OffloadSingleNode.scala index 84a2ec5c6ec80..7a4222b5cb382 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/OffloadSingleNode.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/OffloadSingleNode.scala @@ -20,25 +20,27 @@ import org.apache.gluten.GlutenConfig import org.apache.gluten.backendsapi.BackendsApiManager import org.apache.gluten.exception.GlutenNotSupportException import org.apache.gluten.execution._ -import org.apache.gluten.expression.ExpressionConverter import org.apache.gluten.extension.GlutenPlan import org.apache.gluten.sql.shims.SparkShimLoader import org.apache.gluten.utils.{LogLevelUtil, PlanUtil} import org.apache.spark.api.python.EvalPythonExecTransformer import org.apache.spark.internal.Logging -import org.apache.spark.sql.catalyst.expressions.Expression +import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Expression, InputFileBlockLength, InputFileBlockStart, InputFileName, NamedExpression} import org.apache.spark.sql.catalyst.optimizer.{BuildLeft, BuildRight, BuildSide} -import org.apache.spark.sql.catalyst.plans.{LeftOuter, LeftSemi, RightOuter} +import org.apache.spark.sql.catalyst.plans.logical.Join import org.apache.spark.sql.execution._ import org.apache.spark.sql.execution.aggregate.{HashAggregateExec, ObjectHashAggregateExec, SortAggregateExec} import org.apache.spark.sql.execution.datasources.WriteFilesExec -import org.apache.spark.sql.execution.datasources.v2.{BatchScanExec, FileScan} +import org.apache.spark.sql.execution.datasources.v2.{BatchScanExec, DataSourceV2ScanExecBase} import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, ShuffleExchangeExec} import org.apache.spark.sql.execution.joins._ import org.apache.spark.sql.execution.python.{ArrowEvalPythonExec, BatchEvalPythonExec} import org.apache.spark.sql.execution.window.{WindowExec, WindowGroupLimitExecShim} import org.apache.spark.sql.hive.HiveTableScanExecTransformer +import org.apache.spark.sql.types.{LongType, StringType} + +import scala.collection.mutable.Map /** * Converts a vanilla Spark plan node into Gluten plan node. Gluten plan is supposed to be executed @@ -54,7 +56,7 @@ sealed trait OffloadSingleNode extends Logging { // Aggregation transformation. case class OffloadAggregate() extends OffloadSingleNode with LogLevelUtil { override def offload(plan: SparkPlan): SparkPlan = plan match { - case plan if TransformHints.isNotTransformable(plan) => + case plan if FallbackTags.nonEmpty(plan) => plan case agg: HashAggregateExec => genHashAggregateExec(agg) @@ -70,7 +72,7 @@ case class OffloadAggregate() extends OffloadSingleNode with LogLevelUtil { * the actually used plan for execution. */ private def genHashAggregateExec(plan: HashAggregateExec): SparkPlan = { - if (TransformHints.isNotTransformable(plan)) { + if (FallbackTags.nonEmpty(plan)) { return plan } @@ -90,7 +92,7 @@ case class OffloadAggregate() extends OffloadSingleNode with LogLevelUtil { HashAggregateExecBaseTransformer.from(plan)() case _ => // If the child is not transformable, do not transform the agg. - TransformHints.tagNotTransformable(plan, "child output schema is empty") + FallbackTags.add(plan, "child output schema is empty") plan } } else { @@ -103,57 +105,28 @@ case class OffloadAggregate() extends OffloadSingleNode with LogLevelUtil { // Exchange transformation. case class OffloadExchange() extends OffloadSingleNode with LogLevelUtil { override def offload(plan: SparkPlan): SparkPlan = plan match { - case plan if TransformHints.isNotTransformable(plan) => - plan - case plan: ShuffleExchangeExec => - logDebug(s"Columnar Processing for ${plan.getClass} is currently supported.") - val child = plan.child - if ( - (child.supportsColumnar || GlutenConfig.getConf.enablePreferColumnar) && - BackendsApiManager.getSettings.supportColumnarShuffleExec() - ) { - BackendsApiManager.getSparkPlanExecApiInstance.genColumnarShuffleExchange(plan, child) - } else { - plan.withNewChildren(Seq(child)) - } - case plan: BroadcastExchangeExec => - val child = plan.child - logDebug(s"Columnar Processing for ${plan.getClass} is currently supported.") - ColumnarBroadcastExchangeExec(plan.mode, child) + case p if FallbackTags.nonEmpty(p) => + p + case s: ShuffleExchangeExec + if (s.child.supportsColumnar || GlutenConfig.getConf.enablePreferColumnar) && + BackendsApiManager.getSettings.supportColumnarShuffleExec() => + logDebug(s"Columnar Processing for ${s.getClass} is currently supported.") + BackendsApiManager.getSparkPlanExecApiInstance.genColumnarShuffleExchange(s) + case b: BroadcastExchangeExec => + val child = b.child + logDebug(s"Columnar Processing for ${b.getClass} is currently supported.") + ColumnarBroadcastExchangeExec(b.mode, child) case other => other } } // Join transformation. case class OffloadJoin() extends OffloadSingleNode with LogLevelUtil { - import OffloadJoin._ override def offload(plan: SparkPlan): SparkPlan = { - if (TransformHints.isNotTransformable(plan)) { + if (FallbackTags.nonEmpty(plan)) { logDebug(s"Columnar Processing for ${plan.getClass} is under row guard.") - plan match { - case shj: ShuffledHashJoinExec => - if (BackendsApiManager.getSettings.recreateJoinExecOnFallback()) { - // Since https://github.com/apache/incubator-gluten/pull/408 - // Because we manually removed the build side limitation for LeftOuter, LeftSemi and - // RightOuter, need to change the build side back if this join fallback into vanilla - // Spark for execution. - return ShuffledHashJoinExec( - shj.leftKeys, - shj.rightKeys, - shj.joinType, - getSparkSupportedBuildSide(shj), - shj.condition, - shj.left, - shj.right, - shj.isSkewJoin - ) - } else { - return shj - } - case p => - return p - } + return plan } plan match { case plan: ShuffledHashJoinExec => @@ -165,7 +138,7 @@ case class OffloadJoin() extends OffloadSingleNode with LogLevelUtil { plan.leftKeys, plan.rightKeys, plan.joinType, - plan.buildSide, + OffloadJoin.getBuildSide(plan), plan.condition, left, right, @@ -214,20 +187,162 @@ case class OffloadJoin() extends OffloadSingleNode with LogLevelUtil { case other => other } } - } object OffloadJoin { - private def getSparkSupportedBuildSide(plan: ShuffledHashJoinExec): BuildSide = { - plan.joinType match { - case LeftOuter | LeftSemi => BuildRight - case RightOuter => BuildLeft - case _ => plan.buildSide + + def getBuildSide(shj: ShuffledHashJoinExec): BuildSide = { + val leftBuildable = + BackendsApiManager.getSettings.supportHashBuildJoinTypeOnLeft(shj.joinType) + val rightBuildable = + BackendsApiManager.getSettings.supportHashBuildJoinTypeOnRight(shj.joinType) + if (!leftBuildable) { + BuildRight + } else if (!rightBuildable) { + BuildLeft + } else { + shj.logicalLink match { + case Some(join: Join) => + val leftSize = join.left.stats.sizeInBytes + val rightSize = join.right.stats.sizeInBytes + if (rightSize <= leftSize) BuildRight else BuildLeft + // Only the ShuffledHashJoinExec generated directly in some spark tests is not link + // logical plan, such as OuterJoinSuite. + case _ => shj.buildSide + } + } + } +} + +case class OffloadProject() extends OffloadSingleNode with LogLevelUtil { + private def containsInputFileRelatedExpr(expr: Expression): Boolean = { + expr match { + case _: InputFileName | _: InputFileBlockStart | _: InputFileBlockLength => true + case _ => expr.children.exists(containsInputFileRelatedExpr) } } - def isLegal(plan: ShuffledHashJoinExec): Boolean = { - plan.buildSide == getSparkSupportedBuildSide(plan) + private def rewriteExpr( + expr: Expression, + replacedExprs: Map[String, AttributeReference]): Expression = { + expr match { + case _: InputFileName => + replacedExprs.getOrElseUpdate( + expr.prettyName, + AttributeReference(expr.prettyName, StringType, false)()) + case _: InputFileBlockStart => + replacedExprs.getOrElseUpdate( + expr.prettyName, + AttributeReference(expr.prettyName, LongType, false)()) + case _: InputFileBlockLength => + replacedExprs.getOrElseUpdate( + expr.prettyName, + AttributeReference(expr.prettyName, LongType, false)()) + case other => + other.withNewChildren(other.children.map(child => rewriteExpr(child, replacedExprs))) + } + } + + private def addMetadataCol( + plan: SparkPlan, + replacedExprs: Map[String, AttributeReference]): SparkPlan = { + def genNewOutput(output: Seq[Attribute]): Seq[Attribute] = { + var newOutput = output + for ((_, newAttr) <- replacedExprs) { + if (!newOutput.exists(attr => attr.exprId == newAttr.exprId)) { + newOutput = newOutput :+ newAttr + } + } + newOutput + } + def genNewProjectList(projectList: Seq[NamedExpression]): Seq[NamedExpression] = { + var newProjectList = projectList + for ((_, newAttr) <- replacedExprs) { + if (!newProjectList.exists(attr => attr.exprId == newAttr.exprId)) { + newProjectList = newProjectList :+ newAttr.toAttribute + } + } + newProjectList + } + + plan match { + case f: FileSourceScanExec => + f.copy(output = genNewOutput(f.output)) + case f: FileSourceScanExecTransformer => + f.copy(output = genNewOutput(f.output)) + case b: BatchScanExec => + b.copy(output = genNewOutput(b.output).asInstanceOf[Seq[AttributeReference]]) + case b: BatchScanExecTransformer => + b.copy(output = genNewOutput(b.output).asInstanceOf[Seq[AttributeReference]]) + case p @ ProjectExec(projectList, child) => + p.copy(genNewProjectList(projectList), addMetadataCol(child, replacedExprs)) + case p @ ProjectExecTransformer(projectList, child) => + p.copy(genNewProjectList(projectList), addMetadataCol(child, replacedExprs)) + case _ => plan.withNewChildren(plan.children.map(addMetadataCol(_, replacedExprs))) + } + } + + private def tryOffloadProjectExecWithInputFileRelatedExprs( + projectExec: ProjectExec): SparkPlan = { + def findScanNodes(plan: SparkPlan): Seq[SparkPlan] = { + plan.collect { + case f @ (_: FileSourceScanExec | _: AbstractFileSourceScanExec | + _: DataSourceV2ScanExecBase) => + f + } + } + val addHint = AddFallbackTagRule() + val newProjectList = projectExec.projectList.filterNot(containsInputFileRelatedExpr) + val newProjectExec = ProjectExec(newProjectList, projectExec.child) + addHint.apply(newProjectExec) + if (FallbackTags.nonEmpty(newProjectExec)) { + // Project is still not transformable after remove `input_file_name` expressions. + projectExec + } else { + // the project with `input_file_name` expression should have at most + // one data source, reference: + // https://github.com/apache/spark/blob/e459674127e7b21e2767cc62d10ea6f1f941936c + // /sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala#L506 + val leafScans = findScanNodes(projectExec) + assert(leafScans.size <= 1) + if (leafScans.isEmpty || FallbackTags.nonEmpty(leafScans(0))) { + // It means + // 1. projectExec has `input_file_name` but no scan child. + // 2. It has scan child node but the scan node fallback. + projectExec + } else { + val replacedExprs = scala.collection.mutable.Map[String, AttributeReference]() + val newProjectList = projectExec.projectList.map { + expr => rewriteExpr(expr, replacedExprs).asInstanceOf[NamedExpression] + } + val newChild = addMetadataCol(projectExec.child, replacedExprs) + logDebug( + s"Columnar Processing for ${projectExec.getClass} with " + + s"ProjectList ${projectExec.projectList} is currently supported.") + ProjectExecTransformer(newProjectList, newChild) + } + } + } + + private def genProjectExec(projectExec: ProjectExec): SparkPlan = { + if ( + FallbackTags.nonEmpty(projectExec) && + BackendsApiManager.getSettings.supportNativeInputFileRelatedExpr() && + projectExec.projectList.exists(containsInputFileRelatedExpr) + ) { + tryOffloadProjectExecWithInputFileRelatedExprs(projectExec) + } else if (FallbackTags.nonEmpty(projectExec)) { + projectExec + } else { + logDebug(s"Columnar Processing for ${projectExec.getClass} is currently supported.") + ProjectExecTransformer(projectExec.projectList, projectExec.child) + } + } + + override def offload(plan: SparkPlan): SparkPlan = plan match { + case p: ProjectExec => + genProjectExec(p) + case other => other } } @@ -251,7 +366,7 @@ case class OffloadFilter() extends OffloadSingleNode with LogLevelUtil { * the actually used plan for execution. */ private def genFilterExec(filter: FilterExec): SparkPlan = { - if (TransformHints.isNotTransformable(filter)) { + if (FallbackTags.nonEmpty(filter)) { return filter } @@ -260,18 +375,15 @@ case class OffloadFilter() extends OffloadSingleNode with LogLevelUtil { // Push down the left conditions in Filter into FileSourceScan. val newChild: SparkPlan = filter.child match { case scan @ (_: FileSourceScanExec | _: BatchScanExec) => - if (TransformHints.isTransformable(scan)) { + if (FallbackTags.maybeOffloadable(scan)) { val newScan = FilterHandler.pushFilterToScan(filter.condition, scan) newScan match { case ts: TransformSupport if ts.doValidate().isValid => ts - // TODO remove the call - case _ => replace.doReplace(scan) + case _ => scan } - } else { - replace.doReplace(scan) - } - case _ => replace.doReplace(filter.child) + } else scan + case _ => filter.child } logDebug(s"Columnar Processing for ${filter.getClass} is currently supported.") BackendsApiManager.getSparkPlanExecApiInstance @@ -298,18 +410,8 @@ object OffloadOthers { def doReplace(p: SparkPlan): SparkPlan = { val plan = p - if (TransformHints.isNotTransformable(plan)) { - logDebug(s"Columnar Processing for ${plan.getClass} is under row guard.") - plan match { - case plan: BatchScanExec => - return applyScanNotTransformable(plan) - case plan: FileSourceScanExec => - return applyScanNotTransformable(plan) - case plan if HiveTableScanExecTransformer.isHiveTableScan(plan) => - return applyScanNotTransformable(plan) - case p => - return p - } + if (FallbackTags.nonEmpty(plan)) { + return plan } plan match { case plan: BatchScanExec => @@ -320,11 +422,7 @@ object OffloadOthers { applyScanTransformer(plan) case plan: CoalesceExec => logDebug(s"Columnar Processing for ${plan.getClass} is currently supported.") - CoalesceExecTransformer(plan.numPartitions, plan.child) - case plan: ProjectExec => - val columnarChild = plan.child - logDebug(s"Columnar Processing for ${plan.getClass} is currently supported.") - ProjectExecTransformer(plan.projectList, columnarChild) + ColumnarCoalesceExec(plan.numPartitions, plan.child) case plan: SortAggregateExec => logDebug(s"Columnar Processing for ${plan.getClass} is currently supported.") HashAggregateExecBaseTransformer.from(plan) { @@ -432,6 +530,15 @@ object OffloadOthers { child, plan.evalType) } + case plan: SampleExec => + logDebug(s"Columnar Processing for ${plan.getClass} is currently supported.") + val child = plan.child + BackendsApiManager.getSparkPlanExecApiInstance.genSampleExecTransformer( + plan.lowerBound, + plan.upperBound, + plan.withReplacement, + plan.seed, + child) case p if !p.isInstanceOf[GlutenPlan] => logDebug(s"Transformation for ${p.getClass} is currently not supported.") val children = plan.children @@ -440,44 +547,6 @@ object OffloadOthers { } } - // Since https://github.com/apache/incubator-gluten/pull/2701 - private def applyScanNotTransformable(plan: SparkPlan): SparkPlan = plan match { - case plan: FileSourceScanExec => - val newPartitionFilters = - ExpressionConverter.transformDynamicPruningExpr(plan.partitionFilters) - val newSource = plan.copy(partitionFilters = newPartitionFilters) - if (plan.logicalLink.nonEmpty) { - newSource.setLogicalLink(plan.logicalLink.get) - } - TransformHints.tag(newSource, TransformHints.getHint(plan)) - newSource - case plan: BatchScanExec => - val newPartitionFilters: Seq[Expression] = plan.scan match { - case scan: FileScan => - ExpressionConverter.transformDynamicPruningExpr(scan.partitionFilters) - case _ => - ExpressionConverter.transformDynamicPruningExpr(plan.runtimeFilters) - } - val newSource = plan.copy(runtimeFilters = newPartitionFilters) - if (plan.logicalLink.nonEmpty) { - newSource.setLogicalLink(plan.logicalLink.get) - } - TransformHints.tag(newSource, TransformHints.getHint(plan)) - newSource - case plan if HiveTableScanExecTransformer.isHiveTableScan(plan) => - val newPartitionFilters: Seq[Expression] = - ExpressionConverter.transformDynamicPruningExpr( - HiveTableScanExecTransformer.getPartitionFilters(plan)) - val newSource = HiveTableScanExecTransformer.copyWith(plan, newPartitionFilters) - if (plan.logicalLink.nonEmpty) { - newSource.setLogicalLink(plan.logicalLink.get) - } - TransformHints.tag(newSource, TransformHints.getHint(plan)) - newSource - case other => - throw new UnsupportedOperationException(s"${other.getClass.toString} is not supported.") - } - /** * Apply scan transformer for file source and batch source, * 1. create new filter and scan transformer, 2. validate, tag new scan as unsupported if @@ -492,18 +561,13 @@ object OffloadOthers { transformer } else { logDebug(s"Columnar Processing for ${plan.getClass} is currently unsupported.") - val newSource = plan.copy(partitionFilters = transformer.getPartitionFilters()) - TransformHints.tagNotTransformable(newSource, validationResult.reason.get) - newSource + FallbackTags.add(plan, validationResult.reason.get) + plan } case plan: BatchScanExec => ScanTransformerFactory.createBatchScanTransformer(plan) - case plan if HiveTableScanExecTransformer.isHiveTableScan(plan) => // TODO: Add DynamicPartitionPruningHiveScanSuite.scala - val newPartitionFilters: Seq[Expression] = - ExpressionConverter.transformDynamicPruningExpr( - HiveTableScanExecTransformer.getPartitionFilters(plan)) val hiveTableScanExecTransformer = BackendsApiManager.getSparkPlanExecApiInstance.genHiveTableScanExecTransformer(plan) val validateResult = hiveTableScanExecTransformer.doValidate() @@ -512,9 +576,8 @@ object OffloadOthers { return hiveTableScanExecTransformer } logDebug(s"Columnar Processing for ${plan.getClass} is currently unsupported.") - val newSource = HiveTableScanExecTransformer.copyWith(plan, newPartitionFilters) - TransformHints.tagNotTransformable(newSource, validateResult.reason.get) - newSource + FallbackTags.add(plan, validateResult.reason.get) + plan case other => throw new GlutenNotSupportException(s"${other.getClass.toString} is not supported.") } diff --git a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/RemoveNativeWriteFilesSortAndProject.scala b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/RemoveNativeWriteFilesSortAndProject.scala index ce94626d999d5..d32de32ebb322 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/RemoveNativeWriteFilesSortAndProject.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/RemoveNativeWriteFilesSortAndProject.scala @@ -76,7 +76,7 @@ object NativeWriteFilesWithSkippingSortAndProject extends Logging { } else { // If we can not transform the project, then we fallback to origin plan which means // we also retain the sort operator. - TransformHints.tagNotTransformable(p, validationResult) + FallbackTags.add(p, validationResult) None } case _ => None diff --git a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/enumerated/EnumeratedApplier.scala b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/enumerated/EnumeratedApplier.scala index a259641f50492..519db966c2252 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/enumerated/EnumeratedApplier.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/enumerated/EnumeratedApplier.scala @@ -19,16 +19,15 @@ package org.apache.gluten.extension.columnar.enumerated import org.apache.gluten.GlutenConfig import org.apache.gluten.backendsapi.BackendsApiManager import org.apache.gluten.extension.columnar._ -import org.apache.gluten.extension.columnar.MiscColumnarRules.{RemoveGlutenTableCacheColumnarToRow, RemoveTopmostColumnarToRow} +import org.apache.gluten.extension.columnar.MiscColumnarRules.{RemoveGlutenTableCacheColumnarToRow, RemoveTopmostColumnarToRow, RewriteSubqueryBroadcast} import org.apache.gluten.extension.columnar.transition.{InsertTransitions, RemoveTransitions} import org.apache.gluten.extension.columnar.util.AdaptiveContext -import org.apache.gluten.metrics.GlutenTimeMetric import org.apache.gluten.utils.{LogLevelUtil, PhysicalPlanSelector} import org.apache.spark.annotation.Experimental import org.apache.spark.internal.Logging import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.catalyst.rules.{PlanChangeLogger, Rule} +import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.{ColumnarCollapseTransformStages, GlutenFallbackReporter, SparkPlan} import org.apache.spark.util.SparkRuleUtil @@ -46,42 +45,32 @@ class EnumeratedApplier(session: SparkSession) with Logging with LogLevelUtil { // An empirical value. - private val aqeStackTraceIndex = 16 - - private lazy val transformPlanLogLevel = GlutenConfig.getConf.transformPlanLogLevel - private lazy val planChangeLogger = new PlanChangeLogger[SparkPlan]() - + private val aqeStackTraceIndex = + if (scala.util.Properties.releaseVersion.exists(_.startsWith("2.12"))) { + 16 + } else { + 14 + } private val adaptiveContext = AdaptiveContext(session, aqeStackTraceIndex) override def apply(plan: SparkPlan, outputsColumnar: Boolean): SparkPlan = PhysicalPlanSelector.maybe(session, plan) { - val transformed = transformPlan(transformRules(outputsColumnar), plan, "transform") + val transformed = + transformPlan("transform", transformRules(outputsColumnar).map(_(session)), plan) val postPlan = maybeAqe { - transformPlan(postRules(), transformed, "post") + transformPlan("post", postRules().map(_(session)), transformed) } - val finalPlan = transformPlan(finalRules(), postPlan, "final") + val finalPlan = transformPlan("final", finalRules().map(_(session)), postPlan) finalPlan } private def transformPlan( - getRules: List[SparkSession => Rule[SparkPlan]], - plan: SparkPlan, - step: String) = GlutenTimeMetric.withMillisTime { - logOnLevel( - transformPlanLogLevel, - s"${step}ColumnarTransitions preOverriden plan:\n${plan.toString}") - val overridden = getRules.foldLeft(plan) { - (p, getRule) => - val rule = getRule(session) - val newPlan = rule(p) - planChangeLogger.logRule(rule.ruleName, p, newPlan) - newPlan - } - logOnLevel( - transformPlanLogLevel, - s"${step}ColumnarTransitions afterOverriden plan:\n${overridden.toString}") - overridden - }(t => logOnLevel(transformPlanLogLevel, s"${step}Transform SparkPlan took: $t ms.")) + phase: String, + rules: Seq[Rule[SparkPlan]], + plan: SparkPlan): SparkPlan = { + val executor = new ColumnarRuleApplier.Executor(phase, rules) + executor.execute(plan) + } private def maybeAqe[T](f: => T): T = { adaptiveContext.setAdaptiveContext() @@ -96,12 +85,13 @@ class EnumeratedApplier(session: SparkSession) * Rules to let planner create a suggested Gluten plan being sent to `fallbackPolicies` in which * the plan will be breakdown and decided to be fallen back or not. */ - private def transformRules(outputsColumnar: Boolean): List[SparkSession => Rule[SparkPlan]] = { + private def transformRules(outputsColumnar: Boolean): Seq[SparkSession => Rule[SparkPlan]] = { List( (_: SparkSession) => RemoveTransitions, (spark: SparkSession) => FallbackOnANSIMode(spark), (spark: SparkSession) => PlanOneRowRelation(spark), - (_: SparkSession) => FallbackEmptySchemaRelation() + (_: SparkSession) => FallbackEmptySchemaRelation(), + (_: SparkSession) => RewriteSubqueryBroadcast() ) ::: BackendsApiManager.getSparkPlanExecApiInstance.genExtendedColumnarValidationRules() ::: List((spark: SparkSession) => MergeTwoPhasesHashBaseAggregate(spark)) ::: @@ -125,7 +115,7 @@ class EnumeratedApplier(session: SparkSession) * Rules applying to non-fallen-back Gluten plans. To do some post cleanup works on the plan to * make sure it be able to run and be compatible with Spark's execution engine. */ - private def postRules(): List[SparkSession => Rule[SparkPlan]] = + private def postRules(): Seq[SparkSession => Rule[SparkPlan]] = List( (s: SparkSession) => RemoveTopmostColumnarToRow(s, adaptiveContext.isAdaptiveContext())) ::: BackendsApiManager.getSparkPlanExecApiInstance.genExtendedColumnarPostRules() ::: @@ -136,14 +126,14 @@ class EnumeratedApplier(session: SparkSession) * Rules consistently applying to all input plans after all other rules have been applied, despite * whether the input plan is fallen back or not. */ - private def finalRules(): List[SparkSession => Rule[SparkPlan]] = { + private def finalRules(): Seq[SparkSession => Rule[SparkPlan]] = { List( // The rule is required despite whether the stage is fallen back or not. Since // ColumnarCachedBatchSerializer is statically registered to Spark without a columnar rule // when columnar table cache is enabled. (s: SparkSession) => RemoveGlutenTableCacheColumnarToRow(s), (s: SparkSession) => GlutenFallbackReporter(GlutenConfig.getConf, s), - (_: SparkSession) => RemoveTransformHintRule() + (_: SparkSession) => RemoveFallbackTagRule() ) } } diff --git a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/enumerated/EnumeratedTransform.scala b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/enumerated/EnumeratedTransform.scala index 0b9dcc663246e..30e4c0a798230 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/enumerated/EnumeratedTransform.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/enumerated/EnumeratedTransform.scala @@ -43,6 +43,7 @@ case class EnumeratedTransform(session: SparkSession, outputsColumnar: Boolean) private val rules = List( new PushFilterToScan(RasOffload.validator), + RemoveSort, RemoveFilter ) @@ -53,6 +54,7 @@ case class EnumeratedTransform(session: SparkSession, outputsColumnar: Boolean) RasOffload.from[BaseJoinExec](OffloadJoin()).toRule, RasOffloadHashAggregate.toRule, RasOffloadFilter.toRule, + RasOffloadProject.toRule, RasOffload.from[DataSourceV2ScanExecBase](OffloadOthers()).toRule, RasOffload.from[DataSourceScanExec](OffloadOthers()).toRule, RasOffload diff --git a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/enumerated/RasOffload.scala b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/enumerated/RasOffload.scala index 6af89dc057aa9..8091127da0bfa 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/enumerated/RasOffload.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/enumerated/RasOffload.scala @@ -105,13 +105,13 @@ object RasOffload { validator.validate(from) match { case Validator.Passed => val offloaded = base.offload(from) - offloaded match { - case t: GlutenPlan if !t.doValidate().isValid => - // 4. If native validation fails on the offloaded node, return the - // original one. - from - case other => - other + val offloadedNodes = offloaded.collect[GlutenPlan] { case t: GlutenPlan => t } + if (offloadedNodes.exists(!_.doValidate().isValid)) { + // 4. If native validation fails on the offloaded node, return the + // original one. + from + } else { + offloaded } case Validator.Failed(reason) => from diff --git a/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxBroadcastApi.scala b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/enumerated/RasOffloadProject.scala similarity index 60% rename from backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxBroadcastApi.scala rename to gluten-core/src/main/scala/org/apache/gluten/extension/columnar/enumerated/RasOffloadProject.scala index bae3bb63599e5..0bbf57499b73a 100644 --- a/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxBroadcastApi.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/enumerated/RasOffloadProject.scala @@ -14,19 +14,20 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.gluten.backendsapi.velox +package org.apache.gluten.extension.columnar.enumerated -import org.apache.gluten.backendsapi.BroadcastApi +import org.apache.gluten.execution.ProjectExecTransformer -import java.util +import org.apache.spark.sql.execution.{ProjectExec, SparkPlan} -class VeloxBroadcastApi extends BroadcastApi { +object RasOffloadProject extends RasOffload { + override def offload(node: SparkPlan): SparkPlan = node match { + case ProjectExec(projectList, child) => + ProjectExecTransformer(projectList, child) + case other => + other + } - override def collectExecutionBroadcastTableId(executionId: String, buildTableId: String): Unit = - super.collectExecutionBroadcastTableId(executionId, buildTableId) - - override def cleanExecutionBroadcastTable( - executionId: String, - broadcastTableIds: util.Set[String]): Unit = - super.cleanExecutionBroadcastTable(executionId, broadcastTableIds) + override def typeIdentifier(): RasOffload.TypeIdentifier = + RasOffload.TypeIdentifier.of[ProjectExec] } diff --git a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/enumerated/RemoveFilter.scala b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/enumerated/RemoveFilter.scala index a3b4831a622d5..e2b8439fd218e 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/enumerated/RemoveFilter.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/enumerated/RemoveFilter.scala @@ -17,7 +17,7 @@ package org.apache.gluten.extension.columnar.enumerated import org.apache.gluten.execution._ -import org.apache.gluten.metrics.{MetricsUpdater, NoopMetricsUpdater} +import org.apache.gluten.metrics.MetricsUpdater import org.apache.gluten.ras.path.Pattern._ import org.apache.gluten.ras.path.Pattern.Matchers._ import org.apache.gluten.ras.rule.{RasRule, Shape} @@ -42,6 +42,7 @@ object RemoveFilter extends RasRule[SparkPlan] { val filter = node.asInstanceOf[FilterExecTransformerBase] if (filter.isNoop()) { val out = NoopFilter(filter.child, filter.output) + out.copyTagsFrom(filter) return List(out) } List.empty @@ -54,12 +55,12 @@ object RemoveFilter extends RasRule[SparkPlan] { leaf(clazz(classOf[BasicScanExecTransformer])) ).build()) - // A noop filter placeholder that indicates that all conditions are pushed into scan. + // A noop filter placeholder that indicates that all conditions were pushed down to scan. // // This operator has zero cost in cost model to avoid planner from choosing the // original filter-scan that doesn't have all conditions pushed down to scan. // - // We cannot simplify remove the filter to let planner choose the scan since by vanilla + // We cannot simply remove the filter to let planner choose the pushed scan since by vanilla // Spark's definition the filter may have different output nullability than scan. So // we have to keep this empty filter to let the optimized tree have the identical output schema // with the original tree. If we simply remove the filter, possible UBs might be caused. For @@ -71,12 +72,12 @@ object RemoveFilter extends RasRule[SparkPlan] { // spark.sql.adaptive.logLevel=ERROR. case class NoopFilter(override val child: SparkPlan, override val output: Seq[Attribute]) extends UnaryTransformSupport { - override def metricsUpdater(): MetricsUpdater = NoopMetricsUpdater + override def metricsUpdater(): MetricsUpdater = MetricsUpdater.None override protected def withNewChildInternal(newChild: SparkPlan): SparkPlan = copy(newChild) override def outputPartitioning: Partitioning = child.outputPartitioning override def outputOrdering: Seq[SortOrder] = child.outputOrdering - override def doTransform(context: SubstraitContext): TransformContext = - child.asInstanceOf[TransformSupport].doTransform(context) + override protected def doTransform(context: SubstraitContext): TransformContext = + child.asInstanceOf[TransformSupport].transform(context) override protected def doExecuteColumnar(): RDD[ColumnarBatch] = child.executeColumnar() } } diff --git a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/enumerated/RemoveSort.scala b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/enumerated/RemoveSort.scala new file mode 100644 index 0000000000000..5b5d5e541eb76 --- /dev/null +++ b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/enumerated/RemoveSort.scala @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.extension.columnar.enumerated + +import org.apache.gluten.execution.{HashAggregateExecBaseTransformer, ShuffledHashJoinExecTransformerBase, SortExecTransformer} +import org.apache.gluten.extension.GlutenPlan +import org.apache.gluten.ras.path.Pattern._ +import org.apache.gluten.ras.path.Pattern.Matchers._ +import org.apache.gluten.ras.rule.{RasRule, Shape} +import org.apache.gluten.ras.rule.Shapes._ + +import org.apache.spark.sql.catalyst.expressions.SortOrder +import org.apache.spark.sql.execution.SparkPlan + +/** + * Removes unnecessary sort if its parent doesn't require for sorted input. + * + * TODO: Sort's removal could be made much simpler once output ordering is added as a physical + * property in RAS planer. + */ +object RemoveSort extends RasRule[SparkPlan] { + private val appliedTypes: Seq[Class[_ <: GlutenPlan]] = + List(classOf[HashAggregateExecBaseTransformer], classOf[ShuffledHashJoinExecTransformerBase]) + + override def shift(node: SparkPlan): Iterable[SparkPlan] = { + assert(node.isInstanceOf[GlutenPlan]) + val newChildren = node.requiredChildOrdering.zip(node.children).map { + case (Nil, sort: SortExecTransformer) => + // Parent doesn't ask for sorted input from this child but a sort op was somehow added. + // Remove it. + sort.child + case (req, child) => + // Parent asks for sorted input from this child. Do nothing but an assertion. + assert(SortOrder.orderingSatisfies(child.outputOrdering, req)) + child + } + val out = List(node.withNewChildren(newChildren)) + out + } + override def shape(): Shape[SparkPlan] = pattern( + branch2[SparkPlan]( + or(appliedTypes.map(clazz[SparkPlan](_)): _*), + _ >= 1, + _ => node(clazz(classOf[GlutenPlan])) + ).build() + ) +} diff --git a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/heuristic/HeuristicApplier.scala b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/heuristic/HeuristicApplier.scala index 941677a6b933a..03b2b66b09b36 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/heuristic/HeuristicApplier.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/heuristic/HeuristicApplier.scala @@ -19,16 +19,15 @@ package org.apache.gluten.extension.columnar.heuristic import org.apache.gluten.GlutenConfig import org.apache.gluten.backendsapi.BackendsApiManager import org.apache.gluten.extension.columnar._ -import org.apache.gluten.extension.columnar.MiscColumnarRules.{RemoveGlutenTableCacheColumnarToRow, RemoveTopmostColumnarToRow, TransformPreOverrides} +import org.apache.gluten.extension.columnar.MiscColumnarRules.{RemoveGlutenTableCacheColumnarToRow, RemoveTopmostColumnarToRow, RewriteSubqueryBroadcast, TransformPreOverrides} import org.apache.gluten.extension.columnar.rewrite.RewriteSparkPlanRulesManager import org.apache.gluten.extension.columnar.transition.{InsertTransitions, RemoveTransitions} import org.apache.gluten.extension.columnar.util.AdaptiveContext -import org.apache.gluten.metrics.GlutenTimeMetric import org.apache.gluten.utils.{LogLevelUtil, PhysicalPlanSelector} import org.apache.spark.internal.Logging import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.catalyst.rules.{PlanChangeLogger, Rule} +import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.{ColumnarCollapseTransformStages, GlutenFallbackReporter, SparkPlan} import org.apache.spark.util.SparkRuleUtil @@ -41,55 +40,45 @@ class HeuristicApplier(session: SparkSession) with Logging with LogLevelUtil { // This is an empirical value, may need to be changed for supporting other versions of spark. - private val aqeStackTraceIndex = 19 - - private lazy val transformPlanLogLevel = GlutenConfig.getConf.transformPlanLogLevel - private lazy val planChangeLogger = new PlanChangeLogger[SparkPlan]() - + private val aqeStackTraceIndex = + if (scala.util.Properties.releaseVersion.exists(_.startsWith("2.12"))) { + 19 + } else { + 17 + } private val adaptiveContext = AdaptiveContext(session, aqeStackTraceIndex) - override def apply(plan: SparkPlan, outputsColumnar: Boolean): SparkPlan = + override def apply(plan: SparkPlan, outputsColumnar: Boolean): SparkPlan = { withTransformRules(transformRules(outputsColumnar)).apply(plan) + } // Visible for testing. - def withTransformRules(transformRules: List[SparkSession => Rule[SparkPlan]]): Rule[SparkPlan] = + def withTransformRules(transformRules: Seq[SparkSession => Rule[SparkPlan]]): Rule[SparkPlan] = plan => PhysicalPlanSelector.maybe(session, plan) { val finalPlan = prepareFallback(plan) { p => - val suggestedPlan = transformPlan(transformRules, p, "transform") - transformPlan(fallbackPolicies(), suggestedPlan, "fallback") match { + val suggestedPlan = transformPlan("transform", transformRules.map(_(session)), p) + transformPlan("fallback", fallbackPolicies().map(_(session)), suggestedPlan) match { case FallbackNode(fallbackPlan) => // we should use vanilla c2r rather than native c2r, // and there should be no `GlutenPlan` any more, // so skip the `postRules()`. fallbackPlan case plan => - transformPlan(postRules(), plan, "post") + transformPlan("post", postRules().map(_(session)), plan) } } - transformPlan(finalRules(), finalPlan, "final") + transformPlan("final", finalRules().map(_(session)), finalPlan) } private def transformPlan( - getRules: List[SparkSession => Rule[SparkPlan]], - plan: SparkPlan, - step: String) = GlutenTimeMetric.withMillisTime { - logOnLevel( - transformPlanLogLevel, - s"${step}ColumnarTransitions preOverridden plan:\n${plan.toString}") - val overridden = getRules.foldLeft(plan) { - (p, getRule) => - val rule = getRule(session) - val newPlan = rule(p) - planChangeLogger.logRule(rule.ruleName, p, newPlan) - newPlan - } - logOnLevel( - transformPlanLogLevel, - s"${step}ColumnarTransitions afterOverridden plan:\n${overridden.toString}") - overridden - }(t => logOnLevel(transformPlanLogLevel, s"${step}Transform SparkPlan took: $t ms.")) + phase: String, + rules: Seq[Rule[SparkPlan]], + plan: SparkPlan): SparkPlan = { + val executor = new ColumnarRuleApplier.Executor(phase, rules) + executor.execute(plan) + } private def prepareFallback[T](plan: SparkPlan)(f: SparkPlan => T): T = { adaptiveContext.setAdaptiveContext() @@ -106,19 +95,20 @@ class HeuristicApplier(session: SparkSession) * Rules to let planner create a suggested Gluten plan being sent to `fallbackPolicies` in which * the plan will be breakdown and decided to be fallen back or not. */ - private def transformRules(outputsColumnar: Boolean): List[SparkSession => Rule[SparkPlan]] = { + private def transformRules(outputsColumnar: Boolean): Seq[SparkSession => Rule[SparkPlan]] = { List( (_: SparkSession) => RemoveTransitions, (spark: SparkSession) => FallbackOnANSIMode(spark), (spark: SparkSession) => FallbackMultiCodegens(spark), (spark: SparkSession) => PlanOneRowRelation(spark), - (_: SparkSession) => FallbackEmptySchemaRelation() + (_: SparkSession) => RewriteSubqueryBroadcast() ) ::: BackendsApiManager.getSparkPlanExecApiInstance.genExtendedColumnarValidationRules() ::: List( + (_: SparkSession) => FallbackEmptySchemaRelation(), (spark: SparkSession) => MergeTwoPhasesHashBaseAggregate(spark), (_: SparkSession) => RewriteSparkPlanRulesManager(), - (_: SparkSession) => AddTransformHintRule() + (_: SparkSession) => AddFallbackTagRule() ) ::: List((_: SparkSession) => TransformPreOverrides()) ::: List( @@ -137,7 +127,7 @@ class HeuristicApplier(session: SparkSession) * Rules to add wrapper `FallbackNode`s on top of the input plan, as hints to make planner fall * back the whole input plan to the original vanilla Spark plan. */ - private def fallbackPolicies(): List[SparkSession => Rule[SparkPlan]] = { + private def fallbackPolicies(): Seq[SparkSession => Rule[SparkPlan]] = { List( (_: SparkSession) => ExpandFallbackPolicy(adaptiveContext.isAdaptiveContext(), adaptiveContext.originalPlan())) @@ -147,7 +137,7 @@ class HeuristicApplier(session: SparkSession) * Rules applying to non-fallen-back Gluten plans. To do some post cleanup works on the plan to * make sure it be able to run and be compatible with Spark's execution engine. */ - private def postRules(): List[SparkSession => Rule[SparkPlan]] = + private def postRules(): Seq[SparkSession => Rule[SparkPlan]] = List( (s: SparkSession) => RemoveTopmostColumnarToRow(s, adaptiveContext.isAdaptiveContext())) ::: BackendsApiManager.getSparkPlanExecApiInstance.genExtendedColumnarPostRules() ::: @@ -158,14 +148,14 @@ class HeuristicApplier(session: SparkSession) * Rules consistently applying to all input plans after all other rules have been applied, despite * whether the input plan is fallen back or not. */ - private def finalRules(): List[SparkSession => Rule[SparkPlan]] = { + private def finalRules(): Seq[SparkSession => Rule[SparkPlan]] = { List( // The rule is required despite whether the stage is fallen back or not. Since // ColumnarCachedBatchSerializer is statically registered to Spark without a columnar rule // when columnar table cache is enabled. (s: SparkSession) => RemoveGlutenTableCacheColumnarToRow(s), (s: SparkSession) => GlutenFallbackReporter(GlutenConfig.getConf, s), - (_: SparkSession) => RemoveTransformHintRule() + (_: SparkSession) => RemoveFallbackTagRule() ) } diff --git a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/rewrite/PullOutPreProject.scala b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/rewrite/PullOutPreProject.scala index 50dc55423605d..73b8ab2607eb4 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/rewrite/PullOutPreProject.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/rewrite/PullOutPreProject.scala @@ -75,6 +75,17 @@ object PullOutPreProject extends RewriteSingleNode with PullOutProjectHelper { case _ => false } case _ => false + }.isDefined) || + window.windowExpression.exists(_.find { + case we: WindowExpression => + we.windowSpec.frameSpecification match { + case swf: SpecifiedWindowFrame + if needPreComputeRangeFrame(swf) && supportPreComputeRangeFrame( + we.windowSpec.orderSpec) => + true + case _ => false + } + case _ => false }.isDefined) case plan if SparkShimLoader.getSparkShims.isWindowGroupLimitExec(plan) => val window = SparkShimLoader.getSparkShims @@ -174,7 +185,9 @@ object PullOutPreProject extends RewriteSingleNode with PullOutProjectHelper { // Handle windowExpressions. val newWindowExpressions = window.windowExpression.toIndexedSeq.map { - _.transform { case we: WindowExpression => rewriteWindowExpression(we, expressionMap) } + _.transform { + case we: WindowExpression => rewriteWindowExpression(we, newOrderSpec, expressionMap) + } } val newWindow = window.copy( diff --git a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/rewrite/RewriteJoin.scala b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/rewrite/RewriteJoin.scala new file mode 100644 index 0000000000000..e038f5af0a070 --- /dev/null +++ b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/rewrite/RewriteJoin.scala @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.extension.columnar.rewrite + +import org.apache.gluten.GlutenConfig +import org.apache.gluten.execution.SortUtils + +import org.apache.spark.sql.catalyst.optimizer.{BuildLeft, BuildRight, BuildSide, JoinSelectionHelper} +import org.apache.spark.sql.catalyst.plans.JoinType +import org.apache.spark.sql.execution.SparkPlan +import org.apache.spark.sql.execution.joins.{ShuffledHashJoinExec, SortMergeJoinExec} + +/** + * If force ShuffledHashJoin, convert [[SortMergeJoinExec]] to [[ShuffledHashJoinExec]]. There is no + * need to select a smaller table as buildSide here, it will be reselected when offloading. + */ +object RewriteJoin extends RewriteSingleNode with JoinSelectionHelper { + + private def getBuildSide(joinType: JoinType): Option[BuildSide] = { + val leftBuildable = canBuildShuffledHashJoinLeft(joinType) + val rightBuildable = canBuildShuffledHashJoinRight(joinType) + if (rightBuildable) { + Some(BuildRight) + } else if (leftBuildable) { + Some(BuildLeft) + } else { + None + } + } + + override def rewrite(plan: SparkPlan): SparkPlan = plan match { + case smj: SortMergeJoinExec if GlutenConfig.getConf.forceShuffledHashJoin => + getBuildSide(smj.joinType) match { + case Some(buildSide) => + ShuffledHashJoinExec( + smj.leftKeys, + smj.rightKeys, + smj.joinType, + buildSide, + smj.condition, + SortUtils.dropPartialSort(smj.left), + SortUtils.dropPartialSort(smj.right), + smj.isSkewJoin + ) + case _ => plan + } + case _ => plan + } +} diff --git a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/rewrite/RewriteSingleNode.scala b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/rewrite/RewriteSingleNode.scala index 01f2e29fe62d4..551cfd599abd4 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/rewrite/RewriteSingleNode.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/rewrite/RewriteSingleNode.scala @@ -35,6 +35,6 @@ trait RewriteSingleNode { object RewriteSingleNode { def allRules(): Seq[RewriteSingleNode] = { - Seq(RewriteIn, RewriteMultiChildrenCount, PullOutPreProject, PullOutPostProject) + Seq(RewriteIn, RewriteMultiChildrenCount, RewriteJoin, PullOutPreProject, PullOutPostProject) } } diff --git a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/rewrite/RewriteSparkPlanRulesManager.scala b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/rewrite/RewriteSparkPlanRulesManager.scala index ac663314bead5..2abd4d7d48074 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/rewrite/RewriteSparkPlanRulesManager.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/rewrite/RewriteSparkPlanRulesManager.scala @@ -16,7 +16,7 @@ */ package org.apache.gluten.extension.columnar.rewrite -import org.apache.gluten.extension.columnar.{AddTransformHintRule, TransformHint, TransformHints} +import org.apache.gluten.extension.columnar.{AddFallbackTagRule, FallbackTag, FallbackTags} import org.apache.gluten.sql.shims.SparkShimLoader import org.apache.spark.rdd.RDD @@ -49,7 +49,7 @@ class RewriteSparkPlanRulesManager private (rewriteRules: Seq[RewriteSingleNode] extends Rule[SparkPlan] { private def mayNeedRewrite(plan: SparkPlan): Boolean = { - TransformHints.isTransformable(plan) && { + FallbackTags.maybeOffloadable(plan) && { plan match { case _: SortExec => true case _: TakeOrderedAndProjectExec => true @@ -67,15 +67,14 @@ class RewriteSparkPlanRulesManager private (rewriteRules: Seq[RewriteSingleNode] } } - private def getTransformHintBack( - origin: SparkPlan, - rewrittenPlan: SparkPlan): Option[TransformHint] = { - // The rewritten plan may contain more nodes than origin, here use the node name to get it back + private def getFallbackTagBack(rewrittenPlan: SparkPlan): Option[FallbackTag] = { + // The rewritten plan may contain more nodes than origin, for now it should only be + // `ProjectExec`. val target = rewrittenPlan.collect { - case p if p.nodeName == origin.nodeName => p + case p if !p.isInstanceOf[ProjectExec] && !p.isInstanceOf[RewrittenNodeWall] => p } assert(target.size == 1) - TransformHints.getHintOption(target.head) + FallbackTags.getTagOption(target.head) } private def applyRewriteRules(origin: SparkPlan): (SparkPlan, Option[String]) = { @@ -94,7 +93,7 @@ class RewriteSparkPlanRulesManager private (rewriteRules: Seq[RewriteSingleNode] } override def apply(plan: SparkPlan): SparkPlan = { - val addHint = AddTransformHintRule() + val addHint = AddFallbackTagRule() plan.transformUp { case origin if mayNeedRewrite(origin) => // Add a wall to avoid transforming unnecessary nodes. @@ -105,18 +104,18 @@ class RewriteSparkPlanRulesManager private (rewriteRules: Seq[RewriteSingleNode] // Note, it is not expected, but it happens in CH backend when pulling out // aggregate. // TODO: Fix the exception and remove this branch - TransformHints.tagNotTransformable(origin, error.get) + FallbackTags.add(origin, error.get) origin } else if (withWall.fastEquals(rewrittenPlan)) { // Return origin if the rewrite rules do nothing. - // We do not add tag and leave it to the outside `AddTransformHintRule`. + // We do not add tag and leave it to the outside `AddFallbackTagRule`. origin } else { addHint.apply(rewrittenPlan) - val hint = getTransformHintBack(origin, rewrittenPlan) + val hint = getFallbackTagBack(rewrittenPlan) if (hint.isDefined) { // If the rewritten plan is still not transformable, return the original plan. - TransformHints.tag(origin, hint.get) + FallbackTags.tag(origin, hint.get) origin } else { rewrittenPlan.transformUp { diff --git a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/validator/Validators.scala b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/validator/Validators.scala index d4bd9926a84cb..959bf808aba46 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/validator/Validators.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/validator/Validators.scala @@ -19,7 +19,7 @@ package org.apache.gluten.extension.columnar.validator import org.apache.gluten.GlutenConfig import org.apache.gluten.backendsapi.{BackendsApiManager, BackendSettingsApi} import org.apache.gluten.expression.ExpressionUtils -import org.apache.gluten.extension.columnar.{TRANSFORM_UNSUPPORTED, TransformHints} +import org.apache.gluten.extension.columnar.{FallbackTags, TRANSFORM_UNSUPPORTED} import org.apache.gluten.sql.shims.SparkShimLoader import org.apache.spark.sql.execution._ @@ -97,7 +97,7 @@ object Validators { if (buffer.isEmpty) { NoopValidator } else { - new ValidatorPipeline(buffer) + new ValidatorPipeline(buffer.toSeq) } } } @@ -108,8 +108,8 @@ object Validators { private object FallbackByHint extends Validator { override def validate(plan: SparkPlan): Validator.OutCome = { - if (TransformHints.isNotTransformable(plan)) { - val hint = TransformHints.getHint(plan).asInstanceOf[TRANSFORM_UNSUPPORTED] + if (FallbackTags.nonEmpty(plan)) { + val hint = FallbackTags.getTag(plan).asInstanceOf[TRANSFORM_UNSUPPORTED] return fail(hint.reason.getOrElse("Reason not recorded")) } pass() @@ -192,6 +192,9 @@ object Validators { case p if HiveTableScanExecTransformer.isHiveTableScan(p) && !conf.enableColumnarHiveTableScan => fail(p) + case p: SampleExec + if !(conf.enableColumnarSample && BackendsApiManager.getSettings.supportSampleExec()) => + fail(p) case _ => pass() } } diff --git a/gluten-core/src/main/scala/org/apache/gluten/metrics/MetricsUpdater.scala b/gluten-core/src/main/scala/org/apache/gluten/metrics/MetricsUpdater.scala index 0a622ba0b37d4..5201df3b3472d 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/metrics/MetricsUpdater.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/metrics/MetricsUpdater.scala @@ -16,7 +16,6 @@ */ package org.apache.gluten.metrics -import org.apache.spark.sql.execution.metric.SQLMetric import org.apache.spark.sql.utils.OASPackageBridge.InputMetricsWrapper /** @@ -26,16 +25,34 @@ import org.apache.spark.sql.utils.OASPackageBridge.InputMetricsWrapper * TODO: place it to some other where since it's used not only by whole stage facilities */ trait MetricsUpdater extends Serializable { + def updateInputMetrics(inputMetrics: InputMetricsWrapper): Unit = {} + def updateNativeMetrics(operatorMetrics: IOperatorMetrics): Unit = {} +} - def metrics: Map[String, SQLMetric] +object MetricsUpdater { + // An empty metrics updater. Used when the operator generates native metrics but + // it's yet unwanted to update the metrics in JVM side. + object Todo extends MetricsUpdater {} - def updateInputMetrics(inputMetrics: InputMetricsWrapper): Unit = {} + // Used when the operator doesn't generate native metrics. It could be because + // the operator doesn't generate any native query plan. + object None extends MetricsUpdater { + override def updateInputMetrics(inputMetrics: InputMetricsWrapper): Unit = + throw new UnsupportedOperationException() + override def updateNativeMetrics(operatorMetrics: IOperatorMetrics): Unit = + throw new UnsupportedOperationException() + } - def updateNativeMetrics(operatorMetrics: IOperatorMetrics): Unit = {} + // Indicates a branch of a MetricsUpdaterTree is terminated. It's not bound to + // any operators. + object Terminate extends MetricsUpdater { + override def updateInputMetrics(inputMetrics: InputMetricsWrapper): Unit = + throw new UnsupportedOperationException() + override def updateNativeMetrics(operatorMetrics: IOperatorMetrics): Unit = + throw new UnsupportedOperationException() + } } final case class MetricsUpdaterTree(updater: MetricsUpdater, children: Seq[MetricsUpdaterTree]) -object NoopMetricsUpdater extends MetricsUpdater { - override def metrics: Map[String, SQLMetric] = Map.empty -} +object MetricsUpdaterTree {} diff --git a/gluten-core/src/main/scala/org/apache/gluten/planner/GlutenOptimization.scala b/gluten-core/src/main/scala/org/apache/gluten/planner/GlutenOptimization.scala index 555e7d6143bc1..5b24f596395d7 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/planner/GlutenOptimization.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/planner/GlutenOptimization.scala @@ -61,7 +61,7 @@ object GlutenOptimization { GlutenMetadataModel(), GlutenPropertyModel(), GlutenExplain, - RasRule.Factory.reuse(rules)) + RasRule.Factory.reuse(rules.toSeq)) } } } diff --git a/gluten-core/src/main/scala/org/apache/gluten/planner/cost/GlutenCostModel.scala b/gluten-core/src/main/scala/org/apache/gluten/planner/cost/GlutenCostModel.scala index 4b5fa6803eeca..ab0deab1939d5 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/planner/cost/GlutenCostModel.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/planner/cost/GlutenCostModel.scala @@ -17,7 +17,6 @@ package org.apache.gluten.planner.cost import org.apache.gluten.GlutenConfig -import org.apache.gluten.extension.columnar.OffloadJoin import org.apache.gluten.extension.columnar.enumerated.RemoveFilter import org.apache.gluten.extension.columnar.transition.{ColumnarToRowLike, RowToColumnarLike} import org.apache.gluten.planner.plan.GlutenPlanModel.GroupLeafExec @@ -26,7 +25,6 @@ import org.apache.gluten.utils.PlanUtil import org.apache.spark.internal.Logging import org.apache.spark.sql.execution.{ColumnarToRowExec, RowToColumnarExec, SparkPlan} -import org.apache.spark.sql.execution.joins.ShuffledHashJoinExec import org.apache.spark.sql.utils.ReflectionUtil object GlutenCostModel extends Logging { @@ -75,12 +73,8 @@ object GlutenCostModel extends Logging { // A very rough estimation as of now. private def selfLongCostOf(node: SparkPlan): Long = { node match { - case p: ShuffledHashJoinExec if !OffloadJoin.isLegal(p) => - // To exclude the rewritten intermediate plan that is not executable - // by vanilla Spark and was generated by strategy "JoinSelectionOverrides" - infLongCost case _: RemoveFilter.NoopFilter => - // To make planner choose the tree that has applied rule PushFilterToScan. + // To make planner choose the tree that has applied rule PushFilterToScan. 0L case ColumnarToRowExec(child) => 3L case RowToColumnarExec(child) => 3L diff --git a/gluten-core/src/main/scala/org/apache/gluten/planner/metadata/GlutenMetadata.scala b/gluten-core/src/main/scala/org/apache/gluten/planner/metadata/GlutenMetadata.scala index e25f0a1f1c067..f66c5290e95fc 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/planner/metadata/GlutenMetadata.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/planner/metadata/GlutenMetadata.scala @@ -18,42 +18,18 @@ package org.apache.gluten.planner.metadata import org.apache.gluten.ras.Metadata -import org.apache.spark.sql.catalyst.expressions.Attribute - sealed trait GlutenMetadata extends Metadata { - import GlutenMetadata._ def schema(): Schema + def logicalLink(): LogicalLink } object GlutenMetadata { - def apply(schema: Schema): Metadata = { - Impl(schema) + def apply(schema: Schema, logicalLink: LogicalLink): Metadata = { + Impl(schema, logicalLink) } - private case class Impl(override val schema: Schema) extends GlutenMetadata - - case class Schema(output: Seq[Attribute]) { - private val hash = output.map(_.semanticHash()).hashCode() - - override def hashCode(): Int = { - hash - } - - override def equals(obj: Any): Boolean = obj match { - case other: Schema => - semanticEquals(other) - case _ => - false - } - - private def semanticEquals(other: Schema): Boolean = { - if (output.size != other.output.size) { - return false - } - output.zip(other.output).forall { - case (left, right) => - left.semanticEquals(right) - } - } + private case class Impl(override val schema: Schema, override val logicalLink: LogicalLink) + extends GlutenMetadata { + override def toString: String = s"$schema,$logicalLink" } } diff --git a/gluten-core/src/main/scala/org/apache/gluten/planner/metadata/GlutenMetadataModel.scala b/gluten-core/src/main/scala/org/apache/gluten/planner/metadata/GlutenMetadataModel.scala index 6d1baa79db172..7b95f1383d04b 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/planner/metadata/GlutenMetadataModel.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/planner/metadata/GlutenMetadataModel.scala @@ -16,7 +16,6 @@ */ package org.apache.gluten.planner.metadata -import org.apache.gluten.planner.metadata.GlutenMetadata.Schema import org.apache.gluten.planner.plan.GlutenPlanModel.GroupLeafExec import org.apache.gluten.ras.{Metadata, MetadataModel} @@ -31,18 +30,22 @@ object GlutenMetadataModel extends Logging { private object MetadataModelImpl extends MetadataModel[SparkPlan] { override def metadataOf(node: SparkPlan): Metadata = node match { case g: GroupLeafExec => throw new UnsupportedOperationException() - case other => GlutenMetadata(Schema(other.output)) + case other => + GlutenMetadata( + Schema(other.output), + other.logicalLink.map(LogicalLink(_)).getOrElse(LogicalLink.notFound)) } - override def dummy(): Metadata = GlutenMetadata(Schema(List())) + override def dummy(): Metadata = GlutenMetadata(Schema(List()), LogicalLink.notFound) override def verify(one: Metadata, other: Metadata): Unit = (one, other) match { - case (left: GlutenMetadata, right: GlutenMetadata) if left.schema() != right.schema() => - // We apply loose restriction on schema. Since Gluten still have some customized - // logics causing schema of an operator to change after being transformed. - // For example: https://github.com/apache/incubator-gluten/pull/5171 - logWarning(s"Warning: Schema mismatch: one: ${left.schema()}, other: ${right.schema()}") - case (left: GlutenMetadata, right: GlutenMetadata) if left == right => + case (left: GlutenMetadata, right: GlutenMetadata) => + implicitly[Verifier[Schema]].verify(left.schema(), right.schema()) + implicitly[Verifier[LogicalLink]].verify(left.logicalLink(), right.logicalLink()) case _ => throw new IllegalStateException(s"Metadata mismatch: one: $one, other $other") } } + + trait Verifier[T <: Any] { + def verify(one: T, other: T): Unit + } } diff --git a/gluten-core/src/main/scala/org/apache/gluten/planner/metadata/LogicalLink.scala b/gluten-core/src/main/scala/org/apache/gluten/planner/metadata/LogicalLink.scala new file mode 100644 index 0000000000000..4c3bffd471ad7 --- /dev/null +++ b/gluten-core/src/main/scala/org/apache/gluten/planner/metadata/LogicalLink.scala @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.planner.metadata + +import org.apache.gluten.planner.metadata.GlutenMetadataModel.Verifier + +import org.apache.spark.internal.Logging +import org.apache.spark.sql.catalyst.expressions.Attribute +import org.apache.spark.sql.catalyst.plans.logical +import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Statistics} + +case class LogicalLink(plan: LogicalPlan) { + override def hashCode(): Int = System.identityHashCode(plan) + override def equals(obj: Any): Boolean = obj match { + // LogicalLink's comparison is based on ref equality of the logical plans being compared. + case LogicalLink(otherPlan) => plan eq otherPlan + case _ => false + } + + override def toString: String = s"${plan.nodeName}[${plan.stats.simpleString}]" +} + +object LogicalLink { + private case class LogicalLinkNotFound() extends logical.LeafNode { + override def output: Seq[Attribute] = List.empty + override def canEqual(that: Any): Boolean = throw new UnsupportedOperationException() + override def computeStats(): Statistics = Statistics(sizeInBytes = 0) + } + + val notFound = new LogicalLink(LogicalLinkNotFound()) + implicit val verifier: Verifier[LogicalLink] = new Verifier[LogicalLink] with Logging { + override def verify(one: LogicalLink, other: LogicalLink): Unit = { + // LogicalLink's comparison is based on ref equality of the logical plans being compared. + if (one != other) { + logWarning(s"Warning: Logical link mismatch: one: $one, other: $other") + } + } + } +} diff --git a/gluten-core/src/main/scala/org/apache/gluten/planner/metadata/Schema.scala b/gluten-core/src/main/scala/org/apache/gluten/planner/metadata/Schema.scala new file mode 100644 index 0000000000000..969d34d5cc828 --- /dev/null +++ b/gluten-core/src/main/scala/org/apache/gluten/planner/metadata/Schema.scala @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.planner.metadata + +import org.apache.gluten.planner.metadata.GlutenMetadataModel.Verifier + +import org.apache.spark.internal.Logging +import org.apache.spark.sql.catalyst.expressions.Attribute + +case class Schema(output: Seq[Attribute]) { + private val hash = output.map(_.semanticHash()).hashCode() + + override def hashCode(): Int = { + hash + } + + override def equals(obj: Any): Boolean = obj match { + case other: Schema => + semanticEquals(other) + case _ => + false + } + + private def semanticEquals(other: Schema): Boolean = { + if (output.size != other.output.size) { + return false + } + output.zip(other.output).forall { + case (left, right) => + left.semanticEquals(right) + } + } + + override def toString: String = { + output.toString() + } +} + +object Schema { + implicit val verifier: Verifier[Schema] = new Verifier[Schema] with Logging { + override def verify(one: Schema, other: Schema): Unit = { + if (one != other) { + // We apply loose restriction on schema. Since Gluten still have some customized + // logics causing schema of an operator to change after being transformed. + // For example: https://github.com/apache/incubator-gluten/pull/5171 + logWarning(s"Warning: Schema mismatch: one: $one, other: $other") + } + } + } +} diff --git a/gluten-core/src/main/scala/org/apache/gluten/planner/property/Conv.scala b/gluten-core/src/main/scala/org/apache/gluten/planner/property/Conv.scala index 475f6292094c9..18db0f959491c 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/planner/property/Conv.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/planner/property/Conv.scala @@ -99,6 +99,7 @@ case class ConvEnforcerRule(reqConv: Conv) extends RasRule[SparkPlan] { } val transition = Conv.findTransition(conv, reqConv) val after = transition.apply(node) + after.copyTagsFrom(node) List(after) } diff --git a/gluten-core/src/main/scala/org/apache/gluten/softaffinity/SoftAffinityManager.scala b/gluten-core/src/main/scala/org/apache/gluten/softaffinity/SoftAffinityManager.scala index 72d590d04c556..278e1b5500929 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/softaffinity/SoftAffinityManager.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/softaffinity/SoftAffinityManager.scala @@ -263,7 +263,7 @@ abstract class AffinityManager extends LogLevelUtil with Logging { rand.shuffle(hosts) logOnLevel(logLevel, s"get host for $f: ${hosts.distinct.mkString(",")}") } - hosts.distinct + hosts.distinct.toSeq } def updatePartitionMap(f: FilePartition, rddId: Int): Unit = { diff --git a/gluten-core/src/main/scala/org/apache/gluten/utils/InputPartitionsUtil.scala b/gluten-core/src/main/scala/org/apache/gluten/utils/InputPartitionsUtil.scala index 135721ff03a76..fa0823e1c6f15 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/utils/InputPartitionsUtil.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/utils/InputPartitionsUtil.scala @@ -22,10 +22,12 @@ import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.connector.read.InputPartition import org.apache.spark.sql.execution.datasources.{FilePartition, HadoopFsRelation, PartitionDirectory} +import org.apache.spark.sql.types.StructType import org.apache.spark.util.collection.BitSet case class InputPartitionsUtil( relation: HadoopFsRelation, + requiredSchema: StructType, selectedPartitions: Array[PartitionDirectory], output: Seq[Attribute], bucketedScan: Boolean, @@ -58,7 +60,7 @@ case class InputPartitionsUtil( // getPath() is very expensive so we only want to call it once in this block: val filePath = file.getPath val isSplitable = - relation.fileFormat.isSplitable(relation.sparkSession, relation.options, filePath) + SparkShimLoader.getSparkShims.isFileSplittable(relation, filePath, requiredSchema) SparkShimLoader.getSparkShims.splitFiles( sparkSession = relation.sparkSession, file = file, diff --git a/gluten-core/src/main/scala/org/apache/gluten/utils/Iterators.scala b/gluten-core/src/main/scala/org/apache/gluten/utils/Iterators.scala deleted file mode 100644 index 81ff2dc0b177d..0000000000000 --- a/gluten-core/src/main/scala/org/apache/gluten/utils/Iterators.scala +++ /dev/null @@ -1,202 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.gluten.utils - -import org.apache.spark.{InterruptibleIterator, TaskContext} -import org.apache.spark.sql.execution.metric.SQLMetric -import org.apache.spark.util.TaskResources - -import java.util.concurrent.TimeUnit -import java.util.concurrent.atomic.AtomicBoolean - -private class PayloadCloser[A](in: Iterator[A])(closeCallback: A => Unit) extends Iterator[A] { - private var closer: Option[() => Unit] = None - - TaskResources.addRecycler("Iterators#PayloadCloser", 100) { - tryClose() - } - - override def hasNext: Boolean = { - tryClose() - in.hasNext - } - - override def next(): A = { - val a: A = in.next() - closer.synchronized { - closer = Some( - () => { - closeCallback.apply(a) - }) - } - a - } - - private def tryClose(): Unit = { - closer.synchronized { - closer match { - case Some(c) => c.apply() - case None => - } - closer = None // make sure the payload is closed once - } - } -} - -private class IteratorCompleter[A](in: Iterator[A])(completionCallback: => Unit) - extends Iterator[A] { - private val completed = new AtomicBoolean(false) - - TaskResources.addRecycler("Iterators#IteratorRecycler", 100) { - tryComplete() - } - - override def hasNext: Boolean = { - val out = in.hasNext - if (!out) { - tryComplete() - } - out - } - - override def next(): A = { - in.next() - } - - private def tryComplete(): Unit = { - if (!completed.compareAndSet(false, true)) { - return // make sure the iterator is completed once - } - completionCallback - } -} - -private class PipelineTimeAccumulator[A](in: Iterator[A], pipelineTime: SQLMetric) - extends Iterator[A] { - private val closed = new AtomicBoolean(false) - private val startTime = System.nanoTime() - - TaskResources.addRecycler("Iterators#PipelineTimeAccumulator", 100) { - tryFinish() - } - - override def hasNext: Boolean = { - val out = in.hasNext - if (!out) { - tryFinish() - } - out - } - - override def next(): A = { - in.next() - } - - private def tryFinish(): Unit = { - // pipeline metric should only be calculate once. - if (!closed.compareAndSet(false, true)) { - return - } - pipelineTime += TimeUnit.NANOSECONDS.toMillis( - System.nanoTime() - startTime - ) - } -} - -/** - * To protect the wrapped iterator to avoid undesired order of calls to its `hasNext` and `next` - * methods. - */ -private class InvocationFlowProtection[A](in: Iterator[A]) extends Iterator[A] { - sealed private trait State - private case object Init extends State - private case class HasNextCalled(hasNext: Boolean) extends State - private case object NextCalled extends State - - private var state: State = Init - - override def hasNext: Boolean = { - val out = state match { - case Init | NextCalled => - in.hasNext - case HasNextCalled(lastHasNext) => - lastHasNext - } - state = HasNextCalled(out) - out - } - - override def next(): A = { - val out = state match { - case Init | NextCalled => - if (!in.hasNext) { - throw new IllegalStateException("End of stream") - } - in.next() - case HasNextCalled(lastHasNext) => - if (!lastHasNext) { - throw new IllegalStateException("End of stream") - } - in.next() - } - state = NextCalled - out - } -} - -class WrapperBuilder[A](in: Iterator[A]) { // FIXME how to make the ctor companion-private? - private var wrapped: Iterator[A] = in - - def recyclePayload(closeCallback: (A) => Unit): WrapperBuilder[A] = { - wrapped = new PayloadCloser(wrapped)(closeCallback) - this - } - - def recycleIterator(completionCallback: => Unit): WrapperBuilder[A] = { - wrapped = new IteratorCompleter(wrapped)(completionCallback) - this - } - - def addToPipelineTime(pipelineTime: SQLMetric): WrapperBuilder[A] = { - wrapped = new PipelineTimeAccumulator[A](wrapped, pipelineTime) - this - } - - def asInterruptible(context: TaskContext): WrapperBuilder[A] = { - wrapped = new InterruptibleIterator[A](context, wrapped) - this - } - - def protectInvocationFlow(): WrapperBuilder[A] = { - wrapped = new InvocationFlowProtection[A](wrapped) - this - } - - def create(): Iterator[A] = { - wrapped - } -} - -/** - * Utility class to provide iterator wrappers for non-trivial use cases. E.g. iterators that manage - * payload's lifecycle. - */ -object Iterators { - def wrap[A](in: Iterator[A]): WrapperBuilder[A] = { - new WrapperBuilder[A](in) - } -} diff --git a/gluten-core/src/main/scala/org/apache/gluten/utils/PullOutProjectHelper.scala b/gluten-core/src/main/scala/org/apache/gluten/utils/PullOutProjectHelper.scala index 824694837dcb8..12055f9e97210 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/utils/PullOutProjectHelper.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/utils/PullOutProjectHelper.scala @@ -16,11 +16,13 @@ */ package org.apache.gluten.utils -import org.apache.gluten.exception.GlutenNotSupportException +import org.apache.gluten.backendsapi.BackendsApiManager +import org.apache.gluten.exception.{GlutenException, GlutenNotSupportException} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, AggregateFunction} import org.apache.spark.sql.execution.aggregate._ +import org.apache.spark.sql.types.{ByteType, DateType, IntegerType, LongType, ShortType} import java.util.concurrent.atomic.AtomicInteger @@ -61,7 +63,12 @@ trait PullOutProjectHelper { replaceBoundReference: Boolean = false): Expression = expr match { case alias: Alias => - projectExprsMap.getOrElseUpdate(alias.child.canonicalized, alias).toAttribute + alias.child match { + case _: Literal => + projectExprsMap.getOrElseUpdate(alias, alias).toAttribute + case _ => + projectExprsMap.getOrElseUpdate(alias.child.canonicalized, alias).toAttribute + } case attr: Attribute => attr case e: BoundReference if !replaceBoundReference => e case other => @@ -138,8 +145,49 @@ trait PullOutProjectHelper { ae.copy(aggregateFunction = newAggFunc, filter = newFilter) } + private def needPreComputeRangeFrameBoundary(bound: Expression): Boolean = { + bound match { + case _: PreComputeRangeFrameBound => false + case _ if !bound.foldable => false + case _ => true + } + } + + private def preComputeRangeFrameBoundary( + bound: Expression, + orderSpec: SortOrder, + expressionMap: mutable.HashMap[Expression, NamedExpression]): Expression = { + bound match { + case _: PreComputeRangeFrameBound => bound + case _ if !bound.foldable => bound + case _ if bound.foldable => + val a = expressionMap + .getOrElseUpdate( + bound.canonicalized, + Alias(Add(orderSpec.child, bound), generatePreAliasName)()) + PreComputeRangeFrameBound(a.asInstanceOf[Alias], bound) + } + } + + protected def needPreComputeRangeFrame(swf: SpecifiedWindowFrame): Boolean = { + BackendsApiManager.getSettings.needPreComputeRangeFrameBoundary && + swf.frameType == RangeFrame && + (needPreComputeRangeFrameBoundary(swf.lower) || needPreComputeRangeFrameBoundary(swf.upper)) + } + + protected def supportPreComputeRangeFrame(sortOrders: Seq[SortOrder]): Boolean = { + sortOrders.forall { + _.dataType match { + case ByteType | ShortType | IntegerType | LongType | DateType => true + // Only integral type & date type are supported for sort key with Range Frame + case _ => false + } + } + } + protected def rewriteWindowExpression( we: WindowExpression, + orderSpecs: Seq[SortOrder], expressionMap: mutable.HashMap[Expression, NamedExpression]): WindowExpression = { val newWindowFunc = we.windowFunction match { case windowFunc: WindowFunction => @@ -151,6 +199,22 @@ trait PullOutProjectHelper { case ae: AggregateExpression => rewriteAggregateExpression(ae, expressionMap) case other => other } - we.copy(windowFunction = newWindowFunc) + + val newWindowSpec = we.windowSpec.frameSpecification match { + case swf: SpecifiedWindowFrame if needPreComputeRangeFrame(swf) => + // This is guaranteed by Spark, but we still check it here + if (orderSpecs.size != 1) { + throw new GlutenException( + s"A range window frame with value boundaries expects one and only one " + + s"order by expression: ${orderSpecs.mkString(",")}") + } + val orderSpec = orderSpecs.head + val lowerFrameCol = preComputeRangeFrameBoundary(swf.lower, orderSpec, expressionMap) + val upperFrameCol = preComputeRangeFrameBoundary(swf.upper, orderSpec, expressionMap) + val newFrame = swf.copy(lower = lowerFrameCol, upper = upperFrameCol) + we.windowSpec.copy(frameSpecification = newFrame) + case _ => we.windowSpec + } + we.copy(windowFunction = newWindowFunc, windowSpec = newWindowSpec) } } diff --git a/gluten-core/src/main/scala/org/apache/gluten/utils/SubstraitPlanPrinterUtil.scala b/gluten-core/src/main/scala/org/apache/gluten/utils/SubstraitPlanPrinterUtil.scala index 77d5d55f618d9..a6ec7cb21fbf9 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/utils/SubstraitPlanPrinterUtil.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/utils/SubstraitPlanPrinterUtil.scala @@ -24,37 +24,34 @@ import io.substrait.proto.{NamedStruct, Plan} object SubstraitPlanPrinterUtil extends Logging { - /** Transform Substrait Plan to json format. */ - def substraitPlanToJson(substraintPlan: Plan): String = { + private def typeRegistry( + d: com.google.protobuf.Descriptors.Descriptor): com.google.protobuf.TypeRegistry = { val defaultRegistry = WrappersProto.getDescriptor.getMessageTypes - val registry = com.google.protobuf.TypeRegistry + com.google.protobuf.TypeRegistry .newBuilder() - .add(substraintPlan.getDescriptorForType()) + .add(d) .add(defaultRegistry) .build() - JsonFormat.printer.usingTypeRegistry(registry).print(substraintPlan) + } + private def MessageToJson(message: com.google.protobuf.Message): String = { + val registry = typeRegistry(message.getDescriptorForType) + JsonFormat.printer.usingTypeRegistry(registry).print(message) } - def substraitNamedStructToJson(substraintPlan: NamedStruct): String = { - val defaultRegistry = WrappersProto.getDescriptor.getMessageTypes - val registry = com.google.protobuf.TypeRegistry - .newBuilder() - .add(substraintPlan.getDescriptorForType()) - .add(defaultRegistry) - .build() - JsonFormat.printer.usingTypeRegistry(registry).print(substraintPlan) + /** Transform Substrait Plan to json format. */ + def substraitPlanToJson(substraitPlan: Plan): String = { + MessageToJson(substraitPlan) + } + + def substraitNamedStructToJson(namedStruct: NamedStruct): String = { + MessageToJson(namedStruct) } /** Transform substrait plan json string to PlanNode */ def jsonToSubstraitPlan(planJson: String): Plan = { try { val builder = Plan.newBuilder() - val defaultRegistry = WrappersProto.getDescriptor.getMessageTypes - val registry = com.google.protobuf.TypeRegistry - .newBuilder() - .add(builder.getDescriptorForType) - .add(defaultRegistry) - .build() + val registry = typeRegistry(builder.getDescriptorForType) JsonFormat.parser().usingTypeRegistry(registry).merge(planJson, builder) builder.build() } catch { diff --git a/gluten-core/src/main/scala/org/apache/gluten/utils/iterator/Iterators.scala b/gluten-core/src/main/scala/org/apache/gluten/utils/iterator/Iterators.scala new file mode 100644 index 0000000000000..eedfa66cfeaf1 --- /dev/null +++ b/gluten-core/src/main/scala/org/apache/gluten/utils/iterator/Iterators.scala @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.utils.iterator + +import org.apache.gluten.utils.iterator.IteratorsV1.WrapperBuilderV1 + +import org.apache.spark.TaskContext + +/** + * Utility class to provide iterator wrappers for non-trivial use cases. E.g. iterators that manage + * payload's lifecycle. + */ +object Iterators { + sealed trait Version + case object V1 extends Version + + private val DEFAULT_VERSION: Version = V1 + + trait WrapperBuilder[A] { + def recyclePayload(closeCallback: (A) => Unit): WrapperBuilder[A] + def recycleIterator(completionCallback: => Unit): WrapperBuilder[A] + def collectLifeMillis(onCollected: Long => Unit): WrapperBuilder[A] + def collectReadMillis(onAdded: Long => Unit): WrapperBuilder[A] + def asInterruptible(context: TaskContext): WrapperBuilder[A] + def protectInvocationFlow(): WrapperBuilder[A] + def create(): Iterator[A] + } + + def wrap[A](in: Iterator[A]): WrapperBuilder[A] = { + wrap(V1, in) + } + + def wrap[A](version: Version, in: Iterator[A]): WrapperBuilder[A] = { + version match { + case V1 => + new WrapperBuilderV1[A](in) + } + } +} diff --git a/gluten-core/src/main/scala/org/apache/gluten/utils/iterator/IteratorsV1.scala b/gluten-core/src/main/scala/org/apache/gluten/utils/iterator/IteratorsV1.scala new file mode 100644 index 0000000000000..3e9248c44458f --- /dev/null +++ b/gluten-core/src/main/scala/org/apache/gluten/utils/iterator/IteratorsV1.scala @@ -0,0 +1,222 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.utils.iterator + +import org.apache.gluten.utils.iterator.Iterators.WrapperBuilder + +import org.apache.spark.{InterruptibleIterator, TaskContext} +import org.apache.spark.util.TaskResources + +import java.util.concurrent.TimeUnit +import java.util.concurrent.atomic.AtomicBoolean + +object IteratorsV1 { + private class PayloadCloser[A](in: Iterator[A])(closeCallback: A => Unit) extends Iterator[A] { + private var closer: Option[() => Unit] = None + + TaskResources.addRecycler("Iterators#PayloadCloser", 100) { + tryClose() + } + + override def hasNext: Boolean = { + tryClose() + in.hasNext + } + + override def next(): A = { + val a: A = in.next() + closer.synchronized { + closer = Some( + () => { + closeCallback.apply(a) + }) + } + a + } + + private def tryClose(): Unit = { + closer.synchronized { + closer match { + case Some(c) => c.apply() + case None => + } + closer = None // make sure the payload is closed once + } + } + } + + private class IteratorCompleter[A](in: Iterator[A])(completionCallback: => Unit) + extends Iterator[A] { + private val completed = new AtomicBoolean(false) + + TaskResources.addRecycler("Iterators#IteratorRecycler", 100) { + tryComplete() + } + + override def hasNext: Boolean = { + val out = in.hasNext + if (!out) { + tryComplete() + } + out + } + + override def next(): A = { + in.next() + } + + private def tryComplete(): Unit = { + if (!completed.compareAndSet(false, true)) { + return // make sure the iterator is completed once + } + completionCallback + } + } + + private class LifeTimeAccumulator[A](in: Iterator[A], onCollected: Long => Unit) + extends Iterator[A] { + private val closed = new AtomicBoolean(false) + private val startTime = System.nanoTime() + + TaskResources.addRecycler("Iterators#LifeTimeAccumulator", 100) { + tryFinish() + } + + override def hasNext: Boolean = { + val out = in.hasNext + if (!out) { + tryFinish() + } + out + } + + override def next(): A = { + in.next() + } + + private def tryFinish(): Unit = { + // pipeline metric should only be calculate once. + if (!closed.compareAndSet(false, true)) { + return + } + val lifeTime = TimeUnit.NANOSECONDS.toMillis( + System.nanoTime() - startTime + ) + onCollected(lifeTime) + } + } + + private class ReadTimeAccumulator[A](in: Iterator[A], onAdded: Long => Unit) extends Iterator[A] { + + override def hasNext: Boolean = { + val prev = System.nanoTime() + val out = in.hasNext + val after = System.nanoTime() + val duration = TimeUnit.NANOSECONDS.toMillis(after - prev) + onAdded(duration) + out + } + + override def next(): A = { + val prev = System.nanoTime() + val out = in.next() + val after = System.nanoTime() + val duration = TimeUnit.NANOSECONDS.toMillis(after - prev) + onAdded(duration) + out + } + } + + /** + * To protect the wrapped iterator to avoid undesired order of calls to its `hasNext` and `next` + * methods. + */ + private class InvocationFlowProtection[A](in: Iterator[A]) extends Iterator[A] { + sealed private trait State + private case object Init extends State + private case class HasNextCalled(hasNext: Boolean) extends State + private case object NextCalled extends State + + private var state: State = Init + + override def hasNext: Boolean = { + val out = state match { + case Init | NextCalled => + in.hasNext + case HasNextCalled(lastHasNext) => + lastHasNext + } + state = HasNextCalled(out) + out + } + + override def next(): A = { + val out = state match { + case Init | NextCalled => + if (!in.hasNext) { + throw new IllegalStateException("End of stream") + } + in.next() + case HasNextCalled(lastHasNext) => + if (!lastHasNext) { + throw new IllegalStateException("End of stream") + } + in.next() + } + state = NextCalled + out + } + } + + class WrapperBuilderV1[A] private[iterator] (in: Iterator[A]) extends WrapperBuilder[A] { + private var wrapped: Iterator[A] = in + + override def recyclePayload(closeCallback: (A) => Unit): WrapperBuilder[A] = { + wrapped = new PayloadCloser(wrapped)(closeCallback) + this + } + + override def recycleIterator(completionCallback: => Unit): WrapperBuilder[A] = { + wrapped = new IteratorCompleter(wrapped)(completionCallback) + this + } + + override def collectLifeMillis(onCollected: Long => Unit): WrapperBuilder[A] = { + wrapped = new LifeTimeAccumulator[A](wrapped, onCollected) + this + } + + override def collectReadMillis(onAdded: Long => Unit): WrapperBuilder[A] = { + wrapped = new ReadTimeAccumulator[A](wrapped, onAdded) + this + } + + override def asInterruptible(context: TaskContext): WrapperBuilder[A] = { + wrapped = new InterruptibleIterator[A](context, wrapped) + this + } + + override def protectInvocationFlow(): WrapperBuilder[A] = { + wrapped = new InvocationFlowProtection[A](wrapped) + this + } + + override def create(): Iterator[A] = { + wrapped + } + } +} diff --git a/gluten-core/src/main/scala/org/apache/spark/HdfsConfGenerator.scala b/gluten-core/src/main/scala/org/apache/spark/HdfsConfGenerator.scala index 9756837d96e56..04272517e5bb2 100644 --- a/gluten-core/src/main/scala/org/apache/spark/HdfsConfGenerator.scala +++ b/gluten-core/src/main/scala/org/apache/spark/HdfsConfGenerator.scala @@ -41,8 +41,8 @@ object HdfsConfGenerator extends Logging { addFileMethod.invoke(sc, path, Boolean.box(false), Boolean.box(true), Boolean.box(false)) // Overwrite the spark internal config `spark.app.initial.file.urls`, // so that the file can be available before initializing executor plugin. - assert(sc.addedFiles.nonEmpty) - sc.conf.set("spark.app.initial.file.urls", sc.addedFiles.keys.toSeq.mkString(",")) + assert(sc.listFiles.nonEmpty) + sc.conf.set("spark.app.initial.file.urls", sc.listFiles().mkString(",")) } private def ignoreKey(key: String): Boolean = { diff --git a/gluten-core/src/main/scala/org/apache/spark/listener/GlutenListenerFactory.scala b/gluten-core/src/main/scala/org/apache/spark/listener/GlutenListenerFactory.scala index 9413941fe5982..721711af5fca3 100644 --- a/gluten-core/src/main/scala/org/apache/spark/listener/GlutenListenerFactory.scala +++ b/gluten-core/src/main/scala/org/apache/spark/listener/GlutenListenerFactory.scala @@ -20,12 +20,9 @@ import org.apache.gluten.GlutenConfig import org.apache.gluten.softaffinity.scheduler.SoftAffinityListener import org.apache.spark.SparkContext -import org.apache.spark.rpc.GlutenDriverEndpoint object GlutenListenerFactory { def addToSparkListenerBus(sc: SparkContext): Unit = { - sc.listenerBus.addToStatusQueue( - new GlutenSQLAppStatusListener(GlutenDriverEndpoint.glutenDriverEndpointRef)) if ( sc.getConf.getBoolean( GlutenConfig.GLUTEN_SOFT_AFFINITY_ENABLED, diff --git a/gluten-core/src/main/scala/org/apache/spark/memory/SparkMemoryUtil.scala b/gluten-core/src/main/scala/org/apache/spark/memory/SparkMemoryUtil.scala index 48ed08fb71cec..178310fd64976 100644 --- a/gluten-core/src/main/scala/org/apache/spark/memory/SparkMemoryUtil.scala +++ b/gluten-core/src/main/scala/org/apache/spark/memory/SparkMemoryUtil.scala @@ -44,6 +44,10 @@ object SparkMemoryUtil { consumersField.setAccessible(true) taskIdField.setAccessible(true) + def bytesToString(size: Long): String = { + Utils.bytesToString(size) + } + // We assume storage memory can be fully transferred to execution memory so far def getCurrentAvailableOffHeapMemory: Long = { val mm = SparkEnv.get.memoryManager diff --git a/gluten-core/src/main/scala/org/apache/spark/sql/catalyst/expressions/PreComputeRangeFrameBound.scala b/gluten-core/src/main/scala/org/apache/spark/sql/catalyst/expressions/PreComputeRangeFrameBound.scala new file mode 100644 index 0000000000000..73c1cb3de609e --- /dev/null +++ b/gluten-core/src/main/scala/org/apache/spark/sql/catalyst/expressions/PreComputeRangeFrameBound.scala @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.catalyst.expressions + +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} +import org.apache.spark.sql.types.{DataType, Metadata} + +/** + * Represents a pre-compute boundary for range frame when boundary is non-SpecialFrameBoundary, + * since Velox doesn't support constant offset for range frame. It acts like the original boundary + * which is foldable and generate the same result when eval is invoked so that if the WindowExec + * fallback to Vanilla Spark it can still work correctly. + * @param child + * The alias to pre-compute projection column + * @param originalBound + * The original boundary which is a foldable expression + */ +case class PreComputeRangeFrameBound(child: Alias, originalBound: Expression) + extends UnaryExpression + with NamedExpression { + + override def foldable: Boolean = true + + override def eval(input: InternalRow): Any = originalBound.eval(input) + + override def genCode(ctx: CodegenContext): ExprCode = originalBound.genCode(ctx) + + override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = + originalBound.genCode(ctx) + + override def name: String = child.name + + override def exprId: ExprId = child.exprId + + override def qualifier: Seq[String] = child.qualifier + + override def newInstance(): NamedExpression = + PreComputeRangeFrameBound(child.newInstance().asInstanceOf[Alias], originalBound) + + override lazy val resolved: Boolean = originalBound.resolved + + override def dataType: DataType = child.dataType + + override def nullable: Boolean = child.nullable + + override def metadata: Metadata = child.metadata + + override def toAttribute: Attribute = child.toAttribute + + override def toString: String = child.toString + + override def hashCode(): Int = child.hashCode() + + override def equals(other: Any): Boolean = other match { + case a: PreComputeRangeFrameBound => + child.equals(a.child) + case _ => false + } + + override def sql: String = child.sql + + override protected def withNewChildInternal(newChild: Expression): PreComputeRangeFrameBound = + copy(child = newChild.asInstanceOf[Alias]) + +} diff --git a/gluten-core/src/main/scala/org/apache/spark/sql/execution/ColumnarCollapseTransformStages.scala b/gluten-core/src/main/scala/org/apache/spark/sql/execution/ColumnarCollapseTransformStages.scala index 23746846e9cfb..e5925e3ac4d04 100644 --- a/gluten-core/src/main/scala/org/apache/spark/sql/execution/ColumnarCollapseTransformStages.scala +++ b/gluten-core/src/main/scala/org/apache/spark/sql/execution/ColumnarCollapseTransformStages.scala @@ -65,7 +65,7 @@ case class InputIteratorTransformer(child: SparkPlan) extends UnaryTransformSupp child.doExecuteBroadcast() } - override def doTransform(context: SubstraitContext): TransformContext = { + override protected def doTransform(context: SubstraitContext): TransformContext = { val operatorId = context.nextOperatorId(nodeName) val readRel = RelBuilder.makeReadRelForInputIterator(child.output.asJava, context, operatorId) TransformContext(output, output, readRel) @@ -160,7 +160,7 @@ case class ColumnarCollapseTransformStages( } case class ColumnarInputAdapter(child: SparkPlan) - extends UnaryExecNode + extends InputAdapterGenerateTreeStringShim with Convention.KnownBatchType { override def output: Seq[Attribute] = child.output override def supportsColumnar: Boolean = true diff --git a/gluten-core/src/main/scala/org/apache/spark/sql/execution/ColumnarSubqueryBroadcastExec.scala b/gluten-core/src/main/scala/org/apache/spark/sql/execution/ColumnarSubqueryBroadcastExec.scala index a74d428fd4525..2c1edd04bb4a0 100644 --- a/gluten-core/src/main/scala/org/apache/spark/sql/execution/ColumnarSubqueryBroadcastExec.scala +++ b/gluten-core/src/main/scala/org/apache/spark/sql/execution/ColumnarSubqueryBroadcastExec.scala @@ -24,8 +24,6 @@ import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.QueryPlan -import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec -import org.apache.spark.sql.execution.exchange.ReusedExchangeExec import org.apache.spark.sql.execution.joins.{BuildSideRelation, HashedRelation, HashJoin, LongHashedRelation} import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} import org.apache.spark.util.ThreadUtils @@ -75,35 +73,25 @@ case class ColumnarSubqueryBroadcastExec( SQLExecution.withExecutionId(session, executionId) { val rows = GlutenTimeMetric.millis(longMetric("collectTime")) { _ => - val exchangeChild = child match { - case exec: ReusedExchangeExec => - exec.child - case _ => - child - } - if ( - exchangeChild.isInstanceOf[ColumnarBroadcastExchangeExec] || - exchangeChild.isInstanceOf[AdaptiveSparkPlanExec] - ) { - // transform broadcasted columnar value to Array[InternalRow] by key - exchangeChild - .executeBroadcast[BuildSideRelation] - .value - .transform(buildKeys(index)) - .distinct - } else { - val broadcastRelation = exchangeChild.executeBroadcast[HashedRelation]().value - val (iter, expr) = if (broadcastRelation.isInstanceOf[LongHashedRelation]) { - (broadcastRelation.keys(), HashJoin.extractKeyExprAt(buildKeys, index)) - } else { - ( - broadcastRelation.keys(), - BoundReference(index, buildKeys(index).dataType, buildKeys(index).nullable)) - } - - val proj = UnsafeProjection.create(expr) - val keyIter = iter.map(proj).map(_.copy()) - keyIter.toArray[InternalRow].distinct + val relation = child.executeBroadcast[Any]().value + relation match { + case b: BuildSideRelation => + // Transform columnar broadcast value to Array[InternalRow] by key. + b.transform(buildKeys(index)).distinct + case h: HashedRelation => + val (iter, expr) = if (h.isInstanceOf[LongHashedRelation]) { + (h.keys(), HashJoin.extractKeyExprAt(buildKeys, index)) + } else { + ( + h.keys(), + BoundReference(index, buildKeys(index).dataType, buildKeys(index).nullable)) + } + val proj = UnsafeProjection.create(expr) + val keyIter = iter.map(proj).map(_.copy()) + keyIter.toArray[InternalRow].distinct + case other => + throw new UnsupportedOperationException( + s"Unrecognizable broadcast relation: $other") } } val dataSize = rows.map(_.asInstanceOf[UnsafeRow].getSizeInBytes).sum diff --git a/gluten-core/src/main/scala/org/apache/spark/sql/execution/GlutenFallbackReporter.scala b/gluten-core/src/main/scala/org/apache/spark/sql/execution/GlutenFallbackReporter.scala index 00b0248aee77c..d41dce882602b 100644 --- a/gluten-core/src/main/scala/org/apache/spark/sql/execution/GlutenFallbackReporter.scala +++ b/gluten-core/src/main/scala/org/apache/spark/sql/execution/GlutenFallbackReporter.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.execution import org.apache.gluten.GlutenConfig import org.apache.gluten.events.GlutenPlanFallbackEvent import org.apache.gluten.extension.GlutenPlan -import org.apache.gluten.extension.columnar.{TRANSFORM_UNSUPPORTED, TransformHints} +import org.apache.gluten.extension.columnar.{FallbackTags, TRANSFORM_UNSUPPORTED} import org.apache.gluten.utils.LogLevelUtil import org.apache.spark.sql.SparkSession @@ -57,8 +57,8 @@ case class GlutenFallbackReporter(glutenConfig: GlutenConfig, spark: SparkSessio val validationLogLevel = glutenConfig.validationLogLevel plan.foreachUp { case _: GlutenPlan => // ignore - case p: SparkPlan if TransformHints.isNotTransformable(p) => - TransformHints.getHint(p) match { + case p: SparkPlan if FallbackTags.nonEmpty(p) => + FallbackTags.getTag(p) match { case TRANSFORM_UNSUPPORTED(Some(reason), append) => logFallbackReason(validationLogLevel, p.nodeName, reason) // With in next round stage in AQE, the physical plan would be a new instance that @@ -89,6 +89,7 @@ case class GlutenFallbackReporter(glutenConfig: GlutenConfig, spark: SparkSessio logicalPlan.setTagValue(FALLBACK_REASON_TAG, newReason) } case TRANSFORM_UNSUPPORTED(_, _) => + logFallbackReason(validationLogLevel, p.nodeName, "unknown reason") case _ => throw new IllegalStateException("Unreachable code") } diff --git a/gluten-core/src/main/scala/org/apache/spark/sql/execution/GlutenImplicits.scala b/gluten-core/src/main/scala/org/apache/spark/sql/execution/GlutenImplicits.scala index b0dc3a958e137..eb42f0a884602 100644 --- a/gluten-core/src/main/scala/org/apache/spark/sql/execution/GlutenImplicits.scala +++ b/gluten-core/src/main/scala/org/apache/spark/sql/execution/GlutenImplicits.scala @@ -205,8 +205,8 @@ object GlutenImplicits { FallbackSummary( totalNumGlutenNodes, totalNumFallbackNodes, - totalPhysicalPlanDescription, - totalFallbackNodeToReason + totalPhysicalPlanDescription.toSeq, + totalFallbackNodeToReason.toSeq ) } diff --git a/gluten-core/src/main/scala/org/apache/spark/sql/execution/ShuffledColumnarBatchRDD.scala b/gluten-core/src/main/scala/org/apache/spark/sql/execution/ShuffledColumnarBatchRDD.scala index 42db17b6cd6bb..0642c3a24760f 100644 --- a/gluten-core/src/main/scala/org/apache/spark/sql/execution/ShuffledColumnarBatchRDD.scala +++ b/gluten-core/src/main/scala/org/apache/spark/sql/execution/ShuffledColumnarBatchRDD.scala @@ -139,7 +139,7 @@ class ShuffledColumnarBatchRDD( } } - override def clearDependencies() { + override def clearDependencies(): Unit = { super.clearDependencies() dependency = null } diff --git a/gluten-core/src/main/scala/org/apache/spark/sql/execution/datasources/GlutenFormatWriterInjectsBase.scala b/gluten-core/src/main/scala/org/apache/spark/sql/execution/datasources/GlutenFormatWriterInjectsBase.scala index fbdbeadba8861..450b88163afca 100644 --- a/gluten-core/src/main/scala/org/apache/spark/sql/execution/datasources/GlutenFormatWriterInjectsBase.scala +++ b/gluten-core/src/main/scala/org/apache/spark/sql/execution/datasources/GlutenFormatWriterInjectsBase.scala @@ -18,7 +18,7 @@ package org.apache.spark.sql.execution.datasources import org.apache.gluten.execution.{ProjectExecTransformer, SortExecTransformer, TransformSupport, WholeStageTransformer} import org.apache.gluten.execution.datasource.GlutenFormatWriterInjects -import org.apache.gluten.extension.columnar.AddTransformHintRule +import org.apache.gluten.extension.columnar.AddFallbackTagRule import org.apache.gluten.extension.columnar.MiscColumnarRules.TransformPreOverrides import org.apache.gluten.extension.columnar.rewrite.RewriteSparkPlanRulesManager @@ -47,7 +47,7 @@ trait GlutenFormatWriterInjectsBase extends GlutenFormatWriterInjects { val rules = List( RewriteSparkPlanRulesManager(), - AddTransformHintRule(), + AddFallbackTagRule(), TransformPreOverrides() ) val transformed = rules.foldLeft(plan) { case (latestPlan, rule) => rule.apply(latestPlan) } diff --git a/gluten-core/src/main/scala/org/apache/spark/sql/execution/datasources/GlutenWriterColumnarRules.scala b/gluten-core/src/main/scala/org/apache/spark/sql/execution/datasources/GlutenWriterColumnarRules.scala index 859cca842df3f..7063c3f67b800 100644 --- a/gluten-core/src/main/scala/org/apache/spark/sql/execution/datasources/GlutenWriterColumnarRules.scala +++ b/gluten-core/src/main/scala/org/apache/spark/sql/execution/datasources/GlutenWriterColumnarRules.scala @@ -24,7 +24,7 @@ import org.apache.gluten.extension.columnar.transition.Transitions import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions.Attribute +import org.apache.spark.sql.catalyst.expressions.{Attribute, SortOrder} import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, OrderPreservingUnaryNode} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution._ @@ -65,6 +65,11 @@ case class FakeRowAdaptor(child: SparkPlan) doExecuteColumnar().map(cb => new FakeRow(cb)) } + override def outputOrdering: Seq[SortOrder] = child match { + case aqe: AdaptiveSparkPlanExec => aqe.executedPlan.outputOrdering + case _ => child.outputOrdering + } + override protected def doExecuteColumnar(): RDD[ColumnarBatch] = { if (child.supportsColumnar) { child.executeColumnar() @@ -157,19 +162,28 @@ object GlutenWriterColumnarRules { if write.getClass.getName == NOOP_WRITE && BackendsApiManager.getSettings.enableNativeWriteFiles() => injectFakeRowAdaptor(rc, rc.child) - case rc @ DataWritingCommandExec(cmd, child) - if BackendsApiManager.getSettings.supportNativeWrite(child.output.toStructType.fields) => - val format = getNativeFormat(cmd) - session.sparkContext.setLocalProperty( - "staticPartitionWriteOnly", - BackendsApiManager.getSettings.staticPartitionWriteOnly().toString) - // FIXME: We should only use context property if having no other approaches. - // Should see if there is another way to pass these options. - session.sparkContext.setLocalProperty("isNativeAppliable", format.isDefined.toString) - session.sparkContext.setLocalProperty("nativeFormat", format.getOrElse("")) - if (format.isDefined) { - injectFakeRowAdaptor(rc, child) + case rc @ DataWritingCommandExec(cmd, child) => + if (BackendsApiManager.getSettings.supportNativeWrite(child.output.toStructType.fields)) { + val format = getNativeFormat(cmd) + session.sparkContext.setLocalProperty( + "staticPartitionWriteOnly", + BackendsApiManager.getSettings.staticPartitionWriteOnly().toString) + // FIXME: We should only use context property if having no other approaches. + // Should see if there is another way to pass these options. + session.sparkContext.setLocalProperty("isNativeAppliable", format.isDefined.toString) + session.sparkContext.setLocalProperty("nativeFormat", format.getOrElse("")) + if (format.isDefined) { + injectFakeRowAdaptor(rc, child) + } else { + rc.withNewChildren(rc.children.map(apply)) + } } else { + session.sparkContext.setLocalProperty( + "staticPartitionWriteOnly", + BackendsApiManager.getSettings.staticPartitionWriteOnly().toString) + session.sparkContext.setLocalProperty("isNativeAppliable", "false") + session.sparkContext.setLocalProperty("nativeFormat", "") + rc.withNewChildren(rc.children.map(apply)) } case plan: SparkPlan => plan.withNewChildren(plan.children.map(apply)) diff --git a/gluten-core/src/main/scala/org/apache/spark/sql/execution/python/EvalPythonExecTransformer.scala b/gluten-core/src/main/scala/org/apache/spark/sql/execution/python/EvalPythonExecTransformer.scala index 43dd2f4538030..ecedc1bae01c8 100644 --- a/gluten-core/src/main/scala/org/apache/spark/sql/execution/python/EvalPythonExecTransformer.scala +++ b/gluten-core/src/main/scala/org/apache/spark/sql/execution/python/EvalPythonExecTransformer.scala @@ -84,8 +84,8 @@ case class EvalPythonExecTransformer( doNativeValidation(context, relNode) } - override def doTransform(context: SubstraitContext): TransformContext = { - val childCtx = child.asInstanceOf[TransformSupport].doTransform(context) + override protected def doTransform(context: SubstraitContext): TransformContext = { + val childCtx = child.asInstanceOf[TransformSupport].transform(context) val args = context.registeredFunction val operatorId = context.nextOperatorId(this.nodeName) val expressionNodes = new JArrayList[ExpressionNode] diff --git a/gluten-core/src/main/scala/org/apache/spark/sql/hive/HivePartitionConverter.scala b/gluten-core/src/main/scala/org/apache/spark/sql/hive/HivePartitionConverter.scala index d76eca3d3c49e..3a65d6f559ea1 100644 --- a/gluten-core/src/main/scala/org/apache/spark/sql/hive/HivePartitionConverter.scala +++ b/gluten-core/src/main/scala/org/apache/spark/sql/hive/HivePartitionConverter.scala @@ -80,9 +80,10 @@ class HivePartitionConverter(hadoopConf: Configuration, session: SparkSession) // just like for Apache Spark. val uri = p.getDataLocation.toUri val partValues: Seq[Any] = { - p.getValues.asScala.zip(partitionColTypes).map { - case (value, dataType) => castFromString(value, dataType) - } + p.getValues.asScala + .zip(partitionColTypes) + .map { case (value, dataType) => castFromString(value, dataType) } + .toSeq } val partValuesAsInternalRow = InternalRow.fromSeq(partValues) diff --git a/gluten-core/src/main/scala/org/apache/spark/sql/hive/HiveTableScanExecTransformer.scala b/gluten-core/src/main/scala/org/apache/spark/sql/hive/HiveTableScanExecTransformer.scala index 2952267e5a1de..95793e5dc9354 100644 --- a/gluten-core/src/main/scala/org/apache/spark/sql/hive/HiveTableScanExecTransformer.scala +++ b/gluten-core/src/main/scala/org/apache/spark/sql/hive/HiveTableScanExecTransformer.scala @@ -22,7 +22,6 @@ import org.apache.gluten.extension.ValidationResult import org.apache.gluten.metrics.MetricsUpdater import org.apache.gluten.substrait.rel.LocalFilesNode.ReadFileFormat -import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.catalog.HiveTableRelation import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, AttributeSeq, Expression} @@ -34,7 +33,6 @@ import org.apache.spark.sql.hive.HiveTableScanExecTransformer._ import org.apache.spark.sql.hive.client.HiveClientImpl import org.apache.spark.sql.hive.execution.{AbstractHiveTableScanExec, HiveTableScanExec} import org.apache.spark.sql.types.StructType -import org.apache.spark.sql.vectorized.ColumnarBatch import org.apache.spark.util.Utils import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat @@ -81,10 +79,6 @@ case class HiveTableScanExecTransformer( override def metricsUpdater(): MetricsUpdater = BackendsApiManager.getMetricsApiInstance.genHiveTableScanTransformerMetricsUpdater(metrics) - override def doExecuteColumnar(): RDD[ColumnarBatch] = { - doExecuteColumnarInternal() - } - @transient private lazy val hivePartitionConverter = new HivePartitionConverter(session.sessionState.newHadoopConf(), session) @@ -194,10 +188,6 @@ object HiveTableScanExecTransformer { plan.isInstanceOf[HiveTableScanExec] } - def getPartitionFilters(plan: SparkPlan): Seq[Expression] = { - plan.asInstanceOf[HiveTableScanExec].partitionPruningPred - } - def copyWith(plan: SparkPlan, newPartitionFilters: Seq[Expression]): SparkPlan = { val hiveTableScanExec = plan.asInstanceOf[HiveTableScanExec] hiveTableScanExec.copy(partitionPruningPred = newPartitionFilters)(sparkSession = diff --git a/gluten-core/src/test/java/org/apache/gluten/memory/memtarget/spark/TreeMemoryConsumerTest.java b/gluten-core/src/test/java/org/apache/gluten/memory/memtarget/spark/TreeMemoryConsumerTest.java index e26765d33082d..db018ffe40431 100644 --- a/gluten-core/src/test/java/org/apache/gluten/memory/memtarget/spark/TreeMemoryConsumerTest.java +++ b/gluten-core/src/test/java/org/apache/gluten/memory/memtarget/spark/TreeMemoryConsumerTest.java @@ -17,6 +17,7 @@ package org.apache.gluten.memory.memtarget.spark; import org.apache.gluten.GlutenConfig; +import org.apache.gluten.memory.memtarget.Spillers; import org.apache.gluten.memory.memtarget.TreeMemoryTarget; import org.apache.spark.TaskContext; @@ -43,7 +44,7 @@ public void testIsolated() { factory.newConsumer( TaskContext.get().taskMemoryManager(), "FOO", - Collections.emptyList(), + Spillers.NOOP, Collections.emptyMap()); Assert.assertEquals(20, consumer.borrow(20)); Assert.assertEquals(70, consumer.borrow(70)); @@ -65,7 +66,7 @@ public void testShared() { factory.newConsumer( TaskContext.get().taskMemoryManager(), "FOO", - Collections.emptyList(), + Spillers.NOOP, Collections.emptyMap()); Assert.assertEquals(20, consumer.borrow(20)); Assert.assertEquals(70, consumer.borrow(70)); @@ -87,7 +88,7 @@ public void testIsolatedAndShared() { .newConsumer( TaskContext.get().taskMemoryManager(), "FOO", - Collections.emptyList(), + Spillers.NOOP, Collections.emptyMap()); Assert.assertEquals(110, shared.borrow(110)); final TreeMemoryTarget isolated = @@ -95,7 +96,7 @@ public void testIsolatedAndShared() { .newConsumer( TaskContext.get().taskMemoryManager(), "FOO", - Collections.emptyList(), + Spillers.NOOP, Collections.emptyMap()); Assert.assertEquals(100, isolated.borrow(110)); }); diff --git a/gluten-core/src/test/scala/org/apache/gluten/execution/WholeStageTransformerSuite.scala b/gluten-core/src/test/scala/org/apache/gluten/execution/WholeStageTransformerSuite.scala index 7d2d48828fb32..95391a2c42f56 100644 --- a/gluten-core/src/test/scala/org/apache/gluten/execution/WholeStageTransformerSuite.scala +++ b/gluten-core/src/test/scala/org/apache/gluten/execution/WholeStageTransformerSuite.scala @@ -319,6 +319,26 @@ abstract class WholeStageTransformerSuite df } + /** + * Some rule on LogicalPlan will not only apply in select query, the total df.load() should in + * spark environment with gluten disabled config. + * + * @param sql + * @param f + * @return + */ + protected def runAndCompare(sql: String)(f: => Unit): DataFrame = { + var expected: Seq[Row] = null + withSQLConf(vanillaSparkConfs(): _*) { + f + expected = spark.sql(sql).collect() + } + f + val df = spark.sql(sql) + checkAnswer(df, expected) + df + } + protected def runQueryAndCompare( sqlStr: String, compareResult: Boolean = true, diff --git a/gluten-core/src/test/scala/org/apache/gluten/utils/IteratorSuite.scala b/gluten-core/src/test/scala/org/apache/gluten/utils/iterator/IteratorSuite.scala similarity index 86% rename from gluten-core/src/test/scala/org/apache/gluten/utils/IteratorSuite.scala rename to gluten-core/src/test/scala/org/apache/gluten/utils/iterator/IteratorSuite.scala index 389e2adfefd49..1a84d671922d9 100644 --- a/gluten-core/src/test/scala/org/apache/gluten/utils/IteratorSuite.scala +++ b/gluten-core/src/test/scala/org/apache/gluten/utils/iterator/IteratorSuite.scala @@ -14,18 +14,25 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.gluten.utils +package org.apache.gluten.utils.iterator + +import org.apache.gluten.utils.iterator.Iterators.{V1, WrapperBuilder} import org.apache.spark.util.TaskResources import org.scalatest.funsuite.AnyFunSuite -class IteratorSuite extends AnyFunSuite { +class IteratorV1Suite extends IteratorSuite { + override protected def wrap[A](in: Iterator[A]): WrapperBuilder[A] = Iterators.wrap(V1, in) +} + +abstract class IteratorSuite extends AnyFunSuite { + protected def wrap[A](in: Iterator[A]): WrapperBuilder[A] + test("Trivial wrapping") { val strings = Array[String]("one", "two", "three") val itr = strings.toIterator - val wrapped = Iterators - .wrap(itr) + val wrapped = wrap(itr) .create() assertResult(strings) { wrapped.toArray @@ -37,8 +44,7 @@ class IteratorSuite extends AnyFunSuite { TaskResources.runUnsafe { val strings = Array[String]("one", "two", "three") val itr = strings.toIterator - val wrapped = Iterators - .wrap(itr) + val wrapped = wrap(itr) .recycleIterator { completeCount += 1 } @@ -56,8 +62,7 @@ class IteratorSuite extends AnyFunSuite { TaskResources.runUnsafe { val strings = Array[String]("one", "two", "three") val itr = strings.toIterator - val _ = Iterators - .wrap(itr) + val _ = wrap(itr) .recycleIterator { completeCount += 1 } @@ -72,8 +77,7 @@ class IteratorSuite extends AnyFunSuite { TaskResources.runUnsafe { val strings = Array[String]("one", "two", "three") val itr = strings.toIterator - val wrapped = Iterators - .wrap(itr) + val wrapped = wrap(itr) .recyclePayload { _: String => closeCount += 1 } .create() assertResult(strings) { @@ -89,8 +93,7 @@ class IteratorSuite extends AnyFunSuite { TaskResources.runUnsafe { val strings = Array[String]("one", "two", "three") val itr = strings.toIterator - val wrapped = Iterators - .wrap(itr) + val wrapped = wrap(itr) .recyclePayload { _: String => closeCount += 1 } .create() assertResult(strings.take(2)) { @@ -115,8 +118,7 @@ class IteratorSuite extends AnyFunSuite { new Object } } - val wrapped = Iterators - .wrap(itr) + val wrapped = wrap(itr) .protectInvocationFlow() .create() wrapped.hasNext diff --git a/gluten-core/src/test/scala/org/apache/spark/sql/GlutenQueryTest.scala b/gluten-core/src/test/scala/org/apache/spark/sql/GlutenQueryTest.scala index 35afc731bc2ef..ab30cb14e1805 100644 --- a/gluten-core/src/test/scala/org/apache/spark/sql/GlutenQueryTest.scala +++ b/gluten-core/src/test/scala/org/apache/spark/sql/GlutenQueryTest.scala @@ -20,10 +20,10 @@ package org.apache.spark.sql * Why we need a GlutenQueryTest when we already have QueryTest? * 1. We need to modify the way org.apache.spark.sql.CHQueryTest#compare compares double */ +import org.apache.gluten.backendsapi.BackendsApiManager import org.apache.gluten.sql.shims.SparkShimLoader import org.apache.spark.SPARK_VERSION_SHORT -import org.apache.spark.rpc.GlutenDriverEndpoint import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.logical._ @@ -339,7 +339,7 @@ object GlutenQueryTest extends Assertions { SQLExecution.withExecutionId(df.sparkSession, executionId) { df.rdd.count() // Also attempt to deserialize as an RDD [SPARK-15791] } - GlutenDriverEndpoint.invalidateResourceRelation(executionId) + BackendsApiManager.getTransformerApiInstance.invalidateSQLExecutionResource(executionId) } val sparkAnswer = diff --git a/gluten-core/src/test/scala/org/apache/spark/utils/iterator/IteratorBenchmark.scala b/gluten-core/src/test/scala/org/apache/spark/utils/iterator/IteratorBenchmark.scala new file mode 100644 index 0000000000000..aa69f309aac86 --- /dev/null +++ b/gluten-core/src/test/scala/org/apache/spark/utils/iterator/IteratorBenchmark.scala @@ -0,0 +1,129 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.utils.iterator + +import org.apache.gluten.utils.iterator.Iterators +import org.apache.gluten.utils.iterator.Iterators.V1 + +import org.apache.spark.benchmark.{Benchmark, BenchmarkBase} +import org.apache.spark.util.TaskResources + +object IteratorBenchmark extends BenchmarkBase { + + override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { + runBenchmark("Iterator Nesting") { + TaskResources.runUnsafe { + val nPayloads: Int = 50000000 // 50 millions + + def makeScalaIterator: Iterator[Any] = { + (0 until nPayloads).view.map { _: Int => new Object }.iterator + } + + def compareIterator(name: String)( + makeGlutenIterator: Iterators.Version => Iterator[Any]): Unit = { + val benchmark = new Benchmark(name, nPayloads, output = output) + benchmark.addCase("Scala Iterator") { + _ => + val count = makeScalaIterator.count(_ => true) + assert(count == nPayloads) + } + benchmark.addCase("Gluten Iterator V1") { + _ => + val count = makeGlutenIterator(V1).count(_ => true) + assert(count == nPayloads) + } + benchmark.run() + } + + compareIterator("0 Levels Nesting") { + version => + Iterators + .wrap(version, makeScalaIterator) + .create() + } + compareIterator("1 Levels Nesting - read") { + version => + Iterators + .wrap(version, makeScalaIterator) + .collectReadMillis { _ => } + .create() + } + compareIterator("5 Levels Nesting - read") { + version => + Iterators + .wrap(version, makeScalaIterator) + .collectReadMillis { _ => } + .collectReadMillis { _ => } + .collectReadMillis { _ => } + .collectReadMillis { _ => } + .collectReadMillis { _ => } + .create() + } + compareIterator("10 Levels Nesting - read") { + version => + Iterators + .wrap(version, makeScalaIterator) + .collectReadMillis { _ => } + .collectReadMillis { _ => } + .collectReadMillis { _ => } + .collectReadMillis { _ => } + .collectReadMillis { _ => } + .collectReadMillis { _ => } + .collectReadMillis { _ => } + .collectReadMillis { _ => } + .collectReadMillis { _ => } + .collectReadMillis { _ => } + .create() + } + compareIterator("1 Levels Nesting - recycle") { + version => + Iterators + .wrap(version, makeScalaIterator) + .recycleIterator {} + .create() + } + compareIterator("5 Levels Nesting - recycle") { + version => + Iterators + .wrap(version, makeScalaIterator) + .recycleIterator {} + .recycleIterator {} + .recycleIterator {} + .recycleIterator {} + .recycleIterator {} + .create() + } + compareIterator("10 Levels Nesting - recycle") { + version => + Iterators + .wrap(version, makeScalaIterator) + .recycleIterator {} + .recycleIterator {} + .recycleIterator {} + .recycleIterator {} + .recycleIterator {} + .recycleIterator {} + .recycleIterator {} + .recycleIterator {} + .recycleIterator {} + .recycleIterator {} + .create() + } + } + } + } +} diff --git a/gluten-data/pom.xml b/gluten-data/pom.xml index bb84a06b4125d..c28490d77faad 100644 --- a/gluten-data/pom.xml +++ b/gluten-data/pom.xml @@ -124,7 +124,7 @@ org.apache.arrow arrow-c-data - ${arrow.version} + ${arrow-gluten.version} org.apache.arrow @@ -140,7 +140,7 @@ org.apache.arrow arrow-dataset - ${arrow.version} + ${arrow-gluten.version} io.netty diff --git a/gluten-data/src/main/java/org/apache/gluten/columnarbatch/ColumnarBatchJniWrapper.java b/gluten-data/src/main/java/org/apache/gluten/columnarbatch/ColumnarBatchJniWrapper.java index 4f042b893832f..e71e9d7bee1b5 100644 --- a/gluten-data/src/main/java/org/apache/gluten/columnarbatch/ColumnarBatchJniWrapper.java +++ b/gluten-data/src/main/java/org/apache/gluten/columnarbatch/ColumnarBatchJniWrapper.java @@ -18,7 +18,6 @@ import org.apache.gluten.exec.Runtime; import org.apache.gluten.exec.RuntimeAware; -import org.apache.gluten.exec.Runtimes; public class ColumnarBatchJniWrapper implements RuntimeAware { private final Runtime runtime; @@ -27,11 +26,7 @@ private ColumnarBatchJniWrapper(Runtime runtime) { this.runtime = runtime; } - public static ColumnarBatchJniWrapper create() { - return new ColumnarBatchJniWrapper(Runtimes.contextInstance()); - } - - public static ColumnarBatchJniWrapper forRuntime(Runtime runtime) { + public static ColumnarBatchJniWrapper create(Runtime runtime) { return new ColumnarBatchJniWrapper(runtime); } @@ -39,22 +34,19 @@ public static ColumnarBatchJniWrapper forRuntime(Runtime runtime) { public native long getForEmptySchema(int numRows); - public native String getType(long batchHandle); + public native String getType(long batch); - public native long numColumns(long batchHandle); + public native long numColumns(long batch); - public native long numRows(long batchHandle); + public native long numRows(long batch); - public native long numBytes(long batchHandle); + public native long numBytes(long batch); public native long compose(long[] batches); public native void exportToArrow(long batch, long cSchema, long cArray); - public native long select( - long nativeMemoryManagerHandle, // why a mm is needed here? - long batch, - int[] columnIndices); + public native long select(long batch, int[] columnIndices); public native void close(long batch); diff --git a/gluten-data/src/main/java/org/apache/gluten/columnarbatch/ColumnarBatches.java b/gluten-data/src/main/java/org/apache/gluten/columnarbatch/ColumnarBatches.java index e2cfa335d5c6f..cb68e032dc5ba 100644 --- a/gluten-data/src/main/java/org/apache/gluten/columnarbatch/ColumnarBatches.java +++ b/gluten-data/src/main/java/org/apache/gluten/columnarbatch/ColumnarBatches.java @@ -19,13 +19,12 @@ import org.apache.gluten.exception.GlutenException; import org.apache.gluten.exec.Runtime; import org.apache.gluten.exec.Runtimes; -import org.apache.gluten.memory.nmm.NativeMemoryManager; import org.apache.gluten.utils.ArrowAbiUtil; import org.apache.gluten.utils.ArrowUtil; import org.apache.gluten.utils.ImplicitClass; import org.apache.gluten.vectorized.ArrowWritableColumnVector; -import com.google.common.base.Preconditions; +import com.google.common.annotations.VisibleForTesting; import org.apache.arrow.c.ArrowArray; import org.apache.arrow.c.ArrowSchema; import org.apache.arrow.c.CDataDictionaryProvider; @@ -38,11 +37,8 @@ import java.lang.reflect.Field; import java.util.Arrays; -import java.util.HashSet; import java.util.Iterator; -import java.util.List; import java.util.NoSuchElementException; -import java.util.Set; public class ColumnarBatches { private static final Field FIELD_COLUMNS; @@ -99,10 +95,12 @@ private static void transferVectors(ColumnarBatch from, ColumnarBatch target) { if (target.numCols() != from.numCols()) { throw new IllegalStateException(); } - final ColumnVector[] vectors = (ColumnVector[]) FIELD_COLUMNS.get(target); + final ColumnVector[] newVectors = new ColumnVector[from.numCols()]; for (int i = 0; i < target.numCols(); i++) { - vectors[i] = from.column(i); + newVectors[i] = from.column(i); } + FIELD_COLUMNS.set(target, newVectors); + System.out.println(); } catch (IllegalAccessException e) { throw new GlutenException(e); } @@ -125,15 +123,14 @@ public static boolean isLightBatch(ColumnarBatch batch) { * This method will always return a velox based ColumnarBatch. This method will close the input * column batch. */ - public static ColumnarBatch select( - NativeMemoryManager nmm, ColumnarBatch batch, int[] columnIndices) { + public static ColumnarBatch select(ColumnarBatch batch, int[] columnIndices) { + final Runtime runtime = Runtimes.contextInstance("ColumnarBatches#select"); switch (identifyBatchType(batch)) { case LIGHT: final IndicatorVector iv = getIndicatorVector(batch); long outputBatchHandle = - ColumnarBatchJniWrapper.create() - .select(nmm.getNativeInstanceHandle(), iv.handle(), columnIndices); - return create(iv.runtime(), outputBatchHandle); + ColumnarBatchJniWrapper.create(runtime).select(iv.handle(), columnIndices); + return create(outputBatchHandle); case HEAVY: return new ColumnarBatch( Arrays.stream(columnIndices).mapToObj(batch::column).toArray(ColumnVector[]::new), @@ -181,7 +178,7 @@ private static ColumnarBatch load(BufferAllocator allocator, ColumnarBatch input ArrowArray cArray = ArrowArray.allocateNew(allocator); ArrowSchema arrowSchema = ArrowSchema.allocateNew(allocator); CDataDictionaryProvider provider = new CDataDictionaryProvider()) { - ColumnarBatchJniWrapper.forRuntime(iv.runtime()) + ColumnarBatchJniWrapper.create(Runtimes.contextInstance("ColumnarBatches#load")) .exportToArrow(iv.handle(), cSchema.memoryAddress(), cArray.memoryAddress()); Data.exportSchema( @@ -217,14 +214,14 @@ private static ColumnarBatch offload(BufferAllocator allocator, ColumnarBatch in if (input.numCols() == 0) { throw new IllegalArgumentException("batch with zero columns cannot be offloaded"); } - final Runtime runtime = Runtimes.contextInstance(); + final Runtime runtime = Runtimes.contextInstance("ColumnarBatches#offload"); try (ArrowArray cArray = ArrowArray.allocateNew(allocator); ArrowSchema cSchema = ArrowSchema.allocateNew(allocator)) { ArrowAbiUtil.exportFromSparkColumnarBatch(allocator, input, cSchema, cArray); long handle = - ColumnarBatchJniWrapper.forRuntime(runtime) + ColumnarBatchJniWrapper.create(runtime) .createWithArrowArray(cSchema.memoryAddress(), cArray.memoryAddress()); - ColumnarBatch output = ColumnarBatches.create(runtime, handle); + ColumnarBatch output = ColumnarBatches.create(handle); // Follow input's reference count. This might be optimized using // automatic clean-up or once the extensibility of ColumnarBatch is enriched @@ -300,7 +297,8 @@ private static long getRefCntHeavy(ColumnarBatch input) { return refCnt; } - private static long getRefCnt(ColumnarBatch input) { + @VisibleForTesting + static long getRefCnt(ColumnarBatch input) { switch (identifyBatchType(input)) { case LIGHT: return getRefCntLight(input); @@ -333,18 +331,12 @@ public static long compose(ColumnarBatch... batches) { Arrays.stream(batches) .map(ColumnarBatches::getIndicatorVector) .toArray(IndicatorVector[]::new); - // We assume all input batches should be managed by same Runtime. - // FIXME: The check could be removed to adopt ownership-transfer semantic - final Runtime[] ctxs = - Arrays.stream(ivs).map(IndicatorVector::runtime).distinct().toArray(Runtime[]::new); - Preconditions.checkState( - ctxs.length == 1, "All input batches should be managed by same Runtime."); final long[] handles = Arrays.stream(ivs).mapToLong(IndicatorVector::handle).toArray(); - return ColumnarBatchJniWrapper.forRuntime(ctxs[0]).compose(handles); + return ColumnarBatchJniWrapper.create(Runtimes.contextInstance("ColumnarBatches#compose")) + .compose(handles); } - public static ColumnarBatch create(Runtime runtime, long nativeHandle) { - final IndicatorVector iv = new IndicatorVector(runtime, nativeHandle); + private static ColumnarBatch create(IndicatorVector iv) { int numColumns = Math.toIntExact(iv.getNumColumns()); int numRows = Math.toIntExact(iv.getNumRows()); if (numColumns == 0) { @@ -360,6 +352,10 @@ public static ColumnarBatch create(Runtime runtime, long nativeHandle) { return new ColumnarBatch(columnVectors, numRows); } + public static ColumnarBatch create(long nativeHandle) { + return create(IndicatorVector.obtain(nativeHandle)); + } + public static void retain(ColumnarBatch b) { switch (identifyBatchType(b)) { case LIGHT: @@ -384,18 +380,4 @@ public static void release(ColumnarBatch b) { public static long getNativeHandle(ColumnarBatch batch) { return getIndicatorVector(batch).handle(); } - - public static Runtime getRuntime(ColumnarBatch batch) { - return getIndicatorVector(batch).runtime(); - } - - public static Runtime getRuntime(List batch) { - final Set all = new HashSet<>(); - batch.forEach(b -> all.add(getRuntime(b))); - if (all.size() != 1) { - throw new IllegalArgumentException( - "The input columnar batches has different associated runtimes"); - } - return all.toArray(new Runtime[0])[0]; - } } diff --git a/gluten-data/src/main/java/org/apache/gluten/columnarbatch/IndicatorVector.java b/gluten-data/src/main/java/org/apache/gluten/columnarbatch/IndicatorVector.java index d15412bec7e56..7fe87e95fa548 100644 --- a/gluten-data/src/main/java/org/apache/gluten/columnarbatch/IndicatorVector.java +++ b/gluten-data/src/main/java/org/apache/gluten/columnarbatch/IndicatorVector.java @@ -16,148 +16,45 @@ */ package org.apache.gluten.columnarbatch; -import org.apache.gluten.exec.Runtime; - -import org.apache.spark.sql.types.DataTypes; -import org.apache.spark.sql.types.Decimal; -import org.apache.spark.sql.vectorized.ColumnVector; -import org.apache.spark.sql.vectorized.ColumnarArray; -import org.apache.spark.sql.vectorized.ColumnarMap; -import org.apache.spark.unsafe.types.UTF8String; +import org.apache.spark.util.TaskResources; import java.util.concurrent.atomic.AtomicLong; -public class IndicatorVector extends ColumnVector { - private final Runtime runtime; - private final long handle; +public class IndicatorVector extends IndicatorVectorBase { + private final IndicatorVectorPool pool; private final AtomicLong refCnt = new AtomicLong(1L); - protected IndicatorVector(Runtime runtime, long handle) { - super(DataTypes.NullType); - this.runtime = runtime; - this.handle = handle; - } - - public Runtime runtime() { - return runtime; - } - - public String getType() { - return ColumnarBatchJniWrapper.forRuntime(runtime).getType(handle); - } - - public long getNumColumns() { - return ColumnarBatchJniWrapper.forRuntime(runtime).numColumns(handle); + protected IndicatorVector(IndicatorVectorPool pool, long handle) { + super(handle); + this.pool = pool; } - public long getNumRows() { - return ColumnarBatchJniWrapper.forRuntime(runtime).numRows(handle); + static IndicatorVector obtain(long handle) { + final IndicatorVectorPool pool = + TaskResources.addResourceIfNotRegistered( + IndicatorVectorPool.class.getName(), IndicatorVectorPool::new); + return pool.obtain(handle); } - public long refCnt() { + @Override + long refCnt() { return refCnt.get(); } - public void retain() { + @Override + void retain() { refCnt.getAndIncrement(); } @Override - public void close() { + void release() { if (refCnt.get() == 0) { // TODO use stronger restriction (IllegalStateException probably) return; } if (refCnt.decrementAndGet() == 0) { - ColumnarBatchJniWrapper.forRuntime(runtime).close(handle); + pool.remove(handle); + jniWrapper.close(handle); } } - - public boolean isClosed() { - return refCnt.get() == 0; - } - - @Override - public boolean hasNull() { - throw new UnsupportedOperationException(); - } - - @Override - public int numNulls() { - throw new UnsupportedOperationException(); - } - - @Override - public boolean isNullAt(int rowId) { - throw new UnsupportedOperationException(); - } - - @Override - public boolean getBoolean(int rowId) { - throw new UnsupportedOperationException(); - } - - @Override - public byte getByte(int rowId) { - throw new UnsupportedOperationException(); - } - - @Override - public short getShort(int rowId) { - throw new UnsupportedOperationException(); - } - - @Override - public int getInt(int rowId) { - throw new UnsupportedOperationException(); - } - - @Override - public long getLong(int rowId) { - throw new UnsupportedOperationException(); - } - - @Override - public float getFloat(int rowId) { - throw new UnsupportedOperationException(); - } - - @Override - public double getDouble(int rowId) { - throw new UnsupportedOperationException(); - } - - @Override - public ColumnarArray getArray(int rowId) { - throw new UnsupportedOperationException(); - } - - @Override - public ColumnarMap getMap(int ordinal) { - throw new UnsupportedOperationException(); - } - - @Override - public Decimal getDecimal(int rowId, int precision, int scale) { - throw new UnsupportedOperationException(); - } - - @Override - public UTF8String getUTF8String(int rowId) { - throw new UnsupportedOperationException(); - } - - @Override - public byte[] getBinary(int rowId) { - throw new UnsupportedOperationException(); - } - - @Override - public ColumnVector getChild(int ordinal) { - throw new UnsupportedOperationException(); - } - - public long handle() { - return handle; - } } diff --git a/gluten-data/src/main/java/org/apache/gluten/columnarbatch/IndicatorVectorBase.java b/gluten-data/src/main/java/org/apache/gluten/columnarbatch/IndicatorVectorBase.java new file mode 100644 index 0000000000000..1bc685bd5cebe --- /dev/null +++ b/gluten-data/src/main/java/org/apache/gluten/columnarbatch/IndicatorVectorBase.java @@ -0,0 +1,145 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.columnarbatch; + +import org.apache.gluten.exec.Runtimes; + +import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.sql.types.Decimal; +import org.apache.spark.sql.vectorized.ColumnVector; +import org.apache.spark.sql.vectorized.ColumnarArray; +import org.apache.spark.sql.vectorized.ColumnarMap; +import org.apache.spark.unsafe.types.UTF8String; + +public abstract class IndicatorVectorBase extends ColumnVector { + protected final ColumnarBatchJniWrapper jniWrapper; + protected final long handle; + + protected IndicatorVectorBase(long handle) { + super(DataTypes.NullType); + this.jniWrapper = + ColumnarBatchJniWrapper.create(Runtimes.contextInstance("IndicatorVectorBase#init")); + this.handle = handle; + } + + public String getType() { + return jniWrapper.getType(handle); + } + + public long getNumColumns() { + return jniWrapper.numColumns(handle); + } + + public long getNumRows() { + return jniWrapper.numRows(handle); + } + + abstract long refCnt(); + + abstract void retain(); + + abstract void release(); + + @Override + public void close() { + release(); + } + + @Override + public boolean hasNull() { + throw new UnsupportedOperationException(); + } + + @Override + public int numNulls() { + throw new UnsupportedOperationException(); + } + + @Override + public boolean isNullAt(int rowId) { + throw new UnsupportedOperationException(); + } + + @Override + public boolean getBoolean(int rowId) { + throw new UnsupportedOperationException(); + } + + @Override + public byte getByte(int rowId) { + throw new UnsupportedOperationException(); + } + + @Override + public short getShort(int rowId) { + throw new UnsupportedOperationException(); + } + + @Override + public int getInt(int rowId) { + throw new UnsupportedOperationException(); + } + + @Override + public long getLong(int rowId) { + throw new UnsupportedOperationException(); + } + + @Override + public float getFloat(int rowId) { + throw new UnsupportedOperationException(); + } + + @Override + public double getDouble(int rowId) { + throw new UnsupportedOperationException(); + } + + @Override + public ColumnarArray getArray(int rowId) { + throw new UnsupportedOperationException(); + } + + @Override + public ColumnarMap getMap(int ordinal) { + throw new UnsupportedOperationException(); + } + + @Override + public Decimal getDecimal(int rowId, int precision, int scale) { + throw new UnsupportedOperationException(); + } + + @Override + public UTF8String getUTF8String(int rowId) { + throw new UnsupportedOperationException(); + } + + @Override + public byte[] getBinary(int rowId) { + throw new UnsupportedOperationException(); + } + + @Override + public ColumnVector getChild(int ordinal) { + throw new UnsupportedOperationException(); + } + + public long handle() { + return handle; + } +} diff --git a/gluten-data/src/main/java/org/apache/gluten/columnarbatch/IndicatorVectorPool.java b/gluten-data/src/main/java/org/apache/gluten/columnarbatch/IndicatorVectorPool.java new file mode 100644 index 0000000000000..6e46742b564ad --- /dev/null +++ b/gluten-data/src/main/java/org/apache/gluten/columnarbatch/IndicatorVectorPool.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.columnarbatch; + +import org.apache.spark.util.TaskResource; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; + +public class IndicatorVectorPool implements TaskResource { + private static final Logger LOG = LoggerFactory.getLogger(IndicatorVectorPool.class); + // A pool for all alive indicator vectors. The reason we adopt the pool + // is, we don't want one native columnar batch (which is located via the + // long int handle through JNI bridge) to be owned by more than one IndicatorVector + // instance so release method of the native columnar batch could be guaranteed + // to be called and only called once. + private final Map uniqueInstances = new ConcurrentHashMap<>(); + + IndicatorVectorPool() {} + + @Override + public void release() throws Exception { + if (!uniqueInstances.isEmpty()) { + LOG.warn( + "There are still unreleased native columnar batches during ending the task." + + " Will close them automatically however the batches should be better released" + + " manually to minimize memory pressure."); + } + } + + IndicatorVector obtain(long handle) { + return uniqueInstances.computeIfAbsent(handle, h -> new IndicatorVector(this, handle)); + } + + void remove(long handle) { + if (uniqueInstances.remove(handle) == null) { + throw new IllegalStateException("Indicator vector not found in pool, this should not happen"); + } + } + + @Override + public int priority() { + return 0; + } + + @Override + public String resourceName() { + return IndicatorVectorPool.class.getName(); + } +} diff --git a/gluten-data/src/main/java/org/apache/gluten/datasource/DatasourceJniWrapper.java b/gluten-data/src/main/java/org/apache/gluten/datasource/DatasourceJniWrapper.java index 46d12c1deee9c..5a34196c4eb76 100644 --- a/gluten-data/src/main/java/org/apache/gluten/datasource/DatasourceJniWrapper.java +++ b/gluten-data/src/main/java/org/apache/gluten/datasource/DatasourceJniWrapper.java @@ -18,7 +18,6 @@ import org.apache.gluten.exec.Runtime; import org.apache.gluten.exec.RuntimeAware; -import org.apache.gluten.exec.Runtimes; import org.apache.gluten.init.JniUtils; import org.apache.gluten.vectorized.ColumnarBatchInIterator; @@ -35,8 +34,8 @@ private DatasourceJniWrapper(Runtime runtime) { this.runtime = runtime; } - public static DatasourceJniWrapper create() { - return new DatasourceJniWrapper(Runtimes.contextInstance()); + public static DatasourceJniWrapper create(Runtime runtime) { + return new DatasourceJniWrapper(runtime); } @Override @@ -44,14 +43,11 @@ public long handle() { return runtime.getHandle(); } - public long nativeInitDatasource( - String filePath, long cSchema, long memoryManagerHandle, Map options) { - return nativeInitDatasource( - filePath, cSchema, memoryManagerHandle, JniUtils.toNativeConf(options)); + public long nativeInitDatasource(String filePath, long cSchema, Map options) { + return nativeInitDatasource(filePath, cSchema, JniUtils.toNativeConf(options)); } - public native long nativeInitDatasource( - String filePath, long cSchema, long memoryManagerHandle, byte[] options); + public native long nativeInitDatasource(String filePath, long cSchema, byte[] options); public native void inspectSchema(long dsHandle, long cSchemaAddress); @@ -60,5 +56,5 @@ public native long nativeInitDatasource( public native void write(long dsHandle, ColumnarBatchInIterator iterator); public native BlockStripes splitBlockByPartitionAndBucket( - long blockAddress, int[] partitionColIndice, boolean hasBucket, long memoryManagerId); + long blockAddress, int[] partitionColIndice, boolean hasBucket); } diff --git a/gluten-data/src/main/java/org/apache/gluten/exec/RuntimeJniWrapper.java b/gluten-data/src/main/java/org/apache/gluten/exec/RuntimeJniWrapper.java index 14ec87dcf6ca8..d2a18e9b4930c 100644 --- a/gluten-data/src/main/java/org/apache/gluten/exec/RuntimeJniWrapper.java +++ b/gluten-data/src/main/java/org/apache/gluten/exec/RuntimeJniWrapper.java @@ -16,11 +16,21 @@ */ package org.apache.gluten.exec; +import org.apache.gluten.memory.listener.ReservationListener; + public class RuntimeJniWrapper { private RuntimeJniWrapper() {} - public static native long createRuntime(String backendType, byte[] sessionConf); + public static native long createRuntime( + String backendType, ReservationListener listener, byte[] sessionConf); + + // Memory management. + public static native byte[] collectMemoryUsage(long handle); + + public static native long shrinkMemory(long handle, long size); + + public static native void holdMemory(long handle); public static native void releaseRuntime(long handle); } diff --git a/gluten-data/src/main/java/org/apache/gluten/memory/alloc/NativeMemoryAllocators.java b/gluten-data/src/main/java/org/apache/gluten/memory/alloc/NativeMemoryAllocators.java deleted file mode 100644 index e60766ac1f954..0000000000000 --- a/gluten-data/src/main/java/org/apache/gluten/memory/alloc/NativeMemoryAllocators.java +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.gluten.memory.alloc; - -import java.util.Map; -import java.util.concurrent.ConcurrentHashMap; - -/** - * Built-in toolkit for managing native memory allocations. To use the facility, one should import - * Gluten's C++ library then create the c++ instance using following example code: - * - *

```c++ auto* allocator = reinterpret_cast(allocator_id); ``` - * - *

The ID "allocator_id" can be retrieved from Java API {@link - * NativeMemoryAllocator#getNativeInstanceId()}. - * - *

FIXME: to export the native APIs in a standard way - */ -public final class NativeMemoryAllocators { - private static final Map INSTANCES = - new ConcurrentHashMap<>(); - - private final NativeMemoryAllocator allocator; - - private NativeMemoryAllocators(NativeMemoryAllocator.Type type) { - allocator = NativeMemoryAllocator.create(type); - } - - public static NativeMemoryAllocators getDefault() { - return forType(NativeMemoryAllocator.Type.DEFAULT); - } - - private static NativeMemoryAllocators forType(NativeMemoryAllocator.Type type) { - return INSTANCES.computeIfAbsent(type, NativeMemoryAllocators::new); - } - - public NativeMemoryAllocator get() { - return allocator; - } -} diff --git a/gluten-data/src/main/java/org/apache/gluten/memory/arrow/alloc/ArrowBufferAllocators.java b/gluten-data/src/main/java/org/apache/gluten/memory/arrow/alloc/ArrowBufferAllocators.java index efee20e48b83d..f51852ab8d2bf 100644 --- a/gluten-data/src/main/java/org/apache/gluten/memory/arrow/alloc/ArrowBufferAllocators.java +++ b/gluten-data/src/main/java/org/apache/gluten/memory/arrow/alloc/ArrowBufferAllocators.java @@ -17,6 +17,7 @@ package org.apache.gluten.memory.arrow.alloc; import org.apache.gluten.memory.memtarget.MemoryTargets; +import org.apache.gluten.memory.memtarget.Spillers; import org.apache.arrow.memory.AllocationListener; import org.apache.arrow.memory.BufferAllocator; @@ -60,11 +61,9 @@ public static class ArrowBufferAllocatorManager implements TaskResource { listener = new ManagedAllocationListener( MemoryTargets.throwOnOom( - MemoryTargets.newConsumer( - tmm, - "ArrowContextInstance", - Collections.emptyList(), - Collections.emptyMap())), + MemoryTargets.dynamicOffHeapSizingIfEnabled( + MemoryTargets.newConsumer( + tmm, "ArrowContextInstance", Spillers.NOOP, Collections.emptyMap()))), TaskResources.getSharedUsage()); } diff --git a/gluten-data/src/main/java/org/apache/gluten/memory/nmm/ManagedReservationListener.java b/gluten-data/src/main/java/org/apache/gluten/memory/listener/ManagedReservationListener.java similarity index 98% rename from gluten-data/src/main/java/org/apache/gluten/memory/nmm/ManagedReservationListener.java rename to gluten-data/src/main/java/org/apache/gluten/memory/listener/ManagedReservationListener.java index 88e7933209209..b7d6ecd67589a 100644 --- a/gluten-data/src/main/java/org/apache/gluten/memory/nmm/ManagedReservationListener.java +++ b/gluten-data/src/main/java/org/apache/gluten/memory/listener/ManagedReservationListener.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.gluten.memory.nmm; +package org.apache.gluten.memory.listener; import org.apache.gluten.memory.SimpleMemoryUsageRecorder; import org.apache.gluten.memory.memtarget.MemoryTarget; diff --git a/gluten-data/src/main/java/org/apache/gluten/memory/nmm/ReservationListener.java b/gluten-data/src/main/java/org/apache/gluten/memory/listener/ReservationListener.java similarity index 76% rename from gluten-data/src/main/java/org/apache/gluten/memory/nmm/ReservationListener.java rename to gluten-data/src/main/java/org/apache/gluten/memory/listener/ReservationListener.java index f423bc8fb4e43..6f8cb867e0033 100644 --- a/gluten-data/src/main/java/org/apache/gluten/memory/nmm/ReservationListener.java +++ b/gluten-data/src/main/java/org/apache/gluten/memory/listener/ReservationListener.java @@ -14,14 +14,9 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.gluten.memory.nmm; - -import org.apache.gluten.memory.SimpleMemoryUsageRecorder; -import org.apache.gluten.memory.memtarget.NoopMemoryTarget; +package org.apache.gluten.memory.listener; public interface ReservationListener { - ReservationListener NOOP = - new ManagedReservationListener(new NoopMemoryTarget(), new SimpleMemoryUsageRecorder()); long reserve(long size); diff --git a/gluten-data/src/main/java/org/apache/gluten/memory/listener/ReservationListeners.java b/gluten-data/src/main/java/org/apache/gluten/memory/listener/ReservationListeners.java new file mode 100644 index 0000000000000..47b9937eb7a3a --- /dev/null +++ b/gluten-data/src/main/java/org/apache/gluten/memory/listener/ReservationListeners.java @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.memory.listener; + +import org.apache.gluten.GlutenConfig; +import org.apache.gluten.memory.MemoryUsageStatsBuilder; +import org.apache.gluten.memory.SimpleMemoryUsageRecorder; +import org.apache.gluten.memory.memtarget.*; + +import org.apache.spark.memory.TaskMemoryManager; +import org.apache.spark.util.TaskResources; + +import java.util.Collections; +import java.util.Map; + +public final class ReservationListeners { + public static final ReservationListener NOOP = + new ManagedReservationListener(new NoopMemoryTarget(), new SimpleMemoryUsageRecorder()); + + public static ReservationListener create( + String name, Spiller spiller, Map mutableStats) { + if (!TaskResources.inSparkTask()) { + throw new IllegalStateException( + "Spillable reservation listener must be used in a Spark task."); + } + return create0(name, spiller, mutableStats); + } + + private static ReservationListener create0( + String name, Spiller spiller, Map mutableStats) { + // Memory target. + final double overAcquiredRatio = GlutenConfig.getConf().memoryOverAcquiredRatio(); + final long reservationBlockSize = GlutenConfig.getConf().memoryReservationBlockSize(); + final TaskMemoryManager tmm = TaskResources.getLocalTaskContext().taskMemoryManager(); + final MemoryTarget target = + MemoryTargets.throwOnOom( + MemoryTargets.overAcquire( + MemoryTargets.dynamicOffHeapSizingIfEnabled( + MemoryTargets.newConsumer( + tmm, + name, + Spillers.withMinSpillSize(spiller, reservationBlockSize), + mutableStats)), + MemoryTargets.dynamicOffHeapSizingIfEnabled( + MemoryTargets.newConsumer( + tmm, + "OverAcquire.DummyTarget", + new Spiller() { + @Override + public long spill(MemoryTarget self, Spiller.Phase phase, long size) { + if (!Spillers.PHASE_SET_ALL.contains(phase)) { + return 0L; + } + return self.repay(size); + } + }, + Collections.emptyMap())), + overAcquiredRatio)); + + // Listener. + return new ManagedReservationListener(target, TaskResources.getSharedUsage()); + } +} diff --git a/gluten-data/src/main/java/org/apache/gluten/memory/nmm/NativeMemoryManager.java b/gluten-data/src/main/java/org/apache/gluten/memory/nmm/NativeMemoryManager.java deleted file mode 100644 index 0d1a0c5aec418..0000000000000 --- a/gluten-data/src/main/java/org/apache/gluten/memory/nmm/NativeMemoryManager.java +++ /dev/null @@ -1,124 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.gluten.memory.nmm; - -import org.apache.gluten.backendsapi.BackendsApiManager; -import org.apache.gluten.memory.alloc.NativeMemoryAllocators; -import org.apache.gluten.memory.memtarget.KnownNameAndStats; -import org.apache.gluten.proto.MemoryUsageStats; - -import com.google.protobuf.InvalidProtocolBufferException; -import org.apache.spark.memory.SparkMemoryUtil; -import org.apache.spark.util.TaskResource; -import org.apache.spark.util.Utils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public class NativeMemoryManager implements TaskResource { - - private static final Logger LOGGER = LoggerFactory.getLogger(NativeMemoryManager.class); - - private final long nativeInstanceHandle; - private final String name; - private final ReservationListener listener; - - private NativeMemoryManager( - String name, long nativeInstanceHandle, ReservationListener listener) { - this.name = name; - this.nativeInstanceHandle = nativeInstanceHandle; - this.listener = listener; - } - - public static NativeMemoryManager create(String name, ReservationListener listener) { - long allocatorId = NativeMemoryAllocators.getDefault().get().getNativeInstanceId(); - return new NativeMemoryManager( - name, create(BackendsApiManager.getBackendName(), name, allocatorId, listener), listener); - } - - public long getNativeInstanceHandle() { - return this.nativeInstanceHandle; - } - - public MemoryUsageStats collectMemoryUsage() { - try { - return MemoryUsageStats.parseFrom(collectMemoryUsage(nativeInstanceHandle)); - } catch (InvalidProtocolBufferException e) { - throw new RuntimeException(e); - } - } - - public long shrink(long size) { - return shrink(nativeInstanceHandle, size); - } - - // Hold this memory manager. The underlying memory pools will be released as lately as this - // memory manager gets destroyed. Which means, a call to this function would make sure the - // memory blocks directly or indirectly managed by this manager, be guaranteed safe to - // access during the period that this manager is alive. - public void hold() { - hold(nativeInstanceHandle); - } - - private static native long shrink(long memoryManagerId, long size); - - private static native long create( - String backendType, String name, long allocatorId, ReservationListener listener); - - private static native void release(long memoryManagerId); - - private static native byte[] collectMemoryUsage(long memoryManagerId); - - private static native void hold(long memoryManagerId); - - @Override - public void release() throws Exception { - if (LOGGER.isDebugEnabled()) { - LOGGER.debug( - SparkMemoryUtil.prettyPrintStats( - "About to release memory manager, usage dump:", - new KnownNameAndStats() { - @Override - public String name() { - return name; - } - - @Override - public MemoryUsageStats stats() { - return collectMemoryUsage(); - } - })); - } - release(nativeInstanceHandle); - if (listener.getUsedBytes() != 0) { - LOGGER.warn( - String.format( - "%s Reservation listener %s still reserved non-zero bytes, " - + "which may cause memory leak, size: %s. ", - name, listener.toString(), Utils.bytesToString(listener.getUsedBytes()))); - } - } - - @Override - public int priority() { - return 0; // lowest release priority - } - - @Override - public String resourceName() { - return name + "_mem"; - } -} diff --git a/gluten-data/src/main/java/org/apache/gluten/memory/nmm/NativeMemoryManagers.java b/gluten-data/src/main/java/org/apache/gluten/memory/nmm/NativeMemoryManagers.java deleted file mode 100644 index 928f869ba4e12..0000000000000 --- a/gluten-data/src/main/java/org/apache/gluten/memory/nmm/NativeMemoryManagers.java +++ /dev/null @@ -1,158 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.gluten.memory.nmm; - -import org.apache.gluten.GlutenConfig; -import org.apache.gluten.memory.MemoryUsageRecorder; -import org.apache.gluten.memory.memtarget.MemoryTarget; -import org.apache.gluten.memory.memtarget.MemoryTargets; -import org.apache.gluten.memory.memtarget.Spiller; -import org.apache.gluten.memory.memtarget.Spillers; -import org.apache.gluten.proto.MemoryUsageStats; - -import org.apache.spark.memory.TaskMemoryManager; -import org.apache.spark.util.TaskResources; - -import java.util.Arrays; -import java.util.Collections; -import java.util.List; -import java.util.Optional; -import java.util.Set; -import java.util.concurrent.atomic.AtomicReference; -import java.util.stream.Collectors; -import java.util.stream.Stream; - -public final class NativeMemoryManagers { - - // TODO: Let all caller support spill. - public static NativeMemoryManager contextInstance(String name) { - if (!TaskResources.inSparkTask()) { - throw new IllegalStateException("This method must be called in a Spark task."); - } - String id = "NativeMemoryManager:" + name; - return TaskResources.addResourceIfNotRegistered( - id, () -> createNativeMemoryManager(name, Collections.emptyList())); - } - - public static NativeMemoryManager create(String name, Spiller... spillers) { - if (!TaskResources.inSparkTask()) { - throw new IllegalStateException("Spiller must be used in a Spark task."); - } - - final NativeMemoryManager manager = createNativeMemoryManager(name, Arrays.asList(spillers)); - return TaskResources.addAnonymousResource(manager); - } - - private static NativeMemoryManager createNativeMemoryManager( - String name, List spillers) { - final AtomicReference out = new AtomicReference<>(); - // memory target - final double overAcquiredRatio = GlutenConfig.getConf().memoryOverAcquiredRatio(); - final long reservationBlockSize = GlutenConfig.getConf().memoryReservationBlockSize(); - final TaskMemoryManager tmm = TaskResources.getLocalTaskContext().taskMemoryManager(); - final MemoryTarget target = - MemoryTargets.throwOnOom( - MemoryTargets.overAcquire( - MemoryTargets.newConsumer( - tmm, - name, - // call memory manager's shrink API, if no good then call the spiller - Stream.concat( - Stream.of( - new Spiller() { - @Override - public long spill(MemoryTarget self, long size) { - return Optional.of(out.get()) - .map(nmm -> nmm.shrink(size)) - .orElseThrow( - () -> - new IllegalStateException( - "" - + "Shrink is requested before native " - + "memory manager is created. Try moving " - + "any actions about memory allocation out " - + "from the memory manager constructor.")); - } - - @Override - public Set applicablePhases() { - return Spillers.PHASE_SET_SHRINK_ONLY; - } - }), - spillers.stream()) - .map(spiller -> Spillers.withMinSpillSize(spiller, reservationBlockSize)) - .collect(Collectors.toList()), - Collections.singletonMap( - "single", - new MemoryUsageRecorder() { - @Override - public void inc(long bytes) { - // no-op - } - - @Override - public long peak() { - throw new UnsupportedOperationException("Not implemented"); - } - - @Override - public long current() { - throw new UnsupportedOperationException("Not implemented"); - } - - @Override - public MemoryUsageStats toStats() { - return getNativeMemoryManager().collectMemoryUsage(); - } - - private NativeMemoryManager getNativeMemoryManager() { - return Optional.of(out.get()) - .orElseThrow( - () -> - new IllegalStateException( - "" - + "Memory usage stats are requested before native " - + "memory manager is created. Try moving any " - + "actions about memory allocation out from the " - + "memory manager constructor.")); - } - })), - MemoryTargets.newConsumer( - tmm, - "OverAcquire.DummyTarget", - Collections.singletonList( - new Spiller() { - @Override - public long spill(MemoryTarget self, long size) { - return self.repay(size); - } - - @Override - public Set applicablePhases() { - return Spillers.PHASE_SET_ALL; - } - }), - Collections.emptyMap()), - overAcquiredRatio)); - // listener - ManagedReservationListener rl = - new ManagedReservationListener(target, TaskResources.getSharedUsage()); - // native memory manager - out.set(NativeMemoryManager.create(name, rl)); - return out.get(); - } -} diff --git a/gluten-data/src/main/java/org/apache/gluten/vectorized/ColumnarBatchInIterator.java b/gluten-data/src/main/java/org/apache/gluten/vectorized/ColumnarBatchInIterator.java index bd89f62a18068..c147862d01395 100644 --- a/gluten-data/src/main/java/org/apache/gluten/vectorized/ColumnarBatchInIterator.java +++ b/gluten-data/src/main/java/org/apache/gluten/vectorized/ColumnarBatchInIterator.java @@ -18,6 +18,7 @@ import org.apache.gluten.columnarbatch.ColumnarBatchJniWrapper; import org.apache.gluten.columnarbatch.ColumnarBatches; +import org.apache.gluten.exec.Runtimes; import org.apache.gluten.memory.arrow.alloc.ArrowBufferAllocators; import org.apache.spark.sql.vectorized.ColumnarBatch; @@ -25,6 +26,7 @@ import java.util.Iterator; public class ColumnarBatchInIterator extends GeneralInIterator { + public ColumnarBatchInIterator(Iterator delegated) { super(delegated); } @@ -33,7 +35,8 @@ public long next() { final ColumnarBatch next = nextColumnarBatch(); if (next.numCols() == 0) { // the operation will find a zero column batch from a task-local pool - return ColumnarBatchJniWrapper.create().getForEmptySchema(next.numRows()); + return ColumnarBatchJniWrapper.create(Runtimes.contextInstance("ColumnarBatchInIterator")) + .getForEmptySchema(next.numRows()); } final ColumnarBatch offloaded = ColumnarBatches.ensureOffloaded(ArrowBufferAllocators.contextInstance(), next); diff --git a/gluten-data/src/main/java/org/apache/gluten/vectorized/ColumnarBatchOutIterator.java b/gluten-data/src/main/java/org/apache/gluten/vectorized/ColumnarBatchOutIterator.java index 82b398439cde9..9dd0404384ad4 100644 --- a/gluten-data/src/main/java/org/apache/gluten/vectorized/ColumnarBatchOutIterator.java +++ b/gluten-data/src/main/java/org/apache/gluten/vectorized/ColumnarBatchOutIterator.java @@ -19,7 +19,6 @@ import org.apache.gluten.columnarbatch.ColumnarBatches; import org.apache.gluten.exec.Runtime; import org.apache.gluten.exec.RuntimeAware; -import org.apache.gluten.memory.nmm.NativeMemoryManager; import org.apache.gluten.metrics.IMetrics; import org.apache.spark.sql.vectorized.ColumnarBatch; @@ -29,14 +28,11 @@ public class ColumnarBatchOutIterator extends GeneralOutIterator implements RuntimeAware { private final Runtime runtime; private final long iterHandle; - private final NativeMemoryManager nmm; - public ColumnarBatchOutIterator(Runtime runtime, long iterHandle, NativeMemoryManager nmm) - throws IOException { + public ColumnarBatchOutIterator(Runtime runtime, long iterHandle) { super(); this.runtime = runtime; this.iterHandle = iterHandle; - this.nmm = nmm; } @Override @@ -71,7 +67,7 @@ public ColumnarBatch nextInternal() throws IOException { if (batchHandle == -1L) { return null; // stream ended } - return ColumnarBatches.create(runtime, batchHandle); + return ColumnarBatches.create(batchHandle); } @Override @@ -80,13 +76,18 @@ public IMetrics getMetricsInternal() throws IOException, ClassNotFoundException } public long spill(long size) { - return nativeSpill(iterHandle, size); + if (!closed.get()) { + return nativeSpill(iterHandle, size); + } else { + return 0L; + } } @Override public void closeInternal() { - nmm.hold(); // to make sure the outputted batches are still accessible after the iterator is - // closed + // To make sure the outputted batches are still accessible after the iterator is closed. + // TODO: Remove this API if we have other choice, e.g., hold the pools in native code. + runtime.holdMemory(); nativeClose(iterHandle); } } diff --git a/gluten-data/src/main/java/org/apache/gluten/vectorized/ColumnarBatchSerializerJniWrapper.java b/gluten-data/src/main/java/org/apache/gluten/vectorized/ColumnarBatchSerializerJniWrapper.java index 59f31689e1076..bfe0d756112f4 100644 --- a/gluten-data/src/main/java/org/apache/gluten/vectorized/ColumnarBatchSerializerJniWrapper.java +++ b/gluten-data/src/main/java/org/apache/gluten/vectorized/ColumnarBatchSerializerJniWrapper.java @@ -18,7 +18,6 @@ import org.apache.gluten.exec.Runtime; import org.apache.gluten.exec.RuntimeAware; -import org.apache.gluten.exec.Runtimes; public class ColumnarBatchSerializerJniWrapper implements RuntimeAware { private final Runtime runtime; @@ -27,11 +26,7 @@ private ColumnarBatchSerializerJniWrapper(Runtime runtime) { this.runtime = runtime; } - public static ColumnarBatchSerializerJniWrapper create() { - return new ColumnarBatchSerializerJniWrapper(Runtimes.contextInstance()); - } - - public static ColumnarBatchSerializerJniWrapper forRuntime(Runtime runtime) { + public static ColumnarBatchSerializerJniWrapper create(Runtime runtime) { return new ColumnarBatchSerializerJniWrapper(runtime); } @@ -40,10 +35,10 @@ public long handle() { return runtime.getHandle(); } - public native ColumnarBatchSerializeResult serialize(long[] handles, long memoryManagerHandle); + public native ColumnarBatchSerializeResult serialize(long[] handles); // Return the native ColumnarBatchSerializer handle - public native long init(long cSchema, long memoryManagerHandle); + public native long init(long cSchema); public native long deserialize(long serializerHandle, byte[] data); diff --git a/gluten-data/src/main/java/org/apache/gluten/vectorized/NativeColumnarToRowJniWrapper.java b/gluten-data/src/main/java/org/apache/gluten/vectorized/NativeColumnarToRowJniWrapper.java index 7ae2fafb31b4f..7f8de78f95ef9 100644 --- a/gluten-data/src/main/java/org/apache/gluten/vectorized/NativeColumnarToRowJniWrapper.java +++ b/gluten-data/src/main/java/org/apache/gluten/vectorized/NativeColumnarToRowJniWrapper.java @@ -18,7 +18,6 @@ import org.apache.gluten.exec.Runtime; import org.apache.gluten.exec.RuntimeAware; -import org.apache.gluten.exec.Runtimes; public class NativeColumnarToRowJniWrapper implements RuntimeAware { private final Runtime runtime; @@ -27,8 +26,8 @@ private NativeColumnarToRowJniWrapper(Runtime runtime) { this.runtime = runtime; } - public static NativeColumnarToRowJniWrapper create() { - return new NativeColumnarToRowJniWrapper(Runtimes.contextInstance()); + public static NativeColumnarToRowJniWrapper create(Runtime runtime) { + return new NativeColumnarToRowJniWrapper(runtime); } @Override @@ -36,9 +35,9 @@ public long handle() { return runtime.getHandle(); } - public native long nativeColumnarToRowInit(long memoryManagerHandle) throws RuntimeException; + public native long nativeColumnarToRowInit() throws RuntimeException; - public native NativeColumnarToRowInfo nativeColumnarToRowConvert(long batchHandle, long c2rHandle) + public native NativeColumnarToRowInfo nativeColumnarToRowConvert(long c2rHandle, long batchHandle) throws RuntimeException; public native void nativeClose(long c2rHandle); diff --git a/gluten-data/src/main/java/org/apache/gluten/vectorized/NativePlanEvaluator.java b/gluten-data/src/main/java/org/apache/gluten/vectorized/NativePlanEvaluator.java index e54724a599c1b..e5eea029b2b3d 100644 --- a/gluten-data/src/main/java/org/apache/gluten/vectorized/NativePlanEvaluator.java +++ b/gluten-data/src/main/java/org/apache/gluten/vectorized/NativePlanEvaluator.java @@ -22,8 +22,6 @@ import org.apache.gluten.memory.memtarget.MemoryTarget; import org.apache.gluten.memory.memtarget.Spiller; import org.apache.gluten.memory.memtarget.Spillers; -import org.apache.gluten.memory.nmm.NativeMemoryManager; -import org.apache.gluten.memory.nmm.NativeMemoryManagers; import org.apache.gluten.utils.DebugUtil; import org.apache.gluten.validate.NativePlanValidationInfo; @@ -33,17 +31,15 @@ import java.io.IOException; import java.nio.charset.StandardCharsets; import java.util.List; -import java.util.Optional; -import java.util.Set; import java.util.UUID; -import java.util.concurrent.atomic.AtomicReference; public class NativePlanEvaluator { + private final Runtime runtime = Runtimes.contextInstance("WholeStageIterator"); private final PlanEvaluatorJniWrapper jniWrapper; private NativePlanEvaluator() { - jniWrapper = PlanEvaluatorJniWrapper.create(); + jniWrapper = PlanEvaluatorJniWrapper.create(runtime); } public static NativePlanEvaluator create() { @@ -63,31 +59,6 @@ public void injectWriteFilesTempPath(String path) { public GeneralOutIterator createKernelWithBatchIterator( byte[] wsPlan, byte[][] splitInfo, List iterList, int partitionIndex) throws RuntimeException, IOException { - final AtomicReference outIterator = new AtomicReference<>(); - final NativeMemoryManager nmm = - NativeMemoryManagers.create( - "WholeStageIterator", - new Spiller() { - @Override - public long spill(MemoryTarget self, long size) { - ColumnarBatchOutIterator instance = - Optional.of(outIterator.get()) - .orElseThrow( - () -> - new IllegalStateException( - "Fatal: spill() called before a output iterator " - + "is created. This behavior should be optimized " - + "by moving memory allocations from create() to " - + "hasNext()/next()")); - return instance.spill(size); - } - - @Override - public Set applicablePhases() { - return Spillers.PHASE_SET_SPILL_ONLY; - } - }); - final long memoryManagerHandle = nmm.getNativeInstanceHandle(); final String spillDirPath = SparkDirectoryUtil.get() @@ -95,9 +66,8 @@ public Set applicablePhases() { .mkChildDirRoundRobin(UUID.randomUUID().toString()) .getAbsolutePath(); - long iterHandle = + final long itrHandle = jniWrapper.nativeCreateKernelWithIterator( - memoryManagerHandle, wsPlan, splitInfo, iterList.toArray(new GeneralInIterator[0]), @@ -106,12 +76,21 @@ public Set applicablePhases() { TaskContext.get().taskAttemptId(), DebugUtil.saveInputToFile(), BackendsApiManager.getSparkPlanExecApiInstance().rewriteSpillPath(spillDirPath)); - outIterator.set(createOutIterator(Runtimes.contextInstance(), iterHandle, nmm)); - return outIterator.get(); + final ColumnarBatchOutIterator out = createOutIterator(runtime, itrHandle); + runtime.addSpiller( + new Spiller() { + @Override + public long spill(MemoryTarget self, Spiller.Phase phase, long size) { + if (!Spillers.PHASE_SET_SPILL_ONLY.contains(phase)) { + return 0L; + } + return out.spill(size); + } + }); + return out; } - private ColumnarBatchOutIterator createOutIterator( - Runtime runtime, long iterHandle, NativeMemoryManager nmm) throws IOException { - return new ColumnarBatchOutIterator(runtime, iterHandle, nmm); + private ColumnarBatchOutIterator createOutIterator(Runtime runtime, long itrHandle) { + return new ColumnarBatchOutIterator(runtime, itrHandle); } } diff --git a/gluten-data/src/main/java/org/apache/gluten/vectorized/NativeRowToColumnarJniWrapper.java b/gluten-data/src/main/java/org/apache/gluten/vectorized/NativeRowToColumnarJniWrapper.java index b43c92b991423..1185d52fee773 100644 --- a/gluten-data/src/main/java/org/apache/gluten/vectorized/NativeRowToColumnarJniWrapper.java +++ b/gluten-data/src/main/java/org/apache/gluten/vectorized/NativeRowToColumnarJniWrapper.java @@ -18,7 +18,6 @@ import org.apache.gluten.exec.Runtime; import org.apache.gluten.exec.RuntimeAware; -import org.apache.gluten.exec.Runtimes; public class NativeRowToColumnarJniWrapper implements RuntimeAware { private final Runtime runtime; @@ -27,8 +26,8 @@ private NativeRowToColumnarJniWrapper(Runtime runtime) { this.runtime = runtime; } - public static NativeRowToColumnarJniWrapper create() { - return new NativeRowToColumnarJniWrapper(Runtimes.contextInstance()); + public static NativeRowToColumnarJniWrapper create(Runtime runtime) { + return new NativeRowToColumnarJniWrapper(runtime); } @Override @@ -36,7 +35,7 @@ public long handle() { return runtime.getHandle(); } - public native long init(long cSchema, long memoryManagerHandle); + public native long init(long cSchema); public native long nativeConvertRowToColumnar( long r2cHandle, long[] rowLength, long bufferAddress); diff --git a/gluten-data/src/main/java/org/apache/gluten/vectorized/PlanEvaluatorJniWrapper.java b/gluten-data/src/main/java/org/apache/gluten/vectorized/PlanEvaluatorJniWrapper.java index 5e8ae392af8d0..eecd7c9e28790 100644 --- a/gluten-data/src/main/java/org/apache/gluten/vectorized/PlanEvaluatorJniWrapper.java +++ b/gluten-data/src/main/java/org/apache/gluten/vectorized/PlanEvaluatorJniWrapper.java @@ -18,7 +18,6 @@ import org.apache.gluten.exec.Runtime; import org.apache.gluten.exec.RuntimeAware; -import org.apache.gluten.exec.Runtimes; import org.apache.gluten.validate.NativePlanValidationInfo; /** @@ -33,8 +32,8 @@ private PlanEvaluatorJniWrapper(Runtime runtime) { this.runtime = runtime; } - public static PlanEvaluatorJniWrapper create() { - return new PlanEvaluatorJniWrapper(Runtimes.contextInstance()); + public static PlanEvaluatorJniWrapper create(Runtime runtime) { + return new PlanEvaluatorJniWrapper(runtime); } @Override @@ -57,11 +56,9 @@ public long handle() { /** * Create a native compute kernel and return a columnar result iterator. * - * @param memoryManagerHandle NativeMemoryManager instance handle * @return iterator instance id */ public native long nativeCreateKernelWithIterator( - long memoryManagerHandle, byte[] wsPlan, byte[][] splitInfo, GeneralInIterator[] batchItr, diff --git a/gluten-data/src/main/java/org/apache/gluten/vectorized/ShuffleReaderJniWrapper.java b/gluten-data/src/main/java/org/apache/gluten/vectorized/ShuffleReaderJniWrapper.java index 24425ccf72e60..515486e45a5bc 100644 --- a/gluten-data/src/main/java/org/apache/gluten/vectorized/ShuffleReaderJniWrapper.java +++ b/gluten-data/src/main/java/org/apache/gluten/vectorized/ShuffleReaderJniWrapper.java @@ -18,7 +18,6 @@ import org.apache.gluten.exec.Runtime; import org.apache.gluten.exec.RuntimeAware; -import org.apache.gluten.exec.Runtimes; public class ShuffleReaderJniWrapper implements RuntimeAware { private final Runtime runtime; @@ -27,8 +26,8 @@ private ShuffleReaderJniWrapper(Runtime runtime) { this.runtime = runtime; } - public static ShuffleReaderJniWrapper create() { - return new ShuffleReaderJniWrapper(Runtimes.contextInstance()); + public static ShuffleReaderJniWrapper create(Runtime runtime) { + return new ShuffleReaderJniWrapper(runtime); } @Override @@ -38,7 +37,6 @@ public long handle() { public native long make( long cSchema, - long memoryManagerHandle, String compressionType, String compressionCodecBackend, int batchSize, diff --git a/gluten-data/src/main/java/org/apache/gluten/vectorized/ShuffleWriterJniWrapper.java b/gluten-data/src/main/java/org/apache/gluten/vectorized/ShuffleWriterJniWrapper.java index f4e1172757fe5..883fc600171f2 100644 --- a/gluten-data/src/main/java/org/apache/gluten/vectorized/ShuffleWriterJniWrapper.java +++ b/gluten-data/src/main/java/org/apache/gluten/vectorized/ShuffleWriterJniWrapper.java @@ -18,7 +18,6 @@ import org.apache.gluten.exec.Runtime; import org.apache.gluten.exec.RuntimeAware; -import org.apache.gluten.exec.Runtimes; import java.io.IOException; @@ -29,8 +28,8 @@ private ShuffleWriterJniWrapper(Runtime runtime) { this.runtime = runtime; } - public static ShuffleWriterJniWrapper create() { - return new ShuffleWriterJniWrapper(Runtimes.contextInstance()); + public static ShuffleWriterJniWrapper create(Runtime runtime) { + return new ShuffleWriterJniWrapper(runtime); } @Override @@ -65,7 +64,6 @@ public long make( String dataFile, int subDirsPerLocalDir, String localDirs, - long memoryManagerHandle, double reallocThreshold, long handle, long taskAttemptId, @@ -85,7 +83,6 @@ public long make( dataFile, subDirsPerLocalDir, localDirs, - memoryManagerHandle, reallocThreshold, handle, taskAttemptId, @@ -115,7 +112,6 @@ public long makeForRSS( int pushBufferMaxSize, long sortBufferMaxSize, Object pusher, - long memoryManagerHandle, long handle, long taskAttemptId, int startPartitionId, @@ -136,7 +132,6 @@ public long makeForRSS( null, 0, null, - memoryManagerHandle, reallocThreshold, handle, taskAttemptId, @@ -162,7 +157,6 @@ public native long nativeMake( String dataFile, int subDirsPerLocalDir, String localDirs, - long memoryManagerHandle, double reallocThreshold, long handle, long taskAttemptId, diff --git a/gluten-data/src/main/scala/org/apache/gluten/exec/Runtime.scala b/gluten-data/src/main/scala/org/apache/gluten/exec/Runtime.scala index 12d855c7145ab..1f632659eadf8 100644 --- a/gluten-data/src/main/scala/org/apache/gluten/exec/Runtime.scala +++ b/gluten-data/src/main/scala/org/apache/gluten/exec/Runtime.scala @@ -20,35 +20,111 @@ import org.apache.gluten.GlutenConfig import org.apache.gluten.backendsapi.BackendsApiManager import org.apache.gluten.exception.GlutenException import org.apache.gluten.init.JniUtils +import org.apache.gluten.memory.MemoryUsageStatsBuilder +import org.apache.gluten.memory.listener.ReservationListeners +import org.apache.gluten.memory.memtarget.{KnownNameAndStats, MemoryTarget, Spiller, Spillers} +import org.apache.gluten.proto.MemoryUsageStats -import org.apache.spark.sql.internal.GlutenConfigUtil -import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.memory.SparkMemoryUtil +import org.apache.spark.sql.internal.{GlutenConfigUtil, SQLConf} import org.apache.spark.util.TaskResource +import org.slf4j.LoggerFactory + import java.util.concurrent.atomic.AtomicBoolean -class Runtime private[exec] () extends TaskResource { - private val handle = RuntimeJniWrapper.createRuntime( - BackendsApiManager.getBackendName, - JniUtils.toNativeConf( - GlutenConfig.getNativeSessionConf( - BackendsApiManager.getSettings.getBackendConfigPrefix, - GlutenConfigUtil.parseConfig(SQLConf.get.getAllConfs))) - ) +import scala.collection.JavaConverters._ +import scala.collection.mutable + +trait Runtime { + def addSpiller(spiller: Spiller): Unit + def holdMemory(): Unit + def collectMemoryUsage(): MemoryUsageStats + def getHandle(): Long +} + +object Runtime { + private[exec] def apply(name: String): Runtime with TaskResource = { + new RuntimeImpl(name) + } + + private class RuntimeImpl(name: String) extends Runtime with TaskResource { + private val LOGGER = LoggerFactory.getLogger(classOf[Runtime]) + + private val spillers = Spillers.appendable() + private val mutableStats: mutable.Map[String, MemoryUsageStatsBuilder] = mutable.Map() + private val rl = ReservationListeners.create(resourceName(), spillers, mutableStats.asJava) + private val handle = RuntimeJniWrapper.createRuntime( + BackendsApiManager.getBackendName, + rl, + JniUtils.toNativeConf( + GlutenConfig.getNativeSessionConf( + BackendsApiManager.getSettings.getBackendConfigPrefix, + GlutenConfigUtil.parseConfig(SQLConf.get.getAllConfs))) + ) - private val released: AtomicBoolean = new AtomicBoolean(false) + spillers.append(new Spiller() { + override def spill(self: MemoryTarget, phase: Spiller.Phase, size: Long): Long = { + if (!Spillers.PHASE_SET_SHRINK_ONLY.contains(phase)) { + // Only respond for shrinking. + return 0L + } + RuntimeJniWrapper.shrinkMemory(handle, size) + } + }) + mutableStats += "single" -> new MemoryUsageStatsBuilder { + override def toStats: MemoryUsageStats = collectMemoryUsage() + } - def getHandle: Long = handle + private val released: AtomicBoolean = new AtomicBoolean(false) - override def release(): Unit = { - if (!released.compareAndSet(false, true)) { - throw new GlutenException( - s"Runtime instance already released: $getHandle, ${resourceName()}, ${priority()}") + def getHandle: Long = handle + + def addSpiller(spiller: Spiller): Unit = { + spillers.append(spiller) } - RuntimeJniWrapper.releaseRuntime(handle) - } - override def priority(): Int = 10 + def holdMemory(): Unit = { + RuntimeJniWrapper.holdMemory(handle) + } + + def collectMemoryUsage(): MemoryUsageStats = { + MemoryUsageStats.parseFrom(RuntimeJniWrapper.collectMemoryUsage(handle)) + } + + override def release(): Unit = { + if (!released.compareAndSet(false, true)) { + throw new GlutenException( + s"Runtime instance already released: $handle, ${resourceName()}, ${priority()}") + } + if (LOGGER.isDebugEnabled) { + LOGGER.debug( + SparkMemoryUtil.prettyPrintStats( + "About to release memory manager, usage dump:", + new KnownNameAndStats() { + override def name: String = resourceName() + + override def stats: MemoryUsageStats = collectMemoryUsage() + } + )) + } - override def resourceName(): String = s"Runtime_" + handle + RuntimeJniWrapper.releaseRuntime(handle) + + if (rl.getUsedBytes != 0) { + LOGGER.warn( + String.format( + "%s Reservation listener %s still reserved non-zero bytes, which may cause memory" + + " leak, size: %s. ", + name, + rl.toString, + SparkMemoryUtil.bytesToString(rl.getUsedBytes) + )) + } + } + + override def priority(): Int = 0 + + override def resourceName(): String = name + } } diff --git a/gluten-data/src/main/scala/org/apache/gluten/exec/Runtimes.scala b/gluten-data/src/main/scala/org/apache/gluten/exec/Runtimes.scala index e5204771ce526..3614fe05f6b60 100644 --- a/gluten-data/src/main/scala/org/apache/gluten/exec/Runtimes.scala +++ b/gluten-data/src/main/scala/org/apache/gluten/exec/Runtimes.scala @@ -16,21 +16,20 @@ */ package org.apache.gluten.exec -import org.apache.spark.util.TaskResources +import org.apache.spark.util.{TaskResource, TaskResources} object Runtimes { - private val RUNTIME_NAME = "Runtime" /** Get or create the runtime which bound with Spark TaskContext. */ - def contextInstance(): Runtime = { + def contextInstance(name: String): Runtime = { if (!TaskResources.inSparkTask()) { throw new IllegalStateException("This method must be called in a Spark task.") } - TaskResources.addResourceIfNotRegistered(RUNTIME_NAME, () => create()) + TaskResources.addResourceIfNotRegistered(name, () => create(name)) } - private def create(): Runtime = { - new Runtime + private def create(name: String): Runtime with TaskResource = { + Runtime(name) } } diff --git a/gluten-data/src/main/scala/org/apache/gluten/metrics/MetricsUtil.scala b/gluten-data/src/main/scala/org/apache/gluten/metrics/MetricsUtil.scala index f11800b89c31f..0c387b4292127 100644 --- a/gluten-data/src/main/scala/org/apache/gluten/metrics/MetricsUtil.scala +++ b/gluten-data/src/main/scala/org/apache/gluten/metrics/MetricsUtil.scala @@ -54,10 +54,13 @@ object MetricsUtil extends Logging { MetricsUpdaterTree( smj.metricsUpdater(), Seq(treeifyMetricsUpdaters(smj.bufferedPlan), treeifyMetricsUpdaters(smj.streamedPlan))) + case t: TransformSupport if t.metricsUpdater() == MetricsUpdater.None => + assert(t.children.size == 1, "MetricsUpdater.None can only be used on unary operator") + treeifyMetricsUpdaters(t.children.head) case t: TransformSupport => MetricsUpdaterTree(t.metricsUpdater(), t.children.map(treeifyMetricsUpdaters)) case _ => - MetricsUpdaterTree(NoopMetricsUpdater, Seq()) + MetricsUpdaterTree(MetricsUpdater.Terminate, Seq()) } } @@ -180,6 +183,8 @@ object MetricsUtil extends Logging { ) } + // FIXME: Metrics updating code is too magical to maintain. Tree-walking algorithm should be made + // more declarative than by counting down these counters that don't have fixed definition. /** * @return * operator index and metrics index @@ -192,6 +197,9 @@ object MetricsUtil extends Logging { metricsIdx: Int, joinParamsMap: JMap[JLong, JoinParams], aggParamsMap: JMap[JLong, AggregationParams]): (JLong, Int) = { + if (mutNode.updater == MetricsUpdater.Terminate) { + return (operatorIdx, metricsIdx) + } val operatorMetrics = new JArrayList[OperatorMetrics]() var curMetricsIdx = metricsIdx relMap @@ -245,18 +253,16 @@ object MetricsUtil extends Logging { mutNode.children.foreach { child => - if (child.updater != NoopMetricsUpdater) { - val result = updateTransformerMetricsInternal( - child, - relMap, - newOperatorIdx, - metrics, - newMetricsIdx, - joinParamsMap, - aggParamsMap) - newOperatorIdx = result._1 - newMetricsIdx = result._2 - } + val result = updateTransformerMetricsInternal( + child, + relMap, + newOperatorIdx, + metrics, + newMetricsIdx, + joinParamsMap, + aggParamsMap) + newOperatorIdx = result._1 + newMetricsIdx = result._2 } (newOperatorIdx, newMetricsIdx) @@ -292,8 +298,6 @@ object MetricsUtil extends Logging { val numNativeMetrics = metrics.inputRows.length if (numNativeMetrics == 0) { () - } else if (mutNode.updater == NoopMetricsUpdater) { - () } else { updateTransformerMetricsInternal( mutNode, @@ -305,7 +309,7 @@ object MetricsUtil extends Logging { aggParamsMap) } } catch { - case e: Throwable => + case e: Exception => logWarning(s"Updating native metrics failed due to ${e.getCause}.") () } diff --git a/gluten-data/src/main/java/org/apache/gluten/memory/alloc/NativeMemoryAllocator.java b/gluten-data/src/main/scala/org/apache/gluten/metrics/SampleMetricsUpdater.scala similarity index 50% rename from gluten-data/src/main/java/org/apache/gluten/memory/alloc/NativeMemoryAllocator.java rename to gluten-data/src/main/scala/org/apache/gluten/metrics/SampleMetricsUpdater.scala index 6860d68e3c1b2..a108a5b7979d5 100644 --- a/gluten-data/src/main/java/org/apache/gluten/memory/alloc/NativeMemoryAllocator.java +++ b/gluten-data/src/main/scala/org/apache/gluten/metrics/SampleMetricsUpdater.scala @@ -14,36 +14,22 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.gluten.memory.alloc; - -/** - * This along with {@link NativeMemoryAllocators}, as built-in toolkit for managing native memory - * allocations. - */ -public class NativeMemoryAllocator { - enum Type { - DEFAULT, - } - - private final long nativeInstanceId; - - public NativeMemoryAllocator(long nativeInstanceId) { - this.nativeInstanceId = nativeInstanceId; - } - - public static NativeMemoryAllocator create(Type type) { - return new NativeMemoryAllocator(getAllocator(type.name())); +package org.apache.gluten.metrics + +import org.apache.spark.sql.execution.metric.SQLMetric + +class SampleMetricsUpdater(val metrics: Map[String, SQLMetric]) extends MetricsUpdater { + + override def updateNativeMetrics(opMetrics: IOperatorMetrics): Unit = { + if (opMetrics != null) { + val operatorMetrics = opMetrics.asInstanceOf[OperatorMetrics] + metrics("numOutputRows") += operatorMetrics.outputRows + metrics("outputVectors") += operatorMetrics.outputVectors + metrics("outputBytes") += operatorMetrics.outputBytes + metrics("cpuCount") += operatorMetrics.cpuCount + metrics("wallNanos") += operatorMetrics.wallNanos + metrics("peakMemoryBytes") += operatorMetrics.peakMemoryBytes + metrics("numMemoryAllocations") += operatorMetrics.numMemoryAllocations + } } - - public long getNativeInstanceId() { - return this.nativeInstanceId; - } - - public void close() { - releaseAllocator(this.nativeInstanceId); - } - - private static native long getAllocator(String typeName); - - private static native void releaseAllocator(long allocatorId); } diff --git a/gluten-data/src/main/scala/org/apache/gluten/utils/ArrowUtil.scala b/gluten-data/src/main/scala/org/apache/gluten/utils/ArrowUtil.scala index 99eb72c70ea3f..a94f8f2e3d49c 100644 --- a/gluten-data/src/main/scala/org/apache/gluten/utils/ArrowUtil.scala +++ b/gluten-data/src/main/scala/org/apache/gluten/utils/ArrowUtil.scala @@ -16,16 +16,12 @@ */ package org.apache.gluten.utils -import org.apache.gluten.exception.SchemaMismatchException import org.apache.gluten.vectorized.ArrowWritableColumnVector import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute -import org.apache.spark.sql.execution.datasources.csv.CSVFileFormat -import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat import org.apache.spark.sql.execution.vectorized.ArrowColumnVectorUtils -import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ import org.apache.spark.sql.utils.{SparkArrowUtil, SparkSchemaUtil} import org.apache.spark.sql.vectorized.{ColumnarBatch, ColumnVector} @@ -33,13 +29,15 @@ import org.apache.spark.sql.vectorized.{ColumnarBatch, ColumnVector} import org.apache.arrow.c.{ArrowSchema, CDataDictionaryProvider, Data} import org.apache.arrow.dataset.file.{FileFormat, FileSystemDatasetFactory} import org.apache.arrow.dataset.jni.NativeMemoryPool +import org.apache.arrow.dataset.scanner.FragmentScanOptions import org.apache.arrow.memory.BufferAllocator import org.apache.arrow.vector.ipc.message.ArrowRecordBatch import org.apache.arrow.vector.types.pojo.{ArrowType, Field, Schema} import org.apache.hadoop.fs.FileStatus -import java.net.URI +import java.net.{URI, URLDecoder} import java.util +import java.util.Optional import scala.collection.JavaConverters._ import scala.collection.mutable @@ -99,26 +97,6 @@ object ArrowUtil extends Logging { new Schema(fields) } - def getFormat(format: String): FileFormat = { - format match { - case "parquet" => FileFormat.PARQUET - case "orc" => FileFormat.ORC - case "csv" => FileFormat.CSV - case _ => throw new IllegalArgumentException("Unrecognizable format") - } - } - - def getFormat(format: org.apache.spark.sql.execution.datasources.FileFormat): FileFormat = { - format match { - case _: ParquetFileFormat => - FileFormat.PARQUET - case _: CSVFileFormat => - FileFormat.CSV - case _ => - throw new IllegalArgumentException("Unrecognizable format") - } - } - private def rewriteUri(encodeUri: String): String = { val decodedUri = encodeUri val uri = URI.create(decodedUri) @@ -142,19 +120,49 @@ object ArrowUtil extends Logging { def makeArrowDiscovery( encodedUri: String, format: FileFormat, + option: Optional[FragmentScanOptions], allocator: BufferAllocator, - pool: NativeMemoryPool): FileSystemDatasetFactory = { - val factory = new FileSystemDatasetFactory(allocator, pool, format, rewriteUri(encodedUri)) + pool: NativeMemoryPool + ): FileSystemDatasetFactory = { + val factory = + new FileSystemDatasetFactory(allocator, pool, format, rewriteUri(encodedUri), option) factory } + def readArrowSchema( + file: String, + format: FileFormat, + option: FragmentScanOptions, + allocator: BufferAllocator, + pool: NativeMemoryPool): Schema = { + val factory: FileSystemDatasetFactory = + makeArrowDiscovery(file, format, Optional.of(option), allocator, pool) + val schema = factory.inspect() + factory.close() + schema + } + + def readArrowFileColumnNames( + file: String, + format: FileFormat, + option: FragmentScanOptions, + allocator: BufferAllocator, + pool: NativeMemoryPool): Array[String] = { + val fileFields = ArrowUtil + .readArrowSchema(URLDecoder.decode(file, "UTF-8"), format, option, allocator, pool) + .getFields + .asScala + fileFields.map(_.getName).toArray + } + def readSchema( file: FileStatus, format: FileFormat, + option: FragmentScanOptions, allocator: BufferAllocator, pool: NativeMemoryPool): Option[StructType] = { val factory: FileSystemDatasetFactory = - makeArrowDiscovery(file.getPath.toString, format, allocator, pool) + makeArrowDiscovery(file.getPath.toString, format, Optional.of(option), allocator, pool) val schema = factory.inspect() try { Option(SparkSchemaUtil.fromArrowSchema(schema)) @@ -166,67 +174,14 @@ object ArrowUtil extends Logging { def readSchema( files: Seq[FileStatus], format: FileFormat, + option: FragmentScanOptions, allocator: BufferAllocator, pool: NativeMemoryPool): Option[StructType] = { if (files.isEmpty) { throw new IllegalArgumentException("No input file specified") } - readSchema(files.head, format, allocator, pool) - } - - def compareStringFunc(caseSensitive: Boolean): (String, String) => Boolean = { - if (caseSensitive) { (str1: String, str2: String) => str1.equals(str2) } - else { (str1: String, str2: String) => str1.equalsIgnoreCase(str2) } - } - - // If user specify schema by .schema(newSchemaDifferentWithFile) - def checkSchema( - requiredField: DataType, - parquetFileFieldType: ArrowType, - parquetFileFields: mutable.Buffer[Field]): Unit = { - val requiredFieldType = - SparkArrowUtil.toArrowType(requiredField, SparkSchemaUtil.getLocalTimezoneID) - if (!requiredFieldType.equals(parquetFileFieldType)) { - val arrowFileSchema = parquetFileFields - .map(f => f.toString) - .reduceLeft((f1, f2) => f1 + "\n" + f2) - throw new SchemaMismatchException( - s"Not support specified schema is different with file schema\n$arrowFileSchema") - } - } - - def getRequestedField( - requiredSchema: StructType, - parquetFileFields: mutable.Buffer[Field], - caseSensitive: Boolean): Schema = { - val compareFunc = compareStringFunc(caseSensitive) - requiredSchema.foreach { - readField => - // TODO: check schema inside of complex type - val matchedFields = - parquetFileFields.filter(field => compareFunc(field.getName, readField.name)) - if (!caseSensitive && matchedFields.size > 1) { - // Need to fail if there is ambiguity, i.e. more than one field is matched - val fieldsString = matchedFields.map(_.getName).mkString("[", ", ", "]") - throw new RuntimeException( - s""" - |Found duplicate field(s) "${readField.name}": $fieldsString - - |in case-insensitive mode""".stripMargin.replaceAll("\n", " ")) - } - if (matchedFields.nonEmpty) { - checkSchema( - readField.dataType, - matchedFields.head.getFieldType.getType, - parquetFileFields) - } - } - - val requestColNames = requiredSchema.map(_.name) - new Schema(parquetFileFields.filter { - field => requestColNames.exists(col => compareFunc(col, field.getName)) - }.asJava) + readSchema(files.head, format, option, allocator, pool) } def loadMissingColumns( @@ -262,19 +217,14 @@ object ArrowUtil extends Logging { def loadBatch( allocator: BufferAllocator, input: ArrowRecordBatch, - dataSchema: StructType, - requiredSchema: StructType, + dataSchema: Schema, partitionVectors: Array[ArrowWritableColumnVector] = Array.empty, nullVectors: Array[ArrowWritableColumnVector] = Array.empty): ColumnarBatch = { val rowCount: Int = input.getLength val vectors = try { - ArrowWritableColumnVector.loadColumns( - rowCount, - SparkSchemaUtil.toArrowSchema(dataSchema), - input, - allocator) + ArrowWritableColumnVector.loadColumns(rowCount, dataSchema, input, allocator) } finally { input.close() } @@ -282,21 +232,8 @@ object ArrowUtil extends Logging { val totalVectors = if (nullVectors.nonEmpty) { val finalVectors = mutable.ArrayBuffer[ArrowWritableColumnVector]() - val requiredIterator = requiredSchema.iterator - val compareFunc = compareStringFunc(SQLConf.get.caseSensitiveAnalysis) - while (requiredIterator.hasNext) { - val field = requiredIterator.next() - finalVectors.append(vectors - .find(vector => compareFunc(vector.getValueVector.getName, field.name)) - .getOrElse { - // The missing column need to be find in nullVectors - val nullVector = - nullVectors.find(vector => compareFunc(vector.getValueVector.getName, field.name)).get - nullVector.setValueCount(rowCount) - nullVector.retain() - nullVector - }) - } + finalVectors.appendAll(vectors) + finalVectors.appendAll(nullVectors) finalVectors.toArray } else { vectors diff --git a/gluten-data/src/main/scala/org/apache/gluten/vectorized/ColumnarBatchSerializer.scala b/gluten-data/src/main/scala/org/apache/gluten/vectorized/ColumnarBatchSerializer.scala index 326f836a0a583..e75abe41e4e8c 100644 --- a/gluten-data/src/main/scala/org/apache/gluten/vectorized/ColumnarBatchSerializer.scala +++ b/gluten-data/src/main/scala/org/apache/gluten/vectorized/ColumnarBatchSerializer.scala @@ -19,7 +19,6 @@ package org.apache.gluten.vectorized import org.apache.gluten.GlutenConfig import org.apache.gluten.exec.Runtimes import org.apache.gluten.memory.arrow.alloc.ArrowBufferAllocators -import org.apache.gluten.memory.nmm.NativeMemoryManagers import org.apache.gluten.utils.ArrowAbiUtil import org.apache.spark.SparkEnv @@ -79,7 +78,6 @@ private class ColumnarBatchSerializerInstance( extends SerializerInstance with Logging { - private val nmm = NativeMemoryManagers.contextInstance("ShuffleReader") private val shuffleReaderHandle = { val allocator: BufferAllocator = ArrowBufferAllocators .contextInstance(classOf[ColumnarBatchSerializerInstance].getSimpleName) @@ -98,15 +96,14 @@ private class ColumnarBatchSerializerInstance( val compressionCodecBackend = GlutenConfig.getConf.columnarShuffleCodecBackend.orNull val batchSize = GlutenConfig.getConf.maxBatchSize - val jniWrapper = ShuffleReaderJniWrapper.create() + val runtime = Runtimes.contextInstance("ShuffleReader") + val jniWrapper = ShuffleReaderJniWrapper.create(runtime) val shuffleReaderHandle = jniWrapper.make( cSchema.memoryAddress(), - nmm.getNativeInstanceHandle, compressionCodec, compressionCodecBackend, batchSize, - shuffleWriterType - ) + shuffleWriterType) // Close shuffle reader instance as lately as the end of task processing, // since the native reader could hold a reference to memory pool that // was used to create all buffers read from shuffle reader. The pool @@ -137,12 +134,12 @@ private class ColumnarBatchSerializerInstance( extends DeserializationStream with TaskResource { private val byteIn: JniByteInputStream = JniByteInputStreams.create(in) + private val runtime = Runtimes.contextInstance("ShuffleReader") private val wrappedOut: GeneralOutIterator = new ColumnarBatchOutIterator( - Runtimes.contextInstance(), + runtime, ShuffleReaderJniWrapper - .create() - .readStream(shuffleReaderHandle, byteIn), - nmm) + .create(runtime) + .readStream(shuffleReaderHandle, byteIn)) private var cb: ColumnarBatch = _ diff --git a/gluten-data/src/main/scala/org/apache/spark/shuffle/ColumnarShuffleWriter.scala b/gluten-data/src/main/scala/org/apache/spark/shuffle/ColumnarShuffleWriter.scala index af004b54fd934..4e59d0ba69a92 100644 --- a/gluten-data/src/main/scala/org/apache/spark/shuffle/ColumnarShuffleWriter.scala +++ b/gluten-data/src/main/scala/org/apache/spark/shuffle/ColumnarShuffleWriter.scala @@ -18,10 +18,8 @@ package org.apache.spark.shuffle import org.apache.gluten.GlutenConfig import org.apache.gluten.columnarbatch.ColumnarBatches -import org.apache.gluten.memory.memtarget.MemoryTarget -import org.apache.gluten.memory.memtarget.Spiller -import org.apache.gluten.memory.memtarget.Spillers -import org.apache.gluten.memory.nmm.NativeMemoryManagers +import org.apache.gluten.exec.Runtimes +import org.apache.gluten.memory.memtarget.{MemoryTarget, Spiller, Spillers} import org.apache.gluten.vectorized._ import org.apache.spark._ @@ -98,7 +96,9 @@ class ColumnarShuffleWriter[K, V]( private val reallocThreshold = GlutenConfig.getConf.columnarShuffleReallocThreshold - private val jniWrapper = ShuffleWriterJniWrapper.create() + private val runtime = Runtimes.contextInstance("ShuffleWriter") + + private val jniWrapper = ShuffleWriterJniWrapper.create(runtime) private var nativeShuffleWriter: Long = -1L @@ -153,36 +153,25 @@ class ColumnarShuffleWriter[K, V]( dataTmp.getAbsolutePath, blockManager.subDirsPerLocalDir, localDirs, - NativeMemoryManagers - .create( - "ShuffleWriter", - new Spiller() { - override def spill(self: MemoryTarget, size: Long): Long = { - if (nativeShuffleWriter == -1L) { - throw new IllegalStateException( - "Fatal: spill() called before a shuffle writer " + - "is created. This behavior should be optimized by moving memory " + - "allocations from make() to split()") - } - logInfo(s"Gluten shuffle writer: Trying to spill $size bytes of data") - // fixme pass true when being called by self - val spilled = - jniWrapper.nativeEvict(nativeShuffleWriter, size, false) - logInfo(s"Gluten shuffle writer: Spilled $spilled / $size bytes of data") - spilled - } - - override def applicablePhases(): java.util.Set[Spiller.Phase] = - Spillers.PHASE_SET_SPILL_ONLY - } - ) - .getNativeInstanceHandle, reallocThreshold, handle, taskContext.taskAttemptId(), GlutenShuffleUtils.getStartPartitionId(dep.nativePartitioning, taskContext.partitionId), shuffleWriterType ) + runtime.addSpiller(new Spiller() { + override def spill(self: MemoryTarget, phase: Spiller.Phase, size: Long): Long = { + if (!Spillers.PHASE_SET_SPILL_ONLY.contains(phase)) { + return 0L + } + logInfo(s"Gluten shuffle writer: Trying to spill $size bytes of data") + // fixme pass true when being called by self + val spilled = + jniWrapper.nativeEvict(nativeShuffleWriter, size, false) + logInfo(s"Gluten shuffle writer: Spilled $spilled / $size bytes of data") + spilled + } + }) } val startTime = System.nanoTime() jniWrapper.write(nativeShuffleWriter, rows, handle, availableOffHeapPerTask()) diff --git a/gluten-data/src/main/scala/org/apache/spark/sql/execution/ColumnarBuildSideRelation.scala b/gluten-data/src/main/scala/org/apache/spark/sql/execution/ColumnarBuildSideRelation.scala index 9d9f5ab1765c4..f7bcfd694d52d 100644 --- a/gluten-data/src/main/scala/org/apache/spark/sql/execution/ColumnarBuildSideRelation.scala +++ b/gluten-data/src/main/scala/org/apache/spark/sql/execution/ColumnarBuildSideRelation.scala @@ -19,9 +19,9 @@ package org.apache.spark.sql.execution import org.apache.gluten.columnarbatch.ColumnarBatches import org.apache.gluten.exec.Runtimes import org.apache.gluten.memory.arrow.alloc.ArrowBufferAllocators -import org.apache.gluten.memory.nmm.NativeMemoryManagers import org.apache.gluten.sql.shims.SparkShimLoader -import org.apache.gluten.utils.{ArrowAbiUtil, Iterators} +import org.apache.gluten.utils.ArrowAbiUtil +import org.apache.gluten.utils.iterator.Iterators import org.apache.gluten.vectorized.{ColumnarBatchSerializerJniWrapper, NativeColumnarToRowJniWrapper} import org.apache.spark.sql.catalyst.InternalRow @@ -40,7 +40,8 @@ case class ColumnarBuildSideRelation(output: Seq[Attribute], batches: Array[Arra extends BuildSideRelation { override def deserialized: Iterator[ColumnarBatch] = { - val jniWrapper = ColumnarBatchSerializerJniWrapper.create() + val runtime = Runtimes.contextInstance("BuildSideRelation#deserialized") + val jniWrapper = ColumnarBatchSerializerJniWrapper.create(runtime) val serializeHandle: Long = { val allocator = ArrowBufferAllocators.contextInstance() val cSchema = ArrowSchema.allocateNew(allocator) @@ -49,11 +50,7 @@ case class ColumnarBuildSideRelation(output: Seq[Attribute], batches: Array[Arra SQLConf.get.sessionLocalTimeZone) ArrowAbiUtil.exportSchema(allocator, arrowSchema, cSchema) val handle = jniWrapper - .init( - cSchema.memoryAddress(), - NativeMemoryManagers - .contextInstance("BuildSideRelation#BatchSerializer") - .getNativeInstanceHandle) + .init(cSchema.memoryAddress()) cSchema.close() handle } @@ -71,7 +68,7 @@ case class ColumnarBuildSideRelation(output: Seq[Attribute], batches: Array[Arra jniWrapper .deserialize(serializeHandle, batches(batchId)) batchId += 1 - ColumnarBatches.create(Runtimes.contextInstance(), handle) + ColumnarBatches.create(handle) } }) .protectInvocationFlow() @@ -89,10 +86,9 @@ case class ColumnarBuildSideRelation(output: Seq[Attribute], batches: Array[Arra * was called in Spark Driver, should manage resources carefully. */ override def transform(key: Expression): Array[InternalRow] = TaskResources.runUnsafe { + val runtime = Runtimes.contextInstance("BuildSideRelation#transform") // This transformation happens in Spark driver, thus resources can not be managed automatically. - val runtime = Runtimes.contextInstance() - val nativeMemoryManager = NativeMemoryManagers.contextInstance("BuildSideRelation#transform") - val serializerJniWrapper = ColumnarBatchSerializerJniWrapper.create() + val serializerJniWrapper = ColumnarBatchSerializerJniWrapper.create(runtime) val serializeHandle = { val allocator = ArrowBufferAllocators.contextInstance() val cSchema = ArrowSchema.allocateNew(allocator) @@ -100,8 +96,7 @@ case class ColumnarBuildSideRelation(output: Seq[Attribute], batches: Array[Arra SparkShimLoader.getSparkShims.structFromAttributes(output), SQLConf.get.sessionLocalTimeZone) ArrowAbiUtil.exportSchema(allocator, arrowSchema, cSchema) - val handle = serializerJniWrapper - .init(cSchema.memoryAddress(), nativeMemoryManager.getNativeInstanceHandle) + val handle = serializerJniWrapper.init(cSchema.memoryAddress()) cSchema.close() handle } @@ -109,8 +104,8 @@ case class ColumnarBuildSideRelation(output: Seq[Attribute], batches: Array[Arra var closed = false // Convert columnar to Row. - val jniWrapper = NativeColumnarToRowJniWrapper.create() - val c2rId = jniWrapper.nativeColumnarToRowInit(nativeMemoryManager.getNativeInstanceHandle) + val jniWrapper = NativeColumnarToRowJniWrapper.create(runtime) + val c2rId = jniWrapper.nativeColumnarToRowInit() var batchId = 0 val iterator = if (batches.length > 0) { val res: Iterator[Iterator[InternalRow]] = new Iterator[Iterator[InternalRow]] { @@ -129,7 +124,7 @@ case class ColumnarBuildSideRelation(output: Seq[Attribute], batches: Array[Arra batchId += 1 val batchHandle = serializerJniWrapper.deserialize(serializeHandle, batchBytes) - val batch = ColumnarBatches.create(runtime, batchHandle) + val batch = ColumnarBatches.create(batchHandle) if (batch.numRows == 0) { batch.close() Iterator.empty @@ -141,7 +136,7 @@ case class ColumnarBuildSideRelation(output: Seq[Attribute], batches: Array[Arra val cols = batch.numCols() val rows = batch.numRows() val info = - jniWrapper.nativeColumnarToRowConvert(batchHandle, c2rId) + jniWrapper.nativeColumnarToRowConvert(c2rId, ColumnarBatches.getNativeHandle(batch)) batch.close() val columnNames = key.flatMap { case expression: AttributeReference => diff --git a/gluten-data/src/main/scala/org/apache/spark/sql/execution/utils/ExecUtil.scala b/gluten-data/src/main/scala/org/apache/spark/sql/execution/utils/ExecUtil.scala index 083915f12db94..77f35ff48fcc2 100644 --- a/gluten-data/src/main/scala/org/apache/spark/sql/execution/utils/ExecUtil.scala +++ b/gluten-data/src/main/scala/org/apache/spark/sql/execution/utils/ExecUtil.scala @@ -17,9 +17,9 @@ package org.apache.spark.sql.execution.utils import org.apache.gluten.columnarbatch.ColumnarBatches +import org.apache.gluten.exec.Runtimes import org.apache.gluten.memory.arrow.alloc.ArrowBufferAllocators -import org.apache.gluten.memory.nmm.NativeMemoryManagers -import org.apache.gluten.utils.Iterators +import org.apache.gluten.utils.iterator.Iterators import org.apache.gluten.vectorized.{ArrowWritableColumnVector, NativeColumnarToRowInfo, NativeColumnarToRowJniWrapper, NativePartitioning} import org.apache.spark.{Partitioner, RangePartitioner, ShuffleDependency} @@ -41,14 +41,12 @@ import org.apache.spark.util.MutablePair object ExecUtil { def convertColumnarToRow(batch: ColumnarBatch): Iterator[InternalRow] = { - val jniWrapper = NativeColumnarToRowJniWrapper.create() + val runtime = Runtimes.contextInstance("ExecUtil#ColumnarToRow") + val jniWrapper = NativeColumnarToRowJniWrapper.create(runtime) var info: NativeColumnarToRowInfo = null val batchHandle = ColumnarBatches.getNativeHandle(batch) - val c2rHandle = jniWrapper.nativeColumnarToRowInit( - NativeMemoryManagers - .contextInstance("ExecUtil#ColumnarToRow") - .getNativeInstanceHandle) - info = jniWrapper.nativeColumnarToRowConvert(batchHandle, c2rHandle) + val c2rHandle = jniWrapper.nativeColumnarToRowInit() + info = jniWrapper.nativeColumnarToRowConvert(c2rHandle, batchHandle) Iterators .wrap(new Iterator[InternalRow] { @@ -147,7 +145,7 @@ object ExecUtil { val newHandle = ColumnarBatches.compose(pidBatch, cb) // Composed batch already hold pidBatch's shared ref, so close is safe. ColumnarBatches.forceClose(pidBatch) - (0, ColumnarBatches.create(ColumnarBatches.getRuntime(cb), newHandle)) + (0, ColumnarBatches.create(newHandle)) }) .recyclePayload(p => ColumnarBatches.forceClose(p._2)) // FIXME why force close? .create() diff --git a/gluten-delta/src/main/scala/org/apache/gluten/execution/DeltaScanTransformer.scala b/gluten-delta/src/main/scala/org/apache/gluten/execution/DeltaScanTransformer.scala index 9e97a3687656a..1cd735cf7ee7d 100644 --- a/gluten-delta/src/main/scala/org/apache/gluten/execution/DeltaScanTransformer.scala +++ b/gluten-delta/src/main/scala/org/apache/gluten/execution/DeltaScanTransformer.scala @@ -82,14 +82,12 @@ case class DeltaScanTransformer( object DeltaScanTransformer { - def apply( - scanExec: FileSourceScanExec, - newPartitionFilters: Seq[Expression]): DeltaScanTransformer = { + def apply(scanExec: FileSourceScanExec): DeltaScanTransformer = { new DeltaScanTransformer( scanExec.relation, scanExec.output, scanExec.requiredSchema, - newPartitionFilters, + scanExec.partitionFilters, scanExec.optionalBucketSet, scanExec.optionalNumCoalescedBuckets, scanExec.dataFilters, diff --git a/gluten-delta/src/main/scala/org/apache/gluten/execution/DeltaScanTransformerProvider.scala b/gluten-delta/src/main/scala/org/apache/gluten/execution/DeltaScanTransformerProvider.scala index a7cecde7c0db0..e482150b8e298 100644 --- a/gluten-delta/src/main/scala/org/apache/gluten/execution/DeltaScanTransformerProvider.scala +++ b/gluten-delta/src/main/scala/org/apache/gluten/execution/DeltaScanTransformerProvider.scala @@ -16,7 +16,6 @@ */ package org.apache.gluten.execution -import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.execution.FileSourceScanExec class DeltaScanTransformerProvider extends DataSourceScanTransformerRegister { @@ -24,8 +23,7 @@ class DeltaScanTransformerProvider extends DataSourceScanTransformerRegister { override val scanClassName: String = "org.apache.spark.sql.delta.DeltaParquetFileFormat" override def createDataSourceTransformer( - batchScan: FileSourceScanExec, - newPartitionFilters: Seq[Expression]): FileSourceScanExecTransformerBase = { - DeltaScanTransformer(batchScan, newPartitionFilters) + batchScan: FileSourceScanExec): FileSourceScanExecTransformerBase = { + DeltaScanTransformer(batchScan) } } diff --git a/gluten-iceberg/pom.xml b/gluten-iceberg/pom.xml index 727077f26797c..69630b8cf0eca 100644 --- a/gluten-iceberg/pom.xml +++ b/gluten-iceberg/pom.xml @@ -63,10 +63,6 @@ test-jar test - - org.apache.spark - spark-core_${scala.binary.version} - org.apache.spark spark-core_${scala.binary.version} diff --git a/gluten-iceberg/src/main/java/org/apache/gluten/substrait/rel/IcebergLocalFilesNode.java b/gluten-iceberg/src/main/java/org/apache/gluten/substrait/rel/IcebergLocalFilesNode.java index 7d065f105a438..ba6b0ac4a0291 100644 --- a/gluten-iceberg/src/main/java/org/apache/gluten/substrait/rel/IcebergLocalFilesNode.java +++ b/gluten-iceberg/src/main/java/org/apache/gluten/substrait/rel/IcebergLocalFilesNode.java @@ -42,6 +42,8 @@ public class IcebergLocalFilesNode extends LocalFilesNode { paths, starts, lengths, + new ArrayList<>(), + new ArrayList<>(), partitionColumns, new ArrayList<>(), fileFormat, diff --git a/gluten-iceberg/src/main/scala/org/apache/gluten/execution/IcebergScanTransformer.scala b/gluten-iceberg/src/main/scala/org/apache/gluten/execution/IcebergScanTransformer.scala index 6e079bf7e10a7..5a735b802adbe 100644 --- a/gluten-iceberg/src/main/scala/org/apache/gluten/execution/IcebergScanTransformer.scala +++ b/gluten-iceberg/src/main/scala/org/apache/gluten/execution/IcebergScanTransformer.scala @@ -80,13 +80,11 @@ case class IcebergScanTransformer( } object IcebergScanTransformer { - def apply( - batchScan: BatchScanExec, - newPartitionFilters: Seq[Expression]): IcebergScanTransformer = { + def apply(batchScan: BatchScanExec): IcebergScanTransformer = { new IcebergScanTransformer( batchScan.output, batchScan.scan, - newPartitionFilters, + batchScan.runtimeFilters, table = SparkShimLoader.getSparkShims.getBatchScanExecTable(batchScan), keyGroupedPartitioning = SparkShimLoader.getSparkShims.getKeyGroupedPartitioning(batchScan), commonPartitionValues = SparkShimLoader.getSparkShims.getCommonPartitionValues(batchScan) diff --git a/gluten-iceberg/src/main/scala/org/apache/gluten/execution/IcebergTransformerProvider.scala b/gluten-iceberg/src/main/scala/org/apache/gluten/execution/IcebergTransformerProvider.scala index 1ebeebf00be0b..dc521f39c1b95 100644 --- a/gluten-iceberg/src/main/scala/org/apache/gluten/execution/IcebergTransformerProvider.scala +++ b/gluten-iceberg/src/main/scala/org/apache/gluten/execution/IcebergTransformerProvider.scala @@ -16,7 +16,6 @@ */ package org.apache.gluten.execution -import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.execution.datasources.v2.BatchScanExec class IcebergTransformerProvider extends DataSourceScanTransformerRegister { @@ -24,8 +23,7 @@ class IcebergTransformerProvider extends DataSourceScanTransformerRegister { override val scanClassName: String = "org.apache.iceberg.spark.source.SparkBatchQueryScan" override def createDataSourceV2Transformer( - batchScan: BatchScanExec, - newPartitionFilters: Seq[Expression]): BatchScanExecTransformerBase = { - IcebergScanTransformer(batchScan, newPartitionFilters) + batchScan: BatchScanExec): BatchScanExecTransformerBase = { + IcebergScanTransformer(batchScan) } } diff --git a/gluten-ras/common/src/main/scala/org/apache/gluten/ras/best/GroupBasedBestFinder.scala b/gluten-ras/common/src/main/scala/org/apache/gluten/ras/best/GroupBasedBestFinder.scala index effebd41bb3b6..1128ab8dec01d 100644 --- a/gluten-ras/common/src/main/scala/org/apache/gluten/ras/best/GroupBasedBestFinder.scala +++ b/gluten-ras/common/src/main/scala/org/apache/gluten/ras/best/GroupBasedBestFinder.scala @@ -82,15 +82,23 @@ private object GroupBasedBestFinder { return Some(KnownCostPath(ras, path)) } val childrenGroups = can.getChildrenGroups(allGroups).map(gn => allGroups(gn.groupId())) - val maybeBestChildrenPaths: Seq[Option[RasPath[T]]] = childrenGroups.map { - childGroup => childrenGroupsOutput(childGroup).map(kcg => kcg.best().rasPath) + val maybeBestChildrenPaths: Seq[Option[KnownCostPath[T]]] = childrenGroups.map { + childGroup => childrenGroupsOutput(childGroup).map(kcg => kcg.best()) } if (maybeBestChildrenPaths.exists(_.isEmpty)) { // Node should only be solved when all children outputs exist. return None } val bestChildrenPaths = maybeBestChildrenPaths.map(_.get) - Some(KnownCostPath(ras, path.RasPath(ras, can, bestChildrenPaths).get)) + val kcp = KnownCostPath(ras, path.RasPath(ras, can, bestChildrenPaths.map(_.rasPath)).get) + // Cost should be in monotonically increasing basis. + bestChildrenPaths.map(_.cost).foreach { + childCost => + assert( + ras.costModel.costComparator().gteq(kcp.cost, childCost), + "Illegal decreasing cost") + } + Some(kcp) } override def solveGroup( diff --git a/gluten-ras/common/src/main/scala/org/apache/gluten/ras/memo/ForwardMemoTable.scala b/gluten-ras/common/src/main/scala/org/apache/gluten/ras/memo/ForwardMemoTable.scala index dd4033866f861..b99fb280fe5a5 100644 --- a/gluten-ras/common/src/main/scala/org/apache/gluten/ras/memo/ForwardMemoTable.scala +++ b/gluten-ras/common/src/main/scala/org/apache/gluten/ras/memo/ForwardMemoTable.scala @@ -155,7 +155,7 @@ class ForwardMemoTable[T <: AnyRef] private (override val ras: Ras[T]) groupBuffer(id) } - override def allClusterKeys(): Seq[RasClusterKey] = clusterKeyBuffer + override def allClusterKeys(): Seq[RasClusterKey] = clusterKeyBuffer.toSeq override def allGroupIds(): Seq[Int] = { val from = -dummyGroupBuffer.size diff --git a/gluten-ras/common/src/main/scala/org/apache/gluten/ras/path/PathMask.scala b/gluten-ras/common/src/main/scala/org/apache/gluten/ras/path/PathMask.scala index a8caded407b16..c7dd3d2c0718e 100644 --- a/gluten-ras/common/src/main/scala/org/apache/gluten/ras/path/PathMask.scala +++ b/gluten-ras/common/src/main/scala/org/apache/gluten/ras/path/PathMask.scala @@ -96,7 +96,7 @@ object PathMask { return None } - PathMask(buffer) + PathMask(buffer.toSeq) } Some(out) @@ -168,7 +168,7 @@ object PathMask { dfs(0, 0) - PathMask(buffer) + PathMask(buffer.toSeq) } // Return the sub-mask whose root node is the node at the input index diff --git a/gluten-ras/common/src/main/scala/org/apache/gluten/ras/path/Pattern.scala b/gluten-ras/common/src/main/scala/org/apache/gluten/ras/path/Pattern.scala index e60a947176545..f54b031b0aefe 100644 --- a/gluten-ras/common/src/main/scala/org/apache/gluten/ras/path/Pattern.scala +++ b/gluten-ras/common/src/main/scala/org/apache/gluten/ras/path/Pattern.scala @@ -87,14 +87,35 @@ object Pattern { override def children(count: Int): Seq[Node[T]] = (0 until count).map(_ => ignore[T]) } - private case class Branch[T <: AnyRef](matcher: Matcher[T], children: Seq[Node[T]]) + private case class Branch[T <: AnyRef](matcher: Matcher[T], children: Branch.ChildrenFactory[T]) extends Node[T] { override def skip(): Boolean = false - override def abort(node: CanonicalNode[T]): Boolean = node.childrenCount != children.size + override def abort(node: CanonicalNode[T]): Boolean = + !children.acceptsChildrenCount(node.childrenCount) override def matches(node: CanonicalNode[T]): Boolean = matcher(node.self()) override def children(count: Int): Seq[Node[T]] = { - assert(count == children.size) - children + assert(children.acceptsChildrenCount(count)) + (0 until count).map(children.child) + } + } + + private object Branch { + trait ChildrenFactory[T <: AnyRef] { + def child(index: Int): Node[T] + def acceptsChildrenCount(count: Int): Boolean + } + + object ChildrenFactory { + case class Plain[T <: AnyRef](nodes: Seq[Node[T]]) extends ChildrenFactory[T] { + override def child(index: Int): Node[T] = nodes(index) + override def acceptsChildrenCount(count: Int): Boolean = nodes.size == count + } + + case class Func[T <: AnyRef](arity: Int => Boolean, func: Int => Node[T]) + extends ChildrenFactory[T] { + override def child(index: Int): Node[T] = func(index) + override def acceptsChildrenCount(count: Int): Boolean = arity(count) + } } } @@ -102,8 +123,15 @@ object Pattern { def ignore[T <: AnyRef]: Node[T] = Ignore.INSTANCE.asInstanceOf[Node[T]] def node[T <: AnyRef](matcher: Matcher[T]): Node[T] = Single(matcher) def branch[T <: AnyRef](matcher: Matcher[T], children: Node[T]*): Node[T] = - Branch(matcher, children.toSeq) - def leaf[T <: AnyRef](matcher: Matcher[T]): Node[T] = Branch(matcher, List.empty) + Branch(matcher, Branch.ChildrenFactory.Plain(children.toSeq)) + // Similar to #branch, but with unknown arity. + def branch2[T <: AnyRef]( + matcher: Matcher[T], + arity: Int => Boolean, + children: Int => Node[T]): Node[T] = + Branch(matcher, Branch.ChildrenFactory.Func(arity, children)) + def leaf[T <: AnyRef](matcher: Matcher[T]): Node[T] = + Branch(matcher, Branch.ChildrenFactory.Plain(List.empty)) implicit class NodeImplicits[T <: AnyRef](node: Node[T]) { def build(): Pattern[T] = { diff --git a/gluten-ras/common/src/test/scala/org/apache/gluten/ras/OperationSuite.scala b/gluten-ras/common/src/test/scala/org/apache/gluten/ras/OperationSuite.scala index 60ec2eedd4105..e1ccfa1f44aa0 100644 --- a/gluten-ras/common/src/test/scala/org/apache/gluten/ras/OperationSuite.scala +++ b/gluten-ras/common/src/test/scala/org/apache/gluten/ras/OperationSuite.scala @@ -230,7 +230,7 @@ class OperationSuite extends AnyFunSuite { 48, Unary3(48, Unary3(48, Unary3(48, Unary3(48, Unary3(48, Unary3(48, Leaf(30)))))))))))) assert(costModel.costOfCount == 32) // TODO reduce this for performance - assert(costModel.costCompareCount == 20) // TODO reduce this for performance + assert(costModel.costCompareCount == 50) // TODO reduce this for performance } test("Cost evaluation count - max cost") { @@ -292,7 +292,7 @@ class OperationSuite extends AnyFunSuite { 48, Unary3(48, Unary3(48, Unary3(48, Unary3(48, Unary3(48, Unary3(48, Leaf(30)))))))))))) assert(costModel.costOfCount == 32) // TODO reduce this for performance - assert(costModel.costCompareCount == 20) // TODO reduce this for performance + assert(costModel.costCompareCount == 50) // TODO reduce this for performance } } diff --git a/gluten-ras/common/src/test/scala/org/apache/gluten/ras/mock/MockMemoState.scala b/gluten-ras/common/src/test/scala/org/apache/gluten/ras/mock/MockMemoState.scala index 37d66e2bd7031..1c8458af3c61e 100644 --- a/gluten-ras/common/src/test/scala/org/apache/gluten/ras/mock/MockMemoState.scala +++ b/gluten-ras/common/src/test/scala/org/apache/gluten/ras/mock/MockMemoState.scala @@ -102,7 +102,7 @@ object MockMemoState { nodeBuffer ++= nodes } - override def nodes(): Seq[CanonicalNode[T]] = nodeBuffer + override def nodes(): Seq[CanonicalNode[T]] = nodeBuffer.toSeq } object MockMutableCluster { @@ -153,7 +153,7 @@ object MockMemoState { group } - def allGroups(): Seq[MockMutableGroup[T]] = groupBuffer + def allGroups(): Seq[MockMutableGroup[T]] = groupBuffer.toSeq } object Factory { diff --git a/gluten-ras/common/src/test/scala/org/apache/gluten/ras/rule/PatternSuite.scala b/gluten-ras/common/src/test/scala/org/apache/gluten/ras/rule/PatternSuite.scala index 64b66bbaffae7..dc7f5e883022e 100644 --- a/gluten-ras/common/src/test/scala/org/apache/gluten/ras/rule/PatternSuite.scala +++ b/gluten-ras/common/src/test/scala/org/apache/gluten/ras/rule/PatternSuite.scala @@ -59,6 +59,29 @@ class PatternSuite extends AnyFunSuite { assert(pattern.matches(path, 1)) } + test("Match branch") { + val ras = + Ras[TestNode]( + PlanModelImpl, + CostModelImpl, + MetadataModelImpl, + PropertyModelImpl, + ExplainImpl, + RasRule.Factory.none()) + + val path1 = MockRasPath.mock(ras, Branch("n1", List())) + val path2 = MockRasPath.mock(ras, Branch("n1", List(Leaf("n2", 1)))) + val path3 = MockRasPath.mock(ras, Branch("n1", List(Leaf("n2", 1), Leaf("n3", 1)))) + + val pattern = + Pattern.branch2[TestNode](n => n.isInstanceOf[Branch], _ >= 1, _ => Pattern.any).build() + assert(!pattern.matches(path1, 1)) + assert(pattern.matches(path2, 1)) + assert(pattern.matches(path2, 2)) + assert(pattern.matches(path3, 1)) + assert(pattern.matches(path3, 2)) + } + test("Match unary") { val ras = Ras[TestNode]( @@ -231,17 +254,20 @@ object PatternSuite { case class Unary(name: String, child: TestNode) extends UnaryLike { override def selfCost(): Long = 1 - override def withNewChildren(child: TestNode): UnaryLike = copy(child = child) } case class Binary(name: String, left: TestNode, right: TestNode) extends BinaryLike { override def selfCost(): Long = 1 - override def withNewChildren(left: TestNode, right: TestNode): BinaryLike = copy(left = left, right = right) } + case class Branch(name: String, children: Seq[TestNode]) extends TestNode { + override def selfCost(): Long = 1 + override def withNewChildren(children: Seq[TestNode]): TestNode = copy(children = children) + } + case class DummyGroup() extends LeafLike { override def makeCopy(): LeafLike = throw new UnsupportedOperationException() override def selfCost(): Long = throw new UnsupportedOperationException() diff --git a/gluten-ras/pom.xml b/gluten-ras/pom.xml index e2e8fccb22001..973af760fa87f 100644 --- a/gluten-ras/pom.xml +++ b/gluten-ras/pom.xml @@ -32,7 +32,7 @@ org.scalacheck scalacheck_${scala.binary.version} - 1.13.5 + 1.17.0 test @@ -48,13 +48,13 @@ org.scalatestplus - scalatestplus-mockito_2.12 + scalatestplus-mockito_${scala.binary.version} 1.0.0-M2 test org.scalatestplus - scalatestplus-scalacheck_2.12 + scalatestplus-scalacheck_${scala.binary.version} 3.1.0.0-RC2 test diff --git a/gluten-uniffle/velox/src/main/java/org/apache/spark/shuffle/writer/VeloxUniffleColumnarShuffleWriter.java b/gluten-uniffle/velox/src/main/java/org/apache/spark/shuffle/writer/VeloxUniffleColumnarShuffleWriter.java index 471d70f4d1a51..ca5b3ad9529f8 100644 --- a/gluten-uniffle/velox/src/main/java/org/apache/spark/shuffle/writer/VeloxUniffleColumnarShuffleWriter.java +++ b/gluten-uniffle/velox/src/main/java/org/apache/spark/shuffle/writer/VeloxUniffleColumnarShuffleWriter.java @@ -18,10 +18,11 @@ import org.apache.gluten.GlutenConfig; import org.apache.gluten.columnarbatch.ColumnarBatches; +import org.apache.gluten.exec.Runtime; +import org.apache.gluten.exec.Runtimes; import org.apache.gluten.memory.memtarget.MemoryTarget; import org.apache.gluten.memory.memtarget.Spiller; import org.apache.gluten.memory.memtarget.Spillers; -import org.apache.gluten.memory.nmm.NativeMemoryManagers; import org.apache.gluten.vectorized.ShuffleWriterJniWrapper; import org.apache.gluten.vectorized.SplitResult; @@ -45,7 +46,6 @@ import java.io.IOException; import java.util.List; -import java.util.Set; import java.util.concurrent.TimeUnit; import java.util.function.Function; @@ -67,7 +67,8 @@ public class VeloxUniffleColumnarShuffleWriter extends RssShuffleWriter> records) throws IOException { bufferSize, bufferSize, partitionPusher, - NativeMemoryManagers.create( - "UniffleShuffleWriter", - new Spiller() { - @Override - public long spill(MemoryTarget self, long size) { - if (nativeShuffleWriter == -1) { - throw new IllegalStateException( - "Fatal: spill() called before a shuffle shuffle writer " - + "evaluator is created. This behavior should be" - + "optimized by moving memory " - + "allocations from make() to split()"); - } - LOG.info( - "Gluten shuffle writer: Trying to push {} bytes of data", size); - long pushed = - jniWrapper.nativeEvict(nativeShuffleWriter, size, false); - LOG.info( - "Gluten shuffle writer: Pushed {} / {} bytes of data", - pushed, - size); - return pushed; - } - - @Override - public Set applicablePhases() { - return Spillers.PHASE_SET_SPILL_ONLY; - } - }) - .getNativeInstanceHandle(), handle, taskAttemptId, GlutenShuffleUtils.getStartPartitionId( @@ -186,6 +158,19 @@ public Set applicablePhases() { "uniffle", isSort ? "sort" : "hash", reallocThreshold); + runtime.addSpiller( + new Spiller() { + @Override + public long spill(MemoryTarget self, Spiller.Phase phase, long size) { + if (!Spillers.PHASE_SET_SPILL_ONLY.contains(phase)) { + return 0L; + } + LOG.info("Gluten shuffle writer: Trying to push {} bytes of data", size); + long pushed = jniWrapper.nativeEvict(nativeShuffleWriter, size, false); + LOG.info("Gluten shuffle writer: Pushed {} / {} bytes of data", pushed, size); + return pushed; + } + }); } long startTime = System.nanoTime(); long bytes = @@ -242,7 +227,7 @@ public Option stop(boolean success) { closeShuffleWriter(); return super.stop(success); } - return Option.empty(); + return Option.empty(); } private void closeShuffleWriter() { diff --git a/gluten-ut/common/src/test/scala/org/apache/spark/sql/GlutenSQLTestsBaseTrait.scala b/gluten-ut/common/src/test/scala/org/apache/spark/sql/GlutenSQLTestsBaseTrait.scala index 8c55b823a06c4..4c06b02a1fb47 100644 --- a/gluten-ut/common/src/test/scala/org/apache/spark/sql/GlutenSQLTestsBaseTrait.scala +++ b/gluten-ut/common/src/test/scala/org/apache/spark/sql/GlutenSQLTestsBaseTrait.scala @@ -20,36 +20,13 @@ import org.apache.gluten.GlutenConfig import org.apache.gluten.utils.{BackendTestUtils, SystemParameters} import org.apache.spark.SparkConf -import org.apache.spark.sql.GlutenTestConstants.GLUTEN_TEST import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.adaptive.{AdaptiveSparkPlanExec, ShuffleQueryStageExec} import org.apache.spark.sql.test.SharedSparkSession -import org.scalactic.source.Position -import org.scalatest.Tag - /** Basic trait for Gluten SQL test cases. */ trait GlutenSQLTestsBaseTrait extends SharedSparkSession with GlutenTestsBaseTrait { - protected def testGluten(testName: String, testTag: Tag*)(testFun: => Any)(implicit - pos: Position): Unit = { - test(GLUTEN_TEST + testName, testTag: _*)(testFun) - } - - protected def ignoreGluten(testName: String, testTag: Tag*)(testFun: => Any)(implicit - pos: Position): Unit = { - super.ignore(GLUTEN_TEST + testName, testTag: _*)(testFun) - } - - override protected def test(testName: String, testTags: Tag*)(testFun: => Any)(implicit - pos: Position): Unit = { - if (shouldRun(testName)) { - super.test(testName, testTags: _*)(testFun) - } else { - super.ignore(testName, testTags: _*)(testFun) - } - } - override def sparkConf: SparkConf = { GlutenSQLTestsBaseTrait.nativeSparkConf(super.sparkConf, warehouse) } diff --git a/gluten-ut/common/src/test/scala/org/apache/spark/sql/GlutenTestsBaseTrait.scala b/gluten-ut/common/src/test/scala/org/apache/spark/sql/GlutenTestsBaseTrait.scala index 7c6dcbbee83df..a0ab973061660 100644 --- a/gluten-ut/common/src/test/scala/org/apache/spark/sql/GlutenTestsBaseTrait.scala +++ b/gluten-ut/common/src/test/scala/org/apache/spark/sql/GlutenTestsBaseTrait.scala @@ -18,7 +18,13 @@ package org.apache.spark.sql import org.apache.gluten.utils.BackendTestSettings -trait GlutenTestsBaseTrait { +import org.apache.spark.sql.GlutenTestConstants.GLUTEN_TEST + +import org.scalactic.source.Position +import org.scalatest.Tag +import org.scalatest.funsuite.AnyFunSuiteLike + +trait GlutenTestsBaseTrait extends AnyFunSuiteLike { protected val rootPath: String = getClass.getResource("/").getPath protected val basePath: String = rootPath + "unit-tests-working-home" @@ -30,7 +36,7 @@ trait GlutenTestsBaseTrait { // list will never be run with no regard to backend test settings. def testNameBlackList: Seq[String] = Seq() - def shouldRun(testName: String): Boolean = { + protected def shouldRun(testName: String): Boolean = { if (testNameBlackList.exists(_.equalsIgnoreCase(GlutenTestConstants.IGNORE_ALL))) { return false } @@ -39,4 +45,24 @@ trait GlutenTestsBaseTrait { } BackendTestSettings.shouldRun(getClass.getCanonicalName, testName) } + + protected def testGluten(testName: String, testTag: Tag*)(testFun: => Any)(implicit + pos: Position): Unit = { + test(GLUTEN_TEST + testName, testTag: _*)(testFun) + } + + protected def ignoreGluten(testName: String, testTag: Tag*)(testFun: => Any)(implicit + pos: Position): Unit = { + super.ignore(GLUTEN_TEST + testName, testTag: _*)(testFun) + } + + override protected def test(testName: String, testTags: Tag*)(testFun: => Any)(implicit + pos: Position): Unit = { + if (shouldRun(testName)) { + super.test(testName, testTags: _*)(testFun) + } else { + super.ignore(testName, testTags: _*)(testFun) + } + } + } diff --git a/gluten-ut/common/src/test/scala/org/apache/spark/sql/GlutenTestsCommonTrait.scala b/gluten-ut/common/src/test/scala/org/apache/spark/sql/GlutenTestsCommonTrait.scala index 06b9fca67bf7b..b9ee199eb1af9 100644 --- a/gluten-ut/common/src/test/scala/org/apache/spark/sql/GlutenTestsCommonTrait.scala +++ b/gluten-ut/common/src/test/scala/org/apache/spark/sql/GlutenTestsCommonTrait.scala @@ -19,11 +19,9 @@ package org.apache.spark.sql import org.apache.gluten.test.TestStats import org.apache.spark.SparkFunSuite -import org.apache.spark.sql.GlutenTestConstants.GLUTEN_TEST import org.apache.spark.sql.catalyst.expressions._ -import org.scalactic.source.Position -import org.scalatest.{Args, Status, Tag} +import org.scalatest.{Args, Status} trait GlutenTestsCommonTrait extends SparkFunSuite @@ -48,23 +46,4 @@ trait GlutenTestsCommonTrait TestStats.endCase(status.succeeds()); status } - - protected def testGluten(testName: String, testTag: Tag*)(testFun: => Any)(implicit - pos: Position): Unit = { - test(GLUTEN_TEST + testName, testTag: _*)(testFun) - } - - protected def ignoreGluten(testName: String, testTag: Tag*)(testFun: => Any)(implicit - pos: Position): Unit = { - super.ignore(GLUTEN_TEST + testName, testTag: _*)(testFun) - } - - override protected def test(testName: String, testTags: Tag*)(testFun: => Any)(implicit - pos: Position): Unit = { - if (shouldRun(testName)) { - super.test(testName, testTags: _*)(testFun) - } else { - super.ignore(testName, testTags: _*)(testFun) - } - } } diff --git a/gluten-ut/common/src/test/scala/org/apache/spark/sql/GlutenTestsTrait.scala b/gluten-ut/common/src/test/scala/org/apache/spark/sql/GlutenTestsTrait.scala index ee765ed360991..5df9d007193ef 100644 --- a/gluten-ut/common/src/test/scala/org/apache/spark/sql/GlutenTestsTrait.scala +++ b/gluten-ut/common/src/test/scala/org/apache/spark/sql/GlutenTestsTrait.scala @@ -360,6 +360,6 @@ trait GlutenTestsTrait extends GlutenTestsCommonTrait { } _spark.internalCreateDataFrame( _spark.sparkContext.parallelize(Seq(inputRow)), - StructType(structFileSeq)) + StructType(structFileSeq.toSeq)) } } diff --git a/gluten-ut/pom.xml b/gluten-ut/pom.xml index 2396087fcc335..1e474042d0fa2 100644 --- a/gluten-ut/pom.xml +++ b/gluten-ut/pom.xml @@ -92,13 +92,13 @@ io.delta ${delta.package.name}_${scala.binary.version} - provided + test org.scalacheck scalacheck_${scala.binary.version} - 1.13.5 + 1.17.0 test @@ -145,13 +145,13 @@ org.scalatestplus - scalatestplus-mockito_2.12 + scalatestplus-mockito_${scala.binary.version} 1.0.0-M2 test org.scalatestplus - scalatestplus-scalacheck_2.12 + scalatestplus-scalacheck_${scala.binary.version} 3.1.0.0-RC2 test diff --git a/gluten-ut/spark32/pom.xml b/gluten-ut/spark32/pom.xml index e026bb424129d..b0744589d1611 100644 --- a/gluten-ut/spark32/pom.xml +++ b/gluten-ut/spark32/pom.xml @@ -76,7 +76,7 @@ org.apache.arrow arrow-c-data - ${arrow.version} + ${arrow-gluten.version} test diff --git a/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala b/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala index afc427cd3664f..60df3ee37f669 100644 --- a/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala +++ b/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala @@ -68,6 +68,7 @@ class ClickHouseTestSettings extends BackendTestSettings { false // nativeDoValidate failed due to spark conf cleanup case "GlutenDataSourceV2SQLSuite" => false // nativeDoValidate failed due to spark conf cleanup + case "GlutenSortShuffleSuite" => false case _ => true } preCheck && super.shouldRun(suiteName, testName) @@ -171,6 +172,7 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("shuffle function - array for primitive type not containing null") .exclude("shuffle function - array for primitive type containing null") .exclude("shuffle function - array for non-primitive type") + .exclude("flatten function") enableSuite[GlutenDataFrameHintSuite] enableSuite[GlutenDataFrameImplicitsSuite] enableSuite[GlutenDataFrameJoinSuite].exclude( @@ -299,9 +301,7 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("SPARK-30793: truncate timestamps before the epoch to seconds and minutes") .excludeGlutenTest("unix_timestamp") .excludeGlutenTest("to_unix_timestamp") - .exclude("to_utc_timestamp with literal zone") .exclude("to_utc_timestamp with column zone") - .exclude("from_utc_timestamp with literal zone") .exclude("from_utc_timestamp with column zone") enableSuite[GlutenDeprecatedAPISuite] enableSuite[GlutenDynamicPartitionPruningV1SuiteAEOff].excludeGlutenTest( @@ -437,8 +437,10 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("string regex_replace / regex_extract") .exclude("string overlay function") .exclude("binary overlay function") - .exclude("string / binary substring function") .exclude("string parse_url function") + .exclude("string / binary length function") + .exclude("SPARK-36751: add octet length api for scala") + .exclude("SPARK-36751: add bit length api for scala") enableSuite[GlutenSubquerySuite] .exclude("SPARK-15370: COUNT bug in subquery in subquery in subquery") .exclude("SPARK-26893: Allow pushdown of partition pruning subquery filters to file source") @@ -562,7 +564,7 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("SPARK-28322: IntegralDivide supports decimal type") .exclude("SPARK-33008: division by zero on divide-like operations returns incorrect result") .exclude("SPARK-34920: error class") - enableSuite[GlutenBitwiseExpressionsSuite].exclude("BitGet") + enableSuite[GlutenBitwiseExpressionsSuite] enableSuite[GlutenCastSuite] .exclude("null cast") .exclude("cast string to date") @@ -675,7 +677,6 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("Sequence with default step") .exclude("Reverse") .exclude("elementAt") - .exclude("Flatten") .exclude("ArrayRepeat") .exclude("Array remove") .exclude("Array Distinct") @@ -845,7 +846,6 @@ class ClickHouseTestSettings extends BackendTestSettings { .excludeGlutenTest("default") enableSuite[GlutenMathExpressionsSuite] .exclude("tanh") - .exclude("rint") .exclude("unhex") .exclude("atan2") .exclude("round/bround") @@ -896,31 +896,20 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("SPARK-34814: LikeSimplification should handle NULL") enableSuite[GlutenSortOrderExpressionsSuite].exclude("SortPrefix") enableSuite[GlutenStringExpressionsSuite] - .exclude("concat") .exclude("StringComparison") .exclude("Substring") .exclude("string substring_index function") .exclude("ascii for string") - .exclude("string for ascii") .exclude("base64/unbase64 for string") .exclude("encode/decode for string") - .exclude("Levenshtein distance") - .exclude("soundex unit test") - .exclude("replace") .exclude("overlay for string") .exclude("overlay for byte array") .exclude("translate") - .exclude("FORMAT") - .exclude("SPARK-22603: FormatString should not generate codes beyond 64KB") - .exclude("INSTR") .exclude("LOCATE") .exclude("LPAD/RPAD") .exclude("REPEAT") - .exclude("length for string / binary") - .exclude("format_number / FormatNumber") .exclude("ParseUrl") .exclude("SPARK-33468: ParseUrl in ANSI mode should fail if input string is not a valid url") - .exclude("Sentences") .excludeGlutenTest("SPARK-40213: ascii for Latin-1 Supplement characters") enableSuite[GlutenTryCastSuite] .exclude("null cast") diff --git a/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala b/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala index c78d8230e3a62..a17f72de31214 100644 --- a/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala +++ b/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala @@ -18,6 +18,7 @@ package org.apache.gluten.utils.velox import org.apache.gluten.utils.{BackendTestSettings, SQLQueryTestSettings} +import org.apache.spark.GlutenSortShuffleSuite import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.connector.{GlutenDataSourceV2DataFrameSessionCatalogSuite, GlutenDataSourceV2DataFrameSuite, GlutenDataSourceV2FunctionSuite, GlutenDataSourceV2SQLSessionCatalogSuite, GlutenDataSourceV2SQLSuite, GlutenDataSourceV2Suite, GlutenFileDataSourceV2FallBackSuite, GlutenLocalScanSuite, GlutenSupportsCatalogOptionsSuite, GlutenTableCapabilityCheckSuite, GlutenWriteDistributionAndOrderingSuite} @@ -229,6 +230,7 @@ class VeloxTestSettings extends BackendTestSettings { // Spark round UT for round(3.1415,3) is not correct. .exclude("round/bround") enableSuite[GlutenMathFunctionsSuite] + enableSuite[GlutenSortShuffleSuite] enableSuite[GlutenSortOrderExpressionsSuite] enableSuite[GlutenBitwiseExpressionsSuite] enableSuite[GlutenStringExpressionsSuite] @@ -263,14 +265,6 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("to_timestamp") // Legacy mode is not supported, assuming this mode is not commonly used. .exclude("SPARK-30668: use legacy timestamp parser in to_timestamp") - // Replaced by another test. - .exclude("to_utc_timestamp with literal zone") - // Replaced by another test. - .exclude("to_utc_timestamp with column zone") - // Replaced by another test - .exclude("from_utc_timestamp with literal zone") - // Replaced by another test - .exclude("from_utc_timestamp with column zone") enableSuite[GlutenDataFrameFunctionsSuite] // blocked by Velox-5768 .exclude("aggregate function - array for primitive type containing null") @@ -337,7 +331,6 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("Shouldn't change broadcast join buildSide if user clearly specified") .exclude("Shouldn't bias towards build right if user didn't specify") .exclude("SPARK-23192: broadcast hint should be retained after using the cached data") - .exclude("broadcast hint isn't propagated after a join") .exclude("broadcast join where streamed side's output partitioning is HashPartitioning") .exclude("broadcast join where streamed side's output partitioning is PartitioningCollection") enableSuite[GlutenSQLQuerySuite] @@ -417,6 +410,9 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("save csv with empty fields with user defined empty values") .exclude("save csv with quote") .exclude("SPARK-13543 Write the output as uncompressed via option()") + .exclude("DDL test with tab separated file") + .exclude("DDL test parsing decimal type") + .exclude("test with tab delimiter and double quote") // Arrow not support corrupt record .exclude("SPARK-27873: disabling enforceSchema should not fail columnNameOfCorruptRecord") enableSuite[GlutenCSVv2Suite] @@ -429,6 +425,9 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("save csv with empty fields with user defined empty values") .exclude("save csv with quote") .exclude("SPARK-13543 Write the output as uncompressed via option()") + .exclude("DDL test with tab separated file") + .exclude("DDL test parsing decimal type") + .exclude("test with tab delimiter and double quote") // Rule org.apache.spark.sql.execution.datasources.v2.V2ScanRelationPushDown in batch // Early Filter and Projection Push-Down generated an invalid plan .exclude("SPARK-26208: write and read empty data to csv file with headers") @@ -441,6 +440,9 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("save csv with empty fields with user defined empty values") .exclude("save csv with quote") .exclude("SPARK-13543 Write the output as uncompressed via option()") + .exclude("DDL test with tab separated file") + .exclude("DDL test parsing decimal type") + .exclude("test with tab delimiter and double quote") enableSuite[GlutenJsonV1Suite] // FIXME: Array direct selection fails .exclude("Complex field and type inferring") diff --git a/gluten-ut/spark32/src/test/scala/org/apache/spark/GlutenSortShuffleSuite.scala b/gluten-ut/spark32/src/test/scala/org/apache/spark/GlutenSortShuffleSuite.scala new file mode 100644 index 0000000000000..70579c886248e --- /dev/null +++ b/gluten-ut/spark32/src/test/scala/org/apache/spark/GlutenSortShuffleSuite.scala @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark + +import org.apache.spark.sql.GlutenTestsBaseTrait + +class GlutenSortShuffleSuite extends SortShuffleSuite with GlutenTestsBaseTrait { + override def beforeAll(): Unit = { + super.beforeAll() + conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.ColumnarShuffleManager") + } +} diff --git a/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/GlutenColumnExpressionSuite.scala b/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/GlutenColumnExpressionSuite.scala index edd2a5a9672df..a4b530e637af9 100644 --- a/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/GlutenColumnExpressionSuite.scala +++ b/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/GlutenColumnExpressionSuite.scala @@ -16,4 +16,40 @@ */ package org.apache.spark.sql -class GlutenColumnExpressionSuite extends ColumnExpressionSuite with GlutenSQLTestsTrait {} +import org.apache.spark.sql.execution.ProjectExec +import org.apache.spark.sql.functions.{expr, input_file_name} +import org.apache.spark.sql.types.{ArrayType, IntegerType, StringType, StructField, StructType} + +class GlutenColumnExpressionSuite extends ColumnExpressionSuite with GlutenSQLTestsTrait { + testGluten("input_file_name with scan is fallback") { + withTempPath { + dir => + val rawData = Seq( + Row(1, "Alice", Seq(Row(Seq(1, 2, 3)))), + Row(2, "Bob", Seq(Row(Seq(4, 5)))), + Row(3, "Charlie", Seq(Row(Seq(6, 7, 8, 9)))) + ) + val schema = StructType( + Array( + StructField("id", IntegerType, nullable = false), + StructField("name", StringType, nullable = false), + StructField( + "nested_column", + ArrayType( + StructType(Array( + StructField("array_in_struct", ArrayType(IntegerType), nullable = true) + ))), + nullable = true) + )) + val data: DataFrame = spark.createDataFrame(sparkContext.parallelize(rawData), schema) + data.write.parquet(dir.getCanonicalPath) + + val q = + spark.read.parquet(dir.getCanonicalPath).select(input_file_name(), expr("nested_column")) + val firstRow = q.head() + assert(firstRow.getString(0).contains(dir.toURI.getPath)) + val project = q.queryExecution.executedPlan.collect { case p: ProjectExec => p } + assert(project.size == 1) + } + } +} diff --git a/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/GlutenDataFrameFunctionsSuite.scala b/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/GlutenDataFrameFunctionsSuite.scala index 2b0b40790a76d..e64f760ab55fa 100644 --- a/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/GlutenDataFrameFunctionsSuite.scala +++ b/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/GlutenDataFrameFunctionsSuite.scala @@ -49,4 +49,86 @@ class GlutenDataFrameFunctionsSuite extends DataFrameFunctionsSuite with GlutenS false ) } + + testGluten("flatten function") { + // Test cases with a primitive type + val intDF = Seq( + (Seq(Seq(1, 2, 3), Seq(4, 5), Seq(6))), + (Seq(Seq(1, 2))), + (Seq(Seq(1), Seq.empty)), + (Seq(Seq.empty, Seq(1))) + ).toDF("i") + + val intDFResult = Seq(Row(Seq(1, 2, 3, 4, 5, 6)), Row(Seq(1, 2)), Row(Seq(1)), Row(Seq(1))) + + def testInt(): Unit = { + checkAnswer(intDF.select(flatten($"i")), intDFResult) + checkAnswer(intDF.selectExpr("flatten(i)"), intDFResult) + } + + // Test with local relation, the Project will be evaluated without codegen + testInt() + // Test with cached relation, the Project will be evaluated with codegen + intDF.cache() + testInt() + + // Test cases with non-primitive types + val strDF = Seq( + (Seq(Seq("a", "b"), Seq("c"), Seq("d", "e", "f"))), + (Seq(Seq("a", "b"))), + (Seq(Seq("a", null), Seq(null, "b"), Seq(null, null))), + (Seq(Seq("a"), Seq.empty)), + (Seq(Seq.empty, Seq("a"))) + ).toDF("s") + + val strDFResult = Seq( + Row(Seq("a", "b", "c", "d", "e", "f")), + Row(Seq("a", "b")), + Row(Seq("a", null, null, "b", null, null)), + Row(Seq("a")), + Row(Seq("a"))) + + def testString(): Unit = { + checkAnswer(strDF.select(flatten($"s")), strDFResult) + checkAnswer(strDF.selectExpr("flatten(s)"), strDFResult) + } + + // Test with local relation, the Project will be evaluated without codegen + testString() + // Test with cached relation, the Project will be evaluated with codegen + strDF.cache() + testString() + + val arrDF = Seq((1, "a", Seq(1, 2, 3))).toDF("i", "s", "arr") + + def testArray(): Unit = { + checkAnswer( + arrDF.selectExpr("flatten(array(arr, array(null, 5), array(6, null)))"), + Seq(Row(Seq(1, 2, 3, null, 5, 6, null)))) + checkAnswer( + arrDF.selectExpr("flatten(array(array(arr, arr), array(arr)))"), + Seq(Row(Seq(Seq(1, 2, 3), Seq(1, 2, 3), Seq(1, 2, 3))))) + } + + // Test with local relation, the Project will be evaluated without codegen + testArray() + // Test with cached relation, the Project will be evaluated with codegen + arrDF.cache() + testArray() + + // Error test cases + val oneRowDF = Seq((1, "a", Seq(1, 2, 3))).toDF("i", "s", "arr") + intercept[AnalysisException] { + oneRowDF.select(flatten($"arr")) + } + intercept[AnalysisException] { + oneRowDF.select(flatten($"i")) + } + intercept[AnalysisException] { + oneRowDF.select(flatten($"s")) + } + intercept[AnalysisException] { + oneRowDF.selectExpr("flatten(null)") + } + } } diff --git a/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/GlutenDateFunctionsSuite.scala b/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/GlutenDateFunctionsSuite.scala index 154e67ae3ae0e..8d1f7320dd422 100644 --- a/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/GlutenDateFunctionsSuite.scala +++ b/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/GlutenDateFunctionsSuite.scala @@ -248,80 +248,4 @@ class GlutenDateFunctionsSuite extends DateFunctionsSuite with GlutenSQLTestsTra } } } - - testGluten("to_utc_timestamp with literal zone") { - val df = Seq( - (Timestamp.valueOf("2015-07-24 00:00:00"), "2015-07-24 00:00:00"), - (Timestamp.valueOf("2015-07-25 00:00:00"), "2015-07-25 00:00:00") - ).toDF("a", "b") - checkAnswer( - df.select(to_utc_timestamp(col("a"), "America/Los_Angeles")), - Seq( - Row(Timestamp.valueOf("2015-07-24 07:00:00")), - Row(Timestamp.valueOf("2015-07-25 07:00:00"))) - ) - checkAnswer( - df.select(to_utc_timestamp(col("b"), "America/Los_Angeles")), - Seq( - Row(Timestamp.valueOf("2015-07-24 07:00:00")), - Row(Timestamp.valueOf("2015-07-25 07:00:00"))) - ) - } - - testGluten("to_utc_timestamp with column zone") { - val df = Seq( - (Timestamp.valueOf("2015-07-24 00:00:00"), "2015-07-24 00:00:00", "America/Los_Angeles"), - (Timestamp.valueOf("2015-07-25 00:00:00"), "2015-07-25 00:00:00", "Europe/Paris") - ).toDF("a", "b", "c") - checkAnswer( - df.select(to_utc_timestamp(col("a"), col("c"))), - Seq( - Row(Timestamp.valueOf("2015-07-24 07:00:00")), - Row(Timestamp.valueOf("2015-07-24 22:00:00"))) - ) - checkAnswer( - df.select(to_utc_timestamp(col("b"), col("c"))), - Seq( - Row(Timestamp.valueOf("2015-07-24 07:00:00")), - Row(Timestamp.valueOf("2015-07-24 22:00:00"))) - ) - } - - testGluten("from_utc_timestamp with literal zone") { - val df = Seq( - (Timestamp.valueOf("2015-07-24 00:00:00"), "2015-07-24 00:00:00"), - (Timestamp.valueOf("2015-07-25 00:00:00"), "2015-07-25 00:00:00") - ).toDF("a", "b") - checkAnswer( - df.select(from_utc_timestamp(col("a"), "America/Los_Angeles")), - Seq( - Row(Timestamp.valueOf("2015-07-23 17:00:00")), - Row(Timestamp.valueOf("2015-07-24 17:00:00"))) - ) - checkAnswer( - df.select(from_utc_timestamp(col("b"), "America/Los_Angeles")), - Seq( - Row(Timestamp.valueOf("2015-07-23 17:00:00")), - Row(Timestamp.valueOf("2015-07-24 17:00:00"))) - ) - } - - testGluten("from_utc_timestamp with column zone") { - val df = Seq( - (Timestamp.valueOf("2015-07-24 00:00:00"), "2015-07-24 00:00:00", "Europe/Paris"), - (Timestamp.valueOf("2015-07-25 00:00:00"), "2015-07-25 00:00:00", "America/Los_Angeles") - ).toDF("a", "b", "c") - checkAnswer( - df.select(from_utc_timestamp(col("a"), col("c"))), - Seq( - Row(Timestamp.valueOf("2015-07-24 02:00:00")), - Row(Timestamp.valueOf("2015-07-24 17:00:00"))) - ) - checkAnswer( - df.select(from_utc_timestamp(col("b"), col("c"))), - Seq( - Row(Timestamp.valueOf("2015-07-24 02:00:00")), - Row(Timestamp.valueOf("2015-07-24 17:00:00"))) - ) - } } diff --git a/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/GlutenStringFunctionsSuite.scala b/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/GlutenStringFunctionsSuite.scala index a686b6456e9fd..b88fdc59db6a4 100644 --- a/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/GlutenStringFunctionsSuite.scala +++ b/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/GlutenStringFunctionsSuite.scala @@ -21,9 +21,4 @@ import org.apache.spark.sql.catalyst.expressions.ExpressionEvalHelper class GlutenStringFunctionsSuite extends StringFunctionsSuite with GlutenSQLTestsTrait - with ExpressionEvalHelper { - - override def testNameBlackList: Seq[String] = super.testNameBlackList ++ Seq( - "string / binary length function" - ) -} + with ExpressionEvalHelper {} diff --git a/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/catalyst/expressions/GlutenDateExpressionsSuite.scala b/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/catalyst/expressions/GlutenDateExpressionsSuite.scala index 29b9141b72004..44d4502aedacb 100644 --- a/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/catalyst/expressions/GlutenDateExpressionsSuite.scala +++ b/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/catalyst/expressions/GlutenDateExpressionsSuite.scala @@ -382,6 +382,9 @@ class GlutenDateExpressionsSuite extends DateExpressionsSuite with GlutenTestsTr checkEvaluation( FromUnixTime(Literal(-1000L), Literal(fmt2), timeZoneId), sdf2.format(new Timestamp(-1000000))) + checkEvaluation( + FromUnixTime(Literal(Long.MaxValue), Literal(fmt2), timeZoneId), + sdf2.format(new Timestamp(-1000))) checkEvaluation( FromUnixTime( Literal.create(null, LongType), diff --git a/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/execution/joins/GlutenBroadcastJoinSuite.scala b/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/execution/joins/GlutenBroadcastJoinSuite.scala index c9ccc1afc75d6..f418ec06645c7 100644 --- a/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/execution/joins/GlutenBroadcastJoinSuite.scala +++ b/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/execution/joins/GlutenBroadcastJoinSuite.scala @@ -22,7 +22,6 @@ import org.apache.gluten.utils.{BackendTestUtils, SystemParameters} import org.apache.spark.sql.{GlutenTestsCommonTrait, SparkSession} import org.apache.spark.sql.catalyst.optimizer._ -import org.apache.spark.sql.execution.exchange.EnsureRequirements import org.apache.spark.sql.functions.broadcast import org.apache.spark.sql.internal.SQLConf @@ -41,8 +40,6 @@ class GlutenBroadcastJoinSuite extends BroadcastJoinSuite with GlutenTestsCommon * Create a new [[SparkSession]] running in local-cluster mode with unsafe and codegen enabled. */ - private val EnsureRequirements = new EnsureRequirements() - private val isVeloxBackend = BackendTestUtils.isVeloxBackendLoaded() // BroadcastHashJoinExecTransformer is not case class, can't call toString method, @@ -235,22 +232,6 @@ class GlutenBroadcastJoinSuite extends BroadcastJoinSuite with GlutenTestsCommon } } - testGluten("broadcast hint isn't propagated after a join") { - withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") { - val df1 = Seq((1, "4"), (2, "2")).toDF("key", "value") - val df2 = Seq((1, "1"), (2, "2")).toDF("key", "value") - val df3 = df1.join(broadcast(df2), Seq("key"), "inner").drop(df2("key")) - - val df4 = Seq((1, "5"), (2, "5")).toDF("key", "value") - val df5 = df4.join(df3, Seq("key"), "inner") - - val plan = EnsureRequirements.apply(df5.queryExecution.sparkPlan) - - assert(plan.collect { case p: BroadcastHashJoinExec => p }.size === 1) - assert(plan.collect { case p: ShuffledHashJoinExec => p }.size === 1) - } - } - private def assertJoinBuildSide(sqlStr: String, joinMethod: String, buildSide: BuildSide): Any = { val executedPlan = stripAQEPlan(sql(sqlStr).queryExecution.executedPlan) executedPlan match { diff --git a/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/extension/GlutenSessionExtensionSuite.scala b/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/extension/GlutenSessionExtensionSuite.scala index 00b4bf5821f1e..4df9c63b3ef6d 100644 --- a/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/extension/GlutenSessionExtensionSuite.scala +++ b/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/extension/GlutenSessionExtensionSuite.scala @@ -16,7 +16,7 @@ */ package org.apache.spark.sql.extension -import org.apache.gluten.extension.{ColumnarOverrideRules, JoinSelectionOverrides} +import org.apache.gluten.extension.ColumnarOverrideRules import org.apache.spark.SparkConf import org.apache.spark.sql._ @@ -31,7 +31,6 @@ class GlutenSessionExtensionSuite extends GlutenSQLTestsTrait { testGluten("test gluten extensions") { assert(spark.sessionState.columnarRules.contains(ColumnarOverrideRules(spark))) - assert(spark.sessionState.planner.strategies.contains(JoinSelectionOverrides(spark))) assert(spark.sessionState.planner.strategies.contains(MySparkStrategy(spark))) assert(spark.sessionState.analyzer.extendedResolutionRules.contains(MyRule(spark))) diff --git a/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/extension/TestFileSourceScanExecTransformer.scala b/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/extension/TestFileSourceScanExecTransformer.scala index a7ad96600d8f3..88d771ec8a4f2 100644 --- a/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/extension/TestFileSourceScanExecTransformer.scala +++ b/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/extension/TestFileSourceScanExecTransformer.scala @@ -52,6 +52,7 @@ case class TestFileSourceScanExecTransformer( override def getPartitions: Seq[InputPartition] = BackendsApiManager.getTransformerApiInstance.genInputPartitionSeq( relation, + requiredSchema, selectedPartitions, output, bucketedScan, diff --git a/gluten-ut/spark33/pom.xml b/gluten-ut/spark33/pom.xml index 0f0a0703c3c50..5f9a28e2459d3 100644 --- a/gluten-ut/spark33/pom.xml +++ b/gluten-ut/spark33/pom.xml @@ -83,7 +83,7 @@ org.apache.arrow arrow-c-data - ${arrow.version} + ${arrow-gluten.version} test diff --git a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala index 85f3f94cca95e..df9f49bfc72ed 100644 --- a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala +++ b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala @@ -74,6 +74,7 @@ class ClickHouseTestSettings extends BackendTestSettings { false // nativeDoValidate failed due to spark conf cleanup case "GlutenBloomFilterAggregateQuerySuite" => !bloomFilterCases.contains(testName) + case "GlutenSortShuffleSuite" => false case _ => true } preCheck && super.shouldRun(suiteName, testName) @@ -189,6 +190,7 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("shuffle function - array for primitive type not containing null") .exclude("shuffle function - array for primitive type containing null") .exclude("shuffle function - array for non-primitive type") + .exclude("flatten function") enableSuite[GlutenDataFrameHintSuite] enableSuite[GlutenDataFrameImplicitsSuite] enableSuite[GlutenDataFrameJoinSuite].exclude( @@ -325,9 +327,7 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("SPARK-30793: truncate timestamps before the epoch to seconds and minutes") .excludeGlutenTest("unix_timestamp") .excludeGlutenTest("to_unix_timestamp") - .exclude("to_utc_timestamp with literal zone") .exclude("to_utc_timestamp with column zone") - .exclude("from_utc_timestamp with literal zone") .exclude("from_utc_timestamp with column zone") enableSuite[GlutenDeprecatedAPISuite] enableSuite[GlutenDynamicPartitionPruningV1SuiteAEOff].excludeGlutenTest( @@ -458,8 +458,10 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("string regex_replace / regex_extract") .exclude("string overlay function") .exclude("binary overlay function") - .exclude("string / binary substring function") .exclude("string parse_url function") + .exclude("string / binary length function") + .exclude("SPARK-36751: add octet length api for scala") + .exclude("SPARK-36751: add bit length api for scala") enableSuite[GlutenSubquerySuite] .exclude("SPARK-15370: COUNT bug in subquery in subquery in subquery") .exclude("SPARK-26893: Allow pushdown of partition pruning subquery filters to file source") @@ -474,58 +476,9 @@ class ClickHouseTestSettings extends BackendTestSettings { enableSuite[GlutenXPathFunctionsSuite] enableSuite[QueryTestSuite] enableSuite[GlutenAnsiCastSuiteWithAnsiModeOff] - .exclude("null cast") .exclude("cast string to date") - .exclude("cast string to timestamp") - .exclude("cast from boolean") - .exclude("cast from int") - .exclude("cast from long") - .exclude("cast from float") - .exclude("cast from double") - .exclude("cast from timestamp") - .exclude("data type casting") - .exclude("cast and add") - .exclude("from decimal") - .exclude("cast from array") - .exclude("cast from map") - .exclude("cast from struct") - .exclude("cast struct with a timestamp field") - .exclude("cast between string and interval") - .exclude("cast string to boolean") - .exclude("SPARK-20302 cast with same structure") - .exclude("SPARK-22500: cast for struct should not generate codes beyond 64KB") - .exclude("SPARK-27671: cast from nested null type in struct") - .exclude("Process Infinity, -Infinity, NaN in case insensitive manner") - .exclude("SPARK-22825 Cast array to string") - .exclude("SPARK-33291: Cast array with null elements to string") - .exclude("SPARK-22973 Cast map to string") - .exclude("SPARK-22981 Cast struct to string") - .exclude("SPARK-33291: Cast struct with null elements to string") - .exclude("SPARK-34667: cast year-month interval to string") - .exclude("SPARK-34668: cast day-time interval to string") - .exclude("SPARK-35698: cast timestamp without time zone to string") .exclude("SPARK-35711: cast timestamp without time zone to timestamp with local time zone") - .exclude("SPARK-35716: cast timestamp without time zone to date type") - .exclude("SPARK-35718: cast date type to timestamp without timezone") - .exclude("SPARK-35719: cast timestamp with local time zone to timestamp without timezone") - .exclude("SPARK-35720: cast string to timestamp without timezone") - .exclude("SPARK-35112: Cast string to day-time interval") - .exclude("SPARK-35111: Cast string to year-month interval") - .exclude("SPARK-35820: Support cast DayTimeIntervalType in different fields") .exclude("SPARK-35819: Support cast YearMonthIntervalType in different fields") - .exclude("SPARK-35768: Take into account year-month interval fields in cast") - .exclude("SPARK-35735: Take into account day-time interval fields in cast") - .exclude("ANSI mode: Throw exception on casting out-of-range value to byte type") - .exclude("ANSI mode: Throw exception on casting out-of-range value to short type") - .exclude("ANSI mode: Throw exception on casting out-of-range value to int type") - .exclude("ANSI mode: Throw exception on casting out-of-range value to long type") - .exclude("Fast fail for cast string type to decimal type in ansi mode") - .exclude("cast a timestamp before the epoch 1970-01-01 00:00:00Z") - .exclude("cast from array III") - .exclude("cast from map II") - .exclude("cast from map III") - .exclude("cast from struct II") - .exclude("cast from struct III") enableSuite[GlutenAnsiCastSuiteWithAnsiModeOn] .exclude("null cast") .exclude("cast string to date") @@ -596,7 +549,7 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("SPARK-36920: Support day-time intervals by ABS") .exclude("SPARK-36921: Support YearMonthIntervalType by div") .exclude("SPARK-36921: Support DayTimeIntervalType by div") - enableSuite[GlutenBitwiseExpressionsSuite].exclude("BitGet") + enableSuite[GlutenBitwiseExpressionsSuite] enableSuite[GlutenCastSuite] .exclude("null cast") .exclude("cast string to date") @@ -715,7 +668,6 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("Sequence with default step") .exclude("Reverse") .exclude("elementAt") - .exclude("Flatten") .exclude("ArrayRepeat") .exclude("Array remove") .exclude("Array Distinct") @@ -850,7 +802,6 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("SPARK-37967: Literal.create support ObjectType") enableSuite[GlutenMathExpressionsSuite] .exclude("tanh") - .exclude("rint") .exclude("unhex") .exclude("atan2") .exclude("round/bround/floor/ceil") @@ -904,34 +855,20 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("SPARK - 34814: LikeSimplification should handleNULL") enableSuite[GlutenSortOrderExpressionsSuite].exclude("SortPrefix") enableSuite[GlutenStringExpressionsSuite] - .exclude("concat") .exclude("StringComparison") .exclude("Substring") .exclude("string substring_index function") .exclude("SPARK-40213: ascii for Latin-1 Supplement characters") .exclude("ascii for string") - .exclude("string for ascii") .exclude("base64/unbase64 for string") .exclude("encode/decode for string") - .exclude("Levenshtein distance") - .exclude("soundex unit test") - .exclude("replace") .exclude("overlay for string") .exclude("overlay for byte array") .exclude("translate") - .exclude("FORMAT") - .exclude("SPARK-22603: FormatString should not generate codes beyond 64KB") - .exclude("INSTR") .exclude("LOCATE") - .exclude("LPAD/RPAD") .exclude("REPEAT") - .exclude("length for string / binary") - .exclude("format_number / FormatNumber") - .exclude("ToNumber: positive tests") - .exclude("ToNumber: negative tests (the input string does not match the format string)") .exclude("ParseUrl") .exclude("SPARK-33468: ParseUrl in ANSI mode should fail if input string is not a valid url") - .exclude("Sentences") enableSuite[GlutenTryCastSuite] .exclude("null cast") .exclude("cast string to date") diff --git a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala index 3b32cebca13fb..ae3e7c7b8e9d7 100644 --- a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala +++ b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala @@ -18,6 +18,7 @@ package org.apache.gluten.utils.velox import org.apache.gluten.utils.{BackendTestSettings, SQLQueryTestSettings} +import org.apache.spark.GlutenSortShuffleSuite import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.expressions.{GlutenAnsiCastSuiteWithAnsiModeOff, GlutenAnsiCastSuiteWithAnsiModeOn, GlutenArithmeticExpressionSuite, GlutenBitwiseExpressionsSuite, GlutenCastSuite, GlutenCastSuiteWithAnsiModeOn, GlutenCollectionExpressionsSuite, GlutenComplexTypeSuite, GlutenConditionalExpressionSuite, GlutenDateExpressionsSuite, GlutenDecimalExpressionSuite, GlutenDecimalPrecisionSuite, GlutenHashExpressionsSuite, GlutenHigherOrderFunctionsSuite, GlutenIntervalExpressionsSuite, GlutenLiteralExpressionSuite, GlutenMathExpressionsSuite, GlutenMiscExpressionsSuite, GlutenNondeterministicSuite, GlutenNullExpressionsSuite, GlutenPredicateSuite, GlutenRandomSuite, GlutenRegexpExpressionsSuite, GlutenSortOrderExpressionsSuite, GlutenStringExpressionsSuite, GlutenTryCastSuite} import org.apache.spark.sql.connector._ @@ -159,6 +160,7 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("random") .exclude("SPARK-9127 codegen with long seed") enableSuite[GlutenRegexpExpressionsSuite] + enableSuite[GlutenSortShuffleSuite] enableSuite[GlutenSortOrderExpressionsSuite] enableSuite[GlutenStringExpressionsSuite] enableSuite[VeloxAdaptiveQueryExecSuite] @@ -204,6 +206,9 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("save csv with empty fields with user defined empty values") .exclude("save csv with quote") .exclude("SPARK-13543 Write the output as uncompressed via option()") + .exclude("DDL test with tab separated file") + .exclude("DDL test parsing decimal type") + .exclude("test with tab delimiter and double quote") // Arrow not support corrupt record .exclude("SPARK-27873: disabling enforceSchema should not fail columnNameOfCorruptRecord") enableSuite[GlutenCSVv2Suite] @@ -216,6 +221,9 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("save csv with empty fields with user defined empty values") .exclude("save csv with quote") .exclude("SPARK-13543 Write the output as uncompressed via option()") + .exclude("DDL test with tab separated file") + .exclude("DDL test parsing decimal type") + .exclude("test with tab delimiter and double quote") // Rule org.apache.spark.sql.execution.datasources.v2.V2ScanRelationPushDown in batch // Early Filter and Projection Push-Down generated an invalid plan .exclude("SPARK-26208: write and read empty data to csv file with headers") @@ -227,6 +235,9 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("save csv with empty fields with user defined empty values") .exclude("save csv with quote") .exclude("SPARK-13543 Write the output as uncompressed via option()") + .exclude("DDL test with tab separated file") + .exclude("DDL test parsing decimal type") + .exclude("test with tab delimiter and double quote") // Arrow not support corrupt record .exclude("SPARK-27873: disabling enforceSchema should not fail columnNameOfCorruptRecord") enableSuite[GlutenJsonV1Suite] @@ -854,7 +865,6 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("Shouldn't change broadcast join buildSide if user clearly specified") .exclude("Shouldn't bias towards build right if user didn't specify") .exclude("SPARK-23192: broadcast hint should be retained after using the cached data") - .exclude("broadcast hint isn't propagated after a join") .exclude("broadcast join where streamed side's output partitioning is HashPartitioning") enableSuite[GlutenExistenceJoinSuite] @@ -1059,14 +1069,6 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("to_timestamp") // Legacy mode is not supported, assuming this mode is not commonly used. .exclude("SPARK-30668: use legacy timestamp parser in to_timestamp") - // Replaced by another test. - .exclude("to_utc_timestamp with literal zone") - // Replaced by another test. - .exclude("to_utc_timestamp with column zone") - // Replaced by another test - .exclude("from_utc_timestamp with literal zone") - // Replaced by another test - .exclude("from_utc_timestamp with column zone") enableSuite[GlutenDeprecatedAPISuite] enableSuite[GlutenDynamicPartitionPruningV1SuiteAEOff] enableSuite[GlutenDynamicPartitionPruningV1SuiteAEOn] @@ -1101,8 +1103,6 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("SPARK-31116: Select nested schema with case insensitive mode") // exclude as original metric not correct when task offloaded to velox .exclude("SPARK-37585: test input metrics for DSV2 with output limits") - // Unknown. Need to investigate. - .exclude("SPARK-30362: test input metrics for DSV2") // DISABLED: GLUTEN-4893 Vanilla UT checks scan operator by exactly matching the class type .exclude("File source v2: support passing data filters to FileScan without partitionFilters") // DISABLED: GLUTEN-4893 Vanilla UT checks scan operator by exactly matching the class type diff --git a/gluten-ut/spark33/src/test/scala/org/apache/spark/GlutenSortShuffleSuite.scala b/gluten-ut/spark33/src/test/scala/org/apache/spark/GlutenSortShuffleSuite.scala new file mode 100644 index 0000000000000..70579c886248e --- /dev/null +++ b/gluten-ut/spark33/src/test/scala/org/apache/spark/GlutenSortShuffleSuite.scala @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark + +import org.apache.spark.sql.GlutenTestsBaseTrait + +class GlutenSortShuffleSuite extends SortShuffleSuite with GlutenTestsBaseTrait { + override def beforeAll(): Unit = { + super.beforeAll() + conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.ColumnarShuffleManager") + } +} diff --git a/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/GlutenColumnExpressionSuite.scala b/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/GlutenColumnExpressionSuite.scala index edd2a5a9672df..a4b530e637af9 100644 --- a/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/GlutenColumnExpressionSuite.scala +++ b/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/GlutenColumnExpressionSuite.scala @@ -16,4 +16,40 @@ */ package org.apache.spark.sql -class GlutenColumnExpressionSuite extends ColumnExpressionSuite with GlutenSQLTestsTrait {} +import org.apache.spark.sql.execution.ProjectExec +import org.apache.spark.sql.functions.{expr, input_file_name} +import org.apache.spark.sql.types.{ArrayType, IntegerType, StringType, StructField, StructType} + +class GlutenColumnExpressionSuite extends ColumnExpressionSuite with GlutenSQLTestsTrait { + testGluten("input_file_name with scan is fallback") { + withTempPath { + dir => + val rawData = Seq( + Row(1, "Alice", Seq(Row(Seq(1, 2, 3)))), + Row(2, "Bob", Seq(Row(Seq(4, 5)))), + Row(3, "Charlie", Seq(Row(Seq(6, 7, 8, 9)))) + ) + val schema = StructType( + Array( + StructField("id", IntegerType, nullable = false), + StructField("name", StringType, nullable = false), + StructField( + "nested_column", + ArrayType( + StructType(Array( + StructField("array_in_struct", ArrayType(IntegerType), nullable = true) + ))), + nullable = true) + )) + val data: DataFrame = spark.createDataFrame(sparkContext.parallelize(rawData), schema) + data.write.parquet(dir.getCanonicalPath) + + val q = + spark.read.parquet(dir.getCanonicalPath).select(input_file_name(), expr("nested_column")) + val firstRow = q.head() + assert(firstRow.getString(0).contains(dir.toURI.getPath)) + val project = q.queryExecution.executedPlan.collect { case p: ProjectExec => p } + assert(project.size == 1) + } + } +} diff --git a/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/GlutenDataFrameFunctionsSuite.scala b/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/GlutenDataFrameFunctionsSuite.scala index 2b0b40790a76d..e64f760ab55fa 100644 --- a/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/GlutenDataFrameFunctionsSuite.scala +++ b/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/GlutenDataFrameFunctionsSuite.scala @@ -49,4 +49,86 @@ class GlutenDataFrameFunctionsSuite extends DataFrameFunctionsSuite with GlutenS false ) } + + testGluten("flatten function") { + // Test cases with a primitive type + val intDF = Seq( + (Seq(Seq(1, 2, 3), Seq(4, 5), Seq(6))), + (Seq(Seq(1, 2))), + (Seq(Seq(1), Seq.empty)), + (Seq(Seq.empty, Seq(1))) + ).toDF("i") + + val intDFResult = Seq(Row(Seq(1, 2, 3, 4, 5, 6)), Row(Seq(1, 2)), Row(Seq(1)), Row(Seq(1))) + + def testInt(): Unit = { + checkAnswer(intDF.select(flatten($"i")), intDFResult) + checkAnswer(intDF.selectExpr("flatten(i)"), intDFResult) + } + + // Test with local relation, the Project will be evaluated without codegen + testInt() + // Test with cached relation, the Project will be evaluated with codegen + intDF.cache() + testInt() + + // Test cases with non-primitive types + val strDF = Seq( + (Seq(Seq("a", "b"), Seq("c"), Seq("d", "e", "f"))), + (Seq(Seq("a", "b"))), + (Seq(Seq("a", null), Seq(null, "b"), Seq(null, null))), + (Seq(Seq("a"), Seq.empty)), + (Seq(Seq.empty, Seq("a"))) + ).toDF("s") + + val strDFResult = Seq( + Row(Seq("a", "b", "c", "d", "e", "f")), + Row(Seq("a", "b")), + Row(Seq("a", null, null, "b", null, null)), + Row(Seq("a")), + Row(Seq("a"))) + + def testString(): Unit = { + checkAnswer(strDF.select(flatten($"s")), strDFResult) + checkAnswer(strDF.selectExpr("flatten(s)"), strDFResult) + } + + // Test with local relation, the Project will be evaluated without codegen + testString() + // Test with cached relation, the Project will be evaluated with codegen + strDF.cache() + testString() + + val arrDF = Seq((1, "a", Seq(1, 2, 3))).toDF("i", "s", "arr") + + def testArray(): Unit = { + checkAnswer( + arrDF.selectExpr("flatten(array(arr, array(null, 5), array(6, null)))"), + Seq(Row(Seq(1, 2, 3, null, 5, 6, null)))) + checkAnswer( + arrDF.selectExpr("flatten(array(array(arr, arr), array(arr)))"), + Seq(Row(Seq(Seq(1, 2, 3), Seq(1, 2, 3), Seq(1, 2, 3))))) + } + + // Test with local relation, the Project will be evaluated without codegen + testArray() + // Test with cached relation, the Project will be evaluated with codegen + arrDF.cache() + testArray() + + // Error test cases + val oneRowDF = Seq((1, "a", Seq(1, 2, 3))).toDF("i", "s", "arr") + intercept[AnalysisException] { + oneRowDF.select(flatten($"arr")) + } + intercept[AnalysisException] { + oneRowDF.select(flatten($"i")) + } + intercept[AnalysisException] { + oneRowDF.select(flatten($"s")) + } + intercept[AnalysisException] { + oneRowDF.selectExpr("flatten(null)") + } + } } diff --git a/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/GlutenDateFunctionsSuite.scala b/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/GlutenDateFunctionsSuite.scala index fbbfdf45daa86..a946e6de43452 100644 --- a/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/GlutenDateFunctionsSuite.scala +++ b/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/GlutenDateFunctionsSuite.scala @@ -246,80 +246,4 @@ class GlutenDateFunctionsSuite extends DateFunctionsSuite with GlutenSQLTestsTra } } } - - testGluten("to_utc_timestamp with literal zone") { - val df = Seq( - (Timestamp.valueOf("2015-07-24 00:00:00"), "2015-07-24 00:00:00"), - (Timestamp.valueOf("2015-07-25 00:00:00"), "2015-07-25 00:00:00") - ).toDF("a", "b") - checkAnswer( - df.select(to_utc_timestamp(col("a"), "America/Los_Angeles")), - Seq( - Row(Timestamp.valueOf("2015-07-24 07:00:00")), - Row(Timestamp.valueOf("2015-07-25 07:00:00"))) - ) - checkAnswer( - df.select(to_utc_timestamp(col("b"), "America/Los_Angeles")), - Seq( - Row(Timestamp.valueOf("2015-07-24 07:00:00")), - Row(Timestamp.valueOf("2015-07-25 07:00:00"))) - ) - } - - testGluten("to_utc_timestamp with column zone") { - val df = Seq( - (Timestamp.valueOf("2015-07-24 00:00:00"), "2015-07-24 00:00:00", "America/Los_Angeles"), - (Timestamp.valueOf("2015-07-25 00:00:00"), "2015-07-25 00:00:00", "Europe/Paris") - ).toDF("a", "b", "c") - checkAnswer( - df.select(to_utc_timestamp(col("a"), col("c"))), - Seq( - Row(Timestamp.valueOf("2015-07-24 07:00:00")), - Row(Timestamp.valueOf("2015-07-24 22:00:00"))) - ) - checkAnswer( - df.select(to_utc_timestamp(col("b"), col("c"))), - Seq( - Row(Timestamp.valueOf("2015-07-24 07:00:00")), - Row(Timestamp.valueOf("2015-07-24 22:00:00"))) - ) - } - - testGluten("from_utc_timestamp with literal zone") { - val df = Seq( - (Timestamp.valueOf("2015-07-24 00:00:00"), "2015-07-24 00:00:00"), - (Timestamp.valueOf("2015-07-25 00:00:00"), "2015-07-25 00:00:00") - ).toDF("a", "b") - checkAnswer( - df.select(from_utc_timestamp(col("a"), "America/Los_Angeles")), - Seq( - Row(Timestamp.valueOf("2015-07-23 17:00:00")), - Row(Timestamp.valueOf("2015-07-24 17:00:00"))) - ) - checkAnswer( - df.select(from_utc_timestamp(col("b"), "America/Los_Angeles")), - Seq( - Row(Timestamp.valueOf("2015-07-23 17:00:00")), - Row(Timestamp.valueOf("2015-07-24 17:00:00"))) - ) - } - - testGluten("from_utc_timestamp with column zone") { - val df = Seq( - (Timestamp.valueOf("2015-07-24 00:00:00"), "2015-07-24 00:00:00", "Europe/Paris"), - (Timestamp.valueOf("2015-07-25 00:00:00"), "2015-07-25 00:00:00", "America/Los_Angeles") - ).toDF("a", "b", "c") - checkAnswer( - df.select(from_utc_timestamp(col("a"), col("c"))), - Seq( - Row(Timestamp.valueOf("2015-07-24 02:00:00")), - Row(Timestamp.valueOf("2015-07-24 17:00:00"))) - ) - checkAnswer( - df.select(from_utc_timestamp(col("b"), col("c"))), - Seq( - Row(Timestamp.valueOf("2015-07-24 02:00:00")), - Row(Timestamp.valueOf("2015-07-24 17:00:00"))) - ) - } } diff --git a/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/GlutenImplicitsTest.scala b/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/GlutenImplicitsTest.scala index e4356cec8ff11..9e120945be348 100644 --- a/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/GlutenImplicitsTest.scala +++ b/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/GlutenImplicitsTest.scala @@ -79,10 +79,10 @@ class GlutenImplicitsTest extends GlutenSQLTestsBaseTrait { testGluten("fallbackSummary with shuffle") { withAQEEnabledAndDisabled { val df = spark.sql("SELECT c2 FROM t1 group by c2").filter(_.getLong(0) > 0) - assert(df.fallbackSummary().numGlutenNodes == 5, df.fallbackSummary()) + assert(df.fallbackSummary().numGlutenNodes == 6, df.fallbackSummary()) assert(df.fallbackSummary().numFallbackNodes == 1, df.fallbackSummary()) df.collect() - assert(df.fallbackSummary().numGlutenNodes == 5, df.fallbackSummary()) + assert(df.fallbackSummary().numGlutenNodes == 6, df.fallbackSummary()) assert(df.fallbackSummary().numFallbackNodes == 1, df.fallbackSummary()) } } @@ -119,10 +119,10 @@ class GlutenImplicitsTest extends GlutenSQLTestsBaseTrait { testGluten("fallbackSummary with cached data and shuffle") { withAQEEnabledAndDisabled { val df = spark.sql("select * from t1").filter(_.getLong(0) > 0).cache.repartition() - assert(df.fallbackSummary().numGlutenNodes == 6, df.fallbackSummary()) + assert(df.fallbackSummary().numGlutenNodes == 7, df.fallbackSummary()) assert(df.fallbackSummary().numFallbackNodes == 1, df.fallbackSummary()) df.collect() - assert(df.fallbackSummary().numGlutenNodes == 6, df.fallbackSummary()) + assert(df.fallbackSummary().numGlutenNodes == 7, df.fallbackSummary()) assert(df.fallbackSummary().numFallbackNodes == 1, df.fallbackSummary()) } } diff --git a/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/GlutenStringFunctionsSuite.scala b/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/GlutenStringFunctionsSuite.scala index c58284e4403bb..3d82e214f0311 100644 --- a/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/GlutenStringFunctionsSuite.scala +++ b/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/GlutenStringFunctionsSuite.scala @@ -30,10 +30,6 @@ class GlutenStringFunctionsSuite import testImplicits._ - override def testNameBlackList: Seq[String] = super.testNameBlackList ++ Seq( - "string / binary length function" - ) - testGluten("string split function with no limit and regex pattern") { val df1 = Seq(("aaAbbAcc4")).toDF("a").select(split($"a", "A")) checkAnswer(df1, Row(Seq("aa", "bb", "cc4"))) diff --git a/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/catalyst/expressions/GlutenDateExpressionsSuite.scala b/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/catalyst/expressions/GlutenDateExpressionsSuite.scala index 701dd1e4a8265..234537feef8a3 100644 --- a/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/catalyst/expressions/GlutenDateExpressionsSuite.scala +++ b/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/catalyst/expressions/GlutenDateExpressionsSuite.scala @@ -380,6 +380,9 @@ class GlutenDateExpressionsSuite extends DateExpressionsSuite with GlutenTestsTr checkEvaluation( FromUnixTime(Literal(-1000L), Literal(fmt2), timeZoneId), sdf2.format(new Timestamp(-1000000))) + checkEvaluation( + FromUnixTime(Literal(Long.MaxValue), Literal(fmt2), timeZoneId), + sdf2.format(new Timestamp(-1000))) checkEvaluation( FromUnixTime( Literal.create(null, LongType), diff --git a/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/execution/FallbackStrategiesSuite.scala b/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/execution/FallbackStrategiesSuite.scala index fff883d49e86f..b9c9d8a270bf2 100644 --- a/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/execution/FallbackStrategiesSuite.scala +++ b/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/execution/FallbackStrategiesSuite.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.execution import org.apache.gluten.backendsapi.BackendsApiManager import org.apache.gluten.execution.BasicScanExecTransformer import org.apache.gluten.extension.GlutenPlan -import org.apache.gluten.extension.columnar.{FallbackEmptySchemaRelation, TRANSFORM_UNSUPPORTED, TransformHints} +import org.apache.gluten.extension.columnar.{FallbackEmptySchemaRelation, FallbackTags, TRANSFORM_UNSUPPORTED} import org.apache.gluten.extension.columnar.heuristic.HeuristicApplier import org.apache.gluten.extension.columnar.transition.InsertTransitions import org.apache.gluten.utils.QueryPlanSelector @@ -124,10 +124,10 @@ class FallbackStrategiesSuite extends GlutenSQLTestsTrait { testGluten("Tag not transformable more than once") { val originalPlan = UnaryOp1(LeafOp(supportsColumnar = true)) - TransformHints.tag(originalPlan, TRANSFORM_UNSUPPORTED(Some("fake reason"))) + FallbackTags.tag(originalPlan, TRANSFORM_UNSUPPORTED(Some("fake reason"))) val rule = FallbackEmptySchemaRelation() val newPlan = rule.apply(originalPlan) - val reason = TransformHints.getHint(newPlan).asInstanceOf[TRANSFORM_UNSUPPORTED].reason + val reason = FallbackTags.getTag(newPlan).asInstanceOf[TRANSFORM_UNSUPPORTED].reason assert(reason.isDefined) if (BackendsApiManager.getSettings.fallbackOnEmptySchema(newPlan)) { assert( diff --git a/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/execution/GlutenReplaceHashWithSortAggSuite.scala b/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/execution/GlutenReplaceHashWithSortAggSuite.scala index 026e2dde00558..92e6fee97ea93 100644 --- a/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/execution/GlutenReplaceHashWithSortAggSuite.scala +++ b/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/execution/GlutenReplaceHashWithSortAggSuite.scala @@ -85,11 +85,11 @@ class GlutenReplaceHashWithSortAggSuite withTempView("t1", "t2") { spark.range(100).selectExpr("id as key").createOrReplaceTempView("t1") spark.range(50).selectExpr("id as key").createOrReplaceTempView("t2") - Seq("COUNT", "COLLECT_LIST").foreach { - aggExpr => + Seq(("COUNT", 0, 1, 2, 0), ("COLLECT_LIST", 2, 0, 2, 0)).foreach { + aggExprInfo => val query = s""" - |SELECT key, $aggExpr(key) + |SELECT key, ${aggExprInfo._1}(key) |FROM |( | SELECT /*+ SHUFFLE_MERGE(t1) */ t1.key AS key @@ -102,7 +102,7 @@ class GlutenReplaceHashWithSortAggSuite if (BackendsApiManager.getSettings.mergeTwoPhasesHashBaseAggregateIfNeed()) { checkAggs(query, 1, 0, 1, 0) } else { - checkAggs(query, 2, 0, 2, 0) + checkAggs(query, aggExprInfo._2, aggExprInfo._3, aggExprInfo._4, aggExprInfo._5) } } } diff --git a/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/execution/datasources/GlutenFileMetadataStructSuite.scala b/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/execution/datasources/GlutenFileMetadataStructSuite.scala index 5e4d1ed8f81f4..ed347d024c1c2 100644 --- a/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/execution/datasources/GlutenFileMetadataStructSuite.scala +++ b/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/execution/datasources/GlutenFileMetadataStructSuite.scala @@ -16,12 +16,11 @@ */ package org.apache.spark.sql.execution.datasources +import org.apache.gluten.backendsapi.BackendsApiManager import org.apache.gluten.execution.{FileSourceScanExecTransformer, FilterExecTransformer} -import org.apache.gluten.utils.BackendTestUtils import org.apache.spark.sql.{Column, DataFrame, Row} import org.apache.spark.sql.GlutenSQLTestsBaseTrait -import org.apache.spark.sql.GlutenTestConstants.GLUTEN_TEST import org.apache.spark.sql.execution.FileSourceScanExec import org.apache.spark.sql.types.{IntegerType, LongType, StringType, StructField, StructType} @@ -60,7 +59,7 @@ class GlutenFileMetadataStructSuite extends FileMetadataStructSuite with GlutenS f: (DataFrame, Map[String, Any], Map[String, Any]) => Unit): Unit = { Seq("parquet").foreach { testFileFormat => - test(s"$GLUTEN_TEST metadata struct ($testFileFormat): " + testName) { + testGluten(s"metadata struct ($testFileFormat): " + testName) { withTempDir { dir => import scala.collection.JavaConverters._ @@ -109,7 +108,7 @@ class GlutenFileMetadataStructSuite extends FileMetadataStructSuite with GlutenS METADATA_FILE_MODIFICATION_TIME, "age") dfWithMetadata.collect - if (BackendTestUtils.isVeloxBackendLoaded()) { + if (BackendsApiManager.getSettings.supportNativeMetadataColumns()) { checkOperatorMatch[FileSourceScanExecTransformer](dfWithMetadata) } else { checkOperatorMatch[FileSourceScanExec](dfWithMetadata) @@ -134,7 +133,7 @@ class GlutenFileMetadataStructSuite extends FileMetadataStructSuite with GlutenS .where(Column(METADATA_FILE_NAME) === f0((METADATA_FILE_NAME))) val ret = filterDF.collect assert(ret.size == 1) - if (BackendTestUtils.isVeloxBackendLoaded()) { + if (BackendsApiManager.getSettings.supportNativeMetadataColumns()) { checkOperatorMatch[FileSourceScanExecTransformer](filterDF) } else { checkOperatorMatch[FileSourceScanExec](filterDF) @@ -150,7 +149,7 @@ class GlutenFileMetadataStructSuite extends FileMetadataStructSuite with GlutenS Row(f1(METADATA_FILE_PATH)) ) ) - if (BackendTestUtils.isVeloxBackendLoaded()) { + if (BackendsApiManager.getSettings.supportNativeMetadataColumns()) { checkOperatorMatch[FileSourceScanExecTransformer](filterDF) } else { checkOperatorMatch[FileSourceScanExec](filterDF) diff --git a/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/extension/GlutenSessionExtensionSuite.scala b/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/extension/GlutenSessionExtensionSuite.scala index 00b4bf5821f1e..4df9c63b3ef6d 100644 --- a/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/extension/GlutenSessionExtensionSuite.scala +++ b/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/extension/GlutenSessionExtensionSuite.scala @@ -16,7 +16,7 @@ */ package org.apache.spark.sql.extension -import org.apache.gluten.extension.{ColumnarOverrideRules, JoinSelectionOverrides} +import org.apache.gluten.extension.ColumnarOverrideRules import org.apache.spark.SparkConf import org.apache.spark.sql._ @@ -31,7 +31,6 @@ class GlutenSessionExtensionSuite extends GlutenSQLTestsTrait { testGluten("test gluten extensions") { assert(spark.sessionState.columnarRules.contains(ColumnarOverrideRules(spark))) - assert(spark.sessionState.planner.strategies.contains(JoinSelectionOverrides(spark))) assert(spark.sessionState.planner.strategies.contains(MySparkStrategy(spark))) assert(spark.sessionState.analyzer.extendedResolutionRules.contains(MyRule(spark))) diff --git a/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/extension/TestFileSourceScanExecTransformer.scala b/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/extension/TestFileSourceScanExecTransformer.scala index 592019f34ccf0..85af9a6238531 100644 --- a/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/extension/TestFileSourceScanExecTransformer.scala +++ b/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/extension/TestFileSourceScanExecTransformer.scala @@ -50,6 +50,7 @@ case class TestFileSourceScanExecTransformer( override def getPartitions: Seq[InputPartition] = BackendsApiManager.getTransformerApiInstance.genInputPartitionSeq( relation, + requiredSchema, selectedPartitions, output, bucketedScan, diff --git a/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/gluten/GlutenFallbackSuite.scala b/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/gluten/GlutenFallbackSuite.scala index b85dd6a3518e7..6860d6a129583 100644 --- a/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/gluten/GlutenFallbackSuite.scala +++ b/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/gluten/GlutenFallbackSuite.scala @@ -23,7 +23,9 @@ import org.apache.gluten.utils.BackendTestUtils import org.apache.spark.scheduler.{SparkListener, SparkListenerEvent} import org.apache.spark.sql.{GlutenSQLTestsTrait, Row} +import org.apache.spark.sql.execution.ProjectExec import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper +import org.apache.spark.sql.execution.aggregate.HashAggregateExec import org.apache.spark.sql.execution.ui.{GlutenSQLAppStatusStore, SparkListenerSQLExecutionStart} import org.apache.spark.status.ElementTrackingStore @@ -161,4 +163,67 @@ class GlutenFallbackSuite extends GlutenSQLTestsTrait with AdaptiveSparkPlanHelp } } } + + test("Add logical link to rewritten spark plan") { + val events = new ArrayBuffer[GlutenPlanFallbackEvent] + val listener = new SparkListener { + override def onOtherEvent(event: SparkListenerEvent): Unit = { + event match { + case e: GlutenPlanFallbackEvent => events.append(e) + case _ => + } + } + } + spark.sparkContext.addSparkListener(listener) + withSQLConf(GlutenConfig.EXPRESSION_BLACK_LIST.key -> "add") { + try { + val df = spark.sql("select sum(id + 1) from range(10)") + df.collect() + spark.sparkContext.listenerBus.waitUntilEmpty() + val project = find(df.queryExecution.executedPlan) { + _.isInstanceOf[ProjectExec] + } + assert(project.isDefined) + assert( + events.exists(_.fallbackNodeToReason.values.toSet + .exists(_.contains("Not supported to map spark function name")))) + } finally { + spark.sparkContext.removeSparkListener(listener) + } + } + } + + test("ExpandFallbackPolicy should propagate fallback reason to vanilla SparkPlan") { + val events = new ArrayBuffer[GlutenPlanFallbackEvent] + val listener = new SparkListener { + override def onOtherEvent(event: SparkListenerEvent): Unit = { + event match { + case e: GlutenPlanFallbackEvent => events.append(e) + case _ => + } + } + } + spark.sparkContext.addSparkListener(listener) + spark.range(10).selectExpr("id as c1", "id as c2").write.format("parquet").saveAsTable("t") + withTable("t") { + withSQLConf( + GlutenConfig.EXPRESSION_BLACK_LIST.key -> "max", + GlutenConfig.COLUMNAR_WHOLESTAGE_FALLBACK_THRESHOLD.key -> "1") { + try { + val df = spark.sql("select c2, max(c1) as id from t group by c2") + df.collect() + spark.sparkContext.listenerBus.waitUntilEmpty() + val agg = collect(df.queryExecution.executedPlan) { case a: HashAggregateExec => a } + assert(agg.size == 2) + assert( + events.count( + _.fallbackNodeToReason.values.toSet.exists(_.contains( + "Could not find a valid substrait mapping name for max" + ))) == 2) + } finally { + spark.sparkContext.removeSparkListener(listener) + } + } + } + } } diff --git a/gluten-ut/spark34/pom.xml b/gluten-ut/spark34/pom.xml index d30f9644dbb4d..a8d24d5fd219f 100644 --- a/gluten-ut/spark34/pom.xml +++ b/gluten-ut/spark34/pom.xml @@ -83,7 +83,7 @@ org.apache.arrow arrow-c-data - ${arrow.version} + ${arrow-gluten.version} test diff --git a/gluten-ut/spark34/src/test/resources/sql-tests/inputs/subquery/scalar-subquery/scalar-subquery-select.sql b/gluten-ut/spark34/src/test/resources/sql-tests/inputs/subquery/scalar-subquery/scalar-subquery-select.sql deleted file mode 100644 index 48d1594fa51ac..0000000000000 --- a/gluten-ut/spark34/src/test/resources/sql-tests/inputs/subquery/scalar-subquery/scalar-subquery-select.sql +++ /dev/null @@ -1,363 +0,0 @@ --- A test suite for scalar subquery in SELECT clause - -create temporary view t1 as select * from values - ('val1a', 6S, 8, 10L, float(15.0), 20D, 20E2, timestamp '2014-04-04 00:00:00.000', date '2014-04-04'), - ('val1b', 8S, 16, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), - ('val1a', 16S, 12, 21L, float(15.0), 20D, 20E2, timestamp '2014-06-04 01:02:00.001', date '2014-06-04'), - ('val1a', 16S, 12, 10L, float(15.0), 20D, 20E2, timestamp '2014-07-04 01:01:00.000', date '2014-07-04'), - ('val1c', 8S, 16, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:02:00.001', date '2014-05-05'), - ('val1d', null, 16, 22L, float(17.0), 25D, 26E2, timestamp '2014-06-04 01:01:00.000', null), - ('val1d', null, 16, 19L, float(17.0), 25D, 26E2, timestamp '2014-07-04 01:02:00.001', null), - ('val1e', 10S, null, 25L, float(17.0), 25D, 26E2, timestamp '2014-08-04 01:01:00.000', date '2014-08-04'), - ('val1e', 10S, null, 19L, float(17.0), 25D, 26E2, timestamp '2014-09-04 01:02:00.001', date '2014-09-04'), - ('val1d', 10S, null, 12L, float(17.0), 25D, 26E2, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), - ('val1a', 6S, 8, 10L, float(15.0), 20D, 20E2, timestamp '2014-04-04 01:02:00.001', date '2014-04-04'), - ('val1e', 10S, null, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04') - as t1(t1a, t1b, t1c, t1d, t1e, t1f, t1g, t1h, t1i); - -create temporary view t2 as select * from values - ('val2a', 6S, 12, 14L, float(15), 20D, 20E2, timestamp '2014-04-04 01:01:00.000', date '2014-04-04'), - ('val1b', 10S, 12, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), - ('val1b', 8S, 16, 119L, float(17), 25D, 26E2, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), - ('val1c', 12S, 16, 219L, float(17), 25D, 26E2, timestamp '2016-05-04 01:01:00.000', date '2016-05-04'), - ('val1b', null, 16, 319L, float(17), 25D, 26E2, timestamp '2017-05-04 01:01:00.000', null), - ('val2e', 8S, null, 419L, float(17), 25D, 26E2, timestamp '2014-06-04 01:01:00.000', date '2014-06-04'), - ('val1f', 19S, null, 519L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), - ('val1b', 10S, 12, 19L, float(17), 25D, 26E2, timestamp '2014-06-04 01:01:00.000', date '2014-06-04'), - ('val1b', 8S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-07-04 01:01:00.000', date '2014-07-04'), - ('val1c', 12S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-08-04 01:01:00.000', date '2014-08-05'), - ('val1e', 8S, null, 19L, float(17), 25D, 26E2, timestamp '2014-09-04 01:01:00.000', date '2014-09-04'), - ('val1f', 19S, null, 19L, float(17), 25D, 26E2, timestamp '2014-10-04 01:01:00.000', date '2014-10-04'), - ('val1b', null, 16, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', null) - as t2(t2a, t2b, t2c, t2d, t2e, t2f, t2g, t2h, t2i); - -create temporary view t3 as select * from values - ('val3a', 6S, 12, 110L, float(15), 20D, 20E2, timestamp '2014-04-04 01:02:00.000', date '2014-04-04'), - ('val3a', 6S, 12, 10L, float(15), 20D, 20E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ('val1b', 10S, 12, 219L, float(17), 25D, 26E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ('val1b', 10S, 12, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ('val1b', 8S, 16, 319L, float(17), 25D, 26E2, timestamp '2014-06-04 01:02:00.000', date '2014-06-04'), - ('val1b', 8S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-07-04 01:02:00.000', date '2014-07-04'), - ('val3c', 17S, 16, 519L, float(17), 25D, 26E2, timestamp '2014-08-04 01:02:00.000', date '2014-08-04'), - ('val3c', 17S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-09-04 01:02:00.000', date '2014-09-05'), - ('val1b', null, 16, 419L, float(17), 25D, 26E2, timestamp '2014-10-04 01:02:00.000', null), - ('val1b', null, 16, 19L, float(17), 25D, 26E2, timestamp '2014-11-04 01:02:00.000', null), - ('val3b', 8S, null, 719L, float(17), 25D, 26E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ('val3b', 8S, null, 19L, float(17), 25D, 26E2, timestamp '2015-05-04 01:02:00.000', date '2015-05-04') - as t3(t3a, t3b, t3c, t3d, t3e, t3f, t3g, t3h, t3i); - --- Group 1: scalar subquery in SELECT clause --- no correlation --- TC 01.01 --- more than one scalar subquery -SELECT (SELECT min(t3d) FROM t3) min_t3d, - (SELECT max(t2h) FROM t2) max_t2h -FROM t1 -WHERE t1a = 'val1c'; - --- TC 01.02 --- scalar subquery in an IN subquery -SELECT t1a, count(*) -FROM t1 -WHERE t1c IN (SELECT (SELECT min(t3c) FROM t3) - FROM t2 - GROUP BY t2g - HAVING count(*) > 1) -GROUP BY t1a; - --- TC 01.03 --- under a set op -SELECT (SELECT min(t3d) FROM t3) min_t3d, - null -FROM t1 -WHERE t1a = 'val1c' -UNION -SELECT null, - (SELECT max(t2h) FROM t2) max_t2h -FROM t1 -WHERE t1a = 'val1c'; - --- TC 01.04 -SELECT (SELECT min(t3c) FROM t3) min_t3d -FROM t1 -WHERE t1a = 'val1a' -INTERSECT -SELECT (SELECT min(t2c) FROM t2) min_t2d -FROM t1 -WHERE t1a = 'val1d'; - --- TC 01.05 -SELECT q1.t1a, q2.t2a, q1.min_t3d, q2.avg_t3d -FROM (SELECT t1a, (SELECT min(t3d) FROM t3) min_t3d - FROM t1 - WHERE t1a IN ('val1e', 'val1c')) q1 - FULL OUTER JOIN - (SELECT t2a, (SELECT avg(t3d) FROM t3) avg_t3d - FROM t2 - WHERE t2a IN ('val1c', 'val2a')) q2 -ON q1.t1a = q2.t2a -AND q1.min_t3d < q2.avg_t3d; - --- Group 2: scalar subquery in SELECT clause --- with correlation --- TC 02.01 -SELECT (SELECT min(t3d) FROM t3 WHERE t3.t3a = t1.t1a) min_t3d, - (SELECT max(t2h) FROM t2 WHERE t2.t2a = t1.t1a) max_t2h -FROM t1 -WHERE t1a = 'val1b'; - --- TC 02.02 -SELECT (SELECT min(t3d) FROM t3 WHERE t3a = t1a) min_t3d -FROM t1 -WHERE t1a = 'val1b' -MINUS -SELECT (SELECT min(t3d) FROM t3) abs_min_t3d -FROM t1 -WHERE t1a = 'val1b'; - --- TC 02.03 -SELECT t1a, t1b -FROM t1 -WHERE NOT EXISTS (SELECT (SELECT max(t2b) - FROM t2 LEFT JOIN t1 - ON t2a = t1a - WHERE t2c = t3c) dummy - FROM t3 - WHERE t3b < (SELECT max(t2b) - FROM t2 LEFT JOIN t1 - ON t2a = t1a - WHERE t2c = t3c) - AND t3a = t1a); - --- SPARK-34876: Non-nullable aggregates should not return NULL in a correlated subquery -SELECT t1a, - (SELECT count(t2d) FROM t2 WHERE t2a = t1a) count_t2, - (SELECT count_if(t2d > 0) FROM t2 WHERE t2a = t1a) count_if_t2, - (SELECT approx_count_distinct(t2d) FROM t2 WHERE t2a = t1a) approx_count_distinct_t2, - (SELECT collect_list(t2d) FROM t2 WHERE t2a = t1a) collect_list_t2, - (SELECT sort_array(collect_set(t2d)) FROM t2 WHERE t2a = t1a) collect_set_t2, - (SELECT hex(count_min_sketch(t2d, 0.5d, 0.5d, 1)) FROM t2 WHERE t2a = t1a) collect_set_t2 -FROM t1; - --- SPARK-36028: Allow Project to host outer references in scalar subqueries -SELECT t1c, (SELECT t1c) FROM t1; -SELECT t1c, (SELECT t1c WHERE t1c = 8) FROM t1; -SELECT t1c, t1d, (SELECT c + d FROM (SELECT t1c AS c, t1d AS d)) FROM t1; -SELECT t1c, (SELECT SUM(c) FROM (SELECT t1c AS c)) FROM t1; -SELECT t1a, (SELECT SUM(t2b) FROM t2 JOIN (SELECT t1a AS a) ON t2a = a) FROM t1; - --- CTE in correlated scalar subqueries -CREATE OR REPLACE TEMPORARY VIEW t1 AS VALUES (0, 1), (1, 2) t1(c1, c2); -CREATE OR REPLACE TEMPORARY VIEW t2 AS VALUES (0, 2), (0, 3) t2(c1, c2); - --- Single row subquery -SELECT c1, (WITH t AS (SELECT 1 AS a) SELECT a + c1 FROM t) FROM t1; --- Correlation in CTE. -SELECT c1, (WITH t AS (SELECT * FROM t2 WHERE c1 = t1.c1) SELECT SUM(c2) FROM t) FROM t1; --- Multiple CTE definitions. -SELECT c1, ( - WITH t3 AS (SELECT c1 + 1 AS c1, c2 + 1 AS c2 FROM t2), - t4 AS (SELECT * FROM t3 WHERE t1.c1 = c1) - SELECT SUM(c2) FROM t4 -) FROM t1; --- Multiple CTE references. -SELECT c1, ( - WITH t AS (SELECT * FROM t2) - SELECT SUM(c2) FROM (SELECT c1, c2 FROM t UNION SELECT c2, c1 FROM t) r(c1, c2) - WHERE c1 = t1.c1 -) FROM t1; --- Reference CTE in both the main query and the subquery. -WITH v AS (SELECT * FROM t2) -SELECT * FROM t1 WHERE c1 > ( - WITH t AS (SELECT * FROM t2) - SELECT COUNT(*) FROM v WHERE c1 = t1.c1 AND c1 > (SELECT SUM(c2) FROM t WHERE c1 = v.c1) -); --- Single row subquery that references CTE in the main query. -WITH t AS (SELECT 1 AS a) -SELECT c1, (SELECT a FROM t WHERE a = c1) FROM t1; --- Multiple CTE references with non-deterministic CTEs. -WITH -v1 AS (SELECT c1, c2, rand(0) c3 FROM t1), -v2 AS (SELECT c1, c2, rand(0) c4 FROM v1 WHERE c3 IN (SELECT c3 FROM v1)) -SELECT c1, ( - WITH v3 AS (SELECT c1, c2, rand(0) c5 FROM t2) - SELECT COUNT(*) FROM ( - SELECT * FROM v2 WHERE c1 > 0 - UNION SELECT * FROM v2 WHERE c2 > 0 - UNION SELECT * FROM v3 WHERE c2 > 0 - ) WHERE c1 = v1.c1 -) FROM v1; - --- Multi-value subquery error -SELECT (SELECT a FROM (SELECT 1 AS a UNION ALL SELECT 2 AS a) t) AS b; - --- SPARK-36114: Support correlated non-equality predicates -CREATE OR REPLACE TEMP VIEW t1(c1, c2) AS (VALUES (0, 1), (1, 2)); -CREATE OR REPLACE TEMP VIEW t2(c1, c2) AS (VALUES (0, 2), (0, 3)); - --- Neumann example Q2 -CREATE OR REPLACE TEMP VIEW students(id, name, major, year) AS (VALUES - (0, 'A', 'CS', 2022), - (1, 'B', 'CS', 2022), - (2, 'C', 'Math', 2022)); -CREATE OR REPLACE TEMP VIEW exams(sid, course, curriculum, grade, date) AS (VALUES - (0, 'C1', 'CS', 4, 2020), - (0, 'C2', 'CS', 3, 2021), - (1, 'C1', 'CS', 2, 2020), - (1, 'C2', 'CS', 1, 2021)); - -SELECT students.name, exams.course -FROM students, exams -WHERE students.id = exams.sid - AND (students.major = 'CS' OR students.major = 'Games Eng') - AND exams.grade >= ( - SELECT avg(exams.grade) + 1 - FROM exams - WHERE students.id = exams.sid - OR (exams.curriculum = students.major AND students.year > exams.date)); - --- Correlated non-equality predicates -SELECT (SELECT min(c2) FROM t2 WHERE t1.c1 > t2.c1) FROM t1; -SELECT (SELECT min(c2) FROM t2 WHERE t1.c1 >= t2.c1 AND t1.c2 < t2.c2) FROM t1; - --- Correlated non-equality predicates with the COUNT bug. -SELECT (SELECT count(*) FROM t2 WHERE t1.c1 > t2.c1) FROM t1; - --- Correlated equality predicates that are not supported after SPARK-35080 -SELECT c, ( - SELECT count(*) - FROM (VALUES ('ab'), ('abc'), ('bc')) t2(c) - WHERE t1.c = substring(t2.c, 1, 1) -) FROM (VALUES ('a'), ('b')) t1(c); - -SELECT c, ( - SELECT count(*) - FROM (VALUES (0, 6), (1, 5), (2, 4), (3, 3)) t1(a, b) - WHERE a + b = c -) FROM (VALUES (6)) t2(c); - --- SPARK-43156: scalar subquery with Literal result like `COUNT(1) is null` -SELECT *, (SELECT count(1) is null FROM t2 WHERE t1.c1 = t2.c1) FROM t1; - -select (select f from (select false as f, max(c2) from t1 where t1.c1 = t1.c1)) from t2; - --- Set operations in correlation path - -CREATE OR REPLACE TEMP VIEW t0(t0a, t0b) AS VALUES (1, 1), (2, 0); -CREATE OR REPLACE TEMP VIEW t1(t1a, t1b, t1c) AS VALUES (1, 1, 3); -CREATE OR REPLACE TEMP VIEW t2(t2a, t2b, t2c) AS VALUES (1, 1, 5), (2, 2, 7); - -SELECT t0a, (SELECT sum(c) FROM - (SELECT t1c as c - FROM t1 - WHERE t1a = t0a - UNION ALL - SELECT t2c as c - FROM t2 - WHERE t2b = t0b) -) -FROM t0; - -SELECT t0a, (SELECT sum(c) FROM - (SELECT t1c as c - FROM t1 - WHERE t1a = t0a - UNION ALL - SELECT t2c as c - FROM t2 - WHERE t2a = t0a) -) -FROM t0; - -SELECT t0a, (SELECT sum(c) FROM - (SELECT t1c as c - FROM t1 - WHERE t1a > t0a - UNION ALL - SELECT t2c as c - FROM t2 - WHERE t2b <= t0b) -) -FROM t0; - -SELECT t0a, (SELECT sum(t1c) FROM - (SELECT t1c - FROM t1 - WHERE t1a = t0a - UNION ALL - SELECT t2c - FROM t2 - WHERE t2b = t0b) -) -FROM t0; - -SELECT t0a, (SELECT sum(t1c) FROM - (SELECT t1c - FROM t1 - WHERE t1a = t0a - UNION DISTINCT - SELECT t2c - FROM t2 - WHERE t2b = t0b) -) -FROM t0; - --- Tests for column aliasing -SELECT t0a, (SELECT sum(t1a + 3 * t1b + 5 * t1c) FROM - (SELECT t1c as t1a, t1a as t1b, t0a as t1c - FROM t1 - WHERE t1a = t0a - UNION ALL - SELECT t0a as t2b, t2c as t1a, t0b as t2c - FROM t2 - WHERE t2b = t0b) -) -FROM t0; - --- Test handling of COUNT bug -SELECT t0a, (SELECT count(t1c) FROM - (SELECT t1c - FROM t1 - WHERE t1a = t0a - UNION DISTINCT - SELECT t2c - FROM t2 - WHERE t2b = t0b) -) -FROM t0; - --- Correlated references in project -SELECT t0a, (SELECT sum(d) FROM - (SELECT t1a - t0a as d - FROM t1 - UNION ALL - SELECT t2a - t0a as d - FROM t2) -) -FROM t0; - --- Correlated references in aggregate - unsupported -SELECT t0a, (SELECT sum(d) FROM - (SELECT sum(t0a) as d - FROM t1 - UNION ALL - SELECT sum(t2a) + t0a as d - FROM t2) -) -FROM t0; - --- SPARK-43760: the result of the subquery can be NULL. -select * -from -( - select t1.id c1, ( - select sum(c) - from ( - select t2.id * t2.id c - from range (1, 2) t2 where t1.id = t2.id - group by t2.id - ) - ) c2 - from range (1, 3) t1 -) t -where t.c2 is not null; diff --git a/gluten-ut/spark34/src/test/resources/sql-tests/results/subquery/scalar-subquery/scalar-subquery-select.sql.out b/gluten-ut/spark34/src/test/resources/sql-tests/results/subquery/scalar-subquery/scalar-subquery-select.sql.out deleted file mode 100644 index 088359d39b86e..0000000000000 --- a/gluten-ut/spark34/src/test/resources/sql-tests/results/subquery/scalar-subquery/scalar-subquery-select.sql.out +++ /dev/null @@ -1,791 +0,0 @@ --- Automatically generated by GlutenSQLQueryTestSuite --- Number of queries: 52 - - --- !query -create temporary view t1 as select * from values - ('val1a', 6S, 8, 10L, float(15.0), 20D, 20E2, timestamp '2014-04-04 00:00:00.000', date '2014-04-04'), - ('val1b', 8S, 16, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), - ('val1a', 16S, 12, 21L, float(15.0), 20D, 20E2, timestamp '2014-06-04 01:02:00.001', date '2014-06-04'), - ('val1a', 16S, 12, 10L, float(15.0), 20D, 20E2, timestamp '2014-07-04 01:01:00.000', date '2014-07-04'), - ('val1c', 8S, 16, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:02:00.001', date '2014-05-05'), - ('val1d', null, 16, 22L, float(17.0), 25D, 26E2, timestamp '2014-06-04 01:01:00.000', null), - ('val1d', null, 16, 19L, float(17.0), 25D, 26E2, timestamp '2014-07-04 01:02:00.001', null), - ('val1e', 10S, null, 25L, float(17.0), 25D, 26E2, timestamp '2014-08-04 01:01:00.000', date '2014-08-04'), - ('val1e', 10S, null, 19L, float(17.0), 25D, 26E2, timestamp '2014-09-04 01:02:00.001', date '2014-09-04'), - ('val1d', 10S, null, 12L, float(17.0), 25D, 26E2, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), - ('val1a', 6S, 8, 10L, float(15.0), 20D, 20E2, timestamp '2014-04-04 01:02:00.001', date '2014-04-04'), - ('val1e', 10S, null, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04') - as t1(t1a, t1b, t1c, t1d, t1e, t1f, t1g, t1h, t1i) --- !query schema -struct<> --- !query output - - - --- !query -create temporary view t2 as select * from values - ('val2a', 6S, 12, 14L, float(15), 20D, 20E2, timestamp '2014-04-04 01:01:00.000', date '2014-04-04'), - ('val1b', 10S, 12, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), - ('val1b', 8S, 16, 119L, float(17), 25D, 26E2, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), - ('val1c', 12S, 16, 219L, float(17), 25D, 26E2, timestamp '2016-05-04 01:01:00.000', date '2016-05-04'), - ('val1b', null, 16, 319L, float(17), 25D, 26E2, timestamp '2017-05-04 01:01:00.000', null), - ('val2e', 8S, null, 419L, float(17), 25D, 26E2, timestamp '2014-06-04 01:01:00.000', date '2014-06-04'), - ('val1f', 19S, null, 519L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), - ('val1b', 10S, 12, 19L, float(17), 25D, 26E2, timestamp '2014-06-04 01:01:00.000', date '2014-06-04'), - ('val1b', 8S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-07-04 01:01:00.000', date '2014-07-04'), - ('val1c', 12S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-08-04 01:01:00.000', date '2014-08-05'), - ('val1e', 8S, null, 19L, float(17), 25D, 26E2, timestamp '2014-09-04 01:01:00.000', date '2014-09-04'), - ('val1f', 19S, null, 19L, float(17), 25D, 26E2, timestamp '2014-10-04 01:01:00.000', date '2014-10-04'), - ('val1b', null, 16, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', null) - as t2(t2a, t2b, t2c, t2d, t2e, t2f, t2g, t2h, t2i) --- !query schema -struct<> --- !query output - - - --- !query -create temporary view t3 as select * from values - ('val3a', 6S, 12, 110L, float(15), 20D, 20E2, timestamp '2014-04-04 01:02:00.000', date '2014-04-04'), - ('val3a', 6S, 12, 10L, float(15), 20D, 20E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ('val1b', 10S, 12, 219L, float(17), 25D, 26E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ('val1b', 10S, 12, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ('val1b', 8S, 16, 319L, float(17), 25D, 26E2, timestamp '2014-06-04 01:02:00.000', date '2014-06-04'), - ('val1b', 8S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-07-04 01:02:00.000', date '2014-07-04'), - ('val3c', 17S, 16, 519L, float(17), 25D, 26E2, timestamp '2014-08-04 01:02:00.000', date '2014-08-04'), - ('val3c', 17S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-09-04 01:02:00.000', date '2014-09-05'), - ('val1b', null, 16, 419L, float(17), 25D, 26E2, timestamp '2014-10-04 01:02:00.000', null), - ('val1b', null, 16, 19L, float(17), 25D, 26E2, timestamp '2014-11-04 01:02:00.000', null), - ('val3b', 8S, null, 719L, float(17), 25D, 26E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ('val3b', 8S, null, 19L, float(17), 25D, 26E2, timestamp '2015-05-04 01:02:00.000', date '2015-05-04') - as t3(t3a, t3b, t3c, t3d, t3e, t3f, t3g, t3h, t3i) --- !query schema -struct<> --- !query output - - - --- !query -SELECT (SELECT min(t3d) FROM t3) min_t3d, - (SELECT max(t2h) FROM t2) max_t2h -FROM t1 -WHERE t1a = 'val1c' --- !query schema -struct --- !query output -10 2017-05-04 01:01:00 - - --- !query -SELECT t1a, count(*) -FROM t1 -WHERE t1c IN (SELECT (SELECT min(t3c) FROM t3) - FROM t2 - GROUP BY t2g - HAVING count(*) > 1) -GROUP BY t1a --- !query schema -struct --- !query output -val1a 2 - - --- !query -SELECT (SELECT min(t3d) FROM t3) min_t3d, - null -FROM t1 -WHERE t1a = 'val1c' -UNION -SELECT null, - (SELECT max(t2h) FROM t2) max_t2h -FROM t1 -WHERE t1a = 'val1c' --- !query schema -struct --- !query output -10 NULL -NULL 2017-05-04 01:01:00 - - --- !query -SELECT (SELECT min(t3c) FROM t3) min_t3d -FROM t1 -WHERE t1a = 'val1a' -INTERSECT -SELECT (SELECT min(t2c) FROM t2) min_t2d -FROM t1 -WHERE t1a = 'val1d' --- !query schema -struct --- !query output -12 - - --- !query -SELECT q1.t1a, q2.t2a, q1.min_t3d, q2.avg_t3d -FROM (SELECT t1a, (SELECT min(t3d) FROM t3) min_t3d - FROM t1 - WHERE t1a IN ('val1e', 'val1c')) q1 - FULL OUTER JOIN - (SELECT t2a, (SELECT avg(t3d) FROM t3) avg_t3d - FROM t2 - WHERE t2a IN ('val1c', 'val2a')) q2 -ON q1.t1a = q2.t2a -AND q1.min_t3d < q2.avg_t3d --- !query schema -struct --- !query output -NULL val2a NULL 200.83333333333334 -val1c val1c 10 200.83333333333334 -val1c val1c 10 200.83333333333334 -val1e NULL 10 NULL -val1e NULL 10 NULL -val1e NULL 10 NULL - - --- !query -SELECT (SELECT min(t3d) FROM t3 WHERE t3.t3a = t1.t1a) min_t3d, - (SELECT max(t2h) FROM t2 WHERE t2.t2a = t1.t1a) max_t2h -FROM t1 -WHERE t1a = 'val1b' --- !query schema -struct --- !query output -19 2017-05-04 01:01:00 - - --- !query -SELECT (SELECT min(t3d) FROM t3 WHERE t3a = t1a) min_t3d -FROM t1 -WHERE t1a = 'val1b' -MINUS -SELECT (SELECT min(t3d) FROM t3) abs_min_t3d -FROM t1 -WHERE t1a = 'val1b' --- !query schema -struct --- !query output -19 - - --- !query -SELECT t1a, t1b -FROM t1 -WHERE NOT EXISTS (SELECT (SELECT max(t2b) - FROM t2 LEFT JOIN t1 - ON t2a = t1a - WHERE t2c = t3c) dummy - FROM t3 - WHERE t3b < (SELECT max(t2b) - FROM t2 LEFT JOIN t1 - ON t2a = t1a - WHERE t2c = t3c) - AND t3a = t1a) --- !query schema -struct --- !query output -val1a 16 -val1a 16 -val1a 6 -val1a 6 -val1c 8 -val1d 10 -val1d NULL -val1d NULL -val1e 10 -val1e 10 -val1e 10 - - --- !query -SELECT t1a, - (SELECT count(t2d) FROM t2 WHERE t2a = t1a) count_t2, - (SELECT count_if(t2d > 0) FROM t2 WHERE t2a = t1a) count_if_t2, - (SELECT approx_count_distinct(t2d) FROM t2 WHERE t2a = t1a) approx_count_distinct_t2, - (SELECT collect_list(t2d) FROM t2 WHERE t2a = t1a) collect_list_t2, - (SELECT sort_array(collect_set(t2d)) FROM t2 WHERE t2a = t1a) collect_set_t2, - (SELECT hex(count_min_sketch(t2d, 0.5d, 0.5d, 1)) FROM t2 WHERE t2a = t1a) collect_set_t2 -FROM t1 --- !query schema -struct,collect_set_t2:array,collect_set_t2:string> --- !query output -val1a 0 0 0 [] [] 0000000100000000000000000000000100000004000000005D8D6AB90000000000000000000000000000000000000000000000000000000000000000 -val1a 0 0 0 [] [] 0000000100000000000000000000000100000004000000005D8D6AB90000000000000000000000000000000000000000000000000000000000000000 -val1a 0 0 0 [] [] 0000000100000000000000000000000100000004000000005D8D6AB90000000000000000000000000000000000000000000000000000000000000000 -val1a 0 0 0 [] [] 0000000100000000000000000000000100000004000000005D8D6AB90000000000000000000000000000000000000000000000000000000000000000 -val1b 6 6 3 [19,119,319,19,19,19] [19,119,319] 0000000100000000000000060000000100000004000000005D8D6AB90000000000000000000000000000000400000000000000010000000000000001 -val1c 2 2 2 [219,19] [19,219] 0000000100000000000000020000000100000004000000005D8D6AB90000000000000000000000000000000100000000000000000000000000000001 -val1d 0 0 0 [] [] 0000000100000000000000000000000100000004000000005D8D6AB90000000000000000000000000000000000000000000000000000000000000000 -val1d 0 0 0 [] [] 0000000100000000000000000000000100000004000000005D8D6AB90000000000000000000000000000000000000000000000000000000000000000 -val1d 0 0 0 [] [] 0000000100000000000000000000000100000004000000005D8D6AB90000000000000000000000000000000000000000000000000000000000000000 -val1e 1 1 1 [19] [19] 0000000100000000000000010000000100000004000000005D8D6AB90000000000000000000000000000000100000000000000000000000000000000 -val1e 1 1 1 [19] [19] 0000000100000000000000010000000100000004000000005D8D6AB90000000000000000000000000000000100000000000000000000000000000000 -val1e 1 1 1 [19] [19] 0000000100000000000000010000000100000004000000005D8D6AB90000000000000000000000000000000100000000000000000000000000000000 - - --- !query -SELECT t1c, (SELECT t1c) FROM t1 --- !query schema -struct --- !query output -12 12 -12 12 -16 16 -16 16 -16 16 -16 16 -8 8 -8 8 -NULL NULL -NULL NULL -NULL NULL -NULL NULL - - --- !query -SELECT t1c, (SELECT t1c WHERE t1c = 8) FROM t1 --- !query schema -struct --- !query output -12 NULL -12 NULL -16 NULL -16 NULL -16 NULL -16 NULL -8 8 -8 8 -NULL NULL -NULL NULL -NULL NULL -NULL NULL - - --- !query -SELECT t1c, t1d, (SELECT c + d FROM (SELECT t1c AS c, t1d AS d)) FROM t1 --- !query schema -struct --- !query output -12 10 22 -12 21 33 -16 19 35 -16 19 35 -16 19 35 -16 22 38 -8 10 18 -8 10 18 -NULL 12 NULL -NULL 19 NULL -NULL 19 NULL -NULL 25 NULL - - --- !query -SELECT t1c, (SELECT SUM(c) FROM (SELECT t1c AS c)) FROM t1 --- !query schema -struct --- !query output -12 12 -12 12 -16 16 -16 16 -16 16 -16 16 -8 8 -8 8 -NULL NULL -NULL NULL -NULL NULL -NULL NULL - - --- !query -SELECT t1a, (SELECT SUM(t2b) FROM t2 JOIN (SELECT t1a AS a) ON t2a = a) FROM t1 --- !query schema -struct --- !query output -val1a NULL -val1a NULL -val1a NULL -val1a NULL -val1b 36 -val1c 24 -val1d NULL -val1d NULL -val1d NULL -val1e 8 -val1e 8 -val1e 8 - - --- !query -CREATE OR REPLACE TEMPORARY VIEW t1 AS VALUES (0, 1), (1, 2) t1(c1, c2) --- !query schema -struct<> --- !query output - - - --- !query -CREATE OR REPLACE TEMPORARY VIEW t2 AS VALUES (0, 2), (0, 3) t2(c1, c2) --- !query schema -struct<> --- !query output - - - --- !query -SELECT c1, (WITH t AS (SELECT 1 AS a) SELECT a + c1 FROM t) FROM t1 --- !query schema -struct --- !query output -0 1 -1 2 - - --- !query -SELECT c1, (WITH t AS (SELECT * FROM t2 WHERE c1 = t1.c1) SELECT SUM(c2) FROM t) FROM t1 --- !query schema -struct --- !query output -0 5 -1 NULL - - --- !query -SELECT c1, ( - WITH t3 AS (SELECT c1 + 1 AS c1, c2 + 1 AS c2 FROM t2), - t4 AS (SELECT * FROM t3 WHERE t1.c1 = c1) - SELECT SUM(c2) FROM t4 -) FROM t1 --- !query schema -struct --- !query output -0 NULL -1 7 - - --- !query -SELECT c1, ( - WITH t AS (SELECT * FROM t2) - SELECT SUM(c2) FROM (SELECT c1, c2 FROM t UNION SELECT c2, c1 FROM t) r(c1, c2) - WHERE c1 = t1.c1 -) FROM t1 --- !query schema -struct --- !query output -0 5 -1 NULL - - --- !query -WITH v AS (SELECT * FROM t2) -SELECT * FROM t1 WHERE c1 > ( - WITH t AS (SELECT * FROM t2) - SELECT COUNT(*) FROM v WHERE c1 = t1.c1 AND c1 > (SELECT SUM(c2) FROM t WHERE c1 = v.c1) -) --- !query schema -struct --- !query output -1 2 - - --- !query -WITH t AS (SELECT 1 AS a) -SELECT c1, (SELECT a FROM t WHERE a = c1) FROM t1 --- !query schema -struct --- !query output -0 NULL -1 1 - - --- !query -WITH -v1 AS (SELECT c1, c2, rand(0) c3 FROM t1), -v2 AS (SELECT c1, c2, rand(0) c4 FROM v1 WHERE c3 IN (SELECT c3 FROM v1)) -SELECT c1, ( - WITH v3 AS (SELECT c1, c2, rand(0) c5 FROM t2) - SELECT COUNT(*) FROM ( - SELECT * FROM v2 WHERE c1 > 0 - UNION SELECT * FROM v2 WHERE c2 > 0 - UNION SELECT * FROM v3 WHERE c2 > 0 - ) WHERE c1 = v1.c1 -) FROM v1 --- !query schema -struct --- !query output -0 3 -1 1 - - --- !query -SELECT (SELECT a FROM (SELECT 1 AS a UNION ALL SELECT 2 AS a) t) AS b --- !query schema -struct<> --- !query output -java.lang.IllegalStateException -more than one row returned by a subquery used as an expression: -Subquery subquery#1, [id=#2] -+- AdaptiveSparkPlan isFinalPlan=true - +- == Final Plan == - VeloxColumnarToRowExec - +- ColumnarUnion - :- ProjectExecTransformer [1 AS a#3] - : +- InputIteratorTransformer[fake_column#4] - : +- InputAdapter - : +- RowToVeloxColumnar - : +- Scan OneRowRelation[fake_column#4] - +- ProjectExecTransformer [2 AS a#5] - +- InputIteratorTransformer[fake_column#6] - +- InputAdapter - +- RowToVeloxColumnar - +- Scan OneRowRelation[fake_column#6] - +- == Initial Plan == - Union - :- Project [1 AS a#3] - : +- Scan OneRowRelation[] - +- Project [2 AS a#5] - +- Scan OneRowRelation[] - - --- !query -CREATE OR REPLACE TEMP VIEW t1(c1, c2) AS (VALUES (0, 1), (1, 2)) --- !query schema -struct<> --- !query output - - - --- !query -CREATE OR REPLACE TEMP VIEW t2(c1, c2) AS (VALUES (0, 2), (0, 3)) --- !query schema -struct<> --- !query output - - - --- !query -CREATE OR REPLACE TEMP VIEW students(id, name, major, year) AS (VALUES - (0, 'A', 'CS', 2022), - (1, 'B', 'CS', 2022), - (2, 'C', 'Math', 2022)) --- !query schema -struct<> --- !query output - - - --- !query -CREATE OR REPLACE TEMP VIEW exams(sid, course, curriculum, grade, date) AS (VALUES - (0, 'C1', 'CS', 4, 2020), - (0, 'C2', 'CS', 3, 2021), - (1, 'C1', 'CS', 2, 2020), - (1, 'C2', 'CS', 1, 2021)) --- !query schema -struct<> --- !query output - - - --- !query -SELECT students.name, exams.course -FROM students, exams -WHERE students.id = exams.sid - AND (students.major = 'CS' OR students.major = 'Games Eng') - AND exams.grade >= ( - SELECT avg(exams.grade) + 1 - FROM exams - WHERE students.id = exams.sid - OR (exams.curriculum = students.major AND students.year > exams.date)) --- !query schema -struct --- !query output -A C1 - - --- !query -SELECT (SELECT min(c2) FROM t2 WHERE t1.c1 > t2.c1) FROM t1 --- !query schema -struct --- !query output -2 -NULL - - --- !query -SELECT (SELECT min(c2) FROM t2 WHERE t1.c1 >= t2.c1 AND t1.c2 < t2.c2) FROM t1 --- !query schema -struct --- !query output -2 -3 - - --- !query -SELECT (SELECT count(*) FROM t2 WHERE t1.c1 > t2.c1) FROM t1 --- !query schema -struct --- !query output -0 -2 - - --- !query -SELECT c, ( - SELECT count(*) - FROM (VALUES ('ab'), ('abc'), ('bc')) t2(c) - WHERE t1.c = substring(t2.c, 1, 1) -) FROM (VALUES ('a'), ('b')) t1(c) --- !query schema -struct --- !query output -a 2 -b 1 - - --- !query -SELECT c, ( - SELECT count(*) - FROM (VALUES (0, 6), (1, 5), (2, 4), (3, 3)) t1(a, b) - WHERE a + b = c -) FROM (VALUES (6)) t2(c) --- !query schema -struct --- !query output -6 4 - - --- !query -SELECT *, (SELECT count(1) is null FROM t2 WHERE t1.c1 = t2.c1) FROM t1 --- !query schema -struct --- !query output -0 1 false -1 2 false - - --- !query -select (select f from (select false as f, max(c2) from t1 where t1.c1 = t1.c1)) from t2 --- !query schema -struct --- !query output -false -false - - --- !query -CREATE OR REPLACE TEMP VIEW t0(t0a, t0b) AS VALUES (1, 1), (2, 0) --- !query schema -struct<> --- !query output - - - --- !query -CREATE OR REPLACE TEMP VIEW t1(t1a, t1b, t1c) AS VALUES (1, 1, 3) --- !query schema -struct<> --- !query output - - - --- !query -CREATE OR REPLACE TEMP VIEW t2(t2a, t2b, t2c) AS VALUES (1, 1, 5), (2, 2, 7) --- !query schema -struct<> --- !query output - - - --- !query -SELECT t0a, (SELECT sum(c) FROM - (SELECT t1c as c - FROM t1 - WHERE t1a = t0a - UNION ALL - SELECT t2c as c - FROM t2 - WHERE t2b = t0b) -) -FROM t0 --- !query schema -struct --- !query output -1 8 -2 NULL - - --- !query -SELECT t0a, (SELECT sum(c) FROM - (SELECT t1c as c - FROM t1 - WHERE t1a = t0a - UNION ALL - SELECT t2c as c - FROM t2 - WHERE t2a = t0a) -) -FROM t0 --- !query schema -struct --- !query output -1 8 -2 7 - - --- !query -SELECT t0a, (SELECT sum(c) FROM - (SELECT t1c as c - FROM t1 - WHERE t1a > t0a - UNION ALL - SELECT t2c as c - FROM t2 - WHERE t2b <= t0b) -) -FROM t0 --- !query schema -struct --- !query output -1 5 -2 NULL - - --- !query -SELECT t0a, (SELECT sum(t1c) FROM - (SELECT t1c - FROM t1 - WHERE t1a = t0a - UNION ALL - SELECT t2c - FROM t2 - WHERE t2b = t0b) -) -FROM t0 --- !query schema -struct --- !query output -1 8 -2 NULL - - --- !query -SELECT t0a, (SELECT sum(t1c) FROM - (SELECT t1c - FROM t1 - WHERE t1a = t0a - UNION DISTINCT - SELECT t2c - FROM t2 - WHERE t2b = t0b) -) -FROM t0 --- !query schema -struct --- !query output -1 8 -2 NULL - - --- !query -SELECT t0a, (SELECT sum(t1a + 3 * t1b + 5 * t1c) FROM - (SELECT t1c as t1a, t1a as t1b, t0a as t1c - FROM t1 - WHERE t1a = t0a - UNION ALL - SELECT t0a as t2b, t2c as t1a, t0b as t2c - FROM t2 - WHERE t2b = t0b) -) -FROM t0 --- !query schema -struct --- !query output -1 32 -2 NULL - - --- !query -SELECT t0a, (SELECT count(t1c) FROM - (SELECT t1c - FROM t1 - WHERE t1a = t0a - UNION DISTINCT - SELECT t2c - FROM t2 - WHERE t2b = t0b) -) -FROM t0 --- !query schema -struct --- !query output -1 2 -2 0 - - --- !query -SELECT t0a, (SELECT sum(d) FROM - (SELECT t1a - t0a as d - FROM t1 - UNION ALL - SELECT t2a - t0a as d - FROM t2) -) -FROM t0 --- !query schema -struct --- !query output -1 1 -2 -2 - - --- !query -SELECT t0a, (SELECT sum(d) FROM - (SELECT sum(t0a) as d - FROM t1 - UNION ALL - SELECT sum(t2a) + t0a as d - FROM t2) -) -FROM t0 --- !query schema -struct<> --- !query output -org.apache.spark.sql.AnalysisException -{ - "errorClass" : "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.CORRELATED_REFERENCE", - "sqlState" : "0A000", - "messageParameters" : { - "sqlExprs" : "\"sum(t0a) AS d\"" - }, - "queryContext" : [ { - "objectType" : "", - "objectName" : "", - "startIndex" : 36, - "stopIndex" : 67, - "fragment" : "SELECT sum(t0a) as d\n FROM t1" - } ] -} - - --- !query -select * -from -( - select t1.id c1, ( - select sum(c) - from ( - select t2.id * t2.id c - from range (1, 2) t2 where t1.id = t2.id - group by t2.id - ) - ) c2 - from range (1, 3) t1 -) t -where t.c2 is not null --- !query schema -struct --- !query output -1 1 diff --git a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala index 069d697bd454b..0dc2cdd89f935 100644 --- a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala +++ b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala @@ -76,6 +76,7 @@ class ClickHouseTestSettings extends BackendTestSettings { false // nativeDoValidate failed due to spark conf cleanup case "GlutenBloomFilterAggregateQuerySuite" => !bloomFilterCases.contains(testName) + case "GlutenSortShuffleSuite" => false case _ => true } preCheck && super.shouldRun(suiteName, testName) @@ -323,9 +324,7 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("SPARK-30793: truncate timestamps before the epoch to seconds and minutes") .excludeGlutenTest("unix_timestamp") .excludeGlutenTest("to_unix_timestamp") - .exclude("to_utc_timestamp with literal zone") .exclude("to_utc_timestamp with column zone") - .exclude("from_utc_timestamp with literal zone") .exclude("from_utc_timestamp with column zone") enableSuite[GlutenDeprecatedAPISuite] enableSuite[GlutenDynamicPartitionPruningV1SuiteAEOff].excludeGlutenTest( @@ -458,8 +457,10 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("string regex_replace / regex_extract") .exclude("string overlay function") .exclude("binary overlay function") - .exclude("string / binary substring function") .exclude("string parse_url function") + .exclude("string / binary length function") + .exclude("SPARK-36751: add octet length api for scala") + .exclude("SPARK-36751: add bit length api for scala") enableSuite[GlutenSubquerySuite] .exclude("SPARK-15370: COUNT bug in subquery in subquery in subquery") .exclude("SPARK-26893: Allow pushdown of partition pruning subquery filters to file source") @@ -490,7 +491,7 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("SPARK-36920: Support day-time intervals by ABS") .exclude("SPARK-36921: Support YearMonthIntervalType by div") .exclude("SPARK-36921: Support DayTimeIntervalType by div") - enableSuite[GlutenBitwiseExpressionsSuite].exclude("BitGet") + enableSuite[GlutenBitwiseExpressionsSuite] enableSuite[GlutenCastSuite] .exclude("null cast") .exclude("cast string to date") @@ -757,34 +758,21 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("SPARK - 34814: LikeSimplification should handleNULL") enableSuite[GlutenSortOrderExpressionsSuite].exclude("SortPrefix") enableSuite[GlutenStringExpressionsSuite] - .exclude("concat") .exclude("StringComparison") .exclude("Substring") .exclude("string substring_index function") .exclude("SPARK-40213: ascii for Latin-1 Supplement characters") .exclude("ascii for string") - .exclude("string for ascii") .exclude("base64/unbase64 for string") .exclude("encode/decode for string") .exclude("Levenshtein distance") - .exclude("soundex unit test") - .exclude("replace") .exclude("overlay for string") .exclude("overlay for byte array") .exclude("translate") - .exclude("FORMAT") - .exclude("SPARK-22603: FormatString should not generate codes beyond 64KB") - .exclude("INSTR") .exclude("LOCATE") - .exclude("LPAD/RPAD") .exclude("REPEAT") - .exclude("length for string / binary") - .exclude("format_number / FormatNumber") - .exclude("ToNumber: positive tests") - .exclude("ToNumber: negative tests (the input string does not match the format string)") .exclude("ParseUrl") .exclude("SPARK-33468: ParseUrl in ANSI mode should fail if input string is not a valid url") - .exclude("Sentences") enableSuite[GlutenDataSourceV2DataFrameSessionCatalogSuite] enableSuite[GlutenDataSourceV2SQLSessionCatalogSuite] enableSuite[GlutenDataSourceV2SQLSuiteV1Filter] diff --git a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/velox/VeloxSQLQueryTestSettings.scala b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/velox/VeloxSQLQueryTestSettings.scala index 5067de74ea7e6..345971e9ffc0f 100644 --- a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/velox/VeloxSQLQueryTestSettings.scala +++ b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/velox/VeloxSQLQueryTestSettings.scala @@ -134,6 +134,7 @@ object VeloxSQLQueryTestSettings extends SQLQueryTestSettings { "subquery/negative-cases/invalid-correlation.sql", "subquery/negative-cases/subq-input-typecheck.sql", "subquery/scalar-subquery/scalar-subquery-predicate.sql", + "subquery/scalar-subquery/scalar-subquery-select.sql", "subquery/subquery-in-from.sql", "postgreSQL/aggregates_part1.sql", "postgreSQL/aggregates_part2.sql", @@ -241,9 +242,6 @@ object VeloxSQLQueryTestSettings extends SQLQueryTestSettings { "group-by.sql", "udf/udf-group-by.sql", // Overwrite some results of regr_intercept, regr_r2, corr. - "linear-regression.sql", - // Exception string doesn't match for - // SELECT (SELECT a FROM (SELECT 1 AS a UNION ALL SELECT 2 AS a) t) AS b - "subquery/scalar-subquery/scalar-subquery-select.sql" + "linear-regression.sql" ) } diff --git a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala index 3a993189d6aec..0da19922ffdaf 100644 --- a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala +++ b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala @@ -18,8 +18,9 @@ package org.apache.gluten.utils.velox import org.apache.gluten.utils.{BackendTestSettings, SQLQueryTestSettings} +import org.apache.spark.GlutenSortShuffleSuite import org.apache.spark.sql._ -import org.apache.spark.sql.catalyst.expressions.{GlutenArithmeticExpressionSuite, GlutenBitwiseExpressionsSuite, GlutenCastSuite, GlutenCollectionExpressionsSuite, GlutenComplexTypeSuite, GlutenConditionalExpressionSuite, GlutenDateExpressionsSuite, GlutenDecimalExpressionSuite, GlutenDecimalPrecisionSuite, GlutenHashExpressionsSuite, GlutenHigherOrderFunctionsSuite, GlutenIntervalExpressionsSuite, GlutenLiteralExpressionSuite, GlutenMathExpressionsSuite, GlutenMiscExpressionsSuite, GlutenNondeterministicSuite, GlutenNullExpressionsSuite, GlutenPredicateSuite, GlutenRandomSuite, GlutenRegexpExpressionsSuite, GlutenSortOrderExpressionsSuite, GlutenStringExpressionsSuite} +import org.apache.spark.sql.catalyst.expressions.{GlutenArithmeticExpressionSuite, GlutenBitwiseExpressionsSuite, GlutenCastSuite, GlutenCollectionExpressionsSuite, GlutenComplexTypeSuite, GlutenConditionalExpressionSuite, GlutenDateExpressionsSuite, GlutenDecimalExpressionSuite, GlutenDecimalPrecisionSuite, GlutenHashExpressionsSuite, GlutenHigherOrderFunctionsSuite, GlutenIntervalExpressionsSuite, GlutenLiteralExpressionSuite, GlutenMathExpressionsSuite, GlutenMiscExpressionsSuite, GlutenNondeterministicSuite, GlutenNullExpressionsSuite, GlutenPredicateSuite, GlutenRandomSuite, GlutenRegexpExpressionsSuite, GlutenSortOrderExpressionsSuite, GlutenStringExpressionsSuite, GlutenTryEvalSuite} import org.apache.spark.sql.connector.{GlutenDataSourceV2DataFrameSessionCatalogSuite, GlutenDataSourceV2DataFrameSuite, GlutenDataSourceV2FunctionSuite, GlutenDataSourceV2SQLSessionCatalogSuite, GlutenDataSourceV2SQLSuiteV1Filter, GlutenDataSourceV2SQLSuiteV2Filter, GlutenDataSourceV2Suite, GlutenDeleteFromTableSuite, GlutenDeltaBasedDeleteFromTableSuite, GlutenFileDataSourceV2FallBackSuite, GlutenGroupBasedDeleteFromTableSuite, GlutenKeyGroupedPartitioningSuite, GlutenLocalScanSuite, GlutenMetadataColumnSuite, GlutenSupportsCatalogOptionsSuite, GlutenTableCapabilityCheckSuite, GlutenWriteDistributionAndOrderingSuite} import org.apache.spark.sql.errors.{GlutenQueryCompilationErrorsDSv2Suite, GlutenQueryCompilationErrorsSuite, GlutenQueryExecutionErrorsSuite, GlutenQueryParsingErrorsSuite} import org.apache.spark.sql.execution.{FallbackStrategiesSuite, GlutenBroadcastExchangeSuite, GlutenCoalesceShufflePartitionsSuite, GlutenExchangeSuite, GlutenLocalBroadcastExchangeSuite, GlutenReplaceHashWithSortAggSuite, GlutenReuseExchangeAndSubquerySuite, GlutenSameResultSuite, GlutenSortSuite, GlutenSQLAggregateFunctionSuite, GlutenSQLWindowFunctionSuite, GlutenTakeOrderedAndProjectSuite} @@ -80,8 +81,6 @@ class VeloxTestSettings extends BackendTestSettings { // NEW SUITE: disable as it expects exception which doesn't happen when offloaded to gluten .exclude( "INCONSISTENT_BEHAVIOR_CROSS_VERSION: compatibility with Spark 2.4/3.2 in reading/writing dates") - // gluten throws different exception - .excludeByPrefix("SCALAR_SUBQUERY_TOO_MANY_ROWS:") // Doesn't support unhex with failOnError=true. .exclude("CONVERSION_INVALID_INPUT: to_binary conversion function hex") enableSuite[GlutenQueryParsingErrorsSuite] @@ -139,8 +138,10 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("random") .exclude("SPARK-9127 codegen with long seed") enableSuite[GlutenRegexpExpressionsSuite] + enableSuite[GlutenSortShuffleSuite] enableSuite[GlutenSortOrderExpressionsSuite] enableSuite[GlutenStringExpressionsSuite] + enableSuite[GlutenTryEvalSuite] enableSuite[VeloxAdaptiveQueryExecSuite] .includeAllGlutenTests() .includeByPrefix( @@ -184,6 +185,9 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("save csv with empty fields with user defined empty values") .exclude("save csv with quote") .exclude("SPARK-13543 Write the output as uncompressed via option()") + .exclude("DDL test with tab separated file") + .exclude("DDL test parsing decimal type") + .exclude("test with tab delimiter and double quote") // Arrow not support corrupt record .exclude("SPARK-27873: disabling enforceSchema should not fail columnNameOfCorruptRecord") enableSuite[GlutenCSVv2Suite] @@ -196,6 +200,9 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("save csv with empty fields with user defined empty values") .exclude("save csv with quote") .exclude("SPARK-13543 Write the output as uncompressed via option()") + .exclude("DDL test with tab separated file") + .exclude("DDL test parsing decimal type") + .exclude("test with tab delimiter and double quote") // Rule org.apache.spark.sql.execution.datasources.v2.V2ScanRelationPushDown in batch // Early Filter and Projection Push-Down generated an invalid plan .exclude("SPARK-26208: write and read empty data to csv file with headers") @@ -207,6 +214,9 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("save csv with empty fields with user defined empty values") .exclude("save csv with quote") .exclude("SPARK-13543 Write the output as uncompressed via option()") + .exclude("DDL test with tab separated file") + .exclude("DDL test parsing decimal type") + .exclude("test with tab delimiter and double quote") // Arrow not support corrupt record .exclude("SPARK-27873: disabling enforceSchema should not fail columnNameOfCorruptRecord") enableSuite[GlutenJsonV1Suite] @@ -754,6 +764,7 @@ class VeloxTestSettings extends BackendTestSettings { enableSuite[GlutenFileFormatWriterSuite] enableSuite[GlutenFileIndexSuite] enableSuite[GlutenFileMetadataStructSuite] + enableSuite[GlutenFileMetadataStructRowIndexSuite] enableSuite[GlutenParquetV1AggregatePushDownSuite] enableSuite[GlutenParquetV2AggregatePushDownSuite] enableSuite[GlutenOrcV1AggregatePushDownSuite] @@ -846,7 +857,6 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("Shouldn't change broadcast join buildSide if user clearly specified") .exclude("Shouldn't bias towards build right if user didn't specify") .exclude("SPARK-23192: broadcast hint should be retained after using the cached data") - .exclude("broadcast hint isn't propagated after a join") .exclude("broadcast join where streamed side's output partitioning is HashPartitioning") .exclude("broadcast join where streamed side's output partitioning is PartitioningCollection") @@ -1063,14 +1073,6 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("to_timestamp") // Legacy mode is not supported, assuming this mode is not commonly used. .exclude("SPARK-30668: use legacy timestamp parser in to_timestamp") - // Replaced by another test. - .exclude("to_utc_timestamp with literal zone") - // Replaced by another test. - .exclude("to_utc_timestamp with column zone") - // Replaced by another test - .exclude("from_utc_timestamp with literal zone") - // Replaced by another test - .exclude("from_utc_timestamp with column zone") enableSuite[GlutenDeprecatedAPISuite] enableSuite[GlutenDynamicPartitionPruningV1SuiteAEOff] enableSuite[GlutenDynamicPartitionPruningV1SuiteAEOn] @@ -1101,8 +1103,6 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("SPARK-31116: Select nested schema with case insensitive mode") // exclude as original metric not correct when task offloaded to velox .exclude("SPARK-37585: test input metrics for DSV2 with output limits") - // TODO(yuan): fix the input bytes on ORC code path - .exclude("SPARK-30362: test input metrics for DSV2") // DISABLED: GLUTEN-4893 Vanilla UT checks scan operator by exactly matching the class type .exclude("File source v2: support passing data filters to FileScan without partitionFilters") // DISABLED: GLUTEN-4893 Vanilla UT checks scan operator by exactly matching the class type @@ -1118,9 +1118,6 @@ class VeloxTestSettings extends BackendTestSettings { enableSuite[GlutenJoinSuite] // exclude as it check spark plan .exclude("SPARK-36794: Ignore duplicated key when building relation for semi/anti hash join") - // exclude as it check for SMJ node - .exclude( - "SPARK-43113: Full outer join with duplicate stream-side references in condition (SMJ)") enableSuite[GlutenMathFunctionsSuite] enableSuite[GlutenMetadataCacheSuite] .exclude("SPARK-16336,SPARK-27961 Suggest fixing FileNotFoundException") @@ -1189,10 +1186,6 @@ class VeloxTestSettings extends BackendTestSettings { enableSuite[GlutenResolveDefaultColumnsSuite] enableSuite[GlutenSubqueryHintPropagationSuite] enableSuite[GlutenUrlFunctionsSuite] - enableSuite[GlutenFileMetadataStructRowIndexSuite] - // Row index metadata column support in Velox isn't ready yet, refer velox-9147 - .exclude("reading _tmp_metadata_row_index - not present in a table") - .exclude("reading _tmp_metadata_row_index - present in a table") enableSuite[GlutenParquetRowIndexSuite] .excludeByPrefix("row index generation") .excludeByPrefix("invalid row index column type") diff --git a/gluten-ut/spark34/src/test/scala/org/apache/spark/GlutenSortShuffleSuite.scala b/gluten-ut/spark34/src/test/scala/org/apache/spark/GlutenSortShuffleSuite.scala new file mode 100644 index 0000000000000..70579c886248e --- /dev/null +++ b/gluten-ut/spark34/src/test/scala/org/apache/spark/GlutenSortShuffleSuite.scala @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark + +import org.apache.spark.sql.GlutenTestsBaseTrait + +class GlutenSortShuffleSuite extends SortShuffleSuite with GlutenTestsBaseTrait { + override def beforeAll(): Unit = { + super.beforeAll() + conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.ColumnarShuffleManager") + } +} diff --git a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/GlutenColumnExpressionSuite.scala b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/GlutenColumnExpressionSuite.scala index edd2a5a9672df..a4b530e637af9 100644 --- a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/GlutenColumnExpressionSuite.scala +++ b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/GlutenColumnExpressionSuite.scala @@ -16,4 +16,40 @@ */ package org.apache.spark.sql -class GlutenColumnExpressionSuite extends ColumnExpressionSuite with GlutenSQLTestsTrait {} +import org.apache.spark.sql.execution.ProjectExec +import org.apache.spark.sql.functions.{expr, input_file_name} +import org.apache.spark.sql.types.{ArrayType, IntegerType, StringType, StructField, StructType} + +class GlutenColumnExpressionSuite extends ColumnExpressionSuite with GlutenSQLTestsTrait { + testGluten("input_file_name with scan is fallback") { + withTempPath { + dir => + val rawData = Seq( + Row(1, "Alice", Seq(Row(Seq(1, 2, 3)))), + Row(2, "Bob", Seq(Row(Seq(4, 5)))), + Row(3, "Charlie", Seq(Row(Seq(6, 7, 8, 9)))) + ) + val schema = StructType( + Array( + StructField("id", IntegerType, nullable = false), + StructField("name", StringType, nullable = false), + StructField( + "nested_column", + ArrayType( + StructType(Array( + StructField("array_in_struct", ArrayType(IntegerType), nullable = true) + ))), + nullable = true) + )) + val data: DataFrame = spark.createDataFrame(sparkContext.parallelize(rawData), schema) + data.write.parquet(dir.getCanonicalPath) + + val q = + spark.read.parquet(dir.getCanonicalPath).select(input_file_name(), expr("nested_column")) + val firstRow = q.head() + assert(firstRow.getString(0).contains(dir.toURI.getPath)) + val project = q.queryExecution.executedPlan.collect { case p: ProjectExec => p } + assert(project.size == 1) + } + } +} diff --git a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/GlutenDateFunctionsSuite.scala b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/GlutenDateFunctionsSuite.scala index fbbfdf45daa86..a946e6de43452 100644 --- a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/GlutenDateFunctionsSuite.scala +++ b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/GlutenDateFunctionsSuite.scala @@ -246,80 +246,4 @@ class GlutenDateFunctionsSuite extends DateFunctionsSuite with GlutenSQLTestsTra } } } - - testGluten("to_utc_timestamp with literal zone") { - val df = Seq( - (Timestamp.valueOf("2015-07-24 00:00:00"), "2015-07-24 00:00:00"), - (Timestamp.valueOf("2015-07-25 00:00:00"), "2015-07-25 00:00:00") - ).toDF("a", "b") - checkAnswer( - df.select(to_utc_timestamp(col("a"), "America/Los_Angeles")), - Seq( - Row(Timestamp.valueOf("2015-07-24 07:00:00")), - Row(Timestamp.valueOf("2015-07-25 07:00:00"))) - ) - checkAnswer( - df.select(to_utc_timestamp(col("b"), "America/Los_Angeles")), - Seq( - Row(Timestamp.valueOf("2015-07-24 07:00:00")), - Row(Timestamp.valueOf("2015-07-25 07:00:00"))) - ) - } - - testGluten("to_utc_timestamp with column zone") { - val df = Seq( - (Timestamp.valueOf("2015-07-24 00:00:00"), "2015-07-24 00:00:00", "America/Los_Angeles"), - (Timestamp.valueOf("2015-07-25 00:00:00"), "2015-07-25 00:00:00", "Europe/Paris") - ).toDF("a", "b", "c") - checkAnswer( - df.select(to_utc_timestamp(col("a"), col("c"))), - Seq( - Row(Timestamp.valueOf("2015-07-24 07:00:00")), - Row(Timestamp.valueOf("2015-07-24 22:00:00"))) - ) - checkAnswer( - df.select(to_utc_timestamp(col("b"), col("c"))), - Seq( - Row(Timestamp.valueOf("2015-07-24 07:00:00")), - Row(Timestamp.valueOf("2015-07-24 22:00:00"))) - ) - } - - testGluten("from_utc_timestamp with literal zone") { - val df = Seq( - (Timestamp.valueOf("2015-07-24 00:00:00"), "2015-07-24 00:00:00"), - (Timestamp.valueOf("2015-07-25 00:00:00"), "2015-07-25 00:00:00") - ).toDF("a", "b") - checkAnswer( - df.select(from_utc_timestamp(col("a"), "America/Los_Angeles")), - Seq( - Row(Timestamp.valueOf("2015-07-23 17:00:00")), - Row(Timestamp.valueOf("2015-07-24 17:00:00"))) - ) - checkAnswer( - df.select(from_utc_timestamp(col("b"), "America/Los_Angeles")), - Seq( - Row(Timestamp.valueOf("2015-07-23 17:00:00")), - Row(Timestamp.valueOf("2015-07-24 17:00:00"))) - ) - } - - testGluten("from_utc_timestamp with column zone") { - val df = Seq( - (Timestamp.valueOf("2015-07-24 00:00:00"), "2015-07-24 00:00:00", "Europe/Paris"), - (Timestamp.valueOf("2015-07-25 00:00:00"), "2015-07-25 00:00:00", "America/Los_Angeles") - ).toDF("a", "b", "c") - checkAnswer( - df.select(from_utc_timestamp(col("a"), col("c"))), - Seq( - Row(Timestamp.valueOf("2015-07-24 02:00:00")), - Row(Timestamp.valueOf("2015-07-24 17:00:00"))) - ) - checkAnswer( - df.select(from_utc_timestamp(col("b"), col("c"))), - Seq( - Row(Timestamp.valueOf("2015-07-24 02:00:00")), - Row(Timestamp.valueOf("2015-07-24 17:00:00"))) - ) - } } diff --git a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/GlutenJoinSuite.scala b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/GlutenJoinSuite.scala index 4ac8bd3ea8bf0..8a5a5923f7299 100644 --- a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/GlutenJoinSuite.scala +++ b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/GlutenJoinSuite.scala @@ -16,9 +16,6 @@ */ package org.apache.spark.sql -import org.apache.spark.sql.execution.SparkPlan -import org.apache.spark.sql.execution.joins.ShuffledHashJoinExec - class GlutenJoinSuite extends JoinSuite with GlutenSQLTestsTrait { override def testNameBlackList: Seq[String] = Seq( @@ -55,14 +52,4 @@ class GlutenJoinSuite extends JoinSuite with GlutenSQLTestsTrait { |""".stripMargin checkAnswer(spark.sql(sql), Seq(Row(0, 1), Row(1, 2), Row(2, 3))) } - - testGluten( - "SPARK-43113: Full outer join with duplicate stream-side" + - " references in condition (SHJ)") { - def check(plan: SparkPlan): Unit = { - assert(collect(plan) { case _: ShuffledHashJoinExec => true }.size === 1) - } - - dupStreamSideColTest("MERGE", check) - } } diff --git a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/GlutenStringFunctionsSuite.scala b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/GlutenStringFunctionsSuite.scala index c58284e4403bb..3d82e214f0311 100644 --- a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/GlutenStringFunctionsSuite.scala +++ b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/GlutenStringFunctionsSuite.scala @@ -30,10 +30,6 @@ class GlutenStringFunctionsSuite import testImplicits._ - override def testNameBlackList: Seq[String] = super.testNameBlackList ++ Seq( - "string / binary length function" - ) - testGluten("string split function with no limit and regex pattern") { val df1 = Seq(("aaAbbAcc4")).toDF("a").select(split($"a", "A")) checkAnswer(df1, Row(Seq("aa", "bb", "cc4"))) diff --git a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/catalyst/expressions/GlutenDateExpressionsSuite.scala b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/catalyst/expressions/GlutenDateExpressionsSuite.scala index e726dcea18c7f..f4a9dfd56a927 100644 --- a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/catalyst/expressions/GlutenDateExpressionsSuite.scala +++ b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/catalyst/expressions/GlutenDateExpressionsSuite.scala @@ -380,6 +380,9 @@ class GlutenDateExpressionsSuite extends DateExpressionsSuite with GlutenTestsTr checkEvaluation( FromUnixTime(Literal(-1000L), Literal(fmt2), timeZoneId), sdf2.format(new Timestamp(-1000000))) + checkEvaluation( + FromUnixTime(Literal(Long.MaxValue), Literal(fmt2), timeZoneId), + sdf2.format(new Timestamp(-1000))) checkEvaluation( FromUnixTime( Literal.create(null, LongType), diff --git a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/catalyst/expressions/GlutenTryEvalSuite.scala b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/catalyst/expressions/GlutenTryEvalSuite.scala new file mode 100644 index 0000000000000..6af97677e5d8e --- /dev/null +++ b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/catalyst/expressions/GlutenTryEvalSuite.scala @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.catalyst.expressions + +import org.apache.spark.sql.GlutenTestsTrait + +class GlutenTryEvalSuite extends TryEvalSuite with GlutenTestsTrait {} diff --git a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/errors/GlutenQueryExecutionErrorsSuite.scala b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/errors/GlutenQueryExecutionErrorsSuite.scala index d9b661c8a4a79..8896541c29d28 100644 --- a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/errors/GlutenQueryExecutionErrorsSuite.scala +++ b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/errors/GlutenQueryExecutionErrorsSuite.scala @@ -24,15 +24,4 @@ class GlutenQueryExecutionErrorsSuite override protected def getResourceParquetFilePath(name: String): String = { getWorkspaceFilePath("sql", "core", "src", "test", "resources").toString + "/" + name } - - testGluten( - "SCALAR_SUBQUERY_TOO_MANY_ROWS: " + - "More than one row returned by a subquery used as an expression") { - val exception = intercept[IllegalStateException] { - sql("select (select a from (select 1 as a union all select 2 as a) t) as b").collect() - } - assert( - exception.getMessage.contains("more than one row returned by a subquery" + - " used as an expression")) - } } diff --git a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/execution/FallbackStrategiesSuite.scala b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/execution/FallbackStrategiesSuite.scala index 7976288dd4efc..8ce0af8df051e 100644 --- a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/execution/FallbackStrategiesSuite.scala +++ b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/execution/FallbackStrategiesSuite.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.execution import org.apache.gluten.backendsapi.BackendsApiManager import org.apache.gluten.execution.BasicScanExecTransformer import org.apache.gluten.extension.GlutenPlan -import org.apache.gluten.extension.columnar.{FallbackEmptySchemaRelation, TRANSFORM_UNSUPPORTED, TransformHints} +import org.apache.gluten.extension.columnar.{FallbackEmptySchemaRelation, FallbackTags, TRANSFORM_UNSUPPORTED} import org.apache.gluten.extension.columnar.heuristic.HeuristicApplier import org.apache.gluten.extension.columnar.transition.InsertTransitions import org.apache.gluten.utils.QueryPlanSelector @@ -125,10 +125,10 @@ class FallbackStrategiesSuite extends GlutenSQLTestsTrait { testGluten("Tag not transformable more than once") { val originalPlan = UnaryOp1(LeafOp(supportsColumnar = true)) - TransformHints.tag(originalPlan, TRANSFORM_UNSUPPORTED(Some("fake reason"))) + FallbackTags.tag(originalPlan, TRANSFORM_UNSUPPORTED(Some("fake reason"))) val rule = FallbackEmptySchemaRelation() val newPlan = rule.apply(originalPlan) - val reason = TransformHints.getHint(newPlan).asInstanceOf[TRANSFORM_UNSUPPORTED].reason + val reason = FallbackTags.getTag(newPlan).asInstanceOf[TRANSFORM_UNSUPPORTED].reason assert(reason.isDefined) if (BackendsApiManager.getSettings.fallbackOnEmptySchema(newPlan)) { assert( diff --git a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/execution/GlutenReplaceHashWithSortAggSuite.scala b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/execution/GlutenReplaceHashWithSortAggSuite.scala index 8d795bbffea0c..332c21418a9b9 100644 --- a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/execution/GlutenReplaceHashWithSortAggSuite.scala +++ b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/execution/GlutenReplaceHashWithSortAggSuite.scala @@ -84,11 +84,11 @@ class GlutenReplaceHashWithSortAggSuite withTempView("t1", "t2") { spark.range(100).selectExpr("id as key").createOrReplaceTempView("t1") spark.range(50).selectExpr("id as key").createOrReplaceTempView("t2") - Seq("COUNT", "COLLECT_LIST").foreach { - aggExpr => + Seq(("COUNT", 0, 1, 2, 0), ("COLLECT_LIST", 2, 0, 2, 0)).foreach { + aggExprInfo => val query = s""" - |SELECT key, $aggExpr(key) + |SELECT key, ${aggExprInfo._1}(key) |FROM |( | SELECT /*+ SHUFFLE_MERGE(t1) */ t1.key AS key @@ -98,7 +98,7 @@ class GlutenReplaceHashWithSortAggSuite |) |GROUP BY key """.stripMargin - checkAggs(query, 2, 0, 2, 0) + checkAggs(query, aggExprInfo._2, aggExprInfo._3, aggExprInfo._4, aggExprInfo._5) } } } diff --git a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/execution/datasources/GlutenFileMetadataStructSuite.scala b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/execution/datasources/GlutenFileMetadataStructSuite.scala index 5e4d1ed8f81f4..6e47a94e3c13d 100644 --- a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/execution/datasources/GlutenFileMetadataStructSuite.scala +++ b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/execution/datasources/GlutenFileMetadataStructSuite.scala @@ -16,12 +16,11 @@ */ package org.apache.spark.sql.execution.datasources +import org.apache.gluten.backendsapi.BackendsApiManager import org.apache.gluten.execution.{FileSourceScanExecTransformer, FilterExecTransformer} -import org.apache.gluten.utils.BackendTestUtils import org.apache.spark.sql.{Column, DataFrame, Row} import org.apache.spark.sql.GlutenSQLTestsBaseTrait -import org.apache.spark.sql.GlutenTestConstants.GLUTEN_TEST import org.apache.spark.sql.execution.FileSourceScanExec import org.apache.spark.sql.types.{IntegerType, LongType, StringType, StructField, StructType} @@ -46,6 +45,7 @@ class GlutenFileMetadataStructSuite extends FileMetadataStructSuite with GlutenS private val METADATA_FILE_NAME = "_metadata.file_name" private val METADATA_FILE_SIZE = "_metadata.file_size" private val METADATA_FILE_MODIFICATION_TIME = "_metadata.file_modification_time" + private val FILE_FORMAT = "fileFormat" private def getMetadataForFile(f: File): Map[String, Any] = { Map( @@ -60,7 +60,7 @@ class GlutenFileMetadataStructSuite extends FileMetadataStructSuite with GlutenS f: (DataFrame, Map[String, Any], Map[String, Any]) => Unit): Unit = { Seq("parquet").foreach { testFileFormat => - test(s"$GLUTEN_TEST metadata struct ($testFileFormat): " + testName) { + testGluten(s"metadata struct ($testFileFormat): " + testName) { withTempDir { dir => import scala.collection.JavaConverters._ @@ -109,7 +109,7 @@ class GlutenFileMetadataStructSuite extends FileMetadataStructSuite with GlutenS METADATA_FILE_MODIFICATION_TIME, "age") dfWithMetadata.collect - if (BackendTestUtils.isVeloxBackendLoaded()) { + if (BackendsApiManager.getSettings.supportNativeMetadataColumns()) { checkOperatorMatch[FileSourceScanExecTransformer](dfWithMetadata) } else { checkOperatorMatch[FileSourceScanExec](dfWithMetadata) @@ -134,7 +134,7 @@ class GlutenFileMetadataStructSuite extends FileMetadataStructSuite with GlutenS .where(Column(METADATA_FILE_NAME) === f0((METADATA_FILE_NAME))) val ret = filterDF.collect assert(ret.size == 1) - if (BackendTestUtils.isVeloxBackendLoaded()) { + if (BackendsApiManager.getSettings.supportNativeMetadataColumns()) { checkOperatorMatch[FileSourceScanExecTransformer](filterDF) } else { checkOperatorMatch[FileSourceScanExec](filterDF) @@ -150,7 +150,7 @@ class GlutenFileMetadataStructSuite extends FileMetadataStructSuite with GlutenS Row(f1(METADATA_FILE_PATH)) ) ) - if (BackendTestUtils.isVeloxBackendLoaded()) { + if (BackendsApiManager.getSettings.supportNativeMetadataColumns()) { checkOperatorMatch[FileSourceScanExecTransformer](filterDF) } else { checkOperatorMatch[FileSourceScanExec](filterDF) diff --git a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/GlutenParquetRowIndexSuite.scala b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/GlutenParquetRowIndexSuite.scala index 6f153450cb961..0113a92282fc5 100644 --- a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/GlutenParquetRowIndexSuite.scala +++ b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/GlutenParquetRowIndexSuite.scala @@ -39,10 +39,6 @@ import scala.collection.JavaConverters._ class GlutenParquetRowIndexSuite extends ParquetRowIndexSuite with GlutenSQLTestsBaseTrait { import testImplicits._ - override def beforeAll(): Unit = { - super.beforeAll() - sparkContext.setLogLevel("info") - } private def readRowGroupRowCounts(path: String): Seq[Long] = { ParquetFooterReader diff --git a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/extension/GlutenSessionExtensionSuite.scala b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/extension/GlutenSessionExtensionSuite.scala index 00b4bf5821f1e..4df9c63b3ef6d 100644 --- a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/extension/GlutenSessionExtensionSuite.scala +++ b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/extension/GlutenSessionExtensionSuite.scala @@ -16,7 +16,7 @@ */ package org.apache.spark.sql.extension -import org.apache.gluten.extension.{ColumnarOverrideRules, JoinSelectionOverrides} +import org.apache.gluten.extension.ColumnarOverrideRules import org.apache.spark.SparkConf import org.apache.spark.sql._ @@ -31,7 +31,6 @@ class GlutenSessionExtensionSuite extends GlutenSQLTestsTrait { testGluten("test gluten extensions") { assert(spark.sessionState.columnarRules.contains(ColumnarOverrideRules(spark))) - assert(spark.sessionState.planner.strategies.contains(JoinSelectionOverrides(spark))) assert(spark.sessionState.planner.strategies.contains(MySparkStrategy(spark))) assert(spark.sessionState.analyzer.extendedResolutionRules.contains(MyRule(spark))) diff --git a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/extension/TestFileSourceScanExecTransformer.scala b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/extension/TestFileSourceScanExecTransformer.scala index 592019f34ccf0..85af9a6238531 100644 --- a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/extension/TestFileSourceScanExecTransformer.scala +++ b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/extension/TestFileSourceScanExecTransformer.scala @@ -50,6 +50,7 @@ case class TestFileSourceScanExecTransformer( override def getPartitions: Seq[InputPartition] = BackendsApiManager.getTransformerApiInstance.genInputPartitionSeq( relation, + requiredSchema, selectedPartitions, output, bucketedScan, diff --git a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/gluten/GlutenFallbackSuite.scala b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/gluten/GlutenFallbackSuite.scala index 9e8c7e54291a6..fd6aa047558f1 100644 --- a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/gluten/GlutenFallbackSuite.scala +++ b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/gluten/GlutenFallbackSuite.scala @@ -25,6 +25,7 @@ import org.apache.spark.scheduler.{SparkListener, SparkListenerEvent} import org.apache.spark.sql.{GlutenSQLTestsTrait, Row} import org.apache.spark.sql.execution.ProjectExec import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper +import org.apache.spark.sql.execution.aggregate.HashAggregateExec import org.apache.spark.sql.execution.ui.{GlutenSQLAppStatusStore, SparkListenerSQLExecutionStart} import org.apache.spark.status.ElementTrackingStore @@ -168,18 +169,52 @@ class GlutenFallbackSuite extends GlutenSQLTestsTrait with AdaptiveSparkPlanHelp withSQLConf(GlutenConfig.EXPRESSION_BLACK_LIST.key -> "add") { try { val df = spark.sql("select sum(id + 1) from range(10)") - spark.sparkContext.listenerBus.waitUntilEmpty() df.collect() + spark.sparkContext.listenerBus.waitUntilEmpty() val project = find(df.queryExecution.executedPlan) { _.isInstanceOf[ProjectExec] } assert(project.isDefined) - events.exists( - _.fallbackNodeToReason.values.toSet - .contains("Project: Not supported to map spark function name")) + assert( + events.exists(_.fallbackNodeToReason.values.toSet + .exists(_.contains("Not supported to map spark function name")))) } finally { spark.sparkContext.removeSparkListener(listener) } } } + + test("ExpandFallbackPolicy should propagate fallback reason to vanilla SparkPlan") { + val events = new ArrayBuffer[GlutenPlanFallbackEvent] + val listener = new SparkListener { + override def onOtherEvent(event: SparkListenerEvent): Unit = { + event match { + case e: GlutenPlanFallbackEvent => events.append(e) + case _ => + } + } + } + spark.sparkContext.addSparkListener(listener) + spark.range(10).selectExpr("id as c1", "id as c2").write.format("parquet").saveAsTable("t") + withTable("t") { + withSQLConf( + GlutenConfig.EXPRESSION_BLACK_LIST.key -> "max", + GlutenConfig.COLUMNAR_WHOLESTAGE_FALLBACK_THRESHOLD.key -> "1") { + try { + val df = spark.sql("select c2, max(c1) as id from t group by c2") + df.collect() + spark.sparkContext.listenerBus.waitUntilEmpty() + val agg = collect(df.queryExecution.executedPlan) { case a: HashAggregateExec => a } + assert(agg.size == 2) + assert( + events.count( + _.fallbackNodeToReason.values.toSet.exists(_.contains( + "Could not find a valid substrait mapping name for max" + ))) == 2) + } finally { + spark.sparkContext.removeSparkListener(listener) + } + } + } + } } diff --git a/gluten-ut/spark35/pom.xml b/gluten-ut/spark35/pom.xml index 8f0a5605bdff9..2bf1c93a00522 100644 --- a/gluten-ut/spark35/pom.xml +++ b/gluten-ut/spark35/pom.xml @@ -63,6 +63,28 @@ test + + + + org.codehaus.mojo + build-helper-maven-plugin + + + add-sources + generate-sources + + add-test-source + + + + src/test/backends-clickhouse + + + + + + + backends-velox @@ -95,7 +117,7 @@ org.apache.arrow arrow-c-data - ${arrow.version} + ${arrow-gluten.version} test @@ -155,6 +177,28 @@ 2.19.0 + + + + org.codehaus.mojo + build-helper-maven-plugin + + + add-sources + generate-sources + + add-test-source + + + + src/test/backends-velox + + + + + + + diff --git a/gluten-ut/spark35/src/test/backends-clickhouse/org/apache/gluten/GlutenColumnarWriteTestSupport.scala b/gluten-ut/spark35/src/test/backends-clickhouse/org/apache/gluten/GlutenColumnarWriteTestSupport.scala new file mode 100644 index 0000000000000..43b83afe9af37 --- /dev/null +++ b/gluten-ut/spark35/src/test/backends-clickhouse/org/apache/gluten/GlutenColumnarWriteTestSupport.scala @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten + +import org.apache.spark.sql.execution.SparkPlan + +trait GlutenColumnarWriteTestSupport { + + def checkWriteFilesAndGetChild(sparkPlan: SparkPlan): SparkPlan = { + throw new UnsupportedOperationException("Clickhouse Backend does not support write files") + } +} diff --git a/gluten-ut/spark35/src/test/backends-velox/org/apache/gluten/GlutenColumnarWriteTestSupport.scala b/gluten-ut/spark35/src/test/backends-velox/org/apache/gluten/GlutenColumnarWriteTestSupport.scala new file mode 100644 index 0000000000000..c7ad606bcf8d0 --- /dev/null +++ b/gluten-ut/spark35/src/test/backends-velox/org/apache/gluten/GlutenColumnarWriteTestSupport.scala @@ -0,0 +1,27 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten + +import org.apache.spark.sql.execution.{SparkPlan, VeloxColumnarWriteFilesExec} + +trait GlutenColumnarWriteTestSupport { + + def checkWriteFilesAndGetChild(sparkPlan: SparkPlan): SparkPlan = { + assert(sparkPlan.isInstanceOf[VeloxColumnarWriteFilesExec]) + sparkPlan.asInstanceOf[VeloxColumnarWriteFilesExec].child + } +} diff --git a/gluten-ut/spark35/src/test/resources/sql-tests/inputs/subquery/scalar-subquery/scalar-subquery-select.sql b/gluten-ut/spark35/src/test/resources/sql-tests/inputs/subquery/scalar-subquery/scalar-subquery-select.sql deleted file mode 100644 index 741292d2c0fa5..0000000000000 --- a/gluten-ut/spark35/src/test/resources/sql-tests/inputs/subquery/scalar-subquery/scalar-subquery-select.sql +++ /dev/null @@ -1,257 +0,0 @@ --- A test suite for scalar subquery in SELECT clause - -create temporary view t1 as select * from values - ('val1a', 6S, 8, 10L, float(15.0), 20D, 20E2, timestamp '2014-04-04 00:00:00.000', date '2014-04-04'), - ('val1b', 8S, 16, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), - ('val1a', 16S, 12, 21L, float(15.0), 20D, 20E2, timestamp '2014-06-04 01:02:00.001', date '2014-06-04'), - ('val1a', 16S, 12, 10L, float(15.0), 20D, 20E2, timestamp '2014-07-04 01:01:00.000', date '2014-07-04'), - ('val1c', 8S, 16, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:02:00.001', date '2014-05-05'), - ('val1d', null, 16, 22L, float(17.0), 25D, 26E2, timestamp '2014-06-04 01:01:00.000', null), - ('val1d', null, 16, 19L, float(17.0), 25D, 26E2, timestamp '2014-07-04 01:02:00.001', null), - ('val1e', 10S, null, 25L, float(17.0), 25D, 26E2, timestamp '2014-08-04 01:01:00.000', date '2014-08-04'), - ('val1e', 10S, null, 19L, float(17.0), 25D, 26E2, timestamp '2014-09-04 01:02:00.001', date '2014-09-04'), - ('val1d', 10S, null, 12L, float(17.0), 25D, 26E2, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), - ('val1a', 6S, 8, 10L, float(15.0), 20D, 20E2, timestamp '2014-04-04 01:02:00.001', date '2014-04-04'), - ('val1e', 10S, null, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04') - as t1(t1a, t1b, t1c, t1d, t1e, t1f, t1g, t1h, t1i); - -create temporary view t2 as select * from values - ('val2a', 6S, 12, 14L, float(15), 20D, 20E2, timestamp '2014-04-04 01:01:00.000', date '2014-04-04'), - ('val1b', 10S, 12, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), - ('val1b', 8S, 16, 119L, float(17), 25D, 26E2, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), - ('val1c', 12S, 16, 219L, float(17), 25D, 26E2, timestamp '2016-05-04 01:01:00.000', date '2016-05-04'), - ('val1b', null, 16, 319L, float(17), 25D, 26E2, timestamp '2017-05-04 01:01:00.000', null), - ('val2e', 8S, null, 419L, float(17), 25D, 26E2, timestamp '2014-06-04 01:01:00.000', date '2014-06-04'), - ('val1f', 19S, null, 519L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), - ('val1b', 10S, 12, 19L, float(17), 25D, 26E2, timestamp '2014-06-04 01:01:00.000', date '2014-06-04'), - ('val1b', 8S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-07-04 01:01:00.000', date '2014-07-04'), - ('val1c', 12S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-08-04 01:01:00.000', date '2014-08-05'), - ('val1e', 8S, null, 19L, float(17), 25D, 26E2, timestamp '2014-09-04 01:01:00.000', date '2014-09-04'), - ('val1f', 19S, null, 19L, float(17), 25D, 26E2, timestamp '2014-10-04 01:01:00.000', date '2014-10-04'), - ('val1b', null, 16, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', null) - as t2(t2a, t2b, t2c, t2d, t2e, t2f, t2g, t2h, t2i); - -create temporary view t3 as select * from values - ('val3a', 6S, 12, 110L, float(15), 20D, 20E2, timestamp '2014-04-04 01:02:00.000', date '2014-04-04'), - ('val3a', 6S, 12, 10L, float(15), 20D, 20E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ('val1b', 10S, 12, 219L, float(17), 25D, 26E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ('val1b', 10S, 12, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ('val1b', 8S, 16, 319L, float(17), 25D, 26E2, timestamp '2014-06-04 01:02:00.000', date '2014-06-04'), - ('val1b', 8S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-07-04 01:02:00.000', date '2014-07-04'), - ('val3c', 17S, 16, 519L, float(17), 25D, 26E2, timestamp '2014-08-04 01:02:00.000', date '2014-08-04'), - ('val3c', 17S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-09-04 01:02:00.000', date '2014-09-05'), - ('val1b', null, 16, 419L, float(17), 25D, 26E2, timestamp '2014-10-04 01:02:00.000', null), - ('val1b', null, 16, 19L, float(17), 25D, 26E2, timestamp '2014-11-04 01:02:00.000', null), - ('val3b', 8S, null, 719L, float(17), 25D, 26E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ('val3b', 8S, null, 19L, float(17), 25D, 26E2, timestamp '2015-05-04 01:02:00.000', date '2015-05-04') - as t3(t3a, t3b, t3c, t3d, t3e, t3f, t3g, t3h, t3i); - --- Group 1: scalar subquery in SELECT clause --- no correlation --- TC 01.01 --- more than one scalar subquery -SELECT (SELECT min(t3d) FROM t3) min_t3d, - (SELECT max(t2h) FROM t2) max_t2h -FROM t1 -WHERE t1a = 'val1c'; - --- TC 01.02 --- scalar subquery in an IN subquery -SELECT t1a, count(*) -FROM t1 -WHERE t1c IN (SELECT (SELECT min(t3c) FROM t3) - FROM t2 - GROUP BY t2g - HAVING count(*) > 1) -GROUP BY t1a; - --- TC 01.03 --- under a set op -SELECT (SELECT min(t3d) FROM t3) min_t3d, - null -FROM t1 -WHERE t1a = 'val1c' -UNION -SELECT null, - (SELECT max(t2h) FROM t2) max_t2h -FROM t1 -WHERE t1a = 'val1c'; - --- TC 01.04 -SELECT (SELECT min(t3c) FROM t3) min_t3d -FROM t1 -WHERE t1a = 'val1a' -INTERSECT -SELECT (SELECT min(t2c) FROM t2) min_t2d -FROM t1 -WHERE t1a = 'val1d'; - --- TC 01.05 -SELECT q1.t1a, q2.t2a, q1.min_t3d, q2.avg_t3d -FROM (SELECT t1a, (SELECT min(t3d) FROM t3) min_t3d - FROM t1 - WHERE t1a IN ('val1e', 'val1c')) q1 - FULL OUTER JOIN - (SELECT t2a, (SELECT avg(t3d) FROM t3) avg_t3d - FROM t2 - WHERE t2a IN ('val1c', 'val2a')) q2 -ON q1.t1a = q2.t2a -AND q1.min_t3d < q2.avg_t3d; - --- Group 2: scalar subquery in SELECT clause --- with correlation --- TC 02.01 -SELECT (SELECT min(t3d) FROM t3 WHERE t3.t3a = t1.t1a) min_t3d, - (SELECT max(t2h) FROM t2 WHERE t2.t2a = t1.t1a) max_t2h -FROM t1 -WHERE t1a = 'val1b'; - --- TC 02.02 -SELECT (SELECT min(t3d) FROM t3 WHERE t3a = t1a) min_t3d -FROM t1 -WHERE t1a = 'val1b' -MINUS -SELECT (SELECT min(t3d) FROM t3) abs_min_t3d -FROM t1 -WHERE t1a = 'val1b'; - --- TC 02.03 -SELECT t1a, t1b -FROM t1 -WHERE NOT EXISTS (SELECT (SELECT max(t2b) - FROM t2 LEFT JOIN t1 - ON t2a = t1a - WHERE t2c = t3c) dummy - FROM t3 - WHERE t3b < (SELECT max(t2b) - FROM t2 LEFT JOIN t1 - ON t2a = t1a - WHERE t2c = t3c) - AND t3a = t1a); - --- SPARK-34876: Non-nullable aggregates should not return NULL in a correlated subquery -SELECT t1a, - (SELECT count(t2d) FROM t2 WHERE t2a = t1a) count_t2, - (SELECT count_if(t2d > 0) FROM t2 WHERE t2a = t1a) count_if_t2, - (SELECT approx_count_distinct(t2d) FROM t2 WHERE t2a = t1a) approx_count_distinct_t2, - (SELECT collect_list(t2d) FROM t2 WHERE t2a = t1a) collect_list_t2, - (SELECT sort_array(collect_set(t2d)) FROM t2 WHERE t2a = t1a) collect_set_t2, - (SELECT hex(count_min_sketch(t2d, 0.5d, 0.5d, 1)) FROM t2 WHERE t2a = t1a) collect_set_t2 -FROM t1; - --- SPARK-36028: Allow Project to host outer references in scalar subqueries -SELECT t1c, (SELECT t1c) FROM t1; -SELECT t1c, (SELECT t1c WHERE t1c = 8) FROM t1; -SELECT t1c, t1d, (SELECT c + d FROM (SELECT t1c AS c, t1d AS d)) FROM t1; -SELECT t1c, (SELECT SUM(c) FROM (SELECT t1c AS c)) FROM t1; -SELECT t1a, (SELECT SUM(t2b) FROM t2 JOIN (SELECT t1a AS a) ON t2a = a) FROM t1; - --- CTE in correlated scalar subqueries -CREATE OR REPLACE TEMPORARY VIEW t1 AS VALUES (0, 1), (1, 2) t1(c1, c2); -CREATE OR REPLACE TEMPORARY VIEW t2 AS VALUES (0, 2), (0, 3) t2(c1, c2); - --- Single row subquery -SELECT c1, (WITH t AS (SELECT 1 AS a) SELECT a + c1 FROM t) FROM t1; --- Correlation in CTE. -SELECT c1, (WITH t AS (SELECT * FROM t2 WHERE c1 = t1.c1) SELECT SUM(c2) FROM t) FROM t1; --- Multiple CTE definitions. -SELECT c1, ( - WITH t3 AS (SELECT c1 + 1 AS c1, c2 + 1 AS c2 FROM t2), - t4 AS (SELECT * FROM t3 WHERE t1.c1 = c1) - SELECT SUM(c2) FROM t4 -) FROM t1; --- Multiple CTE references. -SELECT c1, ( - WITH t AS (SELECT * FROM t2) - SELECT SUM(c2) FROM (SELECT c1, c2 FROM t UNION SELECT c2, c1 FROM t) r(c1, c2) - WHERE c1 = t1.c1 -) FROM t1; --- Reference CTE in both the main query and the subquery. -WITH v AS (SELECT * FROM t2) -SELECT * FROM t1 WHERE c1 > ( - WITH t AS (SELECT * FROM t2) - SELECT COUNT(*) FROM v WHERE c1 = t1.c1 AND c1 > (SELECT SUM(c2) FROM t WHERE c1 = v.c1) -); --- Single row subquery that references CTE in the main query. -WITH t AS (SELECT 1 AS a) -SELECT c1, (SELECT a FROM t WHERE a = c1) FROM t1; --- Multiple CTE references with non-deterministic CTEs. -WITH -v1 AS (SELECT c1, c2, rand(0) c3 FROM t1), -v2 AS (SELECT c1, c2, rand(0) c4 FROM v1 WHERE c3 IN (SELECT c3 FROM v1)) -SELECT c1, ( - WITH v3 AS (SELECT c1, c2, rand(0) c5 FROM t2) - SELECT COUNT(*) FROM ( - SELECT * FROM v2 WHERE c1 > 0 - UNION SELECT * FROM v2 WHERE c2 > 0 - UNION SELECT * FROM v3 WHERE c2 > 0 - ) WHERE c1 = v1.c1 -) FROM v1; - --- Multi-value subquery error -SELECT (SELECT a FROM (SELECT 1 AS a UNION ALL SELECT 2 AS a) t) AS b; - --- SPARK-36114: Support correlated non-equality predicates -CREATE OR REPLACE TEMP VIEW t1(c1, c2) AS (VALUES (0, 1), (1, 2)); -CREATE OR REPLACE TEMP VIEW t2(c1, c2) AS (VALUES (0, 2), (0, 3)); - --- Neumann example Q2 -CREATE OR REPLACE TEMP VIEW students(id, name, major, year) AS (VALUES - (0, 'A', 'CS', 2022), - (1, 'B', 'CS', 2022), - (2, 'C', 'Math', 2022)); -CREATE OR REPLACE TEMP VIEW exams(sid, course, curriculum, grade, date) AS (VALUES - (0, 'C1', 'CS', 4, 2020), - (0, 'C2', 'CS', 3, 2021), - (1, 'C1', 'CS', 2, 2020), - (1, 'C2', 'CS', 1, 2021)); - -SELECT students.name, exams.course -FROM students, exams -WHERE students.id = exams.sid - AND (students.major = 'CS' OR students.major = 'Games Eng') - AND exams.grade >= ( - SELECT avg(exams.grade) + 1 - FROM exams - WHERE students.id = exams.sid - OR (exams.curriculum = students.major AND students.year > exams.date)); - --- Correlated non-equality predicates -SELECT (SELECT min(c2) FROM t2 WHERE t1.c1 > t2.c1) FROM t1; -SELECT (SELECT min(c2) FROM t2 WHERE t1.c1 >= t2.c1 AND t1.c2 < t2.c2) FROM t1; - --- Correlated non-equality predicates with the COUNT bug. -SELECT (SELECT count(*) FROM t2 WHERE t1.c1 > t2.c1) FROM t1; - --- Correlated equality predicates that are not supported after SPARK-35080 -SELECT c, ( - SELECT count(*) - FROM (VALUES ('ab'), ('abc'), ('bc')) t2(c) - WHERE t1.c = substring(t2.c, 1, 1) -) FROM (VALUES ('a'), ('b')) t1(c); - -SELECT c, ( - SELECT count(*) - FROM (VALUES (0, 6), (1, 5), (2, 4), (3, 3)) t1(a, b) - WHERE a + b = c -) FROM (VALUES (6)) t2(c); - --- SPARK-43156: scalar subquery with Literal result like `COUNT(1) is null` -SELECT *, (SELECT count(1) is null FROM t2 WHERE t1.c1 = t2.c1) FROM t1; - -select (select f from (select false as f, max(c2) from t1 where t1.c1 = t1.c1)) from t2; - --- SPARK-43596: handle IsNull when rewriting the domain join -set spark.sql.optimizer.optimizeOneRowRelationSubquery.alwaysInline=false; -WITH T AS (SELECT 1 AS a) -SELECT (SELECT sum(1) FROM T WHERE a = col OR upper(col)= 'Y') -FROM (SELECT null as col) as foo; -set spark.sql.optimizer.optimizeOneRowRelationSubquery.alwaysInline=true; - --- SPARK-43760: the result of the subquery can be NULL. -select * from ( - select t1.id c1, ( - select t2.id c from range (1, 2) t2 - where t1.id = t2.id ) c2 - from range (1, 3) t1 ) t -where t.c2 is not null; diff --git a/gluten-ut/spark35/src/test/resources/sql-tests/results/subquery/scalar-subquery/scalar-subquery-select.sql.out b/gluten-ut/spark35/src/test/resources/sql-tests/results/subquery/scalar-subquery/scalar-subquery-select.sql.out deleted file mode 100644 index 5c6f141d8505b..0000000000000 --- a/gluten-ut/spark35/src/test/resources/sql-tests/results/subquery/scalar-subquery/scalar-subquery-select.sql.out +++ /dev/null @@ -1,614 +0,0 @@ --- Automatically generated by GlutenSQLQueryTestSuite --- Number of queries: 52 - - --- !query -create temporary view t1 as select * from values - ('val1a', 6S, 8, 10L, float(15.0), 20D, 20E2, timestamp '2014-04-04 00:00:00.000', date '2014-04-04'), - ('val1b', 8S, 16, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), - ('val1a', 16S, 12, 21L, float(15.0), 20D, 20E2, timestamp '2014-06-04 01:02:00.001', date '2014-06-04'), - ('val1a', 16S, 12, 10L, float(15.0), 20D, 20E2, timestamp '2014-07-04 01:01:00.000', date '2014-07-04'), - ('val1c', 8S, 16, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:02:00.001', date '2014-05-05'), - ('val1d', null, 16, 22L, float(17.0), 25D, 26E2, timestamp '2014-06-04 01:01:00.000', null), - ('val1d', null, 16, 19L, float(17.0), 25D, 26E2, timestamp '2014-07-04 01:02:00.001', null), - ('val1e', 10S, null, 25L, float(17.0), 25D, 26E2, timestamp '2014-08-04 01:01:00.000', date '2014-08-04'), - ('val1e', 10S, null, 19L, float(17.0), 25D, 26E2, timestamp '2014-09-04 01:02:00.001', date '2014-09-04'), - ('val1d', 10S, null, 12L, float(17.0), 25D, 26E2, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), - ('val1a', 6S, 8, 10L, float(15.0), 20D, 20E2, timestamp '2014-04-04 01:02:00.001', date '2014-04-04'), - ('val1e', 10S, null, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04') - as t1(t1a, t1b, t1c, t1d, t1e, t1f, t1g, t1h, t1i) --- !query schema -struct<> --- !query output - - - --- !query -create temporary view t2 as select * from values - ('val2a', 6S, 12, 14L, float(15), 20D, 20E2, timestamp '2014-04-04 01:01:00.000', date '2014-04-04'), - ('val1b', 10S, 12, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), - ('val1b', 8S, 16, 119L, float(17), 25D, 26E2, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), - ('val1c', 12S, 16, 219L, float(17), 25D, 26E2, timestamp '2016-05-04 01:01:00.000', date '2016-05-04'), - ('val1b', null, 16, 319L, float(17), 25D, 26E2, timestamp '2017-05-04 01:01:00.000', null), - ('val2e', 8S, null, 419L, float(17), 25D, 26E2, timestamp '2014-06-04 01:01:00.000', date '2014-06-04'), - ('val1f', 19S, null, 519L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), - ('val1b', 10S, 12, 19L, float(17), 25D, 26E2, timestamp '2014-06-04 01:01:00.000', date '2014-06-04'), - ('val1b', 8S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-07-04 01:01:00.000', date '2014-07-04'), - ('val1c', 12S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-08-04 01:01:00.000', date '2014-08-05'), - ('val1e', 8S, null, 19L, float(17), 25D, 26E2, timestamp '2014-09-04 01:01:00.000', date '2014-09-04'), - ('val1f', 19S, null, 19L, float(17), 25D, 26E2, timestamp '2014-10-04 01:01:00.000', date '2014-10-04'), - ('val1b', null, 16, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', null) - as t2(t2a, t2b, t2c, t2d, t2e, t2f, t2g, t2h, t2i) --- !query schema -struct<> --- !query output - - - --- !query -create temporary view t3 as select * from values - ('val3a', 6S, 12, 110L, float(15), 20D, 20E2, timestamp '2014-04-04 01:02:00.000', date '2014-04-04'), - ('val3a', 6S, 12, 10L, float(15), 20D, 20E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ('val1b', 10S, 12, 219L, float(17), 25D, 26E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ('val1b', 10S, 12, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ('val1b', 8S, 16, 319L, float(17), 25D, 26E2, timestamp '2014-06-04 01:02:00.000', date '2014-06-04'), - ('val1b', 8S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-07-04 01:02:00.000', date '2014-07-04'), - ('val3c', 17S, 16, 519L, float(17), 25D, 26E2, timestamp '2014-08-04 01:02:00.000', date '2014-08-04'), - ('val3c', 17S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-09-04 01:02:00.000', date '2014-09-05'), - ('val1b', null, 16, 419L, float(17), 25D, 26E2, timestamp '2014-10-04 01:02:00.000', null), - ('val1b', null, 16, 19L, float(17), 25D, 26E2, timestamp '2014-11-04 01:02:00.000', null), - ('val3b', 8S, null, 719L, float(17), 25D, 26E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ('val3b', 8S, null, 19L, float(17), 25D, 26E2, timestamp '2015-05-04 01:02:00.000', date '2015-05-04') - as t3(t3a, t3b, t3c, t3d, t3e, t3f, t3g, t3h, t3i) --- !query schema -struct<> --- !query output - - - --- !query -SELECT (SELECT min(t3d) FROM t3) min_t3d, - (SELECT max(t2h) FROM t2) max_t2h -FROM t1 -WHERE t1a = 'val1c' --- !query schema -struct --- !query output -10 2017-05-04 01:01:00 - - --- !query -SELECT t1a, count(*) -FROM t1 -WHERE t1c IN (SELECT (SELECT min(t3c) FROM t3) - FROM t2 - GROUP BY t2g - HAVING count(*) > 1) -GROUP BY t1a --- !query schema -struct --- !query output -val1a 2 - - --- !query -SELECT (SELECT min(t3d) FROM t3) min_t3d, - null -FROM t1 -WHERE t1a = 'val1c' -UNION -SELECT null, - (SELECT max(t2h) FROM t2) max_t2h -FROM t1 -WHERE t1a = 'val1c' --- !query schema -struct --- !query output -10 NULL -NULL 2017-05-04 01:01:00 - - --- !query -SELECT (SELECT min(t3c) FROM t3) min_t3d -FROM t1 -WHERE t1a = 'val1a' -INTERSECT -SELECT (SELECT min(t2c) FROM t2) min_t2d -FROM t1 -WHERE t1a = 'val1d' --- !query schema -struct --- !query output -12 - - --- !query -SELECT q1.t1a, q2.t2a, q1.min_t3d, q2.avg_t3d -FROM (SELECT t1a, (SELECT min(t3d) FROM t3) min_t3d - FROM t1 - WHERE t1a IN ('val1e', 'val1c')) q1 - FULL OUTER JOIN - (SELECT t2a, (SELECT avg(t3d) FROM t3) avg_t3d - FROM t2 - WHERE t2a IN ('val1c', 'val2a')) q2 -ON q1.t1a = q2.t2a -AND q1.min_t3d < q2.avg_t3d --- !query schema -struct --- !query output -NULL val2a NULL 200.83333333333334 -val1c val1c 10 200.83333333333334 -val1c val1c 10 200.83333333333334 -val1e NULL 10 NULL -val1e NULL 10 NULL -val1e NULL 10 NULL - - --- !query -SELECT (SELECT min(t3d) FROM t3 WHERE t3.t3a = t1.t1a) min_t3d, - (SELECT max(t2h) FROM t2 WHERE t2.t2a = t1.t1a) max_t2h -FROM t1 -WHERE t1a = 'val1b' --- !query schema -struct --- !query output -19 2017-05-04 01:01:00 - - --- !query -SELECT (SELECT min(t3d) FROM t3 WHERE t3a = t1a) min_t3d -FROM t1 -WHERE t1a = 'val1b' -MINUS -SELECT (SELECT min(t3d) FROM t3) abs_min_t3d -FROM t1 -WHERE t1a = 'val1b' --- !query schema -struct --- !query output -19 - - --- !query -SELECT t1a, t1b -FROM t1 -WHERE NOT EXISTS (SELECT (SELECT max(t2b) - FROM t2 LEFT JOIN t1 - ON t2a = t1a - WHERE t2c = t3c) dummy - FROM t3 - WHERE t3b < (SELECT max(t2b) - FROM t2 LEFT JOIN t1 - ON t2a = t1a - WHERE t2c = t3c) - AND t3a = t1a) --- !query schema -struct --- !query output -val1a 16 -val1a 16 -val1a 6 -val1a 6 -val1c 8 -val1d 10 -val1d NULL -val1d NULL -val1e 10 -val1e 10 -val1e 10 - - --- !query -SELECT t1a, - (SELECT count(t2d) FROM t2 WHERE t2a = t1a) count_t2, - (SELECT count_if(t2d > 0) FROM t2 WHERE t2a = t1a) count_if_t2, - (SELECT approx_count_distinct(t2d) FROM t2 WHERE t2a = t1a) approx_count_distinct_t2, - (SELECT collect_list(t2d) FROM t2 WHERE t2a = t1a) collect_list_t2, - (SELECT sort_array(collect_set(t2d)) FROM t2 WHERE t2a = t1a) collect_set_t2, - (SELECT hex(count_min_sketch(t2d, 0.5d, 0.5d, 1)) FROM t2 WHERE t2a = t1a) collect_set_t2 -FROM t1 --- !query schema -struct,collect_set_t2:array,collect_set_t2:string> --- !query output -val1a 0 0 0 [] [] 0000000100000000000000000000000100000004000000005D8D6AB90000000000000000000000000000000000000000000000000000000000000000 -val1a 0 0 0 [] [] 0000000100000000000000000000000100000004000000005D8D6AB90000000000000000000000000000000000000000000000000000000000000000 -val1a 0 0 0 [] [] 0000000100000000000000000000000100000004000000005D8D6AB90000000000000000000000000000000000000000000000000000000000000000 -val1a 0 0 0 [] [] 0000000100000000000000000000000100000004000000005D8D6AB90000000000000000000000000000000000000000000000000000000000000000 -val1b 6 6 3 [19,119,319,19,19,19] [19,119,319] 0000000100000000000000060000000100000004000000005D8D6AB90000000000000000000000000000000400000000000000010000000000000001 -val1c 2 2 2 [219,19] [19,219] 0000000100000000000000020000000100000004000000005D8D6AB90000000000000000000000000000000100000000000000000000000000000001 -val1d 0 0 0 [] [] 0000000100000000000000000000000100000004000000005D8D6AB90000000000000000000000000000000000000000000000000000000000000000 -val1d 0 0 0 [] [] 0000000100000000000000000000000100000004000000005D8D6AB90000000000000000000000000000000000000000000000000000000000000000 -val1d 0 0 0 [] [] 0000000100000000000000000000000100000004000000005D8D6AB90000000000000000000000000000000000000000000000000000000000000000 -val1e 1 1 1 [19] [19] 0000000100000000000000010000000100000004000000005D8D6AB90000000000000000000000000000000100000000000000000000000000000000 -val1e 1 1 1 [19] [19] 0000000100000000000000010000000100000004000000005D8D6AB90000000000000000000000000000000100000000000000000000000000000000 -val1e 1 1 1 [19] [19] 0000000100000000000000010000000100000004000000005D8D6AB90000000000000000000000000000000100000000000000000000000000000000 - - --- !query -SELECT t1c, (SELECT t1c) FROM t1 --- !query schema -struct --- !query output -12 12 -12 12 -16 16 -16 16 -16 16 -16 16 -8 8 -8 8 -NULL NULL -NULL NULL -NULL NULL -NULL NULL - - --- !query -SELECT t1c, (SELECT t1c WHERE t1c = 8) FROM t1 --- !query schema -struct --- !query output -12 NULL -12 NULL -16 NULL -16 NULL -16 NULL -16 NULL -8 8 -8 8 -NULL NULL -NULL NULL -NULL NULL -NULL NULL - - --- !query -SELECT t1c, t1d, (SELECT c + d FROM (SELECT t1c AS c, t1d AS d)) FROM t1 --- !query schema -struct --- !query output -12 10 22 -12 21 33 -16 19 35 -16 19 35 -16 19 35 -16 22 38 -8 10 18 -8 10 18 -NULL 12 NULL -NULL 19 NULL -NULL 19 NULL -NULL 25 NULL - - --- !query -SELECT t1c, (SELECT SUM(c) FROM (SELECT t1c AS c)) FROM t1 --- !query schema -struct --- !query output -12 12 -12 12 -16 16 -16 16 -16 16 -16 16 -8 8 -8 8 -NULL NULL -NULL NULL -NULL NULL -NULL NULL - - --- !query -SELECT t1a, (SELECT SUM(t2b) FROM t2 JOIN (SELECT t1a AS a) ON t2a = a) FROM t1 --- !query schema -struct --- !query output -val1a NULL -val1a NULL -val1a NULL -val1a NULL -val1b 36 -val1c 24 -val1d NULL -val1d NULL -val1d NULL -val1e 8 -val1e 8 -val1e 8 - - --- !query -CREATE OR REPLACE TEMPORARY VIEW t1 AS VALUES (0, 1), (1, 2) t1(c1, c2) --- !query schema -struct<> --- !query output - - - --- !query -CREATE OR REPLACE TEMPORARY VIEW t2 AS VALUES (0, 2), (0, 3) t2(c1, c2) --- !query schema -struct<> --- !query output - - - --- !query -SELECT c1, (WITH t AS (SELECT 1 AS a) SELECT a + c1 FROM t) FROM t1 --- !query schema -struct --- !query output -0 1 -1 2 - - --- !query -SELECT c1, (WITH t AS (SELECT * FROM t2 WHERE c1 = t1.c1) SELECT SUM(c2) FROM t) FROM t1 --- !query schema -struct --- !query output -0 5 -1 NULL - - --- !query -SELECT c1, ( - WITH t3 AS (SELECT c1 + 1 AS c1, c2 + 1 AS c2 FROM t2), - t4 AS (SELECT * FROM t3 WHERE t1.c1 = c1) - SELECT SUM(c2) FROM t4 -) FROM t1 --- !query schema -struct --- !query output -0 NULL -1 7 - - --- !query -SELECT c1, ( - WITH t AS (SELECT * FROM t2) - SELECT SUM(c2) FROM (SELECT c1, c2 FROM t UNION SELECT c2, c1 FROM t) r(c1, c2) - WHERE c1 = t1.c1 -) FROM t1 --- !query schema -struct --- !query output -0 5 -1 NULL - - --- !query -WITH v AS (SELECT * FROM t2) -SELECT * FROM t1 WHERE c1 > ( - WITH t AS (SELECT * FROM t2) - SELECT COUNT(*) FROM v WHERE c1 = t1.c1 AND c1 > (SELECT SUM(c2) FROM t WHERE c1 = v.c1) -) --- !query schema -struct --- !query output -1 2 - - --- !query -WITH t AS (SELECT 1 AS a) -SELECT c1, (SELECT a FROM t WHERE a = c1) FROM t1 --- !query schema -struct --- !query output -0 NULL -1 1 - - --- !query -WITH -v1 AS (SELECT c1, c2, rand(0) c3 FROM t1), -v2 AS (SELECT c1, c2, rand(0) c4 FROM v1 WHERE c3 IN (SELECT c3 FROM v1)) -SELECT c1, ( - WITH v3 AS (SELECT c1, c2, rand(0) c5 FROM t2) - SELECT COUNT(*) FROM ( - SELECT * FROM v2 WHERE c1 > 0 - UNION SELECT * FROM v2 WHERE c2 > 0 - UNION SELECT * FROM v3 WHERE c2 > 0 - ) WHERE c1 = v1.c1 -) FROM v1 --- !query schema -struct --- !query output -0 3 -1 1 - - --- !query -SELECT (SELECT a FROM (SELECT 1 AS a UNION ALL SELECT 2 AS a) t) AS b --- !query schema -struct<> --- !query output -java.lang.IllegalStateException -more than one row returned by a subquery used as an expression: -Subquery subquery#1, [id=#2] -+- AdaptiveSparkPlan isFinalPlan=true - +- == Final Plan == - VeloxColumnarToRowExec - +- ColumnarUnion - :- ProjectExecTransformer [1 AS a#3] - : +- InputIteratorTransformer[fake_column#4] - : +- InputAdapter - : +- RowToVeloxColumnar - : +- Scan OneRowRelation[fake_column#4] - +- ProjectExecTransformer [2 AS a#5] - +- InputIteratorTransformer[fake_column#6] - +- InputAdapter - +- RowToVeloxColumnar - +- Scan OneRowRelation[fake_column#6] - +- == Initial Plan == - Union - :- Project [1 AS a#3] - : +- Scan OneRowRelation[] - +- Project [2 AS a#5] - +- Scan OneRowRelation[] - - --- !query -CREATE OR REPLACE TEMP VIEW t1(c1, c2) AS (VALUES (0, 1), (1, 2)) --- !query schema -struct<> --- !query output - - - --- !query -CREATE OR REPLACE TEMP VIEW t2(c1, c2) AS (VALUES (0, 2), (0, 3)) --- !query schema -struct<> --- !query output - - - --- !query -CREATE OR REPLACE TEMP VIEW students(id, name, major, year) AS (VALUES - (0, 'A', 'CS', 2022), - (1, 'B', 'CS', 2022), - (2, 'C', 'Math', 2022)) --- !query schema -struct<> --- !query output - - - --- !query -CREATE OR REPLACE TEMP VIEW exams(sid, course, curriculum, grade, date) AS (VALUES - (0, 'C1', 'CS', 4, 2020), - (0, 'C2', 'CS', 3, 2021), - (1, 'C1', 'CS', 2, 2020), - (1, 'C2', 'CS', 1, 2021)) --- !query schema -struct<> --- !query output - - - --- !query -SELECT students.name, exams.course -FROM students, exams -WHERE students.id = exams.sid - AND (students.major = 'CS' OR students.major = 'Games Eng') - AND exams.grade >= ( - SELECT avg(exams.grade) + 1 - FROM exams - WHERE students.id = exams.sid - OR (exams.curriculum = students.major AND students.year > exams.date)) --- !query schema -struct --- !query output -A C1 - - --- !query -SELECT (SELECT min(c2) FROM t2 WHERE t1.c1 > t2.c1) FROM t1 --- !query schema -struct --- !query output -2 -NULL - - --- !query -SELECT (SELECT min(c2) FROM t2 WHERE t1.c1 >= t2.c1 AND t1.c2 < t2.c2) FROM t1 --- !query schema -struct --- !query output -2 -3 - - --- !query -SELECT (SELECT count(*) FROM t2 WHERE t1.c1 > t2.c1) FROM t1 --- !query schema -struct --- !query output -0 -2 - - --- !query -SELECT c, ( - SELECT count(*) - FROM (VALUES ('ab'), ('abc'), ('bc')) t2(c) - WHERE t1.c = substring(t2.c, 1, 1) -) FROM (VALUES ('a'), ('b')) t1(c) --- !query schema -struct --- !query output -a 2 -b 1 - - --- !query -SELECT c, ( - SELECT count(*) - FROM (VALUES (0, 6), (1, 5), (2, 4), (3, 3)) t1(a, b) - WHERE a + b = c -) FROM (VALUES (6)) t2(c) --- !query schema -struct --- !query output -6 4 - - --- !query -SELECT *, (SELECT count(1) is null FROM t2 WHERE t1.c1 = t2.c1) FROM t1 --- !query schema -struct --- !query output -0 1 false -1 2 false - - --- !query -select (select f from (select false as f, max(c2) from t1 where t1.c1 = t1.c1)) from t2 --- !query schema -struct --- !query output -false -false - - --- !query -set spark.sql.optimizer.optimizeOneRowRelationSubquery.alwaysInline=false --- !query schema -struct --- !query output -spark.sql.optimizer.optimizeOneRowRelationSubquery.alwaysInline false - - --- !query -WITH T AS (SELECT 1 AS a) -SELECT (SELECT sum(1) FROM T WHERE a = col OR upper(col)= 'Y') -FROM (SELECT null as col) as foo --- !query schema -struct --- !query output -NULL - - --- !query -set spark.sql.optimizer.optimizeOneRowRelationSubquery.alwaysInline=true --- !query schema -struct --- !query output -spark.sql.optimizer.optimizeOneRowRelationSubquery.alwaysInline true - - --- !query -select * from ( - select t1.id c1, ( - select t2.id c from range (1, 2) t2 - where t1.id = t2.id ) c2 - from range (1, 3) t1 ) t -where t.c2 is not null --- !query schema -struct --- !query output -1 1 diff --git a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala index 069d697bd454b..0dc2cdd89f935 100644 --- a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala +++ b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala @@ -76,6 +76,7 @@ class ClickHouseTestSettings extends BackendTestSettings { false // nativeDoValidate failed due to spark conf cleanup case "GlutenBloomFilterAggregateQuerySuite" => !bloomFilterCases.contains(testName) + case "GlutenSortShuffleSuite" => false case _ => true } preCheck && super.shouldRun(suiteName, testName) @@ -323,9 +324,7 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("SPARK-30793: truncate timestamps before the epoch to seconds and minutes") .excludeGlutenTest("unix_timestamp") .excludeGlutenTest("to_unix_timestamp") - .exclude("to_utc_timestamp with literal zone") .exclude("to_utc_timestamp with column zone") - .exclude("from_utc_timestamp with literal zone") .exclude("from_utc_timestamp with column zone") enableSuite[GlutenDeprecatedAPISuite] enableSuite[GlutenDynamicPartitionPruningV1SuiteAEOff].excludeGlutenTest( @@ -458,8 +457,10 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("string regex_replace / regex_extract") .exclude("string overlay function") .exclude("binary overlay function") - .exclude("string / binary substring function") .exclude("string parse_url function") + .exclude("string / binary length function") + .exclude("SPARK-36751: add octet length api for scala") + .exclude("SPARK-36751: add bit length api for scala") enableSuite[GlutenSubquerySuite] .exclude("SPARK-15370: COUNT bug in subquery in subquery in subquery") .exclude("SPARK-26893: Allow pushdown of partition pruning subquery filters to file source") @@ -490,7 +491,7 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("SPARK-36920: Support day-time intervals by ABS") .exclude("SPARK-36921: Support YearMonthIntervalType by div") .exclude("SPARK-36921: Support DayTimeIntervalType by div") - enableSuite[GlutenBitwiseExpressionsSuite].exclude("BitGet") + enableSuite[GlutenBitwiseExpressionsSuite] enableSuite[GlutenCastSuite] .exclude("null cast") .exclude("cast string to date") @@ -757,34 +758,21 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("SPARK - 34814: LikeSimplification should handleNULL") enableSuite[GlutenSortOrderExpressionsSuite].exclude("SortPrefix") enableSuite[GlutenStringExpressionsSuite] - .exclude("concat") .exclude("StringComparison") .exclude("Substring") .exclude("string substring_index function") .exclude("SPARK-40213: ascii for Latin-1 Supplement characters") .exclude("ascii for string") - .exclude("string for ascii") .exclude("base64/unbase64 for string") .exclude("encode/decode for string") .exclude("Levenshtein distance") - .exclude("soundex unit test") - .exclude("replace") .exclude("overlay for string") .exclude("overlay for byte array") .exclude("translate") - .exclude("FORMAT") - .exclude("SPARK-22603: FormatString should not generate codes beyond 64KB") - .exclude("INSTR") .exclude("LOCATE") - .exclude("LPAD/RPAD") .exclude("REPEAT") - .exclude("length for string / binary") - .exclude("format_number / FormatNumber") - .exclude("ToNumber: positive tests") - .exclude("ToNumber: negative tests (the input string does not match the format string)") .exclude("ParseUrl") .exclude("SPARK-33468: ParseUrl in ANSI mode should fail if input string is not a valid url") - .exclude("Sentences") enableSuite[GlutenDataSourceV2DataFrameSessionCatalogSuite] enableSuite[GlutenDataSourceV2SQLSessionCatalogSuite] enableSuite[GlutenDataSourceV2SQLSuiteV1Filter] diff --git a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/velox/VeloxSQLQueryTestSettings.scala b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/velox/VeloxSQLQueryTestSettings.scala index 5067de74ea7e6..345971e9ffc0f 100644 --- a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/velox/VeloxSQLQueryTestSettings.scala +++ b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/velox/VeloxSQLQueryTestSettings.scala @@ -134,6 +134,7 @@ object VeloxSQLQueryTestSettings extends SQLQueryTestSettings { "subquery/negative-cases/invalid-correlation.sql", "subquery/negative-cases/subq-input-typecheck.sql", "subquery/scalar-subquery/scalar-subquery-predicate.sql", + "subquery/scalar-subquery/scalar-subquery-select.sql", "subquery/subquery-in-from.sql", "postgreSQL/aggregates_part1.sql", "postgreSQL/aggregates_part2.sql", @@ -241,9 +242,6 @@ object VeloxSQLQueryTestSettings extends SQLQueryTestSettings { "group-by.sql", "udf/udf-group-by.sql", // Overwrite some results of regr_intercept, regr_r2, corr. - "linear-regression.sql", - // Exception string doesn't match for - // SELECT (SELECT a FROM (SELECT 1 AS a UNION ALL SELECT 2 AS a) t) AS b - "subquery/scalar-subquery/scalar-subquery-select.sql" + "linear-regression.sql" ) } diff --git a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala index 98942462aec8b..e54aca34ec757 100644 --- a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala +++ b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala @@ -18,8 +18,9 @@ package org.apache.gluten.utils.velox import org.apache.gluten.utils.{BackendTestSettings, SQLQueryTestSettings} +import org.apache.spark.GlutenSortShuffleSuite import org.apache.spark.sql._ -import org.apache.spark.sql.catalyst.expressions.{GlutenArithmeticExpressionSuite, GlutenBitwiseExpressionsSuite, GlutenCastSuite, GlutenCollectionExpressionsSuite, GlutenComplexTypeSuite, GlutenConditionalExpressionSuite, GlutenDateExpressionsSuite, GlutenDecimalExpressionSuite, GlutenDecimalPrecisionSuite, GlutenHashExpressionsSuite, GlutenHigherOrderFunctionsSuite, GlutenIntervalExpressionsSuite, GlutenLiteralExpressionSuite, GlutenMathExpressionsSuite, GlutenMiscExpressionsSuite, GlutenNondeterministicSuite, GlutenNullExpressionsSuite, GlutenPredicateSuite, GlutenRandomSuite, GlutenRegexpExpressionsSuite, GlutenSortOrderExpressionsSuite, GlutenStringExpressionsSuite} +import org.apache.spark.sql.catalyst.expressions.{GlutenArithmeticExpressionSuite, GlutenBitwiseExpressionsSuite, GlutenCastSuite, GlutenCollectionExpressionsSuite, GlutenComplexTypeSuite, GlutenConditionalExpressionSuite, GlutenDateExpressionsSuite, GlutenDecimalExpressionSuite, GlutenDecimalPrecisionSuite, GlutenHashExpressionsSuite, GlutenHigherOrderFunctionsSuite, GlutenIntervalExpressionsSuite, GlutenLiteralExpressionSuite, GlutenMathExpressionsSuite, GlutenMiscExpressionsSuite, GlutenNondeterministicSuite, GlutenNullExpressionsSuite, GlutenPredicateSuite, GlutenRandomSuite, GlutenRegexpExpressionsSuite, GlutenSortOrderExpressionsSuite, GlutenStringExpressionsSuite, GlutenTryEvalSuite} import org.apache.spark.sql.connector._ import org.apache.spark.sql.errors.{GlutenQueryCompilationErrorsDSv2Suite, GlutenQueryCompilationErrorsSuite, GlutenQueryExecutionErrorsSuite, GlutenQueryParsingErrorsSuite} import org.apache.spark.sql.execution._ @@ -81,8 +82,6 @@ class VeloxTestSettings extends BackendTestSettings { // NEW SUITE: disable as it expects exception which doesn't happen when offloaded to gluten .exclude( "INCONSISTENT_BEHAVIOR_CROSS_VERSION: compatibility with Spark 2.4/3.2 in reading/writing dates") - // gluten throws different exception - .excludeByPrefix("SCALAR_SUBQUERY_TOO_MANY_ROWS:") // Doesn't support unhex with failOnError=true. .exclude("CONVERSION_INVALID_INPUT: to_binary conversion function hex") enableSuite[GlutenQueryParsingErrorsSuite] @@ -142,8 +141,10 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("random") .exclude("SPARK-9127 codegen with long seed") enableSuite[GlutenRegexpExpressionsSuite] + enableSuite[GlutenSortShuffleSuite] enableSuite[GlutenSortOrderExpressionsSuite] enableSuite[GlutenStringExpressionsSuite] + enableSuite[GlutenTryEvalSuite] enableSuite[VeloxAdaptiveQueryExecSuite] .includeAllGlutenTests() .includeByPrefix( @@ -187,6 +188,9 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("save csv with empty fields with user defined empty values") .exclude("save csv with quote") .exclude("SPARK-13543 Write the output as uncompressed via option()") + .exclude("DDL test with tab separated file") + .exclude("DDL test parsing decimal type") + .exclude("test with tab delimiter and double quote") // Arrow not support corrupt record .exclude("SPARK-27873: disabling enforceSchema should not fail columnNameOfCorruptRecord") enableSuite[GlutenCSVv2Suite] @@ -202,6 +206,11 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("save csv with empty fields with user defined empty values") .exclude("save csv with quote") .exclude("SPARK-13543 Write the output as uncompressed via option()") + .exclude("DDL test with tab separated file") + .exclude("DDL test parsing decimal type") + .exclude("test with tab delimiter and double quote") + // Arrow not support corrupt record + .exclude("SPARK-27873: disabling enforceSchema should not fail columnNameOfCorruptRecord") enableSuite[GlutenCSVLegacyTimeParserSuite] // file cars.csv include null string, Arrow not support to read .exclude("DDL test with schema") @@ -212,6 +221,9 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("SPARK-13543 Write the output as uncompressed via option()") // Arrow not support corrupt record .exclude("SPARK-27873: disabling enforceSchema should not fail columnNameOfCorruptRecord") + .exclude("DDL test with tab separated file") + .exclude("DDL test parsing decimal type") + .exclude("test with tab delimiter and double quote") enableSuite[GlutenJsonV1Suite] // FIXME: Array direct selection fails .exclude("Complex field and type inferring") @@ -854,7 +866,6 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("Shouldn't change broadcast join buildSide if user clearly specified") .exclude("Shouldn't bias towards build right if user didn't specify") .exclude("SPARK-23192: broadcast hint should be retained after using the cached data") - .exclude("broadcast hint isn't propagated after a join") .exclude("broadcast join where streamed side's output partitioning is HashPartitioning") enableSuite[GlutenExistenceJoinSuite] @@ -1078,14 +1089,6 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("to_timestamp") // Legacy mode is not supported, assuming this mode is not commonly used. .exclude("SPARK-30668: use legacy timestamp parser in to_timestamp") - // Replaced by another test. - .exclude("to_utc_timestamp with literal zone") - // Replaced by another test. - .exclude("to_utc_timestamp with column zone") - // Replaced by another test - .exclude("from_utc_timestamp with literal zone") - // Replaced by another test - .exclude("from_utc_timestamp with column zone") enableSuite[GlutenDeprecatedAPISuite] enableSuite[GlutenDynamicPartitionPruningV1SuiteAEOff] enableSuite[GlutenDynamicPartitionPruningV1SuiteAEOn] @@ -1116,8 +1119,6 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("SPARK-31116: Select nested schema with case insensitive mode") // exclude as original metric not correct when task offloaded to velox .exclude("SPARK-37585: test input metrics for DSV2 with output limits") - // TODO(yuan): fix the input bytes on ORC code path - .exclude("SPARK-30362: test input metrics for DSV2") // DISABLED: GLUTEN-4893 Vanilla UT checks scan operator by exactly matching the class type .exclude("File source v2: support passing data filters to FileScan without partitionFilters") // DISABLED: GLUTEN-4893 Vanilla UT checks scan operator by exactly matching the class type @@ -1134,9 +1135,6 @@ class VeloxTestSettings extends BackendTestSettings { enableSuite[GlutenJoinSuite] // exclude as it check spark plan .exclude("SPARK-36794: Ignore duplicated key when building relation for semi/anti hash join") - // exclude as it check for SMJ node - .exclude( - "SPARK-43113: Full outer join with duplicate stream-side references in condition (SMJ)") enableSuite[GlutenMathFunctionsSuite] enableSuite[GlutenMetadataCacheSuite] .exclude("SPARK-16336,SPARK-27961 Suggest fixing FileNotFoundException") @@ -1220,9 +1218,6 @@ class VeloxTestSettings extends BackendTestSettings { enableSuite[GlutenGroupBasedMergeIntoTableSuite] enableSuite[GlutenFileSourceCustomMetadataStructSuite] enableSuite[GlutenParquetFileMetadataStructRowIndexSuite] - // Row index metadata column support in Velox isn't ready yet, refer velox-9147 - .exclude("reading _tmp_metadata_row_index - not present in a table") - .exclude("reading _tmp_metadata_row_index - present in a table") enableSuite[GlutenTableLocationSuite] enableSuite[GlutenRemoveRedundantWindowGroupLimitsSuite] diff --git a/gluten-ut/spark35/src/test/scala/org/apache/spark/GlutenSortShuffleSuite.scala b/gluten-ut/spark35/src/test/scala/org/apache/spark/GlutenSortShuffleSuite.scala new file mode 100644 index 0000000000000..70579c886248e --- /dev/null +++ b/gluten-ut/spark35/src/test/scala/org/apache/spark/GlutenSortShuffleSuite.scala @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark + +import org.apache.spark.sql.GlutenTestsBaseTrait + +class GlutenSortShuffleSuite extends SortShuffleSuite with GlutenTestsBaseTrait { + override def beforeAll(): Unit = { + super.beforeAll() + conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.ColumnarShuffleManager") + } +} diff --git a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/GlutenColumnExpressionSuite.scala b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/GlutenColumnExpressionSuite.scala index edd2a5a9672df..8a28c4e98a26b 100644 --- a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/GlutenColumnExpressionSuite.scala +++ b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/GlutenColumnExpressionSuite.scala @@ -16,4 +16,40 @@ */ package org.apache.spark.sql -class GlutenColumnExpressionSuite extends ColumnExpressionSuite with GlutenSQLTestsTrait {} +import org.apache.spark.sql.execution.ProjectExec +import org.apache.spark.sql.functions.{expr, input_file_name} +import org.apache.spark.sql.types._ + +class GlutenColumnExpressionSuite extends ColumnExpressionSuite with GlutenSQLTestsTrait { + testGluten("input_file_name with scan is fallback") { + withTempPath { + dir => + val rawData = Seq( + Row(1, "Alice", Seq(Row(Seq(1, 2, 3)))), + Row(2, "Bob", Seq(Row(Seq(4, 5)))), + Row(3, "Charlie", Seq(Row(Seq(6, 7, 8, 9)))) + ) + val schema = StructType( + Array( + StructField("id", IntegerType, nullable = false), + StructField("name", StringType, nullable = false), + StructField( + "nested_column", + ArrayType( + StructType(Array( + StructField("array_in_struct", ArrayType(IntegerType), nullable = true) + ))), + nullable = true) + )) + val data: DataFrame = spark.createDataFrame(sparkContext.parallelize(rawData), schema) + data.write.parquet(dir.getCanonicalPath) + + val q = + spark.read.parquet(dir.getCanonicalPath).select(input_file_name(), expr("nested_column")) + val firstRow = q.head() + assert(firstRow.getString(0).contains(dir.toURI.getPath)) + val project = q.queryExecution.executedPlan.collect { case p: ProjectExec => p } + assert(project.size == 1) + } + } +} diff --git a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/GlutenDateFunctionsSuite.scala b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/GlutenDateFunctionsSuite.scala index fbbfdf45daa86..a946e6de43452 100644 --- a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/GlutenDateFunctionsSuite.scala +++ b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/GlutenDateFunctionsSuite.scala @@ -246,80 +246,4 @@ class GlutenDateFunctionsSuite extends DateFunctionsSuite with GlutenSQLTestsTra } } } - - testGluten("to_utc_timestamp with literal zone") { - val df = Seq( - (Timestamp.valueOf("2015-07-24 00:00:00"), "2015-07-24 00:00:00"), - (Timestamp.valueOf("2015-07-25 00:00:00"), "2015-07-25 00:00:00") - ).toDF("a", "b") - checkAnswer( - df.select(to_utc_timestamp(col("a"), "America/Los_Angeles")), - Seq( - Row(Timestamp.valueOf("2015-07-24 07:00:00")), - Row(Timestamp.valueOf("2015-07-25 07:00:00"))) - ) - checkAnswer( - df.select(to_utc_timestamp(col("b"), "America/Los_Angeles")), - Seq( - Row(Timestamp.valueOf("2015-07-24 07:00:00")), - Row(Timestamp.valueOf("2015-07-25 07:00:00"))) - ) - } - - testGluten("to_utc_timestamp with column zone") { - val df = Seq( - (Timestamp.valueOf("2015-07-24 00:00:00"), "2015-07-24 00:00:00", "America/Los_Angeles"), - (Timestamp.valueOf("2015-07-25 00:00:00"), "2015-07-25 00:00:00", "Europe/Paris") - ).toDF("a", "b", "c") - checkAnswer( - df.select(to_utc_timestamp(col("a"), col("c"))), - Seq( - Row(Timestamp.valueOf("2015-07-24 07:00:00")), - Row(Timestamp.valueOf("2015-07-24 22:00:00"))) - ) - checkAnswer( - df.select(to_utc_timestamp(col("b"), col("c"))), - Seq( - Row(Timestamp.valueOf("2015-07-24 07:00:00")), - Row(Timestamp.valueOf("2015-07-24 22:00:00"))) - ) - } - - testGluten("from_utc_timestamp with literal zone") { - val df = Seq( - (Timestamp.valueOf("2015-07-24 00:00:00"), "2015-07-24 00:00:00"), - (Timestamp.valueOf("2015-07-25 00:00:00"), "2015-07-25 00:00:00") - ).toDF("a", "b") - checkAnswer( - df.select(from_utc_timestamp(col("a"), "America/Los_Angeles")), - Seq( - Row(Timestamp.valueOf("2015-07-23 17:00:00")), - Row(Timestamp.valueOf("2015-07-24 17:00:00"))) - ) - checkAnswer( - df.select(from_utc_timestamp(col("b"), "America/Los_Angeles")), - Seq( - Row(Timestamp.valueOf("2015-07-23 17:00:00")), - Row(Timestamp.valueOf("2015-07-24 17:00:00"))) - ) - } - - testGluten("from_utc_timestamp with column zone") { - val df = Seq( - (Timestamp.valueOf("2015-07-24 00:00:00"), "2015-07-24 00:00:00", "Europe/Paris"), - (Timestamp.valueOf("2015-07-25 00:00:00"), "2015-07-25 00:00:00", "America/Los_Angeles") - ).toDF("a", "b", "c") - checkAnswer( - df.select(from_utc_timestamp(col("a"), col("c"))), - Seq( - Row(Timestamp.valueOf("2015-07-24 02:00:00")), - Row(Timestamp.valueOf("2015-07-24 17:00:00"))) - ) - checkAnswer( - df.select(from_utc_timestamp(col("b"), col("c"))), - Seq( - Row(Timestamp.valueOf("2015-07-24 02:00:00")), - Row(Timestamp.valueOf("2015-07-24 17:00:00"))) - ) - } } diff --git a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/GlutenJoinSuite.scala b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/GlutenJoinSuite.scala index 09718fb1a439b..5ef4056201ed2 100644 --- a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/GlutenJoinSuite.scala +++ b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/GlutenJoinSuite.scala @@ -16,9 +16,6 @@ */ package org.apache.spark.sql -import org.apache.spark.sql.execution.SparkPlan -import org.apache.spark.sql.execution.joins.ShuffledHashJoinExec - class GlutenJoinSuite extends JoinSuite with GlutenSQLTestsTrait { override def testNameBlackList: Seq[String] = Seq( @@ -57,14 +54,4 @@ class GlutenJoinSuite extends JoinSuite with GlutenSQLTestsTrait { |""".stripMargin checkAnswer(spark.sql(sql), Seq(Row(0, 1), Row(1, 2), Row(2, 3))) } - - testGluten( - "SPARK-43113: Full outer join with duplicate stream-side" + - " references in condition (SHJ)") { - def check(plan: SparkPlan): Unit = { - assert(collect(plan) { case _: ShuffledHashJoinExec => true }.size === 1) - } - - dupStreamSideColTest("MERGE", check) - } } diff --git a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/GlutenJsonFunctionsSuite.scala b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/GlutenJsonFunctionsSuite.scala index 24963f89d3fbe..5a28031b6c7a4 100644 --- a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/GlutenJsonFunctionsSuite.scala +++ b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/GlutenJsonFunctionsSuite.scala @@ -89,13 +89,13 @@ class GlutenJsonFunctionsSuite extends JsonFunctionsSuite with GlutenSQLTestsTra runTest("[\"a\",\"b\"]", "$[1]", "b") runTest("[[\"a\",\"b\"]]", "$[0][1]", "b") - runTest("[1,2,3]", "[0]", "1") - // runTest("[1,2,3]", "$0", null) crashes in velox + runTest("[1,2,3]", "[0]", null) + runTest("[1,2,3]", "$0", null) runTest("[1,2,3]", "0", null) runTest("[1,2,3]", "$.", null) - // runTest("[1,2,3]", "$", "[1,2,3]") crashes in velox - // runTest("{\"a\":4}", "$", "{\"a\":4}") crashes in velox + runTest("[1,2,3]", "$", "[1,2,3]") + runTest("{\"a\":4}", "$", "{\"a\":4}") def runTest(json: String, path: String, exp: String): Unit = { checkAnswer(Seq(json).toDF().selectExpr(s"get_json_object(value, '$path')"), Row(exp)) diff --git a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/GlutenStringFunctionsSuite.scala b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/GlutenStringFunctionsSuite.scala index c58284e4403bb..3d82e214f0311 100644 --- a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/GlutenStringFunctionsSuite.scala +++ b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/GlutenStringFunctionsSuite.scala @@ -30,10 +30,6 @@ class GlutenStringFunctionsSuite import testImplicits._ - override def testNameBlackList: Seq[String] = super.testNameBlackList ++ Seq( - "string / binary length function" - ) - testGluten("string split function with no limit and regex pattern") { val df1 = Seq(("aaAbbAcc4")).toDF("a").select(split($"a", "A")) checkAnswer(df1, Row(Seq("aa", "bb", "cc4"))) diff --git a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/catalyst/expressions/GlutenDateExpressionsSuite.scala b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/catalyst/expressions/GlutenDateExpressionsSuite.scala index e726dcea18c7f..f4a9dfd56a927 100644 --- a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/catalyst/expressions/GlutenDateExpressionsSuite.scala +++ b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/catalyst/expressions/GlutenDateExpressionsSuite.scala @@ -380,6 +380,9 @@ class GlutenDateExpressionsSuite extends DateExpressionsSuite with GlutenTestsTr checkEvaluation( FromUnixTime(Literal(-1000L), Literal(fmt2), timeZoneId), sdf2.format(new Timestamp(-1000000))) + checkEvaluation( + FromUnixTime(Literal(Long.MaxValue), Literal(fmt2), timeZoneId), + sdf2.format(new Timestamp(-1000))) checkEvaluation( FromUnixTime( Literal.create(null, LongType), diff --git a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/catalyst/expressions/GlutenTryEvalSuite.scala b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/catalyst/expressions/GlutenTryEvalSuite.scala new file mode 100644 index 0000000000000..6af97677e5d8e --- /dev/null +++ b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/catalyst/expressions/GlutenTryEvalSuite.scala @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.catalyst.expressions + +import org.apache.spark.sql.GlutenTestsTrait + +class GlutenTryEvalSuite extends TryEvalSuite with GlutenTestsTrait {} diff --git a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/errors/GlutenQueryExecutionErrorsSuite.scala b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/errors/GlutenQueryExecutionErrorsSuite.scala index d9b661c8a4a79..c25e65cf0b686 100644 --- a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/errors/GlutenQueryExecutionErrorsSuite.scala +++ b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/errors/GlutenQueryExecutionErrorsSuite.scala @@ -16,23 +16,21 @@ */ package org.apache.spark.sql.errors +import org.apache.spark.SparkConf import org.apache.spark.sql.GlutenSQLTestsBaseTrait class GlutenQueryExecutionErrorsSuite extends QueryExecutionErrorsSuite with GlutenSQLTestsBaseTrait { - override protected def getResourceParquetFilePath(name: String): String = { - getWorkspaceFilePath("sql", "core", "src", "test", "resources").toString + "/" + name + + override def sparkConf: SparkConf = { + // Disables VeloxAppendBatches in which GeneralOutIterator wraps vanilla Spark's exceptions + // with GlutenException. + super.sparkConf + .set("spark.gluten.sql.columnar.backend.velox.coalesceBatchesBeforeShuffle", "false") } - testGluten( - "SCALAR_SUBQUERY_TOO_MANY_ROWS: " + - "More than one row returned by a subquery used as an expression") { - val exception = intercept[IllegalStateException] { - sql("select (select a from (select 1 as a union all select 2 as a) t) as b").collect() - } - assert( - exception.getMessage.contains("more than one row returned by a subquery" + - " used as an expression")) + override protected def getResourceParquetFilePath(name: String): String = { + getWorkspaceFilePath("sql", "core", "src", "test", "resources").toString + "/" + name } } diff --git a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/execution/FallbackStrategiesSuite.scala b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/execution/FallbackStrategiesSuite.scala index 7976288dd4efc..8ce0af8df051e 100644 --- a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/execution/FallbackStrategiesSuite.scala +++ b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/execution/FallbackStrategiesSuite.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.execution import org.apache.gluten.backendsapi.BackendsApiManager import org.apache.gluten.execution.BasicScanExecTransformer import org.apache.gluten.extension.GlutenPlan -import org.apache.gluten.extension.columnar.{FallbackEmptySchemaRelation, TRANSFORM_UNSUPPORTED, TransformHints} +import org.apache.gluten.extension.columnar.{FallbackEmptySchemaRelation, FallbackTags, TRANSFORM_UNSUPPORTED} import org.apache.gluten.extension.columnar.heuristic.HeuristicApplier import org.apache.gluten.extension.columnar.transition.InsertTransitions import org.apache.gluten.utils.QueryPlanSelector @@ -125,10 +125,10 @@ class FallbackStrategiesSuite extends GlutenSQLTestsTrait { testGluten("Tag not transformable more than once") { val originalPlan = UnaryOp1(LeafOp(supportsColumnar = true)) - TransformHints.tag(originalPlan, TRANSFORM_UNSUPPORTED(Some("fake reason"))) + FallbackTags.tag(originalPlan, TRANSFORM_UNSUPPORTED(Some("fake reason"))) val rule = FallbackEmptySchemaRelation() val newPlan = rule.apply(originalPlan) - val reason = TransformHints.getHint(newPlan).asInstanceOf[TRANSFORM_UNSUPPORTED].reason + val reason = FallbackTags.getTag(newPlan).asInstanceOf[TRANSFORM_UNSUPPORTED].reason assert(reason.isDefined) if (BackendsApiManager.getSettings.fallbackOnEmptySchema(newPlan)) { assert( diff --git a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/execution/GlutenReplaceHashWithSortAggSuite.scala b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/execution/GlutenReplaceHashWithSortAggSuite.scala index 8d795bbffea0c..332c21418a9b9 100644 --- a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/execution/GlutenReplaceHashWithSortAggSuite.scala +++ b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/execution/GlutenReplaceHashWithSortAggSuite.scala @@ -84,11 +84,11 @@ class GlutenReplaceHashWithSortAggSuite withTempView("t1", "t2") { spark.range(100).selectExpr("id as key").createOrReplaceTempView("t1") spark.range(50).selectExpr("id as key").createOrReplaceTempView("t2") - Seq("COUNT", "COLLECT_LIST").foreach { - aggExpr => + Seq(("COUNT", 0, 1, 2, 0), ("COLLECT_LIST", 2, 0, 2, 0)).foreach { + aggExprInfo => val query = s""" - |SELECT key, $aggExpr(key) + |SELECT key, ${aggExprInfo._1}(key) |FROM |( | SELECT /*+ SHUFFLE_MERGE(t1) */ t1.key AS key @@ -98,7 +98,7 @@ class GlutenReplaceHashWithSortAggSuite |) |GROUP BY key """.stripMargin - checkAggs(query, 2, 0, 2, 0) + checkAggs(query, aggExprInfo._2, aggExprInfo._3, aggExprInfo._4, aggExprInfo._5) } } } diff --git a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/execution/datasources/GlutenFileMetadataStructSuite.scala b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/execution/datasources/GlutenFileMetadataStructSuite.scala index 5e4d1ed8f81f4..ed347d024c1c2 100644 --- a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/execution/datasources/GlutenFileMetadataStructSuite.scala +++ b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/execution/datasources/GlutenFileMetadataStructSuite.scala @@ -16,12 +16,11 @@ */ package org.apache.spark.sql.execution.datasources +import org.apache.gluten.backendsapi.BackendsApiManager import org.apache.gluten.execution.{FileSourceScanExecTransformer, FilterExecTransformer} -import org.apache.gluten.utils.BackendTestUtils import org.apache.spark.sql.{Column, DataFrame, Row} import org.apache.spark.sql.GlutenSQLTestsBaseTrait -import org.apache.spark.sql.GlutenTestConstants.GLUTEN_TEST import org.apache.spark.sql.execution.FileSourceScanExec import org.apache.spark.sql.types.{IntegerType, LongType, StringType, StructField, StructType} @@ -60,7 +59,7 @@ class GlutenFileMetadataStructSuite extends FileMetadataStructSuite with GlutenS f: (DataFrame, Map[String, Any], Map[String, Any]) => Unit): Unit = { Seq("parquet").foreach { testFileFormat => - test(s"$GLUTEN_TEST metadata struct ($testFileFormat): " + testName) { + testGluten(s"metadata struct ($testFileFormat): " + testName) { withTempDir { dir => import scala.collection.JavaConverters._ @@ -109,7 +108,7 @@ class GlutenFileMetadataStructSuite extends FileMetadataStructSuite with GlutenS METADATA_FILE_MODIFICATION_TIME, "age") dfWithMetadata.collect - if (BackendTestUtils.isVeloxBackendLoaded()) { + if (BackendsApiManager.getSettings.supportNativeMetadataColumns()) { checkOperatorMatch[FileSourceScanExecTransformer](dfWithMetadata) } else { checkOperatorMatch[FileSourceScanExec](dfWithMetadata) @@ -134,7 +133,7 @@ class GlutenFileMetadataStructSuite extends FileMetadataStructSuite with GlutenS .where(Column(METADATA_FILE_NAME) === f0((METADATA_FILE_NAME))) val ret = filterDF.collect assert(ret.size == 1) - if (BackendTestUtils.isVeloxBackendLoaded()) { + if (BackendsApiManager.getSettings.supportNativeMetadataColumns()) { checkOperatorMatch[FileSourceScanExecTransformer](filterDF) } else { checkOperatorMatch[FileSourceScanExec](filterDF) @@ -150,7 +149,7 @@ class GlutenFileMetadataStructSuite extends FileMetadataStructSuite with GlutenS Row(f1(METADATA_FILE_PATH)) ) ) - if (BackendTestUtils.isVeloxBackendLoaded()) { + if (BackendsApiManager.getSettings.supportNativeMetadataColumns()) { checkOperatorMatch[FileSourceScanExecTransformer](filterDF) } else { checkOperatorMatch[FileSourceScanExec](filterDF) diff --git a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/execution/datasources/GlutenV1WriteCommandSuite.scala b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/execution/datasources/GlutenV1WriteCommandSuite.scala index 3d277b94cc3e7..fcaf75a4d5c18 100644 --- a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/execution/datasources/GlutenV1WriteCommandSuite.scala +++ b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/execution/datasources/GlutenV1WriteCommandSuite.scala @@ -16,12 +16,13 @@ */ package org.apache.spark.sql.execution.datasources +import org.apache.gluten.GlutenColumnarWriteTestSupport import org.apache.gluten.execution.SortExecTransformer import org.apache.spark.sql.GlutenSQLTestsBaseTrait import org.apache.spark.sql.catalyst.expressions.{Ascending, AttributeReference, NullsFirst, SortOrder} import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Sort} -import org.apache.spark.sql.execution.{QueryExecution, SortExec, VeloxColumnarWriteFilesExec} +import org.apache.spark.sql.execution.{QueryExecution, SortExec} import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{IntegerType, StringType} @@ -96,7 +97,8 @@ trait GlutenV1WriteCommandSuiteBase extends V1WriteCommandSuiteBase { class GlutenV1WriteCommandSuite extends V1WriteCommandSuite with GlutenV1WriteCommandSuiteBase - with GlutenSQLTestsBaseTrait { + with GlutenSQLTestsBaseTrait + with GlutenColumnarWriteTestSupport { testGluten( "SPARK-41914: v1 write with AQE and in-partition sorted - non-string partition column") { @@ -122,8 +124,7 @@ class GlutenV1WriteCommandSuite val executedPlan = FileFormatWriter.executedPlan.get val plan = if (enabled) { - assert(executedPlan.isInstanceOf[VeloxColumnarWriteFilesExec]) - executedPlan.asInstanceOf[VeloxColumnarWriteFilesExec].child + checkWriteFilesAndGetChild(executedPlan) } else { executedPlan.transformDown { case a: AdaptiveSparkPlanExec => a.executedPlan } } @@ -204,8 +205,7 @@ class GlutenV1WriteCommandSuite val executedPlan = FileFormatWriter.executedPlan.get val plan = if (enabled) { - assert(executedPlan.isInstanceOf[VeloxColumnarWriteFilesExec]) - executedPlan.asInstanceOf[VeloxColumnarWriteFilesExec].child + checkWriteFilesAndGetChild(executedPlan) } else { executedPlan.transformDown { case a: AdaptiveSparkPlanExec => a.executedPlan } } diff --git a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/execution/datasources/csv/GlutenCSVSuite.scala b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/execution/datasources/csv/GlutenCSVSuite.scala index cb7ce87f97dad..8b75dad33c385 100644 --- a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/execution/datasources/csv/GlutenCSVSuite.scala +++ b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/execution/datasources/csv/GlutenCSVSuite.scala @@ -20,16 +20,14 @@ import org.apache.gluten.GlutenConfig import org.apache.gluten.exception.GlutenException import org.apache.spark.{SparkConf, SparkException} -import org.apache.spark.sql.{AnalysisException, GlutenSQLTestsBaseTrait, Row} +import org.apache.spark.sql.{GlutenSQLTestsBaseTrait, Row} import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.types.{DateType, IntegerType, StringType, StructType, TimestampType} +import org.apache.spark.sql.types.{DateType, IntegerType, StructType, TimestampType} import org.scalatest.exceptions.TestFailedException import java.sql.{Date, Timestamp} -import scala.collection.JavaConverters.seqAsJavaListConverter - class GlutenCSVSuite extends CSVSuite with GlutenSQLTestsBaseTrait { override def sparkConf: SparkConf = @@ -43,68 +41,9 @@ class GlutenCSVSuite extends CSVSuite with GlutenSQLTestsBaseTrait { } class GlutenCSVv1Suite extends GlutenCSVSuite { - import testImplicits._ override def sparkConf: SparkConf = super.sparkConf .set(SQLConf.USE_V1_SOURCE_LIST, "csv") - - testGluten("SPARK-23786: Ignore column name case if spark.sql.caseSensitive is false") { - withSQLConf(SQLConf.CASE_SENSITIVE.key -> "false") { - withTempPath { - path => - val oschema = new StructType().add("A", StringType) - // change the row content 0 to string bbb in Gluten for test - val odf = spark.createDataFrame(List(Row("bbb")).asJava, oschema) - odf.write.option("header", true).csv(path.getCanonicalPath) - val ischema = new StructType().add("a", StringType) - val idf = spark.read - .schema(ischema) - .option("header", true) - .option("enforceSchema", false) - .csv(path.getCanonicalPath) - checkAnswer(idf, odf) - } - } - } - - testGluten("case sensitivity of filters references") { - Seq(true, false).foreach { - filterPushdown => - withSQLConf(SQLConf.CSV_FILTER_PUSHDOWN_ENABLED.key -> filterPushdown.toString) { - withTempPath { - path => - Seq("""aaa,BBB""", """0,1""", """2,3""") - .toDF() - .repartition(1) - .write - .text(path.getCanonicalPath) - withSQLConf(SQLConf.CASE_SENSITIVE.key -> "false") { - // change the schema to Arrow schema to support read in Gluten - val readback = spark.read - .schema("aaa long, BBB long") - .option("header", true) - .csv(path.getCanonicalPath) - checkAnswer(readback, Seq(Row(2, 3), Row(0, 1))) - checkAnswer(readback.filter($"AAA" === 2 && $"bbb" === 3), Seq(Row(2, 3))) - } - withSQLConf(SQLConf.CASE_SENSITIVE.key -> "true") { - val readback = spark.read - .schema("aaa long, BBB long") - .option("header", true) - .csv(path.getCanonicalPath) - checkAnswer(readback, Seq(Row(2, 3), Row(0, 1))) - checkError( - exception = intercept[AnalysisException] { - readback.filter($"AAA" === 2 && $"bbb" === 3).collect() - }, - errorClass = "UNRESOLVED_COLUMN.WITH_SUGGESTION", - parameters = Map("objectName" -> "`AAA`", "proposal" -> "`BBB`, `aaa`") - ) - } - } - } - } - } } class GlutenCSVv2Suite extends GlutenCSVSuite { diff --git a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/GlutenParquetFilterSuite.scala b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/GlutenParquetFilterSuite.scala index bb4a78a827956..4141acee31708 100644 --- a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/GlutenParquetFilterSuite.scala +++ b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/GlutenParquetFilterSuite.scala @@ -37,7 +37,6 @@ import org.apache.spark.util.Utils import org.apache.hadoop.fs.Path import org.apache.parquet.filter2.predicate.{FilterApi, FilterPredicate, Operators} import org.apache.parquet.filter2.predicate.FilterApi._ -import org.apache.parquet.filter2.predicate.Operators import org.apache.parquet.filter2.predicate.Operators.{Column => _, Eq, Gt, GtEq, Lt, LtEq, NotEq} import org.apache.parquet.hadoop.{ParquetFileReader, ParquetInputFormat, ParquetOutputFormat} import org.apache.parquet.hadoop.util.HadoopInputFile diff --git a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/GlutenParquetRowIndexSuite.scala b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/GlutenParquetRowIndexSuite.scala index abf21651f8272..4c53396792c2d 100644 --- a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/GlutenParquetRowIndexSuite.scala +++ b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/GlutenParquetRowIndexSuite.scala @@ -49,6 +49,7 @@ class GlutenParquetRowIndexSuite extends ParquetRowIndexSuite with GlutenSQLTest .getBlocks .asScala .map(_.getRowCount) + .toSeq } private def readRowGroupRowCounts(dir: File): Seq[Seq[Long]] = { @@ -221,8 +222,6 @@ class GlutenParquetRowIndexSuite extends ParquetRowIndexSuite with GlutenSQLTest df.schema.add(rowIndexColName, LongType, nullable = true) } - logInfo(s"gyytest schemaWithRowIndex $schemaWithRowIdx") - df.write .format(conf.writeFormat) .option(ParquetOutputFormat.BLOCK_SIZE, conf.rowGroupSize) diff --git a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/extension/GlutenSessionExtensionSuite.scala b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/extension/GlutenSessionExtensionSuite.scala index 00b4bf5821f1e..4df9c63b3ef6d 100644 --- a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/extension/GlutenSessionExtensionSuite.scala +++ b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/extension/GlutenSessionExtensionSuite.scala @@ -16,7 +16,7 @@ */ package org.apache.spark.sql.extension -import org.apache.gluten.extension.{ColumnarOverrideRules, JoinSelectionOverrides} +import org.apache.gluten.extension.ColumnarOverrideRules import org.apache.spark.SparkConf import org.apache.spark.sql._ @@ -31,7 +31,6 @@ class GlutenSessionExtensionSuite extends GlutenSQLTestsTrait { testGluten("test gluten extensions") { assert(spark.sessionState.columnarRules.contains(ColumnarOverrideRules(spark))) - assert(spark.sessionState.planner.strategies.contains(JoinSelectionOverrides(spark))) assert(spark.sessionState.planner.strategies.contains(MySparkStrategy(spark))) assert(spark.sessionState.analyzer.extendedResolutionRules.contains(MyRule(spark))) diff --git a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/extension/TestFileSourceScanExecTransformer.scala b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/extension/TestFileSourceScanExecTransformer.scala index 592019f34ccf0..85af9a6238531 100644 --- a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/extension/TestFileSourceScanExecTransformer.scala +++ b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/extension/TestFileSourceScanExecTransformer.scala @@ -50,6 +50,7 @@ case class TestFileSourceScanExecTransformer( override def getPartitions: Seq[InputPartition] = BackendsApiManager.getTransformerApiInstance.genInputPartitionSeq( relation, + requiredSchema, selectedPartitions, output, bucketedScan, diff --git a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/gluten/GlutenFallbackSuite.scala b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/gluten/GlutenFallbackSuite.scala index 9e8c7e54291a6..fd6aa047558f1 100644 --- a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/gluten/GlutenFallbackSuite.scala +++ b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/gluten/GlutenFallbackSuite.scala @@ -25,6 +25,7 @@ import org.apache.spark.scheduler.{SparkListener, SparkListenerEvent} import org.apache.spark.sql.{GlutenSQLTestsTrait, Row} import org.apache.spark.sql.execution.ProjectExec import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper +import org.apache.spark.sql.execution.aggregate.HashAggregateExec import org.apache.spark.sql.execution.ui.{GlutenSQLAppStatusStore, SparkListenerSQLExecutionStart} import org.apache.spark.status.ElementTrackingStore @@ -168,18 +169,52 @@ class GlutenFallbackSuite extends GlutenSQLTestsTrait with AdaptiveSparkPlanHelp withSQLConf(GlutenConfig.EXPRESSION_BLACK_LIST.key -> "add") { try { val df = spark.sql("select sum(id + 1) from range(10)") - spark.sparkContext.listenerBus.waitUntilEmpty() df.collect() + spark.sparkContext.listenerBus.waitUntilEmpty() val project = find(df.queryExecution.executedPlan) { _.isInstanceOf[ProjectExec] } assert(project.isDefined) - events.exists( - _.fallbackNodeToReason.values.toSet - .contains("Project: Not supported to map spark function name")) + assert( + events.exists(_.fallbackNodeToReason.values.toSet + .exists(_.contains("Not supported to map spark function name")))) } finally { spark.sparkContext.removeSparkListener(listener) } } } + + test("ExpandFallbackPolicy should propagate fallback reason to vanilla SparkPlan") { + val events = new ArrayBuffer[GlutenPlanFallbackEvent] + val listener = new SparkListener { + override def onOtherEvent(event: SparkListenerEvent): Unit = { + event match { + case e: GlutenPlanFallbackEvent => events.append(e) + case _ => + } + } + } + spark.sparkContext.addSparkListener(listener) + spark.range(10).selectExpr("id as c1", "id as c2").write.format("parquet").saveAsTable("t") + withTable("t") { + withSQLConf( + GlutenConfig.EXPRESSION_BLACK_LIST.key -> "max", + GlutenConfig.COLUMNAR_WHOLESTAGE_FALLBACK_THRESHOLD.key -> "1") { + try { + val df = spark.sql("select c2, max(c1) as id from t group by c2") + df.collect() + spark.sparkContext.listenerBus.waitUntilEmpty() + val agg = collect(df.queryExecution.executedPlan) { case a: HashAggregateExec => a } + assert(agg.size == 2) + assert( + events.count( + _.fallbackNodeToReason.values.toSet.exists(_.contains( + "Could not find a valid substrait mapping name for max" + ))) == 2) + } finally { + spark.sparkContext.removeSparkListener(listener) + } + } + } + } } diff --git a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/sources/GlutenInsertSuite.scala b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/sources/GlutenInsertSuite.scala index 2814c2e8cba6d..084c2faa8c5c2 100644 --- a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/sources/GlutenInsertSuite.scala +++ b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/sources/GlutenInsertSuite.scala @@ -16,6 +16,7 @@ */ package org.apache.spark.sql.sources +import org.apache.gluten.GlutenColumnarWriteTestSupport import org.apache.gluten.execution.SortExecTransformer import org.apache.gluten.extension.GlutenPlan @@ -24,7 +25,7 @@ import org.apache.spark.executor.OutputMetrics import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskEnd} import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.TableIdentifier -import org.apache.spark.sql.execution.{CommandResultExec, QueryExecution, VeloxColumnarWriteFilesExec} +import org.apache.spark.sql.execution.{CommandResultExec, QueryExecution, SparkPlan} import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper import org.apache.spark.sql.execution.command.DataWritingCommandExec import org.apache.spark.sql.execution.metric.SQLMetric @@ -38,7 +39,8 @@ import java.io.{File, IOException} class GlutenInsertSuite extends InsertSuite with GlutenSQLTestsBaseTrait - with AdaptiveSparkPlanHelper { + with AdaptiveSparkPlanHelper + with GlutenColumnarWriteTestSupport { override def sparkConf: SparkConf = { super.sparkConf.set("spark.sql.leafNodeDefaultParallelism", "1") @@ -60,13 +62,13 @@ class GlutenInsertSuite super.afterAll() } - private def checkAndGetWriteFiles(df: DataFrame): VeloxColumnarWriteFilesExec = { + private def checkWriteFilesAndGetChild(df: DataFrame): (SparkPlan, SparkPlan) = { val writeFiles = stripAQEPlan( df.queryExecution.executedPlan .asInstanceOf[CommandResultExec] .commandPhysicalPlan).children.head - assert(writeFiles.isInstanceOf[VeloxColumnarWriteFilesExec]) - writeFiles.asInstanceOf[VeloxColumnarWriteFilesExec] + val child = checkWriteFilesAndGetChild(writeFiles) + (writeFiles, child) } testGluten("insert partition table") { @@ -97,7 +99,7 @@ class GlutenInsertSuite val df = spark.sql("INSERT INTO TABLE pt partition(pt='a') SELECT * FROM VALUES(1, 'a'),(2, 'b')") spark.sparkContext.listenerBus.waitUntilEmpty() - checkAndGetWriteFiles(df) + checkWriteFilesAndGetChild(df) assert(taskMetrics.bytesWritten > 0) assert(taskMetrics.recordsWritten == 2) @@ -135,13 +137,13 @@ class GlutenInsertSuite private def validateDynamicPartitionWrite( df: DataFrame, expectedPartitionNames: Set[String]): Unit = { - val writeFiles = checkAndGetWriteFiles(df) + val (writeFiles, writeChild) = checkWriteFilesAndGetChild(df) assert( writeFiles .find(_.isInstanceOf[SortExecTransformer]) .isEmpty) // all operators should be transformed - assert(writeFiles.child.find(!_.isInstanceOf[GlutenPlan]).isEmpty) + assert(writeChild.find(!_.isInstanceOf[GlutenPlan]).isEmpty) val parts = spark.sessionState.catalog.listPartitionNames(TableIdentifier("pt")).toSet assert(parts == expectedPartitionNames) @@ -209,7 +211,7 @@ class GlutenInsertSuite spark.sql("CREATE TABLE t (c1 int, c2 string) USING PARQUET") val df = spark.sql("INSERT OVERWRITE TABLE t SELECT c1, c2 FROM source SORT BY c1") - val writeFiles = checkAndGetWriteFiles(df) + val (writeFiles, _) = checkWriteFilesAndGetChild(df) assert(writeFiles.find(x => x.isInstanceOf[SortExecTransformer]).isDefined) checkAnswer(spark.sql("SELECT * FROM t"), spark.sql("SELECT * FROM source SORT BY c1")) } @@ -244,7 +246,7 @@ class GlutenInsertSuite spark.sql("CREATE TABLE t1 USING PARQUET AS SELECT id as c1, id % 3 as c2 FROM range(10)") spark.sql("CREATE TABLE t2 (c1 long, c2 long) USING PARQUET") val df = spark.sql("INSERT INTO TABLE t2 SELECT c2, count(*) FROM t1 GROUP BY c2") - checkAndGetWriteFiles(df) + checkWriteFilesAndGetChild(df) } } @@ -257,7 +259,7 @@ class GlutenInsertSuite spark.sql("INSERT INTO TABLE t1 VALUES(1, 1),(2, 2)") spark.sql("CREATE TABLE t2 (c1 long, c2 long) USING PARQUET") val df = spark.sql("INSERT INTO TABLE t2 SELECT * FROM t1") - checkAndGetWriteFiles(df) + checkWriteFilesAndGetChild(df) } } @@ -405,7 +407,7 @@ class GlutenInsertSuite withTable("t") { sql(s"create table t(i boolean) using ${config.dataSource}") if (config.useDataFrames) { - Seq((false)).toDF.write.insertInto("t") + Seq(false).toDF.write.insertInto("t") } else { sql("insert into t select false") } @@ -420,12 +422,12 @@ class GlutenInsertSuite val incompatibleDefault = "Failed to execute ALTER TABLE ADD COLUMNS command because the destination " + "table column `s` has a DEFAULT value" - Seq(Config("parquet"), Config("parquet", true)).foreach { + Seq(Config("parquet"), Config("parquet", useDataFrames = true)).foreach { config => withTable("t") { sql(s"create table t(i boolean) using ${config.dataSource}") if (config.useDataFrames) { - Seq((false)).toDF.write.insertInto("t") + Seq(false).toDF.write.insertInto("t") } else { sql("insert into t select false") } @@ -452,7 +454,7 @@ class GlutenInsertSuite withTable("t") { sql(s"create table t(i boolean) using ${config.dataSource}") if (config.useDataFrames) { - Seq((false)).toDF.write.insertInto("t") + Seq(false).toDF.write.insertInto("t") } else { sql("insert into t select false") } @@ -469,12 +471,12 @@ class GlutenInsertSuite val incompatibleDefault = "Failed to execute ALTER TABLE ADD COLUMNS command because the destination " + "table column `s` has a DEFAULT value" - Seq(Config("parquet"), Config("parquet", true)).foreach { + Seq(Config("parquet"), Config("parquet", useDataFrames = true)).foreach { config => withTable("t") { sql(s"create table t(i boolean) using ${config.dataSource}") if (config.useDataFrames) { - Seq((false)).toDF.write.insertInto("t") + Seq(false).toDF.write.insertInto("t") } else { sql("insert into t select false") } @@ -501,7 +503,7 @@ class GlutenInsertSuite withTable("t") { sql(s"create table t(i boolean) using ${config.dataSource}") if (config.useDataFrames) { - Seq((false)).toDF.write.insertInto("t") + Seq(false).toDF.write.insertInto("t") } else { sql("insert into t select false") } @@ -566,12 +568,12 @@ class GlutenInsertSuite val incompatibleDefault = "Failed to execute ALTER TABLE ADD COLUMNS command because the destination " + "table column `s` has a DEFAULT value" - Seq(Config("parquet"), Config("parquet", true)).foreach { + Seq(Config("parquet"), Config("parquet", useDataFrames = true)).foreach { config => withTable("t") { sql(s"create table t(i boolean) using ${config.dataSource}") if (config.useDataFrames) { - Seq((false)).toDF.write.insertInto("t") + Seq(false).toDF.write.insertInto("t") } else { sql("insert into t select false") } diff --git a/gluten-ut/test/pom.xml b/gluten-ut/test/pom.xml index d55e6ca917e7b..25ec542deab2b 100644 --- a/gluten-ut/test/pom.xml +++ b/gluten-ut/test/pom.xml @@ -90,7 +90,7 @@ org.apache.arrow arrow-c-data - ${arrow.version} + ${arrow-gluten.version} test diff --git a/package/pom.xml b/package/pom.xml index f405124a2a779..ab87e14805ff5 100644 --- a/package/pom.xml +++ b/package/pom.xml @@ -83,6 +83,16 @@ + + iceberg + + + org.apache.gluten + gluten-iceberg + ${project.version} + + + @@ -205,8 +215,10 @@ target - *3.2* - *3.3* + *spark3.2* + *spark3.3* + *spark3.4* + *spark3.5* false diff --git a/pom.xml b/pom.xml index 0f37bcbf18512..f12469317747e 100644 --- a/pom.xml +++ b/pom.xml @@ -41,9 +41,7 @@ 1.8 ${java.version} ${java.version} - 2.9.3 - 2.0.1 - 20 + 2.9.3 2.12 2.12.15 3 @@ -55,11 +53,14 @@ delta-core 2.4.0 24 - 0.3.2-incubating + 0.4.1 0.8.0 15.0.0 + 15.0.0-gluten arrow-memory-unsafe 2.7.4 + 2.0.7 + 2.20.0 UTF-8 UTF-8 spark-sql-columnar @@ -114,6 +115,100 @@ + + scala-2.12 + + true + + + + 2.12.15 + 2.12 + + + + scala-2.13 + + 2.13.8 + 2.13 + + + + + + net.alchim31.maven + scala-maven-plugin + + + + -unchecked + -deprecation + -feature + -explaintypes + -target:jvm-1.8 + -Wconf:cat=deprecation:wv,any:e + -Wunused:imports + + -Wconf:cat=scaladoc:wv + -Wconf:cat=lint-multiarg-infix:wv + -Wconf:cat=other-nullary-override:wv + + -Wconf:msg=^(?=.*?method|value|type|object|trait|inheritance)(?=.*?deprecated)(?=.*?since 2.13).+$:s + -Wconf:msg=^(?=.*?Widening conversion from)(?=.*?is deprecated because it loses precision).+$:s + -Wconf:msg=Auto-application to \`\(\)\` is deprecated:s + -Wconf:msg=method with a single empty parameter list overrides method without any parameter list:s + -Wconf:msg=method without a parameter list overrides a method with a single empty one:s + + -Wconf:cat=deprecation&msg=procedure syntax is deprecated:e + + -Wconf:cat=unchecked&msg=outer reference:s + -Wconf:cat=unchecked&msg=eliminated by erasure:s + -Wconf:msg=^(?=.*?a value of type)(?=.*?cannot also be).+$:s + + + + + + + + + + java-8 @@ -123,6 +218,16 @@ 1.8 + + java-11 + + 11 + + + 11 + 3.1.8 + + java-17 @@ -130,6 +235,7 @@ 17 + 3.1.8 @@ -155,8 +261,8 @@ 1.3.1 delta-core - 2.2.0 - 22 + 2.3.0 + 23 @@ -181,11 +287,25 @@ 3.5.1 1.5.0 delta-spark - 3.1.0 - 31 + 3.2.0 + 32 2.15.1 3.3.4 + + + org.slf4j + slf4j-api + ${slf4j.version} + provided + + + org.apache.logging.log4j + log4j-slf4j2-impl + ${log4j.version} + provided + + hadoop-2.7.4 @@ -302,7 +422,7 @@ com.github.ben-manes.caffeine caffeine - ${caffeine.version.java8} + ${caffeine.version} org.apache.spark @@ -511,7 +631,7 @@ org.scalatest scalatest_${scala.binary.version} - 3.2.3 + 3.2.16 test @@ -625,8 +745,8 @@ io.github.zhztheplayer.scalawarts - scalawarts - 0.1.1 + scalawarts_${scala.binary.version} + 0.1.2 ${scala.recompile.mode} diff --git a/shims/common/pom.xml b/shims/common/pom.xml index adb4112ff2c4c..959a95fc074fe 100644 --- a/shims/common/pom.xml +++ b/shims/common/pom.xml @@ -61,6 +61,9 @@ -Wconf:cat=deprecation:silent + diff --git a/shims/common/src/main/scala/org/apache/gluten/GlutenConfig.scala b/shims/common/src/main/scala/org/apache/gluten/GlutenConfig.scala index 9db063a6623fe..4ef96bec27ebd 100644 --- a/shims/common/src/main/scala/org/apache/gluten/GlutenConfig.scala +++ b/shims/common/src/main/scala/org/apache/gluten/GlutenConfig.scala @@ -28,7 +28,6 @@ import java.util.Locale import java.util.concurrent.TimeUnit import scala.collection.JavaConverters._ -import scala.collection.JavaConverters.collectionAsScalaIterableConverter case class GlutenNumaBindingInfo( enableNumaBinding: Boolean, @@ -85,6 +84,8 @@ class GlutenConfig(conf: SQLConf) extends Logging { def enableColumnarBroadcastJoin: Boolean = conf.getConf(COLUMNAR_BROADCAST_JOIN_ENABLED) + def enableColumnarSample: Boolean = conf.getConf(COLUMNAR_SAMPLE_ENABLED) + def enableColumnarArrowUDF: Boolean = conf.getConf(COLUMNAR_ARROW_UDF_ENABLED) def enableColumnarCoalesce: Boolean = conf.getConf(COLUMNAR_COALESCE_ENABLED) @@ -152,9 +153,6 @@ class GlutenConfig(conf: SQLConf) extends Logging { def logicalJoinOptimizationThrottle: Integer = conf.getConf(COLUMNAR_LOGICAL_JOIN_OPTIMIZATION_THROTTLE) - def enableLogicalJoinOptimize: Boolean = - conf.getConf(COLUMNAR_LOGICAL_JOIN_OPTIMIZATION_ENABLED) - def enableScanOnly: Boolean = conf.getConf(COLUMNAR_SCAN_ONLY_ENABLED) def tmpFile: Option[String] = conf.getConf(COLUMNAR_TEMP_DIR) @@ -185,6 +183,7 @@ class GlutenConfig(conf: SQLConf) extends Logging { def columnarShuffleCompressionThreshold: Int = conf.getConf(COLUMNAR_SHUFFLE_COMPRESSION_THRESHOLD) + // FIXME: Not clear: MIN or MAX ? def maxBatchSize: Int = conf.getConf(COLUMNAR_MAX_BATCH_SIZE) def shuffleWriterBufferSize: Int = conf @@ -238,6 +237,12 @@ class GlutenConfig(conf: SQLConf) extends Logging { def memoryIsolation: Boolean = conf.getConf(COLUMNAR_MEMORY_ISOLATION) + def numTaskSlotsPerExecutor: Int = { + val numSlots = conf.getConf(NUM_TASK_SLOTS_PER_EXECUTOR) + assert(numSlots > 0, s"Number of task slot not found. This should not happen.") + numSlots + } + def offHeapMemorySize: Long = conf.getConf(COLUMNAR_OFFHEAP_SIZE_IN_BYTES) def taskOffHeapMemorySize: Long = conf.getConf(COLUMNAR_TASK_OFFHEAP_SIZE_IN_BYTES) @@ -268,7 +273,9 @@ class GlutenConfig(conf: SQLConf) extends Logging { def veloxSsdODirectEnabled: Boolean = conf.getConf(COLUMNAR_VELOX_SSD_ODIRECT_ENABLED) - def veloxConnectorIOThreads: Integer = conf.getConf(COLUMNAR_VELOX_CONNECTOR_IO_THREADS) + def veloxConnectorIOThreads: Int = { + conf.getConf(COLUMNAR_VELOX_CONNECTOR_IO_THREADS).getOrElse(numTaskSlotsPerExecutor) + } def veloxSplitPreloadPerDriver: Integer = conf.getConf(COLUMNAR_VELOX_SPLIT_PRELOAD_PER_DRIVER) @@ -293,9 +300,26 @@ class GlutenConfig(conf: SQLConf) extends Logging { def veloxBloomFilterMaxNumBits: Long = conf.getConf(COLUMNAR_VELOX_BLOOM_FILTER_MAX_NUM_BITS) + def veloxCoalesceBatchesBeforeShuffle: Boolean = + conf.getConf(COLUMNAR_VELOX_COALESCE_BATCHES_BEFORE_SHUFFLE) + + def veloxMinBatchSizeForShuffle: Int = { + val defaultSize: Int = (0.8 * conf.getConf(COLUMNAR_MAX_BATCH_SIZE)).toInt.max(1) + conf + .getConf(COLUMNAR_VELOX_MIN_BATCH_SIZE_FOR_SHUFFLE) + .getOrElse(defaultSize) + } + def chColumnarShufflePreferSpill: Boolean = conf.getConf(COLUMNAR_CH_SHUFFLE_PREFER_SPILL_ENABLED) - def chColumnarShuffleSpillThreshold: Long = conf.getConf(COLUMNAR_CH_SHUFFLE_SPILL_THRESHOLD) + def chColumnarShuffleSpillThreshold: Long = { + val threshold = conf.getConf(COLUMNAR_CH_SHUFFLE_SPILL_THRESHOLD) + if (threshold == 0) { + (conf.getConf(COLUMNAR_TASK_OFFHEAP_SIZE_IN_BYTES) * 0.9).toLong + } else { + threshold + } + } def chColumnarThrowIfMemoryExceed: Boolean = conf.getConf(COLUMNAR_CH_THROW_IF_MEMORY_EXCEED) @@ -307,7 +331,11 @@ class GlutenConfig(conf: SQLConf) extends Logging { def chColumnarSpillFirstlyBeforeStop: Boolean = conf.getConf(COLUMNAR_CH_SPILL_FIRSTLY_BEFORE_STOP) - def chColumnarForceSortShuffle: Boolean = conf.getConf(COLUMNAR_CH_FORCE_SORT_SHUFFLE) + def chColumnarForceExternalSortShuffle: Boolean = + conf.getConf(COLUMNAR_CH_FORCE_EXTERNAL_SORT_SHUFFLE) + + def chColumnarForceMemorySortShuffle: Boolean = + conf.getConf(COLUMNAR_CH_FORCE_MEMORY_SORT_SHUFFLE) def cartesianProductTransformerEnabled: Boolean = conf.getConf(CARTESIAN_PRODUCT_TRANSFORMER_ENABLED) @@ -359,8 +387,7 @@ class GlutenConfig(conf: SQLConf) extends Logging { conf.getConf(COLUMNAR_VELOX_MEMORY_USE_HUGE_PAGES) def debug: Boolean = conf.getConf(DEBUG_ENABLED) - def debugKeepJniWorkspace: Boolean = - conf.getConf(DEBUG_ENABLED) && conf.getConf(DEBUG_KEEP_JNI_WORKSPACE) + def debugKeepJniWorkspace: Boolean = conf.getConf(DEBUG_KEEP_JNI_WORKSPACE) def taskStageId: Int = conf.getConf(BENCHMARK_TASK_STAGEID) def taskPartitionId: Int = conf.getConf(BENCHMARK_TASK_PARTITIONID) def taskId: Long = conf.getConf(BENCHMARK_TASK_TASK_ID) @@ -404,10 +431,20 @@ class GlutenConfig(conf: SQLConf) extends Logging { def awsSdkLogLevel: String = conf.getConf(AWS_SDK_LOG_LEVEL) + def awsS3RetryMode: String = conf.getConf(AWS_S3_RETRY_MODE) + + def awsConnectionTimeout: String = conf.getConf(AWS_S3_CONNECT_TIMEOUT) + def enableCastAvgAggregateFunction: Boolean = conf.getConf(COLUMNAR_NATIVE_CAST_AGGREGATE_ENABLED) + def enableGlutenCostEvaluator: Boolean = conf.getConf(COST_EVALUATOR_ENABLED) + def dynamicOffHeapSizingEnabled: Boolean = conf.getConf(DYNAMIC_OFFHEAP_SIZING_ENABLED) + + def enableHiveFileFormatWriter: Boolean = conf.getConf(NATIVE_HIVEFILEFORMAT_WRITER_ENABLED) + + def enableCelebornFallback: Boolean = conf.getConf(CELEBORN_FALLBACK_ENABLED) } object GlutenConfig { @@ -452,6 +489,10 @@ object GlutenConfig { val SPARK_S3_IAM: String = HADOOP_PREFIX + S3_IAM_ROLE val S3_IAM_ROLE_SESSION_NAME = "fs.s3a.iam.role.session.name" val SPARK_S3_IAM_SESSION_NAME: String = HADOOP_PREFIX + S3_IAM_ROLE_SESSION_NAME + val S3_RETRY_MAX_ATTEMPTS = "fs.s3a.retry.limit" + val SPARK_S3_RETRY_MAX_ATTEMPTS: String = HADOOP_PREFIX + S3_RETRY_MAX_ATTEMPTS + val S3_CONNECTION_MAXIMUM = "fs.s3a.connection.maximum" + val SPARK_S3_CONNECTION_MAXIMUM: String = HADOOP_PREFIX + S3_CONNECTION_MAXIMUM // Hardware acceleraters backend val GLUTEN_SHUFFLE_CODEC_BACKEND = "spark.gluten.sql.columnar.shuffle.codecBackend" @@ -509,8 +550,10 @@ object GlutenConfig { val GLUTEN_DEBUG_MODE = "spark.gluten.sql.debug" val GLUTEN_DEBUG_KEEP_JNI_WORKSPACE = "spark.gluten.sql.debug.keepJniWorkspace" + val GLUTEN_DEBUG_KEEP_JNI_WORKSPACE_DIR = "spark.gluten.sql.debug.keepJniWorkspaceDir" // Added back to Spark Conf during executor initialization + val GLUTEN_NUM_TASK_SLOTS_PER_EXECUTOR_KEY = "spark.gluten.numTaskSlotsPerExecutor" val GLUTEN_OFFHEAP_SIZE_IN_BYTES_KEY = "spark.gluten.memory.offHeap.size.in.bytes" val GLUTEN_TASK_OFFHEAP_SIZE_IN_BYTES_KEY = "spark.gluten.memory.task.offHeap.size.in.bytes" val GLUTEN_CONSERVATIVE_TASK_OFFHEAP_SIZE_IN_BYTES_KEY = @@ -561,6 +604,8 @@ object GlutenConfig { val GLUTEN_DYNAMIC_OFFHEAP_SIZING_MEMORY_FRACTION = "spark.gluten.memory.dynamic.offHeap.sizing.memory.fraction" + val GLUTEN_COST_EVALUATOR_ENABLED = "spark.gluten.sql.adaptive.costEvaluator.enabled" + var ins: GlutenConfig = _ def getConf: GlutenConfig = { @@ -603,6 +648,10 @@ object GlutenConfig { SPARK_S3_USE_INSTANCE_CREDENTIALS, SPARK_S3_IAM, SPARK_S3_IAM_SESSION_NAME, + SPARK_S3_RETRY_MAX_ATTEMPTS, + SPARK_S3_CONNECTION_MAXIMUM, + AWS_S3_CONNECT_TIMEOUT.key, + AWS_S3_RETRY_MODE.key, AWS_SDK_LOG_LEVEL.key, // gcs config SPARK_GCS_STORAGE_ROOT_URL, @@ -654,9 +703,13 @@ object GlutenConfig { (SPARK_S3_USE_INSTANCE_CREDENTIALS, "false"), (SPARK_S3_IAM, ""), (SPARK_S3_IAM_SESSION_NAME, ""), + (SPARK_S3_RETRY_MAX_ATTEMPTS, "20"), + (SPARK_S3_CONNECTION_MAXIMUM, "15"), + (AWS_S3_CONNECT_TIMEOUT.key, AWS_S3_CONNECT_TIMEOUT.defaultValueString), + (AWS_S3_RETRY_MODE.key, AWS_S3_RETRY_MODE.defaultValueString), ( COLUMNAR_VELOX_CONNECTOR_IO_THREADS.key, - COLUMNAR_VELOX_CONNECTOR_IO_THREADS.defaultValueString), + conf.getOrElse(GLUTEN_NUM_TASK_SLOTS_PER_EXECUTOR_KEY, "-1")), (COLUMNAR_SHUFFLE_CODEC.key, ""), (COLUMNAR_SHUFFLE_CODEC_BACKEND.key, ""), ("spark.hadoop.input.connect.timeout", "180000"), @@ -679,7 +732,9 @@ object GlutenConfig { GLUTEN_OFFHEAP_SIZE_IN_BYTES_KEY, GLUTEN_TASK_OFFHEAP_SIZE_IN_BYTES_KEY, - GLUTEN_OFFHEAP_ENABLED + GLUTEN_OFFHEAP_ENABLED, + SESSION_LOCAL_TIMEZONE.key, + DECIMAL_OPERATIONS_ALLOW_PREC_LOSS.key ) nativeConfMap.putAll(conf.filter(e => keys.contains(e._1)).asJava) @@ -949,13 +1004,6 @@ object GlutenConfig { .intConf .createWithDefault(12) - val COLUMNAR_LOGICAL_JOIN_OPTIMIZATION_ENABLED = - buildConf("spark.gluten.sql.columnar.logicalJoinOptimizeEnable") - .internal() - .doc("Enable or disable columnar logicalJoinOptimize.") - .booleanConf - .createWithDefault(false) - val COLUMNAR_SCAN_ONLY_ENABLED = buildConf("spark.gluten.sql.columnar.scanOnly") .internal() @@ -1139,6 +1187,16 @@ object GlutenConfig { .stringConf .createOptional + val NUM_TASK_SLOTS_PER_EXECUTOR = + buildConf(GlutenConfig.GLUTEN_NUM_TASK_SLOTS_PER_EXECUTOR_KEY) + .internal() + .doc( + "Must provide default value since non-execution operations " + + "(e.g. org.apache.spark.sql.Dataset#summary) doesn't propagate configurations using " + + "org.apache.spark.sql.execution.SQLExecution#withSQLConfPropagated") + .intConf + .createWithDefaultString("-1") + val COLUMNAR_OFFHEAP_SIZE_IN_BYTES = buildConf(GlutenConfig.GLUTEN_OFFHEAP_SIZE_IN_BYTES_KEY) .internal() @@ -1209,7 +1267,7 @@ object GlutenConfig { val RAS_COST_MODEL = buildConf("spark.gluten.ras.costModel") .doc( - "Experimental: The classpath of user-defined cost model that will be used by RAS. " + + "Experimental: The class name of user-defined cost model that will be used by RAS. " + "If not specified, a rough built-in cost model will be used.") .stringConf .createWithDefaultString("rough") @@ -1277,11 +1335,7 @@ object GlutenConfig { .doc("The Size of the IO thread pool in the Connector. This thread pool is used for split" + " preloading and DirectBufferedInput.") .intConf - .createWithDefaultFunction( - () => - SQLConf.get.getConfString("spark.executor.cores", "1").toInt / SQLConf.get - .getConfString("spark.task.cpus", "1") - .toInt) + .createOptional val COLUMNAR_VELOX_ASYNC_TIMEOUT = buildStaticConf("spark.gluten.sql.columnar.backend.velox.asyncTimeoutOnTaskStopping") @@ -1378,6 +1432,23 @@ object GlutenConfig { .checkValue(_ > 0, "must be a positive number") .createWithDefault(10000) + val COLUMNAR_VELOX_COALESCE_BATCHES_BEFORE_SHUFFLE = + buildConf("spark.gluten.sql.columnar.backend.velox.coalesceBatchesBeforeShuffle") + .internal() + .doc(s"If true, combine small columnar batches together before sending to shuffle. " + + s"The default minimum output batch size is equal to 0.8 * $GLUTEN_MAX_BATCH_SIZE_KEY") + .booleanConf + .createWithDefault(true) + + val COLUMNAR_VELOX_MIN_BATCH_SIZE_FOR_SHUFFLE = + buildConf("spark.gluten.sql.columnar.backend.velox.minBatchSizeForShuffle") + .internal() + .doc(s"The minimum batch size for shuffle. If the batch size is smaller than this value, " + + s"it will be combined with other batches before sending to shuffle. Only functions when " + + s"${COLUMNAR_VELOX_COALESCE_BATCHES_BEFORE_SHUFFLE.key} is set to true.") + .intConf + .createOptional + val COLUMNAR_CH_SHUFFLE_PREFER_SPILL_ENABLED = buildConf("spark.gluten.sql.columnar.backend.ch.shuffle.preferSpill") .internal() @@ -1414,7 +1485,7 @@ object GlutenConfig { .internal() .doc("The maximum size of sort shuffle buffer in CH backend.") .bytesConf(ByteUnit.BYTE) - .createWithDefaultString("1GB") + .createWithDefaultString("0") val COLUMNAR_CH_SPILL_FIRSTLY_BEFORE_STOP = buildConf("spark.gluten.sql.columnar.backend.ch.spillFirstlyBeforeStop") @@ -1423,11 +1494,17 @@ object GlutenConfig { .booleanConf .createWithDefault(true) - val COLUMNAR_CH_FORCE_SORT_SHUFFLE = - buildConf("spark.gluten.sql.columnar.backend.ch.forceSortShuffle") + val COLUMNAR_CH_FORCE_EXTERNAL_SORT_SHUFFLE = + buildConf("spark.gluten.sql.columnar.backend.ch.forceExternalSortShuffle") .internal() - .doc("Whether to force to use sort shuffle in CH backend. " + - "Sort shuffle will enable When partition num greater than 300.") + .doc("Whether to force to use external sort shuffle in CH backend. ") + .booleanConf + .createWithDefault(false) + + val COLUMNAR_CH_FORCE_MEMORY_SORT_SHUFFLE = + buildConf("spark.gluten.sql.columnar.backend.ch.forceMemorySortShuffle") + .internal() + .doc("Whether to force to use memory sort shuffle in CH backend. ") .booleanConf .createWithDefault(false) @@ -1484,11 +1561,17 @@ object GlutenConfig { .createWithDefault(false) val DEBUG_KEEP_JNI_WORKSPACE = - buildConf(GLUTEN_DEBUG_KEEP_JNI_WORKSPACE) + buildStaticConf(GLUTEN_DEBUG_KEEP_JNI_WORKSPACE) .internal() .booleanConf .createWithDefault(false) + val DEBUG_KEEP_JNI_WORKSPACE_DIR = + buildStaticConf(GLUTEN_DEBUG_KEEP_JNI_WORKSPACE_DIR) + .internal() + .stringConf + .createWithDefault("/tmp") + val BENCHMARK_TASK_STAGEID = buildConf("spark.gluten.sql.benchmark_task.stageId") .internal() @@ -1514,6 +1597,16 @@ object GlutenConfig { .booleanConf .createOptional + val NATIVE_HIVEFILEFORMAT_WRITER_ENABLED = + buildConf("spark.gluten.sql.native.hive.writer.enabled") + .internal() + .doc( + "This is config to specify whether to enable the native columnar writer for " + + "HiveFileFormat. Currently only supports HiveFileFormat with Parquet as the output " + + "file type.") + .booleanConf + .createWithDefault(true) + val NATIVE_ARROW_READER_ENABLED = buildConf("spark.gluten.sql.native.arrow.reader.enabled") .internal() @@ -1772,6 +1865,13 @@ object GlutenConfig { .booleanConf .createWithDefault(true) + val COLUMNAR_SAMPLE_ENABLED = + buildConf("spark.gluten.sql.columnarSampleEnabled") + .internal() + .doc("Disable or enable columnar sample.") + .booleanConf + .createWithDefault(false) + val CACHE_WHOLE_STAGE_TRANSFORMER_CONTEXT = buildConf("spark.gluten.sql.cacheWholeStageTransformerContext") .internal() @@ -1844,6 +1944,20 @@ object GlutenConfig { .stringConf .createWithDefault("FATAL") + val AWS_S3_RETRY_MODE = + buildConf("spark.gluten.velox.fs.s3a.retry.mode") + .internal() + .doc("Retry mode for AWS s3 connection error: legacy, standard and adaptive.") + .stringConf + .createWithDefault("legacy") + + val AWS_S3_CONNECT_TIMEOUT = + buildConf("spark.gluten.velox.fs.s3a.connect.timeout") + .internal() + .doc("Timeout for AWS s3 connection.") + .stringConf + .createWithDefault("200s") + val VELOX_ORC_SCAN_ENABLED = buildStaticConf("spark.gluten.sql.columnar.backend.velox.orc.scan.enabled") .internal() @@ -1878,6 +1992,17 @@ object GlutenConfig { .booleanConf .createWithDefault(true) + val COST_EVALUATOR_ENABLED = + buildConf(GlutenConfig.GLUTEN_COST_EVALUATOR_ENABLED) + .internal() + .doc( + "If true and gluten enabled, use " + + "org.apache.spark.sql.execution.adaptive.GlutenCostEvaluator as custom cost " + + "evaluator class, else follow the configuration " + + "spark.sql.adaptive.customCostEvaluatorClass.") + .booleanConf + .createWithDefault(true) + val DYNAMIC_OFFHEAP_SIZING_ENABLED = buildConf(GlutenConfig.GLUTEN_DYNAMIC_OFFHEAP_SIZING_ENABLED) .internal() @@ -1905,4 +2030,12 @@ object GlutenConfig { .doubleConf .checkValue(v => v >= 0 && v <= 1, "offheap sizing memory fraction must between [0, 1]") .createWithDefault(0.6) + + val CELEBORN_FALLBACK_ENABLED = + buildStaticConf("spark.gluten.sql.columnar.shuffle.celeborn.fallback.enabled") + .internal() + .doc("If enabled, fall back to ColumnarShuffleManager when celeborn service is unavailable." + + "Otherwise, throw an exception.") + .booleanConf + .createWithDefault(true) } diff --git a/shims/common/src/main/scala/org/apache/gluten/expression/ExpressionNames.scala b/shims/common/src/main/scala/org/apache/gluten/expression/ExpressionNames.scala index 8214ff3893e5c..408c8e0e9e81e 100644 --- a/shims/common/src/main/scala/org/apache/gluten/expression/ExpressionNames.scala +++ b/shims/common/src/main/scala/org/apache/gluten/expression/ExpressionNames.scala @@ -83,7 +83,10 @@ object ExpressionNames { final val IS_NAN = "isnan" final val NANVL = "nanvl" final val TRY_EVAL = "try" - final val TRY_ADD = "try_add" + final val CHECKED_ADD = "checked_add" + final val CHECKED_SUBTRACT = "checked_subtract" + final val CHECKED_DIVIDE = "checked_divide" + final val CHECKED_MULTIPLY = "checked_multiply" // SparkSQL String functions final val ASCII = "ascii" @@ -113,6 +116,7 @@ object ExpressionNames { final val LEFT = "left" final val REPEAT = "repeat" final val TRANSLATE = "translate" + final val SOUNDEX = "soundex" final val SPACE = "space" final val EMPTY2NULL = "empty2null" final val INITCAP = "initcap" @@ -123,6 +127,10 @@ object ExpressionNames { final val ENCODE = "encode" final val UUID = "uuid" final val BIT_LENGTH = "bit_length" + final val OCTET_LENGTH = "octet_length" + final val LEVENSHTEIN = "levenshteinDistance" + final val UNBASE64 = "unbase64" + final val BASE64 = "base64" // URL functions final val PARSE_URL = "parse_url" @@ -167,8 +175,8 @@ object ExpressionNames { final val SIGN = "sign" final val LOG1P = "log1p" final val LOG2 = "log2" - final val LOG = "log" - final val LOGARITHM = "logarithm" + final val LOG = "ln" + final val LOGARITHM = "log" final val RADIANS = "radians" final val GREATEST = "greatest" final val LEAST = "least" @@ -215,6 +223,7 @@ object ExpressionNames { final val UNIX_MICROS = "unix_micros" final val TIMESTAMP_MILLIS = "timestamp_millis" final val TIMESTAMP_MICROS = "timestamp_micros" + final val PRECYSE_TIMESTAMP_CONVERSION = "precise_timestamp_conversion" // JSON functions final val GET_JSON_OBJECT = "get_json_object" @@ -269,6 +278,8 @@ object ExpressionNames { final val MAP_FROM_ARRAYS = "map_from_arrays" final val MAP_ENTRIES = "map_entries" final val MAP_ZIP_WITH = "map_zip_with" + final val TRANSFORM_KEYS = "transform_keys" + final val TRANSFORM_VALUES = "transform_values" final val STR_TO_MAP = "str_to_map" // struct functions @@ -304,6 +315,7 @@ object ExpressionNames { final val MONOTONICALLY_INCREASING_ID = "monotonically_increasing_id" final val WIDTH_BUCKET = "width_bucket" final val ASSERT_TRUE = "assert_true" + final val REPLICATE_ROWS = "replicaterows" // Directly use child expression transformer final val KNOWN_NULLABLE = "known_nullable" diff --git a/shims/common/src/main/scala/org/apache/gluten/sql/shims/SparkShims.scala b/shims/common/src/main/scala/org/apache/gluten/sql/shims/SparkShims.scala index d6acc8c27b291..bd1665330a891 100644 --- a/shims/common/src/main/scala/org/apache/gluten/sql/shims/SparkShims.scala +++ b/shims/common/src/main/scala/org/apache/gluten/sql/shims/SparkShims.scala @@ -28,27 +28,27 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.catalog.BucketSpec import org.apache.spark.sql.catalyst.csv.CSVOptions import org.apache.spark.sql.catalyst.expressions.{Attribute, BinaryExpression, Expression} -import org.apache.spark.sql.catalyst.expressions.aggregate.{ImperativeAggregate, TypedImperativeAggregate} +import org.apache.spark.sql.catalyst.expressions.aggregate.TypedImperativeAggregate import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan -import org.apache.spark.sql.catalyst.plans.physical.Distribution -import org.apache.spark.sql.catalyst.plans.physical.Partitioning +import org.apache.spark.sql.catalyst.plans.physical.{Distribution, Partitioning} import org.apache.spark.sql.catalyst.rules.Rule -import org.apache.spark.sql.catalyst.trees.TernaryLike import org.apache.spark.sql.connector.catalog.Table import org.apache.spark.sql.connector.expressions.Transform import org.apache.spark.sql.connector.read.{InputPartition, Scan} -import org.apache.spark.sql.execution.{FileSourceScanExec, FilterExec, GlobalLimitExec, SparkPlan, TakeOrderedAndProjectExec} -import org.apache.spark.sql.execution.aggregate.ObjectHashAggregateExec +import org.apache.spark.sql.execution.{FileSourceScanExec, GlobalLimitExec, SparkPlan, TakeOrderedAndProjectExec} import org.apache.spark.sql.execution.command.DataWritingCommandExec -import org.apache.spark.sql.execution.datasources.{FilePartition, FileScanRDD, PartitionDirectory, PartitionedFile, PartitioningAwareFileIndex, WriteJobDescription, WriteTaskResult} +import org.apache.spark.sql.execution.datasources._ +import org.apache.spark.sql.execution.datasources.parquet.{ParquetFileFormat, ParquetFilters} import org.apache.spark.sql.execution.datasources.v2.BatchScanExec import org.apache.spark.sql.execution.datasources.v2.text.TextScan import org.apache.spark.sql.execution.exchange.{BroadcastExchangeLike, ShuffleExchangeLike} -import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types.{StructField, StructType} import org.apache.spark.sql.util.CaseInsensitiveStringMap import org.apache.spark.storage.{BlockId, BlockManagerId} import org.apache.hadoop.fs.{FileStatus, Path} +import org.apache.parquet.schema.MessageType import java.util.{ArrayList => JArrayList, Map => JMap} @@ -193,8 +193,12 @@ trait SparkShims { def getFileStatus(partition: PartitionDirectory): Seq[FileStatus] + def isFileSplittable(relation: HadoopFsRelation, filePath: Path, sparkSchema: StructType): Boolean + def isRowIndexMetadataColumn(name: String): Boolean + def findRowIndexColumnIndexInSchema(sparkSchema: StructType): Int + def splitFiles( sparkSession: SparkSession, file: FileStatus, @@ -207,6 +211,9 @@ trait SparkShims { def attributesFromStruct(structType: StructType): Seq[Attribute] + // Spark 3.3 and later only have file size and modification time in PartitionedFile + def getFileSizeAndModificationTime(file: PartitionedFile): (Option[Long], Option[Long]) + def generateMetadataColumns( file: PartitionedFile, metadataColumnNames: Seq[String] = Seq.empty): JMap[String, String] @@ -236,4 +243,9 @@ trait SparkShims { def dateTimestampFormatInReadIsDefaultValue(csvOptions: CSVOptions, timeZone: String): Boolean def isPlannedV1Write(write: DataWritingCommandExec): Boolean = false + + def createParquetFilters( + conf: SQLConf, + schema: MessageType, + caseSensitive: Option[Boolean] = None): ParquetFilters } diff --git a/shims/common/src/main/scala/org/apache/spark/sql/execution/adaptive/GlutenCost.scala b/shims/common/src/main/scala/org/apache/spark/sql/execution/adaptive/GlutenCost.scala new file mode 100644 index 0000000000000..9436f469ba79e --- /dev/null +++ b/shims/common/src/main/scala/org/apache/spark/sql/execution/adaptive/GlutenCost.scala @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.execution.adaptive + +import org.apache.spark.sql.errors.QueryExecutionErrors +import org.apache.spark.sql.execution.SparkPlan + +class GlutenCost(val eval: CostEvaluator, val plan: SparkPlan) extends Cost { + override def compare(that: Cost): Int = that match { + case that: GlutenCost if plan eq that.plan => + 0 + case that: GlutenCost if plan == that.plan => + // Plans are identical. Considers the newer one as having lower cost. + -(plan.id - that.plan.id) + case that: GlutenCost => + // Plans are different. Use the delegated cost evaluator. + assert(eval == that.eval) + eval.evaluateCost(plan).compare(eval.evaluateCost(that.plan)) + case _ => + throw QueryExecutionErrors.cannotCompareCostWithTargetCostError(that.toString) + } + + override def hashCode(): Int = throw new UnsupportedOperationException() + + override def equals(obj: Any): Boolean = obj match { + case that: Cost => compare(that) == 0 + case _ => false + } +} diff --git a/shims/pom.xml b/shims/pom.xml index 5c17c3ec32ccb..61bea7040999e 100644 --- a/shims/pom.xml +++ b/shims/pom.xml @@ -37,7 +37,7 @@ org.scalatest scalatest_${scala.binary.version} - 3.2.3 + 3.2.16 test diff --git a/shims/spark32/pom.xml b/shims/spark32/pom.xml index 4ee8889cf9a93..2a3ed7400c9cb 100644 --- a/shims/spark32/pom.xml +++ b/shims/spark32/pom.xml @@ -108,6 +108,7 @@ -Wconf:cat=deprecation:silent + -P:wartremover:traverser:io.github.zhztheplayer.scalawarts.InheritFromCaseClass diff --git a/shims/spark32/src/main/java/org/apache/spark/sql/execution/joins/HashJoin.scala.deprecated b/shims/spark32/src/main/java/org/apache/spark/sql/execution/joins/HashJoin.scala.deprecated deleted file mode 100644 index 24bf246b325cc..0000000000000 --- a/shims/spark32/src/main/java/org/apache/spark/sql/execution/joins/HashJoin.scala.deprecated +++ /dev/null @@ -1,774 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.execution.joins - -import org.apache.spark.sql.catalyst.{InternalRow, SQLConfHelper} -import org.apache.spark.sql.catalyst.analysis.CastSupport -import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.catalyst.expressions.BindReferences.bindReferences -import org.apache.spark.sql.catalyst.expressions.codegen._ -import org.apache.spark.sql.catalyst.optimizer.{BuildLeft, BuildRight, BuildSide} -import org.apache.spark.sql.catalyst.plans._ -import org.apache.spark.sql.catalyst.plans.physical.Partitioning -import org.apache.spark.sql.execution.{CodegenSupport, ExplainUtils, RowIterator} -import org.apache.spark.sql.execution.metric.SQLMetric -import org.apache.spark.sql.types.{BooleanType, IntegralType, LongType} - -/** - * @param relationTerm variable name for HashedRelation - * @param keyIsUnique indicate whether keys of HashedRelation known to be unique in code-gen time - * @param isEmpty indicate whether it known to be EmptyHashedRelation in code-gen time - */ -private[joins] case class HashedRelationInfo( - relationTerm: String, - keyIsUnique: Boolean, - isEmpty: Boolean) - -trait HashJoin extends JoinCodegenSupport { - def buildSide: BuildSide - - override def simpleStringWithNodeId(): String = { - val opId = ExplainUtils.getOpId(this) - s"$nodeName $joinType ${buildSide} ($opId)".trim - } - - override def output: Seq[Attribute] = { - joinType match { - case _: InnerLike => - left.output ++ right.output - case LeftOuter => - left.output ++ right.output.map(_.withNullability(true)) - case RightOuter => - left.output.map(_.withNullability(true)) ++ right.output - case j: ExistenceJoin => - left.output :+ j.exists - case LeftExistence(_) => - left.output - case x => - throw new IllegalArgumentException(s"HashJoin should not take $x as the JoinType") - } - } - - override def outputPartitioning: Partitioning = buildSide match { - case BuildLeft => - joinType match { - case _: InnerLike | RightOuter => right.outputPartitioning - case x => - throw new IllegalArgumentException( - s"HashJoin should not take $x as the JoinType with building left side") - } - case BuildRight => - joinType match { - case _: InnerLike | LeftOuter | LeftSemi | LeftAnti | _: ExistenceJoin => - left.outputPartitioning - case x => - throw new IllegalArgumentException( - s"HashJoin should not take $x as the JoinType with building right side") - } - } - - /** - * Handle the special cases for LeftOuter/LeftSemi with BuildLeft and RightOuter with BuildRight. - */ - override def outputOrdering: Seq[SortOrder] = buildSide match { - case BuildLeft => - joinType match { - case _: InnerLike | RightOuter => right.outputOrdering - case LeftOuter => left.outputOrdering - case LeftSemi => left.outputOrdering - case x => - throw new IllegalArgumentException( - s"HashJoin should not take $x as the JoinType with building left side") - } - case BuildRight => - joinType match { - case _: InnerLike | LeftOuter | LeftSemi | LeftAnti | _: ExistenceJoin => - left.outputOrdering - case RightOuter => right.outputOrdering - case x => - throw new IllegalArgumentException( - s"HashJoin should not take $x as the JoinType with building right side") - } - } - - protected lazy val (buildPlan, streamedPlan) = buildSide match { - case BuildLeft => (left, right) - case BuildRight => (right, left) - } - - protected lazy val (buildKeys, streamedKeys) = { - require(leftKeys.length == rightKeys.length && - leftKeys.map(_.dataType) - .zip(rightKeys.map(_.dataType)) - .forall(types => types._1.sameType(types._2)), - "Join keys from two sides should have same length and types") - buildSide match { - case BuildLeft => (leftKeys, rightKeys) - case BuildRight => (rightKeys, leftKeys) - } - } - - @transient protected lazy val (buildOutput, streamedOutput) = { - buildSide match { - case BuildLeft => (left.output, right.output) - case BuildRight => (right.output, left.output) - } - } - - @transient protected lazy val buildBoundKeys = - bindReferences(HashJoin.rewriteKeyExpr(buildKeys), buildOutput) - - @transient protected lazy val streamedBoundKeys = - bindReferences(HashJoin.rewriteKeyExpr(streamedKeys), streamedOutput) - - protected def buildSideKeyGenerator(): Projection = - UnsafeProjection.create(buildBoundKeys) - - protected def streamSideKeyGenerator(): UnsafeProjection = - UnsafeProjection.create(streamedBoundKeys) - - @transient protected[this] lazy val boundCondition = if (condition.isDefined) { - if (joinType == FullOuter && buildSide == BuildLeft) { - // Put join left side before right side. This is to be consistent with - // `ShuffledHashJoinExec.fullOuterJoin`. - Predicate.create(condition.get, buildPlan.output ++ streamedPlan.output).eval _ - } else { - Predicate.create(condition.get, streamedPlan.output ++ buildPlan.output).eval _ - } - } else { - (r: InternalRow) => true - } - - protected def createResultProjection(): (InternalRow) => InternalRow = joinType match { - case LeftExistence(_) => - UnsafeProjection.create(output, output) - case _ => - // Always put the stream side on left to simplify implementation - // both of left and right side could be null - UnsafeProjection.create( - output, (streamedPlan.output ++ buildPlan.output).map(_.withNullability(true))) - } - - private def innerJoin( - streamIter: Iterator[InternalRow], - hashedRelation: HashedRelation): Iterator[InternalRow] = { - val joinRow = new JoinedRow - val joinKeys = streamSideKeyGenerator() - - if (hashedRelation == EmptyHashedRelation) { - Iterator.empty - } else if (hashedRelation.keyIsUnique) { - streamIter.flatMap { srow => - joinRow.withLeft(srow) - val matched = hashedRelation.getValue(joinKeys(srow)) - if (matched != null) { - Some(joinRow.withRight(matched)).filter(boundCondition) - } else { - None - } - } - } else { - streamIter.flatMap { srow => - joinRow.withLeft(srow) - val matches = hashedRelation.get(joinKeys(srow)) - if (matches != null) { - matches.map(joinRow.withRight).filter(boundCondition) - } else { - Seq.empty - } - } - } - } - - private def outerJoin( - streamedIter: Iterator[InternalRow], - hashedRelation: HashedRelation): Iterator[InternalRow] = { - val joinedRow = new JoinedRow() - val keyGenerator = streamSideKeyGenerator() - val nullRow = new GenericInternalRow(buildPlan.output.length) - - if (hashedRelation.keyIsUnique) { - streamedIter.map { currentRow => - val rowKey = keyGenerator(currentRow) - joinedRow.withLeft(currentRow) - val matched = hashedRelation.getValue(rowKey) - if (matched != null && boundCondition(joinedRow.withRight(matched))) { - joinedRow - } else { - joinedRow.withRight(nullRow) - } - } - } else { - streamedIter.flatMap { currentRow => - val rowKey = keyGenerator(currentRow) - joinedRow.withLeft(currentRow) - val buildIter = hashedRelation.get(rowKey) - new RowIterator { - private var found = false - override def advanceNext(): Boolean = { - while (buildIter != null && buildIter.hasNext) { - val nextBuildRow = buildIter.next() - if (boundCondition(joinedRow.withRight(nextBuildRow))) { - found = true - return true - } - } - if (!found) { - joinedRow.withRight(nullRow) - found = true - return true - } - false - } - override def getRow: InternalRow = joinedRow - }.toScala - } - } - } - - private def semiJoin( - streamIter: Iterator[InternalRow], - hashedRelation: HashedRelation): Iterator[InternalRow] = { - val joinKeys = streamSideKeyGenerator() - val joinedRow = new JoinedRow - - if (hashedRelation == EmptyHashedRelation) { - Iterator.empty - } else if (hashedRelation.keyIsUnique) { - streamIter.filter { current => - val key = joinKeys(current) - lazy val matched = hashedRelation.getValue(key) - !key.anyNull && matched != null && - (condition.isEmpty || boundCondition(joinedRow(current, matched))) - } - } else { - streamIter.filter { current => - val key = joinKeys(current) - lazy val buildIter = hashedRelation.get(key) - !key.anyNull && buildIter != null && (condition.isEmpty || buildIter.exists { - (row: InternalRow) => boundCondition(joinedRow(current, row)) - }) - } - } - } - - private def existenceJoin( - streamIter: Iterator[InternalRow], - hashedRelation: HashedRelation): Iterator[InternalRow] = { - val joinKeys = streamSideKeyGenerator() - val result = new GenericInternalRow(Array[Any](null)) - val joinedRow = new JoinedRow - - if (hashedRelation.keyIsUnique) { - streamIter.map { current => - val key = joinKeys(current) - lazy val matched = hashedRelation.getValue(key) - val exists = !key.anyNull && matched != null && - (condition.isEmpty || boundCondition(joinedRow(current, matched))) - result.setBoolean(0, exists) - joinedRow(current, result) - } - } else { - streamIter.map { current => - val key = joinKeys(current) - lazy val buildIter = hashedRelation.get(key) - val exists = !key.anyNull && buildIter != null && (condition.isEmpty || buildIter.exists { - (row: InternalRow) => boundCondition(joinedRow(current, row)) - }) - result.setBoolean(0, exists) - joinedRow(current, result) - } - } - } - - private def antiJoin( - streamIter: Iterator[InternalRow], - hashedRelation: HashedRelation): Iterator[InternalRow] = { - // If the right side is empty, AntiJoin simply returns the left side. - if (hashedRelation == EmptyHashedRelation) { - return streamIter - } - - val joinKeys = streamSideKeyGenerator() - val joinedRow = new JoinedRow - - if (hashedRelation.keyIsUnique) { - streamIter.filter { current => - val key = joinKeys(current) - lazy val matched = hashedRelation.getValue(key) - key.anyNull || matched == null || - (condition.isDefined && !boundCondition(joinedRow(current, matched))) - } - } else { - streamIter.filter { current => - val key = joinKeys(current) - lazy val buildIter = hashedRelation.get(key) - key.anyNull || buildIter == null || (condition.isDefined && !buildIter.exists { - row => boundCondition(joinedRow(current, row)) - }) - } - } - } - - protected def join( - streamedIter: Iterator[InternalRow], - hashed: HashedRelation, - numOutputRows: SQLMetric): Iterator[InternalRow] = { - - val joinedIter = joinType match { - case _: InnerLike => - innerJoin(streamedIter, hashed) - case LeftOuter | RightOuter => - outerJoin(streamedIter, hashed) - case LeftSemi => - semiJoin(streamedIter, hashed) - case LeftAnti => - antiJoin(streamedIter, hashed) - case _: ExistenceJoin => - existenceJoin(streamedIter, hashed) - case x => - throw new IllegalArgumentException( - s"HashJoin should not take $x as the JoinType") - } - - val resultProj = createResultProjection - joinedIter.map { r => - numOutputRows += 1 - resultProj(r) - } - } - - override def doProduce(ctx: CodegenContext): String = { - streamedPlan.asInstanceOf[CodegenSupport].produce(ctx, this) - } - - override def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: ExprCode): String = { - joinType match { - case _: InnerLike => codegenInner(ctx, input) - case LeftOuter | RightOuter => codegenOuter(ctx, input) - case LeftSemi => codegenSemi(ctx, input) - case LeftAnti => codegenAnti(ctx, input) - case _: ExistenceJoin => codegenExistence(ctx, input) - case x => - throw new IllegalArgumentException( - s"HashJoin should not take $x as the JoinType") - } - } - - /** - * Returns the code for generating join key for stream side, and expression of whether the key - * has any null in it or not. - */ - protected def genStreamSideJoinKey( - ctx: CodegenContext, - input: Seq[ExprCode]): (ExprCode, String) = { - ctx.currentVars = input - if (streamedBoundKeys.length == 1 && streamedBoundKeys.head.dataType == LongType) { - // generate the join key as Long - val ev = streamedBoundKeys.head.genCode(ctx) - (ev, ev.isNull) - } else { - // generate the join key as UnsafeRow - val ev = GenerateUnsafeProjection.createCode(ctx, streamedBoundKeys) - (ev, s"${ev.value}.anyNull()") - } - } - - /** - * Generates the code for Inner join. - */ - protected def codegenInner(ctx: CodegenContext, input: Seq[ExprCode]): String = { - val HashedRelationInfo(relationTerm, keyIsUnique, isEmptyHashedRelation) = prepareRelation(ctx) - val (keyEv, anyNull) = genStreamSideJoinKey(ctx, input) - val (matched, checkCondition, buildVars) = getJoinCondition(ctx, input, streamedPlan, buildPlan) - val numOutput = metricTerm(ctx, "numOutputRows") - - val resultVars = buildSide match { - case BuildLeft => buildVars ++ input - case BuildRight => input ++ buildVars - } - - if (isEmptyHashedRelation) { - """ - |// If HashedRelation is empty, hash inner join simply returns nothing. - """.stripMargin - } else if (keyIsUnique) { - s""" - |// generate join key for stream side - |${keyEv.code} - |// find matches from HashedRelation - |UnsafeRow $matched = $anyNull ? null: (UnsafeRow)$relationTerm.getValue(${keyEv.value}); - |if ($matched != null) { - | $checkCondition { - | $numOutput.add(1); - | ${consume(ctx, resultVars)} - | } - |} - """.stripMargin - } else { - val matches = ctx.freshName("matches") - val iteratorCls = classOf[Iterator[UnsafeRow]].getName - - s""" - |// generate join key for stream side - |${keyEv.code} - |// find matches from HashRelation - |$iteratorCls $matches = $anyNull ? - | null : ($iteratorCls)$relationTerm.get(${keyEv.value}); - |if ($matches != null) { - | while ($matches.hasNext()) { - | UnsafeRow $matched = (UnsafeRow) $matches.next(); - | $checkCondition { - | $numOutput.add(1); - | ${consume(ctx, resultVars)} - | } - | } - |} - """.stripMargin - } - } - - /** - * Generates the code for left or right outer join. - */ - protected def codegenOuter(ctx: CodegenContext, input: Seq[ExprCode]): String = { - val HashedRelationInfo(relationTerm, keyIsUnique, _) = prepareRelation(ctx) - val (keyEv, anyNull) = genStreamSideJoinKey(ctx, input) - val matched = ctx.freshName("matched") - val buildVars = genBuildSideVars(ctx, matched, buildPlan) - val numOutput = metricTerm(ctx, "numOutputRows") - - // filter the output via condition - val conditionPassed = ctx.freshName("conditionPassed") - val checkCondition = if (condition.isDefined) { - val expr = condition.get - // evaluate the variables from build side that used by condition - val eval = evaluateRequiredVariables(buildPlan.output, buildVars, expr.references) - ctx.currentVars = input ++ buildVars - val ev = - BindReferences.bindReference(expr, streamedPlan.output ++ buildPlan.output).genCode(ctx) - s""" - |boolean $conditionPassed = true; - |${eval.trim} - |if ($matched != null) { - | ${ev.code} - | $conditionPassed = !${ev.isNull} && ${ev.value}; - |} - """.stripMargin - } else { - s"final boolean $conditionPassed = true;" - } - - val resultVars = buildSide match { - case BuildLeft => buildVars ++ input - case BuildRight => input ++ buildVars - } - - if (keyIsUnique) { - s""" - |// generate join key for stream side - |${keyEv.code} - |// find matches from HashedRelation - |UnsafeRow $matched = $anyNull ? null: (UnsafeRow)$relationTerm.getValue(${keyEv.value}); - |${checkCondition.trim} - |if (!$conditionPassed) { - | $matched = null; - | // reset the variables those are already evaluated. - | ${buildVars.filter(_.code.isEmpty).map(v => s"${v.isNull} = true;").mkString("\n")} - |} - |$numOutput.add(1); - |${consume(ctx, resultVars)} - """.stripMargin - } else { - val matches = ctx.freshName("matches") - val iteratorCls = classOf[Iterator[UnsafeRow]].getName - val found = ctx.freshName("found") - - s""" - |// generate join key for stream side - |${keyEv.code} - |// find matches from HashRelation - |$iteratorCls $matches = $anyNull ? null : ($iteratorCls)$relationTerm.get(${keyEv.value}); - |boolean $found = false; - |// the last iteration of this loop is to emit an empty row if there is no matched rows. - |while ($matches != null && $matches.hasNext() || !$found) { - | UnsafeRow $matched = $matches != null && $matches.hasNext() ? - | (UnsafeRow) $matches.next() : null; - | ${checkCondition.trim} - | if ($conditionPassed) { - | $found = true; - | $numOutput.add(1); - | ${consume(ctx, resultVars)} - | } - |} - """.stripMargin - } - } - - /** - * Generates the code for left semi join. - */ - protected def codegenSemi(ctx: CodegenContext, input: Seq[ExprCode]): String = { - val HashedRelationInfo(relationTerm, keyIsUnique, isEmptyHashedRelation) = prepareRelation(ctx) - val (keyEv, anyNull) = genStreamSideJoinKey(ctx, input) - val (matched, checkCondition, _) = getJoinCondition(ctx, input, streamedPlan, buildPlan) - val numOutput = metricTerm(ctx, "numOutputRows") - - if (isEmptyHashedRelation) { - """ - |// If HashedRelation is empty, hash semi join simply returns nothing. - """.stripMargin - } else if (keyIsUnique) { - s""" - |// generate join key for stream side - |${keyEv.code} - |// find matches from HashedRelation - |UnsafeRow $matched = $anyNull ? null: (UnsafeRow)$relationTerm.getValue(${keyEv.value}); - |if ($matched != null) { - | $checkCondition { - | $numOutput.add(1); - | ${consume(ctx, input)} - | } - |} - """.stripMargin - } else { - val matches = ctx.freshName("matches") - val iteratorCls = classOf[Iterator[UnsafeRow]].getName - val found = ctx.freshName("found") - - s""" - |// generate join key for stream side - |${keyEv.code} - |// find matches from HashRelation - |$iteratorCls $matches = $anyNull ? null : ($iteratorCls)$relationTerm.get(${keyEv.value}); - |if ($matches != null) { - | boolean $found = false; - | while (!$found && $matches.hasNext()) { - | UnsafeRow $matched = (UnsafeRow) $matches.next(); - | $checkCondition { - | $found = true; - | } - | } - | if ($found) { - | $numOutput.add(1); - | ${consume(ctx, input)} - | } - |} - """.stripMargin - } - } - - /** - * Generates the code for anti join. - */ - protected def codegenAnti(ctx: CodegenContext, input: Seq[ExprCode]): String = { - val HashedRelationInfo(relationTerm, keyIsUnique, isEmptyHashedRelation) = prepareRelation(ctx) - val numOutput = metricTerm(ctx, "numOutputRows") - if (isEmptyHashedRelation) { - return s""" - |// If HashedRelation is empty, hash anti join simply returns the stream side. - |$numOutput.add(1); - |${consume(ctx, input)} - """.stripMargin - } - - val (keyEv, anyNull) = genStreamSideJoinKey(ctx, input) - val (matched, checkCondition, _) = getJoinCondition(ctx, input, streamedPlan, buildPlan) - - if (keyIsUnique) { - val found = ctx.freshName("found") - s""" - |boolean $found = false; - |// generate join key for stream side - |${keyEv.code} - |// Check if the key has nulls. - |if (!($anyNull)) { - | // Check if the HashedRelation exists. - | UnsafeRow $matched = (UnsafeRow)$relationTerm.getValue(${keyEv.value}); - | if ($matched != null) { - | // Evaluate the condition. - | $checkCondition { - | $found = true; - | } - | } - |} - |if (!$found) { - | $numOutput.add(1); - | ${consume(ctx, input)} - |} - """.stripMargin - } else { - val matches = ctx.freshName("matches") - val iteratorCls = classOf[Iterator[UnsafeRow]].getName - val found = ctx.freshName("found") - s""" - |boolean $found = false; - |// generate join key for stream side - |${keyEv.code} - |// Check if the key has nulls. - |if (!($anyNull)) { - | // Check if the HashedRelation exists. - | $iteratorCls $matches = ($iteratorCls)$relationTerm.get(${keyEv.value}); - | if ($matches != null) { - | // Evaluate the condition. - | while (!$found && $matches.hasNext()) { - | UnsafeRow $matched = (UnsafeRow) $matches.next(); - | $checkCondition { - | $found = true; - | } - | } - | } - |} - |if (!$found) { - | $numOutput.add(1); - | ${consume(ctx, input)} - |} - """.stripMargin - } - } - - /** - * Generates the code for existence join. - */ - protected def codegenExistence(ctx: CodegenContext, input: Seq[ExprCode]): String = { - val HashedRelationInfo(relationTerm, keyIsUnique, _) = prepareRelation(ctx) - val (keyEv, anyNull) = genStreamSideJoinKey(ctx, input) - val numOutput = metricTerm(ctx, "numOutputRows") - val existsVar = ctx.freshName("exists") - - val matched = ctx.freshName("matched") - val buildVars = genBuildSideVars(ctx, matched, buildPlan) - val checkCondition = if (condition.isDefined) { - val expr = condition.get - // evaluate the variables from build side that used by condition - val eval = evaluateRequiredVariables(buildPlan.output, buildVars, expr.references) - // filter the output via condition - ctx.currentVars = input ++ buildVars - val ev = - BindReferences.bindReference(expr, streamedPlan.output ++ buildPlan.output).genCode(ctx) - s""" - |$eval - |${ev.code} - |$existsVar = !${ev.isNull} && ${ev.value}; - """.stripMargin - } else { - s"$existsVar = true;" - } - - val resultVar = input ++ Seq(ExprCode.forNonNullValue( - JavaCode.variable(existsVar, BooleanType))) - - if (keyIsUnique) { - s""" - |// generate join key for stream side - |${keyEv.code} - |// find matches from HashedRelation - |UnsafeRow $matched = $anyNull ? null: (UnsafeRow)$relationTerm.getValue(${keyEv.value}); - |boolean $existsVar = false; - |if ($matched != null) { - | $checkCondition - |} - |$numOutput.add(1); - |${consume(ctx, resultVar)} - """.stripMargin - } else { - val matches = ctx.freshName("matches") - val iteratorCls = classOf[Iterator[UnsafeRow]].getName - s""" - |// generate join key for stream side - |${keyEv.code} - |// find matches from HashRelation - |$iteratorCls $matches = $anyNull ? null : ($iteratorCls)$relationTerm.get(${keyEv.value}); - |boolean $existsVar = false; - |if ($matches != null) { - | while (!$existsVar && $matches.hasNext()) { - | UnsafeRow $matched = (UnsafeRow) $matches.next(); - | $checkCondition - | } - |} - |$numOutput.add(1); - |${consume(ctx, resultVar)} - """.stripMargin - } - } - - protected def prepareRelation(ctx: CodegenContext): HashedRelationInfo -} - -object HashJoin extends CastSupport with SQLConfHelper { - - private def canRewriteAsLongType(keys: Seq[Expression]): Boolean = { - // TODO: support BooleanType, DateType and TimestampType - keys.forall(_.dataType.isInstanceOf[IntegralType]) && - keys.map(_.dataType.defaultSize).sum <= 8 - } - - /** - * Try to rewrite the key as LongType so we can use getLong(), if they key can fit with a long. - * - * If not, returns the original expressions. - */ - def rewriteKeyExpr(keys: Seq[Expression]): Seq[Expression] = { - assert(keys.nonEmpty) - if (!canRewriteAsLongType(keys)) { - return keys - } - - var keyExpr: Expression = if (keys.head.dataType != LongType) { - cast(keys.head, LongType) - } else { - keys.head - } - keys.tail.foreach { e => - val bits = e.dataType.defaultSize * 8 - keyExpr = BitwiseOr(ShiftLeft(keyExpr, Literal(bits)), - BitwiseAnd(cast(e, LongType), Literal((1L << bits) - 1))) - } - keyExpr :: Nil - } - - /** - * Extract a given key which was previously packed in a long value using its index to - * determine the number of bits to shift - */ - def extractKeyExprAt(keys: Seq[Expression], index: Int): Expression = { - assert(canRewriteAsLongType(keys)) - // jump over keys that have a higher index value than the required key - if (keys.size == 1) { - assert(index == 0) - Cast( - child = BoundReference(0, LongType, nullable = false), - dataType = keys(index).dataType, - timeZoneId = Option(conf.sessionLocalTimeZone), - ansiEnabled = false) - } else { - val shiftedBits = - keys.slice(index + 1, keys.size).map(_.dataType.defaultSize * 8).sum - val mask = (1L << (keys(index).dataType.defaultSize * 8)) - 1 - // build the schema for unpacking the required key - val castChild = BitwiseAnd( - ShiftRightUnsigned(BoundReference(0, LongType, nullable = false), Literal(shiftedBits)), - Literal(mask)) - Cast( - child = castChild, - dataType = keys(index).dataType, - timeZoneId = Option(conf.sessionLocalTimeZone), - ansiEnabled = false) - } - } -} diff --git a/shims/spark32/src/main/scala/org/apache/gluten/execution/GenerateTreeStringShim.scala b/shims/spark32/src/main/scala/org/apache/gluten/execution/GenerateTreeStringShim.scala index 3e42244bd8c64..8936e6ca63510 100644 --- a/shims/spark32/src/main/scala/org/apache/gluten/execution/GenerateTreeStringShim.scala +++ b/shims/spark32/src/main/scala/org/apache/gluten/execution/GenerateTreeStringShim.scala @@ -16,7 +16,7 @@ */ package org.apache.gluten.execution -import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} +import org.apache.spark.sql.execution.UnaryExecNode /** * Spark 3.5 has changed the parameter type of the generateTreeString API in TreeNode. In order to @@ -25,7 +25,7 @@ import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} * allow different Spark versions to override their own generateTreeString. */ -trait GenerateTreeStringShim extends UnaryExecNode { +trait WholeStageTransformerGenerateTreeStringShim extends UnaryExecNode { def stageId: Int @@ -62,3 +62,28 @@ trait GenerateTreeStringShim extends UnaryExecNode { } } } + +trait InputAdapterGenerateTreeStringShim extends UnaryExecNode { + + override def generateTreeString( + depth: Int, + lastChildren: Seq[Boolean], + append: String => Unit, + verbose: Boolean, + prefix: String = "", + addSuffix: Boolean = false, + maxFields: Int, + printNodeId: Boolean, + indent: Int = 0): Unit = { + child.generateTreeString( + depth, + lastChildren, + append, + verbose, + prefix = "", + addSuffix = false, + maxFields, + printNodeId, + indent) + } +} diff --git a/shims/spark32/src/main/scala/org/apache/gluten/sql/shims/spark32/Spark32Shims.scala b/shims/spark32/src/main/scala/org/apache/gluten/sql/shims/spark32/Spark32Shims.scala index 29fddc697b07b..b036d6dd9a411 100644 --- a/shims/spark32/src/main/scala/org/apache/gluten/sql/shims/spark32/Spark32Shims.scala +++ b/shims/spark32/src/main/scala/org/apache/gluten/sql/shims/spark32/Spark32Shims.scala @@ -27,26 +27,31 @@ import org.apache.spark.sql.{AnalysisException, SparkSession} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.catalog.BucketSpec import org.apache.spark.sql.catalyst.csv.CSVOptions -import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, BinaryExpression, Expression} +import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, BinaryExpression, Expression, InputFileBlockLength, InputFileBlockStart, InputFileName} import org.apache.spark.sql.catalyst.expressions.aggregate.TypedImperativeAggregate import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.plans.physical.{Distribution, HashClusteredDistribution} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap +import org.apache.spark.sql.catalyst.util.RebaseDateTime.RebaseSpec import org.apache.spark.sql.connector.catalog.Table import org.apache.spark.sql.connector.expressions.Transform import org.apache.spark.sql.execution.{FileSourceScanExec, PartitionedFileUtil, SparkPlan} import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.execution.datasources.FileFormatWriter.Empty2Null +import org.apache.spark.sql.execution.datasources.parquet.ParquetFilters import org.apache.spark.sql.execution.datasources.v2.BatchScanExec import org.apache.spark.sql.execution.datasources.v2.text.TextScan import org.apache.spark.sql.execution.datasources.v2.utils.CatalogUtil import org.apache.spark.sql.execution.exchange.BroadcastExchangeLike +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.internal.SQLConf.LegacyBehaviorPolicy import org.apache.spark.sql.types.{StructField, StructType} import org.apache.spark.sql.util.CaseInsensitiveStringMap import org.apache.spark.storage.{BlockId, BlockManagerId} import org.apache.hadoop.fs.{FileStatus, Path} +import org.apache.parquet.schema.MessageType import java.util.{HashMap => JHashMap, Map => JMap} @@ -189,8 +194,15 @@ class Spark32Shims extends SparkShims { def getFileStatus(partition: PartitionDirectory): Seq[FileStatus] = partition.files + def isFileSplittable( + relation: HadoopFsRelation, + filePath: Path, + sparkSchema: StructType): Boolean = true + def isRowIndexMetadataColumn(name: String): Boolean = false + def findRowIndexColumnIndexInSchema(sparkSchema: StructType): Int = -1 + def splitFiles( sparkSession: SparkSession, file: FileStatus, @@ -217,10 +229,20 @@ class Spark32Shims extends SparkShims { } } + override def getFileSizeAndModificationTime( + file: PartitionedFile): (Option[Long], Option[Long]) = { + (None, None) + } + override def generateMetadataColumns( file: PartitionedFile, - metadataColumnNames: Seq[String]): JMap[String, String] = - new JHashMap[String, String]() + metadataColumnNames: Seq[String]): JMap[String, String] = { + val metadataColumn = new JHashMap[String, String]() + metadataColumn.put(InputFileName().prettyName, file.filePath) + metadataColumn.put(InputFileBlockStart().prettyName, file.start.toString) + metadataColumn.put(InputFileBlockLength().prettyName, file.length.toString) + metadataColumn + } def getAnalysisExceptionPlan(ae: AnalysisException): Option[LogicalPlan] = { ae.plan @@ -238,4 +260,20 @@ class Spark32Shims extends SparkShims { csvOptions.dateFormat == default.dateFormat && csvOptions.timestampFormat == default.timestampFormat } + + override def createParquetFilters( + conf: SQLConf, + schema: MessageType, + caseSensitive: Option[Boolean] = None): ParquetFilters = { + new ParquetFilters( + schema, + conf.parquetFilterPushDownDate, + conf.parquetFilterPushDownTimestamp, + conf.parquetFilterPushDownDecimal, + conf.parquetFilterPushDownStringStartWith, + conf.parquetFilterPushDownInFilterThreshold, + caseSensitive.getOrElse(conf.caseSensitiveAnalysis), + RebaseSpec(LegacyBehaviorPolicy.CORRECTED) + ) + } } diff --git a/shims/spark32/src/main/scala/org/apache/spark/sql/execution/adaptive/GlutenCostEvaluator.scala b/shims/spark32/src/main/scala/org/apache/spark/sql/execution/adaptive/GlutenCostEvaluator.scala new file mode 100644 index 0000000000000..a6f066462a1ac --- /dev/null +++ b/shims/spark32/src/main/scala/org/apache/spark/sql/execution/adaptive/GlutenCostEvaluator.scala @@ -0,0 +1,32 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.execution.adaptive + +import org.apache.gluten.GlutenConfig + +import org.apache.spark.sql.execution.SparkPlan + +/** This [[CostEvaluator]] is to force use the new physical plan when cost is equal. */ +case class GlutenCostEvaluator() extends CostEvaluator { + override def evaluateCost(plan: SparkPlan): Cost = { + if (GlutenConfig.getConf.enableGluten) { + new GlutenCost(SimpleCostEvaluator, plan) + } else { + SimpleCostEvaluator.evaluateCost(plan) + } + } +} diff --git a/shims/spark32/src/main/scala/org/apache/spark/sql/hive/execution/AbstractHiveTableScanExec.scala b/shims/spark32/src/main/scala/org/apache/spark/sql/hive/execution/AbstractHiveTableScanExec.scala index 95106b4edba13..46b59ac306c21 100644 --- a/shims/spark32/src/main/scala/org/apache/spark/sql/hive/execution/AbstractHiveTableScanExec.scala +++ b/shims/spark32/src/main/scala/org/apache/spark/sql/hive/execution/AbstractHiveTableScanExec.scala @@ -75,7 +75,7 @@ abstract private[hive] class AbstractHiveTableScanExec( override val output: Seq[Attribute] = { // Retrieve the original attributes based on expression ID so that capitalization matches. - requestedAttributes.map(originalAttributes) + requestedAttributes.map(originalAttributes).distinct } // Bind all partition key attribute references in the partition pruning predicate for later diff --git a/shims/spark33/pom.xml b/shims/spark33/pom.xml index 23e35718a4aff..725d1f8edf01f 100644 --- a/shims/spark33/pom.xml +++ b/shims/spark33/pom.xml @@ -109,6 +109,7 @@ -Wconf:cat=deprecation:silent + -P:wartremover:traverser:io.github.zhztheplayer.scalawarts.InheritFromCaseClass diff --git a/shims/spark33/src/main/java/org/apache/spark/sql/execution/joins/HashJoin.scala.deprecated b/shims/spark33/src/main/java/org/apache/spark/sql/execution/joins/HashJoin.scala.deprecated deleted file mode 100644 index 3fc8237306140..0000000000000 --- a/shims/spark33/src/main/java/org/apache/spark/sql/execution/joins/HashJoin.scala.deprecated +++ /dev/null @@ -1,774 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.execution.joins - -import org.apache.spark.sql.catalyst.{InternalRow, SQLConfHelper} -import org.apache.spark.sql.catalyst.analysis.CastSupport -import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.catalyst.expressions.BindReferences.bindReferences -import org.apache.spark.sql.catalyst.expressions.codegen._ -import org.apache.spark.sql.catalyst.optimizer.{BuildLeft, BuildRight, BuildSide} -import org.apache.spark.sql.catalyst.plans._ -import org.apache.spark.sql.catalyst.plans.physical.Partitioning -import org.apache.spark.sql.execution.{CodegenSupport, ExplainUtils, RowIterator} -import org.apache.spark.sql.execution.metric.SQLMetric -import org.apache.spark.sql.types.{BooleanType, IntegralType, LongType} - -/** - * @param relationTerm variable name for HashedRelation - * @param keyIsUnique indicate whether keys of HashedRelation known to be unique in code-gen time - * @param isEmpty indicate whether it known to be EmptyHashedRelation in code-gen time - */ -private[joins] case class HashedRelationInfo( - relationTerm: String, - keyIsUnique: Boolean, - isEmpty: Boolean) - -trait HashJoin extends JoinCodegenSupport { - def buildSide: BuildSide - - override def simpleStringWithNodeId(): String = { - val opId = ExplainUtils.getOpId(this) - s"$nodeName $joinType ${buildSide} ($opId)".trim - } - - override def output: Seq[Attribute] = { - joinType match { - case _: InnerLike => - left.output ++ right.output - case LeftOuter => - left.output ++ right.output.map(_.withNullability(true)) - case RightOuter => - left.output.map(_.withNullability(true)) ++ right.output - case j: ExistenceJoin => - left.output :+ j.exists - case LeftExistence(_) => - left.output - case x => - throw new IllegalArgumentException(s"HashJoin should not take $x as the JoinType") - } - } - - override def outputPartitioning: Partitioning = buildSide match { - case BuildLeft => - joinType match { - case _: InnerLike | RightOuter => right.outputPartitioning - case x => - throw new IllegalArgumentException( - s"HashJoin should not take $x as the JoinType with building left side") - } - case BuildRight => - joinType match { - case _: InnerLike | LeftOuter | LeftSemi | LeftAnti | _: ExistenceJoin => - left.outputPartitioning - case x => - throw new IllegalArgumentException( - s"HashJoin should not take $x as the JoinType with building right side") - } - } - - /** - * Handle the special cases for LeftOuter/LeftSemi with BuildLeft and RightOuter with BuildRight. - */ - override def outputOrdering: Seq[SortOrder] = buildSide match { - case BuildLeft => - joinType match { - case _: InnerLike | RightOuter => right.outputOrdering - case LeftOuter => left.outputOrdering - case LeftSemi => left.outputOrdering - case x => - throw new IllegalArgumentException( - s"HashJoin should not take $x as the JoinType with building left side") - } - case BuildRight => - joinType match { - case _: InnerLike | LeftOuter | LeftSemi | LeftAnti | _: ExistenceJoin => - left.outputOrdering - case RightOuter => right.outputOrdering - case x => - throw new IllegalArgumentException( - s"HashJoin should not take $x as the JoinType with building right side") - } - } - - protected lazy val (buildPlan, streamedPlan) = buildSide match { - case BuildLeft => (left, right) - case BuildRight => (right, left) - } - - protected lazy val (buildKeys, streamedKeys) = { - require(leftKeys.length == rightKeys.length && - leftKeys.map(_.dataType) - .zip(rightKeys.map(_.dataType)) - .forall(types => types._1.sameType(types._2)), - "Join keys from two sides should have same length and types") - buildSide match { - case BuildLeft => (leftKeys, rightKeys) - case BuildRight => (rightKeys, leftKeys) - } - } - - @transient protected lazy val (buildOutput, streamedOutput) = { - buildSide match { - case BuildLeft => (left.output, right.output) - case BuildRight => (right.output, left.output) - } - } - - @transient protected lazy val buildBoundKeys = - bindReferences(HashJoin.rewriteKeyExpr(buildKeys), buildOutput) - - @transient protected lazy val streamedBoundKeys = - bindReferences(HashJoin.rewriteKeyExpr(streamedKeys), streamedOutput) - - protected def buildSideKeyGenerator(): Projection = - UnsafeProjection.create(buildBoundKeys) - - protected def streamSideKeyGenerator(): UnsafeProjection = - UnsafeProjection.create(streamedBoundKeys) - - @transient protected[this] lazy val boundCondition = if (condition.isDefined) { - if (joinType == FullOuter && buildSide == BuildLeft) { - // Put join left side before right side. This is to be consistent with - // `ShuffledHashJoinExec.fullOuterJoin`. - Predicate.create(condition.get, buildPlan.output ++ streamedPlan.output).eval _ - } else { - Predicate.create(condition.get, streamedPlan.output ++ buildPlan.output).eval _ - } - } else { - (r: InternalRow) => true - } - - protected def createResultProjection(): (InternalRow) => InternalRow = joinType match { - case LeftExistence(_) => - UnsafeProjection.create(output, output) - case _ => - // Always put the stream side on left to simplify implementation - // both of left and right side could be null - UnsafeProjection.create( - output, (streamedPlan.output ++ buildPlan.output).map(_.withNullability(true))) - } - - private def innerJoin( - streamIter: Iterator[InternalRow], - hashedRelation: HashedRelation): Iterator[InternalRow] = { - val joinRow = new JoinedRow - val joinKeys = streamSideKeyGenerator() - - if (hashedRelation == EmptyHashedRelation) { - Iterator.empty - } else if (hashedRelation.keyIsUnique) { - streamIter.flatMap { srow => - joinRow.withLeft(srow) - val matched = hashedRelation.getValue(joinKeys(srow)) - if (matched != null) { - Some(joinRow.withRight(matched)).filter(boundCondition) - } else { - None - } - } - } else { - streamIter.flatMap { srow => - joinRow.withLeft(srow) - val matches = hashedRelation.get(joinKeys(srow)) - if (matches != null) { - matches.map(joinRow.withRight).filter(boundCondition) - } else { - Seq.empty - } - } - } - } - - private def outerJoin( - streamedIter: Iterator[InternalRow], - hashedRelation: HashedRelation): Iterator[InternalRow] = { - val joinedRow = new JoinedRow() - val keyGenerator = streamSideKeyGenerator() - val nullRow = new GenericInternalRow(buildPlan.output.length) - - if (hashedRelation.keyIsUnique) { - streamedIter.map { currentRow => - val rowKey = keyGenerator(currentRow) - joinedRow.withLeft(currentRow) - val matched = hashedRelation.getValue(rowKey) - if (matched != null && boundCondition(joinedRow.withRight(matched))) { - joinedRow - } else { - joinedRow.withRight(nullRow) - } - } - } else { - streamedIter.flatMap { currentRow => - val rowKey = keyGenerator(currentRow) - joinedRow.withLeft(currentRow) - val buildIter = hashedRelation.get(rowKey) - new RowIterator { - private var found = false - override def advanceNext(): Boolean = { - while (buildIter != null && buildIter.hasNext) { - val nextBuildRow = buildIter.next() - if (boundCondition(joinedRow.withRight(nextBuildRow))) { - found = true - return true - } - } - if (!found) { - joinedRow.withRight(nullRow) - found = true - return true - } - false - } - override def getRow: InternalRow = joinedRow - }.toScala - } - } - } - - private def semiJoin( - streamIter: Iterator[InternalRow], - hashedRelation: HashedRelation): Iterator[InternalRow] = { - val joinKeys = streamSideKeyGenerator() - val joinedRow = new JoinedRow - - if (hashedRelation == EmptyHashedRelation) { - Iterator.empty - } else if (hashedRelation.keyIsUnique) { - streamIter.filter { current => - val key = joinKeys(current) - lazy val matched = hashedRelation.getValue(key) - !key.anyNull && matched != null && - (condition.isEmpty || boundCondition(joinedRow(current, matched))) - } - } else { - streamIter.filter { current => - val key = joinKeys(current) - lazy val buildIter = hashedRelation.get(key) - !key.anyNull && buildIter != null && (condition.isEmpty || buildIter.exists { - (row: InternalRow) => boundCondition(joinedRow(current, row)) - }) - } - } - } - - private def existenceJoin( - streamIter: Iterator[InternalRow], - hashedRelation: HashedRelation): Iterator[InternalRow] = { - val joinKeys = streamSideKeyGenerator() - val result = new GenericInternalRow(Array[Any](null)) - val joinedRow = new JoinedRow - - if (hashedRelation.keyIsUnique) { - streamIter.map { current => - val key = joinKeys(current) - lazy val matched = hashedRelation.getValue(key) - val exists = !key.anyNull && matched != null && - (condition.isEmpty || boundCondition(joinedRow(current, matched))) - result.setBoolean(0, exists) - joinedRow(current, result) - } - } else { - streamIter.map { current => - val key = joinKeys(current) - lazy val buildIter = hashedRelation.get(key) - val exists = !key.anyNull && buildIter != null && (condition.isEmpty || buildIter.exists { - (row: InternalRow) => boundCondition(joinedRow(current, row)) - }) - result.setBoolean(0, exists) - joinedRow(current, result) - } - } - } - - private def antiJoin( - streamIter: Iterator[InternalRow], - hashedRelation: HashedRelation): Iterator[InternalRow] = { - // If the right side is empty, AntiJoin simply returns the left side. - if (hashedRelation == EmptyHashedRelation) { - return streamIter - } - - val joinKeys = streamSideKeyGenerator() - val joinedRow = new JoinedRow - - if (hashedRelation.keyIsUnique) { - streamIter.filter { current => - val key = joinKeys(current) - lazy val matched = hashedRelation.getValue(key) - key.anyNull || matched == null || - (condition.isDefined && !boundCondition(joinedRow(current, matched))) - } - } else { - streamIter.filter { current => - val key = joinKeys(current) - lazy val buildIter = hashedRelation.get(key) - key.anyNull || buildIter == null || (condition.isDefined && !buildIter.exists { - row => boundCondition(joinedRow(current, row)) - }) - } - } - } - - protected def join( - streamedIter: Iterator[InternalRow], - hashed: HashedRelation, - numOutputRows: SQLMetric): Iterator[InternalRow] = { - - val joinedIter = joinType match { - case _: InnerLike => - innerJoin(streamedIter, hashed) - case LeftOuter | RightOuter => - outerJoin(streamedIter, hashed) - case LeftSemi => - semiJoin(streamedIter, hashed) - case LeftAnti => - antiJoin(streamedIter, hashed) - case _: ExistenceJoin => - existenceJoin(streamedIter, hashed) - case x => - throw new IllegalArgumentException( - s"HashJoin should not take $x as the JoinType") - } - - val resultProj = createResultProjection - joinedIter.map { r => - numOutputRows += 1 - resultProj(r) - } - } - - override def doProduce(ctx: CodegenContext): String = { - streamedPlan.asInstanceOf[CodegenSupport].produce(ctx, this) - } - - override def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: ExprCode): String = { - joinType match { - case _: InnerLike => codegenInner(ctx, input) - case LeftOuter | RightOuter => codegenOuter(ctx, input) - case LeftSemi => codegenSemi(ctx, input) - case LeftAnti => codegenAnti(ctx, input) - case _: ExistenceJoin => codegenExistence(ctx, input) - case x => - throw new IllegalArgumentException( - s"HashJoin should not take $x as the JoinType") - } - } - - /** - * Returns the code for generating join key for stream side, and expression of whether the key - * has any null in it or not. - */ - protected def genStreamSideJoinKey( - ctx: CodegenContext, - input: Seq[ExprCode]): (ExprCode, String) = { - ctx.currentVars = input - if (streamedBoundKeys.length == 1 && streamedBoundKeys.head.dataType == LongType) { - // generate the join key as Long - val ev = streamedBoundKeys.head.genCode(ctx) - (ev, ev.isNull) - } else { - // generate the join key as UnsafeRow - val ev = GenerateUnsafeProjection.createCode(ctx, streamedBoundKeys) - (ev, s"${ev.value}.anyNull()") - } - } - - /** - * Generates the code for Inner join. - */ - protected def codegenInner(ctx: CodegenContext, input: Seq[ExprCode]): String = { - val HashedRelationInfo(relationTerm, keyIsUnique, isEmptyHashedRelation) = prepareRelation(ctx) - val (keyEv, anyNull) = genStreamSideJoinKey(ctx, input) - val (matched, checkCondition, buildVars) = getJoinCondition(ctx, input, streamedPlan, buildPlan) - val numOutput = metricTerm(ctx, "numOutputRows") - - val resultVars = buildSide match { - case BuildLeft => buildVars ++ input - case BuildRight => input ++ buildVars - } - - if (isEmptyHashedRelation) { - """ - |// If HashedRelation is empty, hash inner join simply returns nothing. - """.stripMargin - } else if (keyIsUnique) { - s""" - |// generate join key for stream side - |${keyEv.code} - |// find matches from HashedRelation - |UnsafeRow $matched = $anyNull ? null: (UnsafeRow)$relationTerm.getValue(${keyEv.value}); - |if ($matched != null) { - | $checkCondition { - | $numOutput.add(1); - | ${consume(ctx, resultVars)} - | } - |} - """.stripMargin - } else { - val matches = ctx.freshName("matches") - val iteratorCls = classOf[Iterator[UnsafeRow]].getName - - s""" - |// generate join key for stream side - |${keyEv.code} - |// find matches from HashRelation - |$iteratorCls $matches = $anyNull ? - | null : ($iteratorCls)$relationTerm.get(${keyEv.value}); - |if ($matches != null) { - | while ($matches.hasNext()) { - | UnsafeRow $matched = (UnsafeRow) $matches.next(); - | $checkCondition { - | $numOutput.add(1); - | ${consume(ctx, resultVars)} - | } - | } - |} - """.stripMargin - } - } - - /** - * Generates the code for left or right outer join. - */ - protected def codegenOuter(ctx: CodegenContext, input: Seq[ExprCode]): String = { - val HashedRelationInfo(relationTerm, keyIsUnique, _) = prepareRelation(ctx) - val (keyEv, anyNull) = genStreamSideJoinKey(ctx, input) - val matched = ctx.freshName("matched") - val buildVars = genOneSideJoinVars(ctx, matched, buildPlan, setDefaultValue = true) - val numOutput = metricTerm(ctx, "numOutputRows") - - // filter the output via condition - val conditionPassed = ctx.freshName("conditionPassed") - val checkCondition = if (condition.isDefined) { - val expr = condition.get - // evaluate the variables from build side that used by condition - val eval = evaluateRequiredVariables(buildPlan.output, buildVars, expr.references) - ctx.currentVars = input ++ buildVars - val ev = - BindReferences.bindReference(expr, streamedPlan.output ++ buildPlan.output).genCode(ctx) - s""" - |boolean $conditionPassed = true; - |${eval.trim} - |if ($matched != null) { - | ${ev.code} - | $conditionPassed = !${ev.isNull} && ${ev.value}; - |} - """.stripMargin - } else { - s"final boolean $conditionPassed = true;" - } - - val resultVars = buildSide match { - case BuildLeft => buildVars ++ input - case BuildRight => input ++ buildVars - } - - if (keyIsUnique) { - s""" - |// generate join key for stream side - |${keyEv.code} - |// find matches from HashedRelation - |UnsafeRow $matched = $anyNull ? null: (UnsafeRow)$relationTerm.getValue(${keyEv.value}); - |${checkCondition.trim} - |if (!$conditionPassed) { - | $matched = null; - | // reset the variables those are already evaluated. - | ${buildVars.filter(_.code.isEmpty).map(v => s"${v.isNull} = true;").mkString("\n")} - |} - |$numOutput.add(1); - |${consume(ctx, resultVars)} - """.stripMargin - } else { - val matches = ctx.freshName("matches") - val iteratorCls = classOf[Iterator[UnsafeRow]].getName - val found = ctx.freshName("found") - - s""" - |// generate join key for stream side - |${keyEv.code} - |// find matches from HashRelation - |$iteratorCls $matches = $anyNull ? null : ($iteratorCls)$relationTerm.get(${keyEv.value}); - |boolean $found = false; - |// the last iteration of this loop is to emit an empty row if there is no matched rows. - |while ($matches != null && $matches.hasNext() || !$found) { - | UnsafeRow $matched = $matches != null && $matches.hasNext() ? - | (UnsafeRow) $matches.next() : null; - | ${checkCondition.trim} - | if ($conditionPassed) { - | $found = true; - | $numOutput.add(1); - | ${consume(ctx, resultVars)} - | } - |} - """.stripMargin - } - } - - /** - * Generates the code for left semi join. - */ - protected def codegenSemi(ctx: CodegenContext, input: Seq[ExprCode]): String = { - val HashedRelationInfo(relationTerm, keyIsUnique, isEmptyHashedRelation) = prepareRelation(ctx) - val (keyEv, anyNull) = genStreamSideJoinKey(ctx, input) - val (matched, checkCondition, _) = getJoinCondition(ctx, input, streamedPlan, buildPlan) - val numOutput = metricTerm(ctx, "numOutputRows") - - if (isEmptyHashedRelation) { - """ - |// If HashedRelation is empty, hash semi join simply returns nothing. - """.stripMargin - } else if (keyIsUnique) { - s""" - |// generate join key for stream side - |${keyEv.code} - |// find matches from HashedRelation - |UnsafeRow $matched = $anyNull ? null: (UnsafeRow)$relationTerm.getValue(${keyEv.value}); - |if ($matched != null) { - | $checkCondition { - | $numOutput.add(1); - | ${consume(ctx, input)} - | } - |} - """.stripMargin - } else { - val matches = ctx.freshName("matches") - val iteratorCls = classOf[Iterator[UnsafeRow]].getName - val found = ctx.freshName("found") - - s""" - |// generate join key for stream side - |${keyEv.code} - |// find matches from HashRelation - |$iteratorCls $matches = $anyNull ? null : ($iteratorCls)$relationTerm.get(${keyEv.value}); - |if ($matches != null) { - | boolean $found = false; - | while (!$found && $matches.hasNext()) { - | UnsafeRow $matched = (UnsafeRow) $matches.next(); - | $checkCondition { - | $found = true; - | } - | } - | if ($found) { - | $numOutput.add(1); - | ${consume(ctx, input)} - | } - |} - """.stripMargin - } - } - - /** - * Generates the code for anti join. - */ - protected def codegenAnti(ctx: CodegenContext, input: Seq[ExprCode]): String = { - val HashedRelationInfo(relationTerm, keyIsUnique, isEmptyHashedRelation) = prepareRelation(ctx) - val numOutput = metricTerm(ctx, "numOutputRows") - if (isEmptyHashedRelation) { - return s""" - |// If HashedRelation is empty, hash anti join simply returns the stream side. - |$numOutput.add(1); - |${consume(ctx, input)} - """.stripMargin - } - - val (keyEv, anyNull) = genStreamSideJoinKey(ctx, input) - val (matched, checkCondition, _) = getJoinCondition(ctx, input, streamedPlan, buildPlan) - - if (keyIsUnique) { - val found = ctx.freshName("found") - s""" - |boolean $found = false; - |// generate join key for stream side - |${keyEv.code} - |// Check if the key has nulls. - |if (!($anyNull)) { - | // Check if the HashedRelation exists. - | UnsafeRow $matched = (UnsafeRow)$relationTerm.getValue(${keyEv.value}); - | if ($matched != null) { - | // Evaluate the condition. - | $checkCondition { - | $found = true; - | } - | } - |} - |if (!$found) { - | $numOutput.add(1); - | ${consume(ctx, input)} - |} - """.stripMargin - } else { - val matches = ctx.freshName("matches") - val iteratorCls = classOf[Iterator[UnsafeRow]].getName - val found = ctx.freshName("found") - s""" - |boolean $found = false; - |// generate join key for stream side - |${keyEv.code} - |// Check if the key has nulls. - |if (!($anyNull)) { - | // Check if the HashedRelation exists. - | $iteratorCls $matches = ($iteratorCls)$relationTerm.get(${keyEv.value}); - | if ($matches != null) { - | // Evaluate the condition. - | while (!$found && $matches.hasNext()) { - | UnsafeRow $matched = (UnsafeRow) $matches.next(); - | $checkCondition { - | $found = true; - | } - | } - | } - |} - |if (!$found) { - | $numOutput.add(1); - | ${consume(ctx, input)} - |} - """.stripMargin - } - } - - /** - * Generates the code for existence join. - */ - protected def codegenExistence(ctx: CodegenContext, input: Seq[ExprCode]): String = { - val HashedRelationInfo(relationTerm, keyIsUnique, _) = prepareRelation(ctx) - val (keyEv, anyNull) = genStreamSideJoinKey(ctx, input) - val numOutput = metricTerm(ctx, "numOutputRows") - val existsVar = ctx.freshName("exists") - - val matched = ctx.freshName("matched") - val buildVars = genOneSideJoinVars(ctx, matched, buildPlan, setDefaultValue = false) - val checkCondition = if (condition.isDefined) { - val expr = condition.get - // evaluate the variables from build side that used by condition - val eval = evaluateRequiredVariables(buildPlan.output, buildVars, expr.references) - // filter the output via condition - ctx.currentVars = input ++ buildVars - val ev = - BindReferences.bindReference(expr, streamedPlan.output ++ buildPlan.output).genCode(ctx) - s""" - |$eval - |${ev.code} - |$existsVar = !${ev.isNull} && ${ev.value}; - """.stripMargin - } else { - s"$existsVar = true;" - } - - val resultVar = input ++ Seq(ExprCode.forNonNullValue( - JavaCode.variable(existsVar, BooleanType))) - - if (keyIsUnique) { - s""" - |// generate join key for stream side - |${keyEv.code} - |// find matches from HashedRelation - |UnsafeRow $matched = $anyNull ? null: (UnsafeRow)$relationTerm.getValue(${keyEv.value}); - |boolean $existsVar = false; - |if ($matched != null) { - | $checkCondition - |} - |$numOutput.add(1); - |${consume(ctx, resultVar)} - """.stripMargin - } else { - val matches = ctx.freshName("matches") - val iteratorCls = classOf[Iterator[UnsafeRow]].getName - s""" - |// generate join key for stream side - |${keyEv.code} - |// find matches from HashRelation - |$iteratorCls $matches = $anyNull ? null : ($iteratorCls)$relationTerm.get(${keyEv.value}); - |boolean $existsVar = false; - |if ($matches != null) { - | while (!$existsVar && $matches.hasNext()) { - | UnsafeRow $matched = (UnsafeRow) $matches.next(); - | $checkCondition - | } - |} - |$numOutput.add(1); - |${consume(ctx, resultVar)} - """.stripMargin - } - } - - protected def prepareRelation(ctx: CodegenContext): HashedRelationInfo -} - -object HashJoin extends CastSupport with SQLConfHelper { - - private def canRewriteAsLongType(keys: Seq[Expression]): Boolean = { - // TODO: support BooleanType, DateType and TimestampType - keys.forall(_.dataType.isInstanceOf[IntegralType]) && - keys.map(_.dataType.defaultSize).sum <= 8 - } - - /** - * Try to rewrite the key as LongType so we can use getLong(), if they key can fit with a long. - * - * If not, returns the original expressions. - */ - def rewriteKeyExpr(keys: Seq[Expression]): Seq[Expression] = { - assert(keys.nonEmpty) - if (!canRewriteAsLongType(keys)) { - return keys - } - - var keyExpr: Expression = if (keys.head.dataType != LongType) { - cast(keys.head, LongType) - } else { - keys.head - } - keys.tail.foreach { e => - val bits = e.dataType.defaultSize * 8 - keyExpr = BitwiseOr(ShiftLeft(keyExpr, Literal(bits)), - BitwiseAnd(cast(e, LongType), Literal((1L << bits) - 1))) - } - keyExpr :: Nil - } - - /** - * Extract a given key which was previously packed in a long value using its index to - * determine the number of bits to shift - */ - def extractKeyExprAt(keys: Seq[Expression], index: Int): Expression = { - assert(canRewriteAsLongType(keys)) - // jump over keys that have a higher index value than the required key - if (keys.size == 1) { - assert(index == 0) - Cast( - child = BoundReference(0, LongType, nullable = false), - dataType = keys(index).dataType, - timeZoneId = Option(conf.sessionLocalTimeZone), - ansiEnabled = false) - } else { - val shiftedBits = - keys.slice(index + 1, keys.size).map(_.dataType.defaultSize * 8).sum - val mask = (1L << (keys(index).dataType.defaultSize * 8)) - 1 - // build the schema for unpacking the required key - val castChild = BitwiseAnd( - ShiftRightUnsigned(BoundReference(0, LongType, nullable = false), Literal(shiftedBits)), - Literal(mask)) - Cast( - child = castChild, - dataType = keys(index).dataType, - timeZoneId = Option(conf.sessionLocalTimeZone), - ansiEnabled = false) - } - } -} diff --git a/shims/spark33/src/main/scala/org/apache/gluten/execution/GenerateTreeStringShim.scala b/shims/spark33/src/main/scala/org/apache/gluten/execution/GenerateTreeStringShim.scala index 3e42244bd8c64..8936e6ca63510 100644 --- a/shims/spark33/src/main/scala/org/apache/gluten/execution/GenerateTreeStringShim.scala +++ b/shims/spark33/src/main/scala/org/apache/gluten/execution/GenerateTreeStringShim.scala @@ -16,7 +16,7 @@ */ package org.apache.gluten.execution -import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} +import org.apache.spark.sql.execution.UnaryExecNode /** * Spark 3.5 has changed the parameter type of the generateTreeString API in TreeNode. In order to @@ -25,7 +25,7 @@ import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} * allow different Spark versions to override their own generateTreeString. */ -trait GenerateTreeStringShim extends UnaryExecNode { +trait WholeStageTransformerGenerateTreeStringShim extends UnaryExecNode { def stageId: Int @@ -62,3 +62,28 @@ trait GenerateTreeStringShim extends UnaryExecNode { } } } + +trait InputAdapterGenerateTreeStringShim extends UnaryExecNode { + + override def generateTreeString( + depth: Int, + lastChildren: Seq[Boolean], + append: String => Unit, + verbose: Boolean, + prefix: String = "", + addSuffix: Boolean = false, + maxFields: Int, + printNodeId: Boolean, + indent: Int = 0): Unit = { + child.generateTreeString( + depth, + lastChildren, + append, + verbose, + prefix = "", + addSuffix = false, + maxFields, + printNodeId, + indent) + } +} diff --git a/shims/spark33/src/main/scala/org/apache/gluten/sql/shims/spark33/Spark33Shims.scala b/shims/spark33/src/main/scala/org/apache/gluten/sql/shims/spark33/Spark33Shims.scala index 7c6ce644dc74a..8b12c2642c55b 100644 --- a/shims/spark33/src/main/scala/org/apache/gluten/sql/shims/spark33/Spark33Shims.scala +++ b/shims/spark33/src/main/scala/org/apache/gluten/sql/shims/spark33/Spark33Shims.scala @@ -34,21 +34,26 @@ import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.plans.physical.{ClusteredDistribution, Distribution} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap +import org.apache.spark.sql.catalyst.util.RebaseDateTime.RebaseSpec import org.apache.spark.sql.catalyst.util.TimestampFormatter import org.apache.spark.sql.connector.catalog.Table import org.apache.spark.sql.connector.expressions.Transform import org.apache.spark.sql.execution.{FileSourceScanExec, PartitionedFileUtil, SparkPlan} import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.execution.datasources.FileFormatWriter.Empty2Null +import org.apache.spark.sql.execution.datasources.parquet.ParquetFilters import org.apache.spark.sql.execution.datasources.v2.BatchScanExec import org.apache.spark.sql.execution.datasources.v2.text.TextScan import org.apache.spark.sql.execution.datasources.v2.utils.CatalogUtil import org.apache.spark.sql.execution.exchange.BroadcastExchangeLike +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.internal.SQLConf.LegacyBehaviorPolicy import org.apache.spark.sql.types.{StructField, StructType} import org.apache.spark.sql.util.CaseInsensitiveStringMap import org.apache.spark.storage.{BlockId, BlockManagerId} import org.apache.hadoop.fs.{FileStatus, Path} +import org.apache.parquet.schema.MessageType import java.time.ZoneOffset import java.util.{HashMap => JHashMap, Map => JMap} @@ -202,6 +207,11 @@ class Spark33Shims extends SparkShims { case other => other } + override def getFileSizeAndModificationTime( + file: PartitionedFile): (Option[Long], Option[Long]) = { + (Some(file.fileSize), Some(file.modificationTime)) + } + override def generateMetadataColumns( file: PartitionedFile, metadataColumnNames: Seq[String]): JMap[String, String] = { @@ -221,6 +231,9 @@ class Spark33Shims extends SparkShims { case _ => } } + metadataColumn.put(InputFileName().prettyName, file.filePath) + metadataColumn.put(InputFileBlockStart().prettyName, file.start.toString) + metadataColumn.put(InputFileBlockLength().prettyName, file.length.toString) metadataColumn } @@ -273,8 +286,15 @@ class Spark33Shims extends SparkShims { def getFileStatus(partition: PartitionDirectory): Seq[FileStatus] = partition.files + def isFileSplittable( + relation: HadoopFsRelation, + filePath: Path, + sparkSchema: StructType): Boolean = true + def isRowIndexMetadataColumn(name: String): Boolean = false + def findRowIndexColumnIndexInSchema(sparkSchema: StructType): Int = -1 + def splitFiles( sparkSession: SparkSession, file: FileStatus, @@ -329,4 +349,20 @@ class Spark33Shims extends SparkShims { csvOptions.timestampFormatInRead == default.timestampFormatInRead && csvOptions.timestampNTZFormatInRead == default.timestampNTZFormatInRead } + + override def createParquetFilters( + conf: SQLConf, + schema: MessageType, + caseSensitive: Option[Boolean] = None): ParquetFilters = { + new ParquetFilters( + schema, + conf.parquetFilterPushDownDate, + conf.parquetFilterPushDownTimestamp, + conf.parquetFilterPushDownDecimal, + conf.parquetFilterPushDownStringStartWith, + conf.parquetFilterPushDownInFilterThreshold, + caseSensitive.getOrElse(conf.caseSensitiveAnalysis), + RebaseSpec(LegacyBehaviorPolicy.CORRECTED) + ) + } } diff --git a/shims/spark33/src/main/scala/org/apache/spark/sql/execution/JoinSelectionShim.scala b/shims/spark33/src/main/scala/org/apache/spark/sql/execution/JoinSelectionShim.scala deleted file mode 100644 index 20b9dea333a57..0000000000000 --- a/shims/spark33/src/main/scala/org/apache/spark/sql/execution/JoinSelectionShim.scala +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.spark.sql.execution - -import org.apache.spark.sql.catalyst.expressions.Expression -import org.apache.spark.sql.catalyst.planning.ExtractEquiJoinKeys -import org.apache.spark.sql.catalyst.plans.JoinType -import org.apache.spark.sql.catalyst.plans.logical.{Join, JoinHint, LogicalPlan} - -// https://issues.apache.org/jira/browse/SPARK-36745 -object JoinSelectionShim { - object ExtractEquiJoinKeysShim { - type ReturnType = - ( - JoinType, - Seq[Expression], - Seq[Expression], - Option[Expression], - LogicalPlan, - LogicalPlan, - JoinHint) - def unapply(join: Join): Option[ReturnType] = { - ExtractEquiJoinKeys.unapply(join).map { - case ( - joinType, - leftKeys, - rightKeys, - otherPredicates, - predicatesOfJoinKeys, - left, - right, - hint) => - (joinType, leftKeys, rightKeys, otherPredicates, left, right, hint) - } - } - } -} diff --git a/shims/spark32/src/main/scala/org/apache/spark/sql/execution/JoinSelectionShim.scala b/shims/spark33/src/main/scala/org/apache/spark/sql/execution/adaptive/GlutenCostEvaluator.scala similarity index 52% rename from shims/spark32/src/main/scala/org/apache/spark/sql/execution/JoinSelectionShim.scala rename to shims/spark33/src/main/scala/org/apache/spark/sql/execution/adaptive/GlutenCostEvaluator.scala index 5849dd600b7da..8fcfa735f463d 100644 --- a/shims/spark32/src/main/scala/org/apache/spark/sql/execution/JoinSelectionShim.scala +++ b/shims/spark33/src/main/scala/org/apache/spark/sql/execution/adaptive/GlutenCostEvaluator.scala @@ -14,27 +14,22 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.spark.sql.execution +package org.apache.spark.sql.execution.adaptive -import org.apache.spark.sql.catalyst.expressions.Expression -import org.apache.spark.sql.catalyst.planning.ExtractEquiJoinKeys -import org.apache.spark.sql.catalyst.plans.JoinType -import org.apache.spark.sql.catalyst.plans.logical.{Join, JoinHint, LogicalPlan} +import org.apache.gluten.GlutenConfig -// https://issues.apache.org/jira/browse/SPARK-36745 -object JoinSelectionShim { - object ExtractEquiJoinKeysShim { - type ReturnType = - ( - JoinType, - Seq[Expression], - Seq[Expression], - Option[Expression], - LogicalPlan, - LogicalPlan, - JoinHint) - def unapply(join: Join): Option[ReturnType] = { - ExtractEquiJoinKeys.unapply(join) +import org.apache.spark.sql.catalyst.SQLConfHelper +import org.apache.spark.sql.execution.SparkPlan +import org.apache.spark.sql.internal.SQLConf + +/** This [[CostEvaluator]] is to force use the new physical plan when cost is equal. */ +case class GlutenCostEvaluator() extends CostEvaluator with SQLConfHelper { + override def evaluateCost(plan: SparkPlan): Cost = { + val forceOptimizeSkewedJoin = conf.getConf(SQLConf.ADAPTIVE_FORCE_OPTIMIZE_SKEWED_JOIN) + if (GlutenConfig.getConf.enableGluten) { + new GlutenCost(SimpleCostEvaluator(forceOptimizeSkewedJoin), plan) + } else { + SimpleCostEvaluator(forceOptimizeSkewedJoin).evaluateCost(plan) } } } diff --git a/shims/spark33/src/main/scala/org/apache/spark/sql/hive/execution/AbstractHiveTableScanExec.scala b/shims/spark33/src/main/scala/org/apache/spark/sql/hive/execution/AbstractHiveTableScanExec.scala index 78f5ff7f1be1c..dd095f0ff2472 100644 --- a/shims/spark33/src/main/scala/org/apache/spark/sql/hive/execution/AbstractHiveTableScanExec.scala +++ b/shims/spark33/src/main/scala/org/apache/spark/sql/hive/execution/AbstractHiveTableScanExec.scala @@ -75,7 +75,7 @@ abstract private[hive] class AbstractHiveTableScanExec( override val output: Seq[Attribute] = { // Retrieve the original attributes based on expression ID so that capitalization matches. - requestedAttributes.map(originalAttributes) + requestedAttributes.map(originalAttributes).distinct } // Bind all partition key attribute references in the partition pruning predicate for later diff --git a/shims/spark34/pom.xml b/shims/spark34/pom.xml index 42d4b830dfd0f..9cc9bb409beae 100644 --- a/shims/spark34/pom.xml +++ b/shims/spark34/pom.xml @@ -109,6 +109,7 @@ -Wconf:cat=deprecation:silent + -P:wartremover:traverser:io.github.zhztheplayer.scalawarts.InheritFromCaseClass diff --git a/shims/spark34/src/main/java/org/apache/spark/sql/execution/joins/HashJoin.scala.deprecated b/shims/spark34/src/main/java/org/apache/spark/sql/execution/joins/HashJoin.scala.deprecated deleted file mode 100644 index 3fc8237306140..0000000000000 --- a/shims/spark34/src/main/java/org/apache/spark/sql/execution/joins/HashJoin.scala.deprecated +++ /dev/null @@ -1,774 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.execution.joins - -import org.apache.spark.sql.catalyst.{InternalRow, SQLConfHelper} -import org.apache.spark.sql.catalyst.analysis.CastSupport -import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.catalyst.expressions.BindReferences.bindReferences -import org.apache.spark.sql.catalyst.expressions.codegen._ -import org.apache.spark.sql.catalyst.optimizer.{BuildLeft, BuildRight, BuildSide} -import org.apache.spark.sql.catalyst.plans._ -import org.apache.spark.sql.catalyst.plans.physical.Partitioning -import org.apache.spark.sql.execution.{CodegenSupport, ExplainUtils, RowIterator} -import org.apache.spark.sql.execution.metric.SQLMetric -import org.apache.spark.sql.types.{BooleanType, IntegralType, LongType} - -/** - * @param relationTerm variable name for HashedRelation - * @param keyIsUnique indicate whether keys of HashedRelation known to be unique in code-gen time - * @param isEmpty indicate whether it known to be EmptyHashedRelation in code-gen time - */ -private[joins] case class HashedRelationInfo( - relationTerm: String, - keyIsUnique: Boolean, - isEmpty: Boolean) - -trait HashJoin extends JoinCodegenSupport { - def buildSide: BuildSide - - override def simpleStringWithNodeId(): String = { - val opId = ExplainUtils.getOpId(this) - s"$nodeName $joinType ${buildSide} ($opId)".trim - } - - override def output: Seq[Attribute] = { - joinType match { - case _: InnerLike => - left.output ++ right.output - case LeftOuter => - left.output ++ right.output.map(_.withNullability(true)) - case RightOuter => - left.output.map(_.withNullability(true)) ++ right.output - case j: ExistenceJoin => - left.output :+ j.exists - case LeftExistence(_) => - left.output - case x => - throw new IllegalArgumentException(s"HashJoin should not take $x as the JoinType") - } - } - - override def outputPartitioning: Partitioning = buildSide match { - case BuildLeft => - joinType match { - case _: InnerLike | RightOuter => right.outputPartitioning - case x => - throw new IllegalArgumentException( - s"HashJoin should not take $x as the JoinType with building left side") - } - case BuildRight => - joinType match { - case _: InnerLike | LeftOuter | LeftSemi | LeftAnti | _: ExistenceJoin => - left.outputPartitioning - case x => - throw new IllegalArgumentException( - s"HashJoin should not take $x as the JoinType with building right side") - } - } - - /** - * Handle the special cases for LeftOuter/LeftSemi with BuildLeft and RightOuter with BuildRight. - */ - override def outputOrdering: Seq[SortOrder] = buildSide match { - case BuildLeft => - joinType match { - case _: InnerLike | RightOuter => right.outputOrdering - case LeftOuter => left.outputOrdering - case LeftSemi => left.outputOrdering - case x => - throw new IllegalArgumentException( - s"HashJoin should not take $x as the JoinType with building left side") - } - case BuildRight => - joinType match { - case _: InnerLike | LeftOuter | LeftSemi | LeftAnti | _: ExistenceJoin => - left.outputOrdering - case RightOuter => right.outputOrdering - case x => - throw new IllegalArgumentException( - s"HashJoin should not take $x as the JoinType with building right side") - } - } - - protected lazy val (buildPlan, streamedPlan) = buildSide match { - case BuildLeft => (left, right) - case BuildRight => (right, left) - } - - protected lazy val (buildKeys, streamedKeys) = { - require(leftKeys.length == rightKeys.length && - leftKeys.map(_.dataType) - .zip(rightKeys.map(_.dataType)) - .forall(types => types._1.sameType(types._2)), - "Join keys from two sides should have same length and types") - buildSide match { - case BuildLeft => (leftKeys, rightKeys) - case BuildRight => (rightKeys, leftKeys) - } - } - - @transient protected lazy val (buildOutput, streamedOutput) = { - buildSide match { - case BuildLeft => (left.output, right.output) - case BuildRight => (right.output, left.output) - } - } - - @transient protected lazy val buildBoundKeys = - bindReferences(HashJoin.rewriteKeyExpr(buildKeys), buildOutput) - - @transient protected lazy val streamedBoundKeys = - bindReferences(HashJoin.rewriteKeyExpr(streamedKeys), streamedOutput) - - protected def buildSideKeyGenerator(): Projection = - UnsafeProjection.create(buildBoundKeys) - - protected def streamSideKeyGenerator(): UnsafeProjection = - UnsafeProjection.create(streamedBoundKeys) - - @transient protected[this] lazy val boundCondition = if (condition.isDefined) { - if (joinType == FullOuter && buildSide == BuildLeft) { - // Put join left side before right side. This is to be consistent with - // `ShuffledHashJoinExec.fullOuterJoin`. - Predicate.create(condition.get, buildPlan.output ++ streamedPlan.output).eval _ - } else { - Predicate.create(condition.get, streamedPlan.output ++ buildPlan.output).eval _ - } - } else { - (r: InternalRow) => true - } - - protected def createResultProjection(): (InternalRow) => InternalRow = joinType match { - case LeftExistence(_) => - UnsafeProjection.create(output, output) - case _ => - // Always put the stream side on left to simplify implementation - // both of left and right side could be null - UnsafeProjection.create( - output, (streamedPlan.output ++ buildPlan.output).map(_.withNullability(true))) - } - - private def innerJoin( - streamIter: Iterator[InternalRow], - hashedRelation: HashedRelation): Iterator[InternalRow] = { - val joinRow = new JoinedRow - val joinKeys = streamSideKeyGenerator() - - if (hashedRelation == EmptyHashedRelation) { - Iterator.empty - } else if (hashedRelation.keyIsUnique) { - streamIter.flatMap { srow => - joinRow.withLeft(srow) - val matched = hashedRelation.getValue(joinKeys(srow)) - if (matched != null) { - Some(joinRow.withRight(matched)).filter(boundCondition) - } else { - None - } - } - } else { - streamIter.flatMap { srow => - joinRow.withLeft(srow) - val matches = hashedRelation.get(joinKeys(srow)) - if (matches != null) { - matches.map(joinRow.withRight).filter(boundCondition) - } else { - Seq.empty - } - } - } - } - - private def outerJoin( - streamedIter: Iterator[InternalRow], - hashedRelation: HashedRelation): Iterator[InternalRow] = { - val joinedRow = new JoinedRow() - val keyGenerator = streamSideKeyGenerator() - val nullRow = new GenericInternalRow(buildPlan.output.length) - - if (hashedRelation.keyIsUnique) { - streamedIter.map { currentRow => - val rowKey = keyGenerator(currentRow) - joinedRow.withLeft(currentRow) - val matched = hashedRelation.getValue(rowKey) - if (matched != null && boundCondition(joinedRow.withRight(matched))) { - joinedRow - } else { - joinedRow.withRight(nullRow) - } - } - } else { - streamedIter.flatMap { currentRow => - val rowKey = keyGenerator(currentRow) - joinedRow.withLeft(currentRow) - val buildIter = hashedRelation.get(rowKey) - new RowIterator { - private var found = false - override def advanceNext(): Boolean = { - while (buildIter != null && buildIter.hasNext) { - val nextBuildRow = buildIter.next() - if (boundCondition(joinedRow.withRight(nextBuildRow))) { - found = true - return true - } - } - if (!found) { - joinedRow.withRight(nullRow) - found = true - return true - } - false - } - override def getRow: InternalRow = joinedRow - }.toScala - } - } - } - - private def semiJoin( - streamIter: Iterator[InternalRow], - hashedRelation: HashedRelation): Iterator[InternalRow] = { - val joinKeys = streamSideKeyGenerator() - val joinedRow = new JoinedRow - - if (hashedRelation == EmptyHashedRelation) { - Iterator.empty - } else if (hashedRelation.keyIsUnique) { - streamIter.filter { current => - val key = joinKeys(current) - lazy val matched = hashedRelation.getValue(key) - !key.anyNull && matched != null && - (condition.isEmpty || boundCondition(joinedRow(current, matched))) - } - } else { - streamIter.filter { current => - val key = joinKeys(current) - lazy val buildIter = hashedRelation.get(key) - !key.anyNull && buildIter != null && (condition.isEmpty || buildIter.exists { - (row: InternalRow) => boundCondition(joinedRow(current, row)) - }) - } - } - } - - private def existenceJoin( - streamIter: Iterator[InternalRow], - hashedRelation: HashedRelation): Iterator[InternalRow] = { - val joinKeys = streamSideKeyGenerator() - val result = new GenericInternalRow(Array[Any](null)) - val joinedRow = new JoinedRow - - if (hashedRelation.keyIsUnique) { - streamIter.map { current => - val key = joinKeys(current) - lazy val matched = hashedRelation.getValue(key) - val exists = !key.anyNull && matched != null && - (condition.isEmpty || boundCondition(joinedRow(current, matched))) - result.setBoolean(0, exists) - joinedRow(current, result) - } - } else { - streamIter.map { current => - val key = joinKeys(current) - lazy val buildIter = hashedRelation.get(key) - val exists = !key.anyNull && buildIter != null && (condition.isEmpty || buildIter.exists { - (row: InternalRow) => boundCondition(joinedRow(current, row)) - }) - result.setBoolean(0, exists) - joinedRow(current, result) - } - } - } - - private def antiJoin( - streamIter: Iterator[InternalRow], - hashedRelation: HashedRelation): Iterator[InternalRow] = { - // If the right side is empty, AntiJoin simply returns the left side. - if (hashedRelation == EmptyHashedRelation) { - return streamIter - } - - val joinKeys = streamSideKeyGenerator() - val joinedRow = new JoinedRow - - if (hashedRelation.keyIsUnique) { - streamIter.filter { current => - val key = joinKeys(current) - lazy val matched = hashedRelation.getValue(key) - key.anyNull || matched == null || - (condition.isDefined && !boundCondition(joinedRow(current, matched))) - } - } else { - streamIter.filter { current => - val key = joinKeys(current) - lazy val buildIter = hashedRelation.get(key) - key.anyNull || buildIter == null || (condition.isDefined && !buildIter.exists { - row => boundCondition(joinedRow(current, row)) - }) - } - } - } - - protected def join( - streamedIter: Iterator[InternalRow], - hashed: HashedRelation, - numOutputRows: SQLMetric): Iterator[InternalRow] = { - - val joinedIter = joinType match { - case _: InnerLike => - innerJoin(streamedIter, hashed) - case LeftOuter | RightOuter => - outerJoin(streamedIter, hashed) - case LeftSemi => - semiJoin(streamedIter, hashed) - case LeftAnti => - antiJoin(streamedIter, hashed) - case _: ExistenceJoin => - existenceJoin(streamedIter, hashed) - case x => - throw new IllegalArgumentException( - s"HashJoin should not take $x as the JoinType") - } - - val resultProj = createResultProjection - joinedIter.map { r => - numOutputRows += 1 - resultProj(r) - } - } - - override def doProduce(ctx: CodegenContext): String = { - streamedPlan.asInstanceOf[CodegenSupport].produce(ctx, this) - } - - override def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: ExprCode): String = { - joinType match { - case _: InnerLike => codegenInner(ctx, input) - case LeftOuter | RightOuter => codegenOuter(ctx, input) - case LeftSemi => codegenSemi(ctx, input) - case LeftAnti => codegenAnti(ctx, input) - case _: ExistenceJoin => codegenExistence(ctx, input) - case x => - throw new IllegalArgumentException( - s"HashJoin should not take $x as the JoinType") - } - } - - /** - * Returns the code for generating join key for stream side, and expression of whether the key - * has any null in it or not. - */ - protected def genStreamSideJoinKey( - ctx: CodegenContext, - input: Seq[ExprCode]): (ExprCode, String) = { - ctx.currentVars = input - if (streamedBoundKeys.length == 1 && streamedBoundKeys.head.dataType == LongType) { - // generate the join key as Long - val ev = streamedBoundKeys.head.genCode(ctx) - (ev, ev.isNull) - } else { - // generate the join key as UnsafeRow - val ev = GenerateUnsafeProjection.createCode(ctx, streamedBoundKeys) - (ev, s"${ev.value}.anyNull()") - } - } - - /** - * Generates the code for Inner join. - */ - protected def codegenInner(ctx: CodegenContext, input: Seq[ExprCode]): String = { - val HashedRelationInfo(relationTerm, keyIsUnique, isEmptyHashedRelation) = prepareRelation(ctx) - val (keyEv, anyNull) = genStreamSideJoinKey(ctx, input) - val (matched, checkCondition, buildVars) = getJoinCondition(ctx, input, streamedPlan, buildPlan) - val numOutput = metricTerm(ctx, "numOutputRows") - - val resultVars = buildSide match { - case BuildLeft => buildVars ++ input - case BuildRight => input ++ buildVars - } - - if (isEmptyHashedRelation) { - """ - |// If HashedRelation is empty, hash inner join simply returns nothing. - """.stripMargin - } else if (keyIsUnique) { - s""" - |// generate join key for stream side - |${keyEv.code} - |// find matches from HashedRelation - |UnsafeRow $matched = $anyNull ? null: (UnsafeRow)$relationTerm.getValue(${keyEv.value}); - |if ($matched != null) { - | $checkCondition { - | $numOutput.add(1); - | ${consume(ctx, resultVars)} - | } - |} - """.stripMargin - } else { - val matches = ctx.freshName("matches") - val iteratorCls = classOf[Iterator[UnsafeRow]].getName - - s""" - |// generate join key for stream side - |${keyEv.code} - |// find matches from HashRelation - |$iteratorCls $matches = $anyNull ? - | null : ($iteratorCls)$relationTerm.get(${keyEv.value}); - |if ($matches != null) { - | while ($matches.hasNext()) { - | UnsafeRow $matched = (UnsafeRow) $matches.next(); - | $checkCondition { - | $numOutput.add(1); - | ${consume(ctx, resultVars)} - | } - | } - |} - """.stripMargin - } - } - - /** - * Generates the code for left or right outer join. - */ - protected def codegenOuter(ctx: CodegenContext, input: Seq[ExprCode]): String = { - val HashedRelationInfo(relationTerm, keyIsUnique, _) = prepareRelation(ctx) - val (keyEv, anyNull) = genStreamSideJoinKey(ctx, input) - val matched = ctx.freshName("matched") - val buildVars = genOneSideJoinVars(ctx, matched, buildPlan, setDefaultValue = true) - val numOutput = metricTerm(ctx, "numOutputRows") - - // filter the output via condition - val conditionPassed = ctx.freshName("conditionPassed") - val checkCondition = if (condition.isDefined) { - val expr = condition.get - // evaluate the variables from build side that used by condition - val eval = evaluateRequiredVariables(buildPlan.output, buildVars, expr.references) - ctx.currentVars = input ++ buildVars - val ev = - BindReferences.bindReference(expr, streamedPlan.output ++ buildPlan.output).genCode(ctx) - s""" - |boolean $conditionPassed = true; - |${eval.trim} - |if ($matched != null) { - | ${ev.code} - | $conditionPassed = !${ev.isNull} && ${ev.value}; - |} - """.stripMargin - } else { - s"final boolean $conditionPassed = true;" - } - - val resultVars = buildSide match { - case BuildLeft => buildVars ++ input - case BuildRight => input ++ buildVars - } - - if (keyIsUnique) { - s""" - |// generate join key for stream side - |${keyEv.code} - |// find matches from HashedRelation - |UnsafeRow $matched = $anyNull ? null: (UnsafeRow)$relationTerm.getValue(${keyEv.value}); - |${checkCondition.trim} - |if (!$conditionPassed) { - | $matched = null; - | // reset the variables those are already evaluated. - | ${buildVars.filter(_.code.isEmpty).map(v => s"${v.isNull} = true;").mkString("\n")} - |} - |$numOutput.add(1); - |${consume(ctx, resultVars)} - """.stripMargin - } else { - val matches = ctx.freshName("matches") - val iteratorCls = classOf[Iterator[UnsafeRow]].getName - val found = ctx.freshName("found") - - s""" - |// generate join key for stream side - |${keyEv.code} - |// find matches from HashRelation - |$iteratorCls $matches = $anyNull ? null : ($iteratorCls)$relationTerm.get(${keyEv.value}); - |boolean $found = false; - |// the last iteration of this loop is to emit an empty row if there is no matched rows. - |while ($matches != null && $matches.hasNext() || !$found) { - | UnsafeRow $matched = $matches != null && $matches.hasNext() ? - | (UnsafeRow) $matches.next() : null; - | ${checkCondition.trim} - | if ($conditionPassed) { - | $found = true; - | $numOutput.add(1); - | ${consume(ctx, resultVars)} - | } - |} - """.stripMargin - } - } - - /** - * Generates the code for left semi join. - */ - protected def codegenSemi(ctx: CodegenContext, input: Seq[ExprCode]): String = { - val HashedRelationInfo(relationTerm, keyIsUnique, isEmptyHashedRelation) = prepareRelation(ctx) - val (keyEv, anyNull) = genStreamSideJoinKey(ctx, input) - val (matched, checkCondition, _) = getJoinCondition(ctx, input, streamedPlan, buildPlan) - val numOutput = metricTerm(ctx, "numOutputRows") - - if (isEmptyHashedRelation) { - """ - |// If HashedRelation is empty, hash semi join simply returns nothing. - """.stripMargin - } else if (keyIsUnique) { - s""" - |// generate join key for stream side - |${keyEv.code} - |// find matches from HashedRelation - |UnsafeRow $matched = $anyNull ? null: (UnsafeRow)$relationTerm.getValue(${keyEv.value}); - |if ($matched != null) { - | $checkCondition { - | $numOutput.add(1); - | ${consume(ctx, input)} - | } - |} - """.stripMargin - } else { - val matches = ctx.freshName("matches") - val iteratorCls = classOf[Iterator[UnsafeRow]].getName - val found = ctx.freshName("found") - - s""" - |// generate join key for stream side - |${keyEv.code} - |// find matches from HashRelation - |$iteratorCls $matches = $anyNull ? null : ($iteratorCls)$relationTerm.get(${keyEv.value}); - |if ($matches != null) { - | boolean $found = false; - | while (!$found && $matches.hasNext()) { - | UnsafeRow $matched = (UnsafeRow) $matches.next(); - | $checkCondition { - | $found = true; - | } - | } - | if ($found) { - | $numOutput.add(1); - | ${consume(ctx, input)} - | } - |} - """.stripMargin - } - } - - /** - * Generates the code for anti join. - */ - protected def codegenAnti(ctx: CodegenContext, input: Seq[ExprCode]): String = { - val HashedRelationInfo(relationTerm, keyIsUnique, isEmptyHashedRelation) = prepareRelation(ctx) - val numOutput = metricTerm(ctx, "numOutputRows") - if (isEmptyHashedRelation) { - return s""" - |// If HashedRelation is empty, hash anti join simply returns the stream side. - |$numOutput.add(1); - |${consume(ctx, input)} - """.stripMargin - } - - val (keyEv, anyNull) = genStreamSideJoinKey(ctx, input) - val (matched, checkCondition, _) = getJoinCondition(ctx, input, streamedPlan, buildPlan) - - if (keyIsUnique) { - val found = ctx.freshName("found") - s""" - |boolean $found = false; - |// generate join key for stream side - |${keyEv.code} - |// Check if the key has nulls. - |if (!($anyNull)) { - | // Check if the HashedRelation exists. - | UnsafeRow $matched = (UnsafeRow)$relationTerm.getValue(${keyEv.value}); - | if ($matched != null) { - | // Evaluate the condition. - | $checkCondition { - | $found = true; - | } - | } - |} - |if (!$found) { - | $numOutput.add(1); - | ${consume(ctx, input)} - |} - """.stripMargin - } else { - val matches = ctx.freshName("matches") - val iteratorCls = classOf[Iterator[UnsafeRow]].getName - val found = ctx.freshName("found") - s""" - |boolean $found = false; - |// generate join key for stream side - |${keyEv.code} - |// Check if the key has nulls. - |if (!($anyNull)) { - | // Check if the HashedRelation exists. - | $iteratorCls $matches = ($iteratorCls)$relationTerm.get(${keyEv.value}); - | if ($matches != null) { - | // Evaluate the condition. - | while (!$found && $matches.hasNext()) { - | UnsafeRow $matched = (UnsafeRow) $matches.next(); - | $checkCondition { - | $found = true; - | } - | } - | } - |} - |if (!$found) { - | $numOutput.add(1); - | ${consume(ctx, input)} - |} - """.stripMargin - } - } - - /** - * Generates the code for existence join. - */ - protected def codegenExistence(ctx: CodegenContext, input: Seq[ExprCode]): String = { - val HashedRelationInfo(relationTerm, keyIsUnique, _) = prepareRelation(ctx) - val (keyEv, anyNull) = genStreamSideJoinKey(ctx, input) - val numOutput = metricTerm(ctx, "numOutputRows") - val existsVar = ctx.freshName("exists") - - val matched = ctx.freshName("matched") - val buildVars = genOneSideJoinVars(ctx, matched, buildPlan, setDefaultValue = false) - val checkCondition = if (condition.isDefined) { - val expr = condition.get - // evaluate the variables from build side that used by condition - val eval = evaluateRequiredVariables(buildPlan.output, buildVars, expr.references) - // filter the output via condition - ctx.currentVars = input ++ buildVars - val ev = - BindReferences.bindReference(expr, streamedPlan.output ++ buildPlan.output).genCode(ctx) - s""" - |$eval - |${ev.code} - |$existsVar = !${ev.isNull} && ${ev.value}; - """.stripMargin - } else { - s"$existsVar = true;" - } - - val resultVar = input ++ Seq(ExprCode.forNonNullValue( - JavaCode.variable(existsVar, BooleanType))) - - if (keyIsUnique) { - s""" - |// generate join key for stream side - |${keyEv.code} - |// find matches from HashedRelation - |UnsafeRow $matched = $anyNull ? null: (UnsafeRow)$relationTerm.getValue(${keyEv.value}); - |boolean $existsVar = false; - |if ($matched != null) { - | $checkCondition - |} - |$numOutput.add(1); - |${consume(ctx, resultVar)} - """.stripMargin - } else { - val matches = ctx.freshName("matches") - val iteratorCls = classOf[Iterator[UnsafeRow]].getName - s""" - |// generate join key for stream side - |${keyEv.code} - |// find matches from HashRelation - |$iteratorCls $matches = $anyNull ? null : ($iteratorCls)$relationTerm.get(${keyEv.value}); - |boolean $existsVar = false; - |if ($matches != null) { - | while (!$existsVar && $matches.hasNext()) { - | UnsafeRow $matched = (UnsafeRow) $matches.next(); - | $checkCondition - | } - |} - |$numOutput.add(1); - |${consume(ctx, resultVar)} - """.stripMargin - } - } - - protected def prepareRelation(ctx: CodegenContext): HashedRelationInfo -} - -object HashJoin extends CastSupport with SQLConfHelper { - - private def canRewriteAsLongType(keys: Seq[Expression]): Boolean = { - // TODO: support BooleanType, DateType and TimestampType - keys.forall(_.dataType.isInstanceOf[IntegralType]) && - keys.map(_.dataType.defaultSize).sum <= 8 - } - - /** - * Try to rewrite the key as LongType so we can use getLong(), if they key can fit with a long. - * - * If not, returns the original expressions. - */ - def rewriteKeyExpr(keys: Seq[Expression]): Seq[Expression] = { - assert(keys.nonEmpty) - if (!canRewriteAsLongType(keys)) { - return keys - } - - var keyExpr: Expression = if (keys.head.dataType != LongType) { - cast(keys.head, LongType) - } else { - keys.head - } - keys.tail.foreach { e => - val bits = e.dataType.defaultSize * 8 - keyExpr = BitwiseOr(ShiftLeft(keyExpr, Literal(bits)), - BitwiseAnd(cast(e, LongType), Literal((1L << bits) - 1))) - } - keyExpr :: Nil - } - - /** - * Extract a given key which was previously packed in a long value using its index to - * determine the number of bits to shift - */ - def extractKeyExprAt(keys: Seq[Expression], index: Int): Expression = { - assert(canRewriteAsLongType(keys)) - // jump over keys that have a higher index value than the required key - if (keys.size == 1) { - assert(index == 0) - Cast( - child = BoundReference(0, LongType, nullable = false), - dataType = keys(index).dataType, - timeZoneId = Option(conf.sessionLocalTimeZone), - ansiEnabled = false) - } else { - val shiftedBits = - keys.slice(index + 1, keys.size).map(_.dataType.defaultSize * 8).sum - val mask = (1L << (keys(index).dataType.defaultSize * 8)) - 1 - // build the schema for unpacking the required key - val castChild = BitwiseAnd( - ShiftRightUnsigned(BoundReference(0, LongType, nullable = false), Literal(shiftedBits)), - Literal(mask)) - Cast( - child = castChild, - dataType = keys(index).dataType, - timeZoneId = Option(conf.sessionLocalTimeZone), - ansiEnabled = false) - } - } -} diff --git a/shims/spark34/src/main/scala/org/apache/gluten/execution/GenerateTreeStringShim.scala b/shims/spark34/src/main/scala/org/apache/gluten/execution/GenerateTreeStringShim.scala index 3e42244bd8c64..8936e6ca63510 100644 --- a/shims/spark34/src/main/scala/org/apache/gluten/execution/GenerateTreeStringShim.scala +++ b/shims/spark34/src/main/scala/org/apache/gluten/execution/GenerateTreeStringShim.scala @@ -16,7 +16,7 @@ */ package org.apache.gluten.execution -import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} +import org.apache.spark.sql.execution.UnaryExecNode /** * Spark 3.5 has changed the parameter type of the generateTreeString API in TreeNode. In order to @@ -25,7 +25,7 @@ import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} * allow different Spark versions to override their own generateTreeString. */ -trait GenerateTreeStringShim extends UnaryExecNode { +trait WholeStageTransformerGenerateTreeStringShim extends UnaryExecNode { def stageId: Int @@ -62,3 +62,28 @@ trait GenerateTreeStringShim extends UnaryExecNode { } } } + +trait InputAdapterGenerateTreeStringShim extends UnaryExecNode { + + override def generateTreeString( + depth: Int, + lastChildren: Seq[Boolean], + append: String => Unit, + verbose: Boolean, + prefix: String = "", + addSuffix: Boolean = false, + maxFields: Int, + printNodeId: Boolean, + indent: Int = 0): Unit = { + child.generateTreeString( + depth, + lastChildren, + append, + verbose, + prefix = "", + addSuffix = false, + maxFields, + printNodeId, + indent) + } +} diff --git a/shims/spark34/src/main/scala/org/apache/gluten/sql/shims/spark34/Spark34Shims.scala b/shims/spark34/src/main/scala/org/apache/gluten/sql/shims/spark34/Spark34Shims.scala index 4ab307e8568f6..203256cf5fec2 100644 --- a/shims/spark34/src/main/scala/org/apache/gluten/sql/shims/spark34/Spark34Shims.scala +++ b/shims/spark34/src/main/scala/org/apache/gluten/sql/shims/spark34/Spark34Shims.scala @@ -36,22 +36,26 @@ import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.plans.physical.{ClusteredDistribution, Distribution, KeyGroupedPartitioning, Partitioning} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, InternalRowComparableWrapper, TimestampFormatter} +import org.apache.spark.sql.catalyst.util.RebaseDateTime.RebaseSpec import org.apache.spark.sql.connector.catalog.Table import org.apache.spark.sql.connector.expressions.Transform import org.apache.spark.sql.connector.read.{HasPartitionKey, InputPartition, Scan} import org.apache.spark.sql.execution._ import org.apache.spark.sql.execution.command.DataWritingCommandExec import org.apache.spark.sql.execution.datasources._ +import org.apache.spark.sql.execution.datasources.parquet.ParquetFilters import org.apache.spark.sql.execution.datasources.v2.BatchScanExec import org.apache.spark.sql.execution.datasources.v2.text.TextScan import org.apache.spark.sql.execution.datasources.v2.utils.CatalogUtil import org.apache.spark.sql.execution.exchange.BroadcastExchangeLike import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.types.{StructField, StructType} +import org.apache.spark.sql.internal.SQLConf.LegacyBehaviorPolicy +import org.apache.spark.sql.types.{IntegerType, LongType, StructField, StructType} import org.apache.spark.sql.util.CaseInsensitiveStringMap import org.apache.spark.storage.{BlockId, BlockManagerId} import org.apache.hadoop.fs.{FileStatus, Path} +import org.apache.parquet.schema.MessageType import java.time.ZoneOffset import java.util.{HashMap => JHashMap, Map => JMap} @@ -73,7 +77,10 @@ class Spark34Shims extends SparkShims { Sig[Sec](ExpressionNames.SEC), Sig[Csc](ExpressionNames.CSC), Sig[KnownNullable](KNOWN_NULLABLE), - Sig[Empty2Null](ExpressionNames.EMPTY2NULL) + Sig[Empty2Null](ExpressionNames.EMPTY2NULL), + Sig[TimestampAdd](ExpressionNames.TIMESTAMP_ADD), + Sig[RoundFloor](ExpressionNames.FLOOR), + Sig[RoundCeil](ExpressionNames.CEIL) ) } @@ -208,6 +215,11 @@ class Spark34Shims extends SparkShims { case other => other } + override def getFileSizeAndModificationTime( + file: PartitionedFile): (Option[Long], Option[Long]) = { + (Some(file.fileSize), Some(file.modificationTime)) + } + override def generateMetadataColumns( file: PartitionedFile, metadataColumnNames: Seq[String]): JMap[String, String] = { @@ -231,8 +243,9 @@ class Spark34Shims extends SparkShims { case _ => } } - - // TODO: row_index metadata support + metadataColumn.put(InputFileName().prettyName, file.filePath.toString) + metadataColumn.put(InputFileBlockStart().prettyName, file.start.toString) + metadataColumn.put(InputFileBlockLength().prettyName, file.length.toString) metadataColumn } @@ -327,10 +340,37 @@ class Spark34Shims extends SparkShims { def getFileStatus(partition: PartitionDirectory): Seq[FileStatus] = partition.files + def isFileSplittable( + relation: HadoopFsRelation, + filePath: Path, + sparkSchema: StructType): Boolean = { + // SPARK-39634: Allow file splitting in combination with row index generation once + // the fix for PARQUET-2161 is available. + relation.fileFormat + .isSplitable(relation.sparkSession, relation.options, filePath) && + !(RowIndexUtil.findRowIndexColumnIndexInSchema(sparkSchema) >= 0) + } + def isRowIndexMetadataColumn(name: String): Boolean = { name == FileFormat.ROW_INDEX_TEMPORARY_COLUMN_NAME } + def findRowIndexColumnIndexInSchema(sparkSchema: StructType): Int = { + sparkSchema.fields.zipWithIndex.find { + case (field: StructField, _: Int) => + field.name == FileFormat.ROW_INDEX_TEMPORARY_COLUMN_NAME + } match { + case Some((field: StructField, idx: Int)) => + if (field.dataType != LongType && field.dataType != IntegerType) { + throw new RuntimeException( + s"${FileFormat.ROW_INDEX_TEMPORARY_COLUMN_NAME} " + + s"must be of LongType or IntegerType") + } + idx + case _ => -1 + } + } + def splitFiles( sparkSession: SparkSession, file: FileStatus, @@ -407,6 +447,9 @@ class Spark34Shims extends SparkShims { override def withTryEvalMode(expr: Expression): Boolean = { expr match { case a: Add => a.evalMode == EvalMode.TRY + case s: Subtract => s.evalMode == EvalMode.TRY + case d: Divide => d.evalMode == EvalMode.TRY + case m: Multiply => m.evalMode == EvalMode.TRY case _ => false } } @@ -414,6 +457,9 @@ class Spark34Shims extends SparkShims { override def withAnsiEvalMode(expr: Expression): Boolean = { expr match { case a: Add => a.evalMode == EvalMode.ANSI + case s: Subtract => s.evalMode == EvalMode.ANSI + case d: Divide => d.evalMode == EvalMode.ANSI + case m: Multiply => m.evalMode == EvalMode.ANSI case _ => false } } @@ -430,4 +476,20 @@ class Spark34Shims extends SparkShims { override def isPlannedV1Write(write: DataWritingCommandExec): Boolean = { write.cmd.isInstanceOf[V1WriteCommand] && SQLConf.get.plannedWriteEnabled } + + override def createParquetFilters( + conf: SQLConf, + schema: MessageType, + caseSensitive: Option[Boolean] = None): ParquetFilters = { + new ParquetFilters( + schema, + conf.parquetFilterPushDownDate, + conf.parquetFilterPushDownTimestamp, + conf.parquetFilterPushDownDecimal, + conf.parquetFilterPushDownStringPredicate, + conf.parquetFilterPushDownInFilterThreshold, + caseSensitive.getOrElse(conf.caseSensitiveAnalysis), + RebaseSpec(LegacyBehaviorPolicy.CORRECTED) + ) + } } diff --git a/shims/spark34/src/main/scala/org/apache/spark/sql/execution/FileSourceScanExecShim.scala b/shims/spark34/src/main/scala/org/apache/spark/sql/execution/FileSourceScanExecShim.scala index 33df953f32c87..bc3893ca201af 100644 --- a/shims/spark34/src/main/scala/org/apache/spark/sql/execution/FileSourceScanExecShim.scala +++ b/shims/spark34/src/main/scala/org/apache/spark/sql/execution/FileSourceScanExecShim.scala @@ -63,16 +63,11 @@ abstract class FileSourceScanExecShim( def hasUnsupportedColumns: Boolean = { val metadataColumnsNames = metadataColumns.map(_.name) - // row_index metadata is not support yet - metadataColumnsNames.contains(FileFormat.ROW_INDEX_TEMPORARY_COLUMN_NAME) || output .filterNot(metadataColumns.toSet) .exists(v => metadataColumnsNames.contains(v.name)) || // Below name has special meaning in Velox. - output.exists( - a => - a.name == "$path" || a.name == "$bucket" || - a.name == FileFormat.ROW_INDEX_TEMPORARY_COLUMN_NAME) + output.exists(a => a.name == "$path" || a.name == "$bucket") } def isMetadataColumn(attr: Attribute): Boolean = metadataColumns.contains(attr) diff --git a/shims/spark34/src/main/scala/org/apache/spark/sql/execution/JoinSelectionShim.scala b/shims/spark34/src/main/scala/org/apache/spark/sql/execution/JoinSelectionShim.scala deleted file mode 100644 index 20b9dea333a57..0000000000000 --- a/shims/spark34/src/main/scala/org/apache/spark/sql/execution/JoinSelectionShim.scala +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.spark.sql.execution - -import org.apache.spark.sql.catalyst.expressions.Expression -import org.apache.spark.sql.catalyst.planning.ExtractEquiJoinKeys -import org.apache.spark.sql.catalyst.plans.JoinType -import org.apache.spark.sql.catalyst.plans.logical.{Join, JoinHint, LogicalPlan} - -// https://issues.apache.org/jira/browse/SPARK-36745 -object JoinSelectionShim { - object ExtractEquiJoinKeysShim { - type ReturnType = - ( - JoinType, - Seq[Expression], - Seq[Expression], - Option[Expression], - LogicalPlan, - LogicalPlan, - JoinHint) - def unapply(join: Join): Option[ReturnType] = { - ExtractEquiJoinKeys.unapply(join).map { - case ( - joinType, - leftKeys, - rightKeys, - otherPredicates, - predicatesOfJoinKeys, - left, - right, - hint) => - (joinType, leftKeys, rightKeys, otherPredicates, left, right, hint) - } - } - } -} diff --git a/shims/spark34/src/main/scala/org/apache/spark/sql/execution/adaptive/GlutenCostEvaluator.scala b/shims/spark34/src/main/scala/org/apache/spark/sql/execution/adaptive/GlutenCostEvaluator.scala new file mode 100644 index 0000000000000..8fcfa735f463d --- /dev/null +++ b/shims/spark34/src/main/scala/org/apache/spark/sql/execution/adaptive/GlutenCostEvaluator.scala @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.execution.adaptive + +import org.apache.gluten.GlutenConfig + +import org.apache.spark.sql.catalyst.SQLConfHelper +import org.apache.spark.sql.execution.SparkPlan +import org.apache.spark.sql.internal.SQLConf + +/** This [[CostEvaluator]] is to force use the new physical plan when cost is equal. */ +case class GlutenCostEvaluator() extends CostEvaluator with SQLConfHelper { + override def evaluateCost(plan: SparkPlan): Cost = { + val forceOptimizeSkewedJoin = conf.getConf(SQLConf.ADAPTIVE_FORCE_OPTIMIZE_SKEWED_JOIN) + if (GlutenConfig.getConf.enableGluten) { + new GlutenCost(SimpleCostEvaluator(forceOptimizeSkewedJoin), plan) + } else { + SimpleCostEvaluator(forceOptimizeSkewedJoin).evaluateCost(plan) + } + } +} diff --git a/shims/spark34/src/main/scala/org/apache/spark/sql/execution/datasources/v2/BatchScanExecShim.scala b/shims/spark34/src/main/scala/org/apache/spark/sql/execution/datasources/v2/BatchScanExecShim.scala index 64afc8193f4ec..bd3b09a01ef42 100644 --- a/shims/spark34/src/main/scala/org/apache/spark/sql/execution/datasources/v2/BatchScanExecShim.scala +++ b/shims/spark34/src/main/scala/org/apache/spark/sql/execution/datasources/v2/BatchScanExecShim.scala @@ -65,14 +65,10 @@ abstract class BatchScanExecShim( // TODO, fallback if user define same name column due to we can't right now // detect which column is metadata column which is user defined column. val metadataColumnsNames = metadataColumns.map(_.name) - metadataColumnsNames.contains(FileFormat.ROW_INDEX_TEMPORARY_COLUMN_NAME) || output .filterNot(metadataColumns.toSet) .exists(v => metadataColumnsNames.contains(v.name)) || - output.exists( - a => - a.name == "$path" || a.name == "$bucket" || - a.name == FileFormat.ROW_INDEX_TEMPORARY_COLUMN_NAME) + output.exists(a => a.name == "$path" || a.name == "$bucket") } override def doExecuteColumnar(): RDD[ColumnarBatch] = { diff --git a/shims/spark34/src/main/scala/org/apache/spark/sql/hive/execution/AbstractHiveTableScanExec.scala b/shims/spark34/src/main/scala/org/apache/spark/sql/hive/execution/AbstractHiveTableScanExec.scala index 77f15ac57087b..87aba00b0f593 100644 --- a/shims/spark34/src/main/scala/org/apache/spark/sql/hive/execution/AbstractHiveTableScanExec.scala +++ b/shims/spark34/src/main/scala/org/apache/spark/sql/hive/execution/AbstractHiveTableScanExec.scala @@ -77,7 +77,7 @@ abstract private[hive] class AbstractHiveTableScanExec( override val output: Seq[Attribute] = { // Retrieve the original attributes based on expression ID so that capitalization matches. - requestedAttributes.map(originalAttributes) + requestedAttributes.map(originalAttributes).distinct } // Bind all partition key attribute references in the partition pruning predicate for later diff --git a/shims/spark35/pom.xml b/shims/spark35/pom.xml index f5658eee2a3ff..1c79b882bf4ab 100644 --- a/shims/spark35/pom.xml +++ b/shims/spark35/pom.xml @@ -43,13 +43,13 @@ org.apache.spark - spark-catalyst_2.12 + spark-catalyst_${scala.binary.version} provided true org.apache.spark - spark-core_2.12 + spark-core_${scala.binary.version} provided true @@ -109,6 +109,9 @@ -Wconf:cat=deprecation:silent + diff --git a/shims/spark35/src/main/java/org/apache/spark/sql/execution/joins/HashJoin.scala.deprecated b/shims/spark35/src/main/java/org/apache/spark/sql/execution/joins/HashJoin.scala.deprecated deleted file mode 100644 index 3fc8237306140..0000000000000 --- a/shims/spark35/src/main/java/org/apache/spark/sql/execution/joins/HashJoin.scala.deprecated +++ /dev/null @@ -1,774 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.execution.joins - -import org.apache.spark.sql.catalyst.{InternalRow, SQLConfHelper} -import org.apache.spark.sql.catalyst.analysis.CastSupport -import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.catalyst.expressions.BindReferences.bindReferences -import org.apache.spark.sql.catalyst.expressions.codegen._ -import org.apache.spark.sql.catalyst.optimizer.{BuildLeft, BuildRight, BuildSide} -import org.apache.spark.sql.catalyst.plans._ -import org.apache.spark.sql.catalyst.plans.physical.Partitioning -import org.apache.spark.sql.execution.{CodegenSupport, ExplainUtils, RowIterator} -import org.apache.spark.sql.execution.metric.SQLMetric -import org.apache.spark.sql.types.{BooleanType, IntegralType, LongType} - -/** - * @param relationTerm variable name for HashedRelation - * @param keyIsUnique indicate whether keys of HashedRelation known to be unique in code-gen time - * @param isEmpty indicate whether it known to be EmptyHashedRelation in code-gen time - */ -private[joins] case class HashedRelationInfo( - relationTerm: String, - keyIsUnique: Boolean, - isEmpty: Boolean) - -trait HashJoin extends JoinCodegenSupport { - def buildSide: BuildSide - - override def simpleStringWithNodeId(): String = { - val opId = ExplainUtils.getOpId(this) - s"$nodeName $joinType ${buildSide} ($opId)".trim - } - - override def output: Seq[Attribute] = { - joinType match { - case _: InnerLike => - left.output ++ right.output - case LeftOuter => - left.output ++ right.output.map(_.withNullability(true)) - case RightOuter => - left.output.map(_.withNullability(true)) ++ right.output - case j: ExistenceJoin => - left.output :+ j.exists - case LeftExistence(_) => - left.output - case x => - throw new IllegalArgumentException(s"HashJoin should not take $x as the JoinType") - } - } - - override def outputPartitioning: Partitioning = buildSide match { - case BuildLeft => - joinType match { - case _: InnerLike | RightOuter => right.outputPartitioning - case x => - throw new IllegalArgumentException( - s"HashJoin should not take $x as the JoinType with building left side") - } - case BuildRight => - joinType match { - case _: InnerLike | LeftOuter | LeftSemi | LeftAnti | _: ExistenceJoin => - left.outputPartitioning - case x => - throw new IllegalArgumentException( - s"HashJoin should not take $x as the JoinType with building right side") - } - } - - /** - * Handle the special cases for LeftOuter/LeftSemi with BuildLeft and RightOuter with BuildRight. - */ - override def outputOrdering: Seq[SortOrder] = buildSide match { - case BuildLeft => - joinType match { - case _: InnerLike | RightOuter => right.outputOrdering - case LeftOuter => left.outputOrdering - case LeftSemi => left.outputOrdering - case x => - throw new IllegalArgumentException( - s"HashJoin should not take $x as the JoinType with building left side") - } - case BuildRight => - joinType match { - case _: InnerLike | LeftOuter | LeftSemi | LeftAnti | _: ExistenceJoin => - left.outputOrdering - case RightOuter => right.outputOrdering - case x => - throw new IllegalArgumentException( - s"HashJoin should not take $x as the JoinType with building right side") - } - } - - protected lazy val (buildPlan, streamedPlan) = buildSide match { - case BuildLeft => (left, right) - case BuildRight => (right, left) - } - - protected lazy val (buildKeys, streamedKeys) = { - require(leftKeys.length == rightKeys.length && - leftKeys.map(_.dataType) - .zip(rightKeys.map(_.dataType)) - .forall(types => types._1.sameType(types._2)), - "Join keys from two sides should have same length and types") - buildSide match { - case BuildLeft => (leftKeys, rightKeys) - case BuildRight => (rightKeys, leftKeys) - } - } - - @transient protected lazy val (buildOutput, streamedOutput) = { - buildSide match { - case BuildLeft => (left.output, right.output) - case BuildRight => (right.output, left.output) - } - } - - @transient protected lazy val buildBoundKeys = - bindReferences(HashJoin.rewriteKeyExpr(buildKeys), buildOutput) - - @transient protected lazy val streamedBoundKeys = - bindReferences(HashJoin.rewriteKeyExpr(streamedKeys), streamedOutput) - - protected def buildSideKeyGenerator(): Projection = - UnsafeProjection.create(buildBoundKeys) - - protected def streamSideKeyGenerator(): UnsafeProjection = - UnsafeProjection.create(streamedBoundKeys) - - @transient protected[this] lazy val boundCondition = if (condition.isDefined) { - if (joinType == FullOuter && buildSide == BuildLeft) { - // Put join left side before right side. This is to be consistent with - // `ShuffledHashJoinExec.fullOuterJoin`. - Predicate.create(condition.get, buildPlan.output ++ streamedPlan.output).eval _ - } else { - Predicate.create(condition.get, streamedPlan.output ++ buildPlan.output).eval _ - } - } else { - (r: InternalRow) => true - } - - protected def createResultProjection(): (InternalRow) => InternalRow = joinType match { - case LeftExistence(_) => - UnsafeProjection.create(output, output) - case _ => - // Always put the stream side on left to simplify implementation - // both of left and right side could be null - UnsafeProjection.create( - output, (streamedPlan.output ++ buildPlan.output).map(_.withNullability(true))) - } - - private def innerJoin( - streamIter: Iterator[InternalRow], - hashedRelation: HashedRelation): Iterator[InternalRow] = { - val joinRow = new JoinedRow - val joinKeys = streamSideKeyGenerator() - - if (hashedRelation == EmptyHashedRelation) { - Iterator.empty - } else if (hashedRelation.keyIsUnique) { - streamIter.flatMap { srow => - joinRow.withLeft(srow) - val matched = hashedRelation.getValue(joinKeys(srow)) - if (matched != null) { - Some(joinRow.withRight(matched)).filter(boundCondition) - } else { - None - } - } - } else { - streamIter.flatMap { srow => - joinRow.withLeft(srow) - val matches = hashedRelation.get(joinKeys(srow)) - if (matches != null) { - matches.map(joinRow.withRight).filter(boundCondition) - } else { - Seq.empty - } - } - } - } - - private def outerJoin( - streamedIter: Iterator[InternalRow], - hashedRelation: HashedRelation): Iterator[InternalRow] = { - val joinedRow = new JoinedRow() - val keyGenerator = streamSideKeyGenerator() - val nullRow = new GenericInternalRow(buildPlan.output.length) - - if (hashedRelation.keyIsUnique) { - streamedIter.map { currentRow => - val rowKey = keyGenerator(currentRow) - joinedRow.withLeft(currentRow) - val matched = hashedRelation.getValue(rowKey) - if (matched != null && boundCondition(joinedRow.withRight(matched))) { - joinedRow - } else { - joinedRow.withRight(nullRow) - } - } - } else { - streamedIter.flatMap { currentRow => - val rowKey = keyGenerator(currentRow) - joinedRow.withLeft(currentRow) - val buildIter = hashedRelation.get(rowKey) - new RowIterator { - private var found = false - override def advanceNext(): Boolean = { - while (buildIter != null && buildIter.hasNext) { - val nextBuildRow = buildIter.next() - if (boundCondition(joinedRow.withRight(nextBuildRow))) { - found = true - return true - } - } - if (!found) { - joinedRow.withRight(nullRow) - found = true - return true - } - false - } - override def getRow: InternalRow = joinedRow - }.toScala - } - } - } - - private def semiJoin( - streamIter: Iterator[InternalRow], - hashedRelation: HashedRelation): Iterator[InternalRow] = { - val joinKeys = streamSideKeyGenerator() - val joinedRow = new JoinedRow - - if (hashedRelation == EmptyHashedRelation) { - Iterator.empty - } else if (hashedRelation.keyIsUnique) { - streamIter.filter { current => - val key = joinKeys(current) - lazy val matched = hashedRelation.getValue(key) - !key.anyNull && matched != null && - (condition.isEmpty || boundCondition(joinedRow(current, matched))) - } - } else { - streamIter.filter { current => - val key = joinKeys(current) - lazy val buildIter = hashedRelation.get(key) - !key.anyNull && buildIter != null && (condition.isEmpty || buildIter.exists { - (row: InternalRow) => boundCondition(joinedRow(current, row)) - }) - } - } - } - - private def existenceJoin( - streamIter: Iterator[InternalRow], - hashedRelation: HashedRelation): Iterator[InternalRow] = { - val joinKeys = streamSideKeyGenerator() - val result = new GenericInternalRow(Array[Any](null)) - val joinedRow = new JoinedRow - - if (hashedRelation.keyIsUnique) { - streamIter.map { current => - val key = joinKeys(current) - lazy val matched = hashedRelation.getValue(key) - val exists = !key.anyNull && matched != null && - (condition.isEmpty || boundCondition(joinedRow(current, matched))) - result.setBoolean(0, exists) - joinedRow(current, result) - } - } else { - streamIter.map { current => - val key = joinKeys(current) - lazy val buildIter = hashedRelation.get(key) - val exists = !key.anyNull && buildIter != null && (condition.isEmpty || buildIter.exists { - (row: InternalRow) => boundCondition(joinedRow(current, row)) - }) - result.setBoolean(0, exists) - joinedRow(current, result) - } - } - } - - private def antiJoin( - streamIter: Iterator[InternalRow], - hashedRelation: HashedRelation): Iterator[InternalRow] = { - // If the right side is empty, AntiJoin simply returns the left side. - if (hashedRelation == EmptyHashedRelation) { - return streamIter - } - - val joinKeys = streamSideKeyGenerator() - val joinedRow = new JoinedRow - - if (hashedRelation.keyIsUnique) { - streamIter.filter { current => - val key = joinKeys(current) - lazy val matched = hashedRelation.getValue(key) - key.anyNull || matched == null || - (condition.isDefined && !boundCondition(joinedRow(current, matched))) - } - } else { - streamIter.filter { current => - val key = joinKeys(current) - lazy val buildIter = hashedRelation.get(key) - key.anyNull || buildIter == null || (condition.isDefined && !buildIter.exists { - row => boundCondition(joinedRow(current, row)) - }) - } - } - } - - protected def join( - streamedIter: Iterator[InternalRow], - hashed: HashedRelation, - numOutputRows: SQLMetric): Iterator[InternalRow] = { - - val joinedIter = joinType match { - case _: InnerLike => - innerJoin(streamedIter, hashed) - case LeftOuter | RightOuter => - outerJoin(streamedIter, hashed) - case LeftSemi => - semiJoin(streamedIter, hashed) - case LeftAnti => - antiJoin(streamedIter, hashed) - case _: ExistenceJoin => - existenceJoin(streamedIter, hashed) - case x => - throw new IllegalArgumentException( - s"HashJoin should not take $x as the JoinType") - } - - val resultProj = createResultProjection - joinedIter.map { r => - numOutputRows += 1 - resultProj(r) - } - } - - override def doProduce(ctx: CodegenContext): String = { - streamedPlan.asInstanceOf[CodegenSupport].produce(ctx, this) - } - - override def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: ExprCode): String = { - joinType match { - case _: InnerLike => codegenInner(ctx, input) - case LeftOuter | RightOuter => codegenOuter(ctx, input) - case LeftSemi => codegenSemi(ctx, input) - case LeftAnti => codegenAnti(ctx, input) - case _: ExistenceJoin => codegenExistence(ctx, input) - case x => - throw new IllegalArgumentException( - s"HashJoin should not take $x as the JoinType") - } - } - - /** - * Returns the code for generating join key for stream side, and expression of whether the key - * has any null in it or not. - */ - protected def genStreamSideJoinKey( - ctx: CodegenContext, - input: Seq[ExprCode]): (ExprCode, String) = { - ctx.currentVars = input - if (streamedBoundKeys.length == 1 && streamedBoundKeys.head.dataType == LongType) { - // generate the join key as Long - val ev = streamedBoundKeys.head.genCode(ctx) - (ev, ev.isNull) - } else { - // generate the join key as UnsafeRow - val ev = GenerateUnsafeProjection.createCode(ctx, streamedBoundKeys) - (ev, s"${ev.value}.anyNull()") - } - } - - /** - * Generates the code for Inner join. - */ - protected def codegenInner(ctx: CodegenContext, input: Seq[ExprCode]): String = { - val HashedRelationInfo(relationTerm, keyIsUnique, isEmptyHashedRelation) = prepareRelation(ctx) - val (keyEv, anyNull) = genStreamSideJoinKey(ctx, input) - val (matched, checkCondition, buildVars) = getJoinCondition(ctx, input, streamedPlan, buildPlan) - val numOutput = metricTerm(ctx, "numOutputRows") - - val resultVars = buildSide match { - case BuildLeft => buildVars ++ input - case BuildRight => input ++ buildVars - } - - if (isEmptyHashedRelation) { - """ - |// If HashedRelation is empty, hash inner join simply returns nothing. - """.stripMargin - } else if (keyIsUnique) { - s""" - |// generate join key for stream side - |${keyEv.code} - |// find matches from HashedRelation - |UnsafeRow $matched = $anyNull ? null: (UnsafeRow)$relationTerm.getValue(${keyEv.value}); - |if ($matched != null) { - | $checkCondition { - | $numOutput.add(1); - | ${consume(ctx, resultVars)} - | } - |} - """.stripMargin - } else { - val matches = ctx.freshName("matches") - val iteratorCls = classOf[Iterator[UnsafeRow]].getName - - s""" - |// generate join key for stream side - |${keyEv.code} - |// find matches from HashRelation - |$iteratorCls $matches = $anyNull ? - | null : ($iteratorCls)$relationTerm.get(${keyEv.value}); - |if ($matches != null) { - | while ($matches.hasNext()) { - | UnsafeRow $matched = (UnsafeRow) $matches.next(); - | $checkCondition { - | $numOutput.add(1); - | ${consume(ctx, resultVars)} - | } - | } - |} - """.stripMargin - } - } - - /** - * Generates the code for left or right outer join. - */ - protected def codegenOuter(ctx: CodegenContext, input: Seq[ExprCode]): String = { - val HashedRelationInfo(relationTerm, keyIsUnique, _) = prepareRelation(ctx) - val (keyEv, anyNull) = genStreamSideJoinKey(ctx, input) - val matched = ctx.freshName("matched") - val buildVars = genOneSideJoinVars(ctx, matched, buildPlan, setDefaultValue = true) - val numOutput = metricTerm(ctx, "numOutputRows") - - // filter the output via condition - val conditionPassed = ctx.freshName("conditionPassed") - val checkCondition = if (condition.isDefined) { - val expr = condition.get - // evaluate the variables from build side that used by condition - val eval = evaluateRequiredVariables(buildPlan.output, buildVars, expr.references) - ctx.currentVars = input ++ buildVars - val ev = - BindReferences.bindReference(expr, streamedPlan.output ++ buildPlan.output).genCode(ctx) - s""" - |boolean $conditionPassed = true; - |${eval.trim} - |if ($matched != null) { - | ${ev.code} - | $conditionPassed = !${ev.isNull} && ${ev.value}; - |} - """.stripMargin - } else { - s"final boolean $conditionPassed = true;" - } - - val resultVars = buildSide match { - case BuildLeft => buildVars ++ input - case BuildRight => input ++ buildVars - } - - if (keyIsUnique) { - s""" - |// generate join key for stream side - |${keyEv.code} - |// find matches from HashedRelation - |UnsafeRow $matched = $anyNull ? null: (UnsafeRow)$relationTerm.getValue(${keyEv.value}); - |${checkCondition.trim} - |if (!$conditionPassed) { - | $matched = null; - | // reset the variables those are already evaluated. - | ${buildVars.filter(_.code.isEmpty).map(v => s"${v.isNull} = true;").mkString("\n")} - |} - |$numOutput.add(1); - |${consume(ctx, resultVars)} - """.stripMargin - } else { - val matches = ctx.freshName("matches") - val iteratorCls = classOf[Iterator[UnsafeRow]].getName - val found = ctx.freshName("found") - - s""" - |// generate join key for stream side - |${keyEv.code} - |// find matches from HashRelation - |$iteratorCls $matches = $anyNull ? null : ($iteratorCls)$relationTerm.get(${keyEv.value}); - |boolean $found = false; - |// the last iteration of this loop is to emit an empty row if there is no matched rows. - |while ($matches != null && $matches.hasNext() || !$found) { - | UnsafeRow $matched = $matches != null && $matches.hasNext() ? - | (UnsafeRow) $matches.next() : null; - | ${checkCondition.trim} - | if ($conditionPassed) { - | $found = true; - | $numOutput.add(1); - | ${consume(ctx, resultVars)} - | } - |} - """.stripMargin - } - } - - /** - * Generates the code for left semi join. - */ - protected def codegenSemi(ctx: CodegenContext, input: Seq[ExprCode]): String = { - val HashedRelationInfo(relationTerm, keyIsUnique, isEmptyHashedRelation) = prepareRelation(ctx) - val (keyEv, anyNull) = genStreamSideJoinKey(ctx, input) - val (matched, checkCondition, _) = getJoinCondition(ctx, input, streamedPlan, buildPlan) - val numOutput = metricTerm(ctx, "numOutputRows") - - if (isEmptyHashedRelation) { - """ - |// If HashedRelation is empty, hash semi join simply returns nothing. - """.stripMargin - } else if (keyIsUnique) { - s""" - |// generate join key for stream side - |${keyEv.code} - |// find matches from HashedRelation - |UnsafeRow $matched = $anyNull ? null: (UnsafeRow)$relationTerm.getValue(${keyEv.value}); - |if ($matched != null) { - | $checkCondition { - | $numOutput.add(1); - | ${consume(ctx, input)} - | } - |} - """.stripMargin - } else { - val matches = ctx.freshName("matches") - val iteratorCls = classOf[Iterator[UnsafeRow]].getName - val found = ctx.freshName("found") - - s""" - |// generate join key for stream side - |${keyEv.code} - |// find matches from HashRelation - |$iteratorCls $matches = $anyNull ? null : ($iteratorCls)$relationTerm.get(${keyEv.value}); - |if ($matches != null) { - | boolean $found = false; - | while (!$found && $matches.hasNext()) { - | UnsafeRow $matched = (UnsafeRow) $matches.next(); - | $checkCondition { - | $found = true; - | } - | } - | if ($found) { - | $numOutput.add(1); - | ${consume(ctx, input)} - | } - |} - """.stripMargin - } - } - - /** - * Generates the code for anti join. - */ - protected def codegenAnti(ctx: CodegenContext, input: Seq[ExprCode]): String = { - val HashedRelationInfo(relationTerm, keyIsUnique, isEmptyHashedRelation) = prepareRelation(ctx) - val numOutput = metricTerm(ctx, "numOutputRows") - if (isEmptyHashedRelation) { - return s""" - |// If HashedRelation is empty, hash anti join simply returns the stream side. - |$numOutput.add(1); - |${consume(ctx, input)} - """.stripMargin - } - - val (keyEv, anyNull) = genStreamSideJoinKey(ctx, input) - val (matched, checkCondition, _) = getJoinCondition(ctx, input, streamedPlan, buildPlan) - - if (keyIsUnique) { - val found = ctx.freshName("found") - s""" - |boolean $found = false; - |// generate join key for stream side - |${keyEv.code} - |// Check if the key has nulls. - |if (!($anyNull)) { - | // Check if the HashedRelation exists. - | UnsafeRow $matched = (UnsafeRow)$relationTerm.getValue(${keyEv.value}); - | if ($matched != null) { - | // Evaluate the condition. - | $checkCondition { - | $found = true; - | } - | } - |} - |if (!$found) { - | $numOutput.add(1); - | ${consume(ctx, input)} - |} - """.stripMargin - } else { - val matches = ctx.freshName("matches") - val iteratorCls = classOf[Iterator[UnsafeRow]].getName - val found = ctx.freshName("found") - s""" - |boolean $found = false; - |// generate join key for stream side - |${keyEv.code} - |// Check if the key has nulls. - |if (!($anyNull)) { - | // Check if the HashedRelation exists. - | $iteratorCls $matches = ($iteratorCls)$relationTerm.get(${keyEv.value}); - | if ($matches != null) { - | // Evaluate the condition. - | while (!$found && $matches.hasNext()) { - | UnsafeRow $matched = (UnsafeRow) $matches.next(); - | $checkCondition { - | $found = true; - | } - | } - | } - |} - |if (!$found) { - | $numOutput.add(1); - | ${consume(ctx, input)} - |} - """.stripMargin - } - } - - /** - * Generates the code for existence join. - */ - protected def codegenExistence(ctx: CodegenContext, input: Seq[ExprCode]): String = { - val HashedRelationInfo(relationTerm, keyIsUnique, _) = prepareRelation(ctx) - val (keyEv, anyNull) = genStreamSideJoinKey(ctx, input) - val numOutput = metricTerm(ctx, "numOutputRows") - val existsVar = ctx.freshName("exists") - - val matched = ctx.freshName("matched") - val buildVars = genOneSideJoinVars(ctx, matched, buildPlan, setDefaultValue = false) - val checkCondition = if (condition.isDefined) { - val expr = condition.get - // evaluate the variables from build side that used by condition - val eval = evaluateRequiredVariables(buildPlan.output, buildVars, expr.references) - // filter the output via condition - ctx.currentVars = input ++ buildVars - val ev = - BindReferences.bindReference(expr, streamedPlan.output ++ buildPlan.output).genCode(ctx) - s""" - |$eval - |${ev.code} - |$existsVar = !${ev.isNull} && ${ev.value}; - """.stripMargin - } else { - s"$existsVar = true;" - } - - val resultVar = input ++ Seq(ExprCode.forNonNullValue( - JavaCode.variable(existsVar, BooleanType))) - - if (keyIsUnique) { - s""" - |// generate join key for stream side - |${keyEv.code} - |// find matches from HashedRelation - |UnsafeRow $matched = $anyNull ? null: (UnsafeRow)$relationTerm.getValue(${keyEv.value}); - |boolean $existsVar = false; - |if ($matched != null) { - | $checkCondition - |} - |$numOutput.add(1); - |${consume(ctx, resultVar)} - """.stripMargin - } else { - val matches = ctx.freshName("matches") - val iteratorCls = classOf[Iterator[UnsafeRow]].getName - s""" - |// generate join key for stream side - |${keyEv.code} - |// find matches from HashRelation - |$iteratorCls $matches = $anyNull ? null : ($iteratorCls)$relationTerm.get(${keyEv.value}); - |boolean $existsVar = false; - |if ($matches != null) { - | while (!$existsVar && $matches.hasNext()) { - | UnsafeRow $matched = (UnsafeRow) $matches.next(); - | $checkCondition - | } - |} - |$numOutput.add(1); - |${consume(ctx, resultVar)} - """.stripMargin - } - } - - protected def prepareRelation(ctx: CodegenContext): HashedRelationInfo -} - -object HashJoin extends CastSupport with SQLConfHelper { - - private def canRewriteAsLongType(keys: Seq[Expression]): Boolean = { - // TODO: support BooleanType, DateType and TimestampType - keys.forall(_.dataType.isInstanceOf[IntegralType]) && - keys.map(_.dataType.defaultSize).sum <= 8 - } - - /** - * Try to rewrite the key as LongType so we can use getLong(), if they key can fit with a long. - * - * If not, returns the original expressions. - */ - def rewriteKeyExpr(keys: Seq[Expression]): Seq[Expression] = { - assert(keys.nonEmpty) - if (!canRewriteAsLongType(keys)) { - return keys - } - - var keyExpr: Expression = if (keys.head.dataType != LongType) { - cast(keys.head, LongType) - } else { - keys.head - } - keys.tail.foreach { e => - val bits = e.dataType.defaultSize * 8 - keyExpr = BitwiseOr(ShiftLeft(keyExpr, Literal(bits)), - BitwiseAnd(cast(e, LongType), Literal((1L << bits) - 1))) - } - keyExpr :: Nil - } - - /** - * Extract a given key which was previously packed in a long value using its index to - * determine the number of bits to shift - */ - def extractKeyExprAt(keys: Seq[Expression], index: Int): Expression = { - assert(canRewriteAsLongType(keys)) - // jump over keys that have a higher index value than the required key - if (keys.size == 1) { - assert(index == 0) - Cast( - child = BoundReference(0, LongType, nullable = false), - dataType = keys(index).dataType, - timeZoneId = Option(conf.sessionLocalTimeZone), - ansiEnabled = false) - } else { - val shiftedBits = - keys.slice(index + 1, keys.size).map(_.dataType.defaultSize * 8).sum - val mask = (1L << (keys(index).dataType.defaultSize * 8)) - 1 - // build the schema for unpacking the required key - val castChild = BitwiseAnd( - ShiftRightUnsigned(BoundReference(0, LongType, nullable = false), Literal(shiftedBits)), - Literal(mask)) - Cast( - child = castChild, - dataType = keys(index).dataType, - timeZoneId = Option(conf.sessionLocalTimeZone), - ansiEnabled = false) - } - } -} diff --git a/shims/spark35/src/main/scala/org/apache/gluten/execution/GenerateTreeStringShim.scala b/shims/spark35/src/main/scala/org/apache/gluten/execution/GenerateTreeStringShim.scala index 1dc6d7f7174da..7628b210f1c13 100644 --- a/shims/spark35/src/main/scala/org/apache/gluten/execution/GenerateTreeStringShim.scala +++ b/shims/spark35/src/main/scala/org/apache/gluten/execution/GenerateTreeStringShim.scala @@ -16,7 +16,7 @@ */ package org.apache.gluten.execution -import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} +import org.apache.spark.sql.execution.UnaryExecNode /** * Spark 3.5 has changed the parameter type of the generateTreeString API in TreeNode. In order to @@ -25,7 +25,7 @@ import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} * allow different Spark versions to override their own generateTreeString. */ -trait GenerateTreeStringShim extends UnaryExecNode { +trait WholeStageTransformerGenerateTreeStringShim extends UnaryExecNode { def stageId: Int @@ -62,3 +62,28 @@ trait GenerateTreeStringShim extends UnaryExecNode { } } } + +trait InputAdapterGenerateTreeStringShim extends UnaryExecNode { + + override def generateTreeString( + depth: Int, + lastChildren: java.util.ArrayList[Boolean], + append: String => Unit, + verbose: Boolean, + prefix: String = "", + addSuffix: Boolean = false, + maxFields: Int, + printNodeId: Boolean, + indent: Int = 0): Unit = { + child.generateTreeString( + depth, + lastChildren, + append, + verbose, + prefix = "", + addSuffix = false, + maxFields, + printNodeId, + indent) + } +} diff --git a/shims/spark35/src/main/scala/org/apache/gluten/sql/shims/spark35/Spark35Shims.scala b/shims/spark35/src/main/scala/org/apache/gluten/sql/shims/spark35/Spark35Shims.scala index ef1cea865d49d..821e0f5837d69 100644 --- a/shims/spark35/src/main/scala/org/apache/gluten/sql/shims/spark35/Spark35Shims.scala +++ b/shims/spark35/src/main/scala/org/apache/gluten/sql/shims/spark35/Spark35Shims.scala @@ -36,24 +36,26 @@ import org.apache.spark.sql.catalyst.plans.physical.{ClusteredDistribution, Dist import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.catalyst.types.DataTypeUtils import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, InternalRowComparableWrapper, TimestampFormatter} +import org.apache.spark.sql.catalyst.util.RebaseDateTime.RebaseSpec import org.apache.spark.sql.connector.catalog.Table import org.apache.spark.sql.connector.expressions.Transform import org.apache.spark.sql.connector.read.{HasPartitionKey, InputPartition, Scan} import org.apache.spark.sql.execution._ import org.apache.spark.sql.execution.command.DataWritingCommandExec import org.apache.spark.sql.execution.datasources._ -import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat +import org.apache.spark.sql.execution.datasources.parquet.{ParquetFileFormat, ParquetFilters} import org.apache.spark.sql.execution.datasources.v2.BatchScanExec import org.apache.spark.sql.execution.datasources.v2.text.TextScan import org.apache.spark.sql.execution.datasources.v2.utils.CatalogUtil import org.apache.spark.sql.execution.exchange.{BroadcastExchangeLike, ShuffleExchangeLike} import org.apache.spark.sql.execution.window.{WindowGroupLimitExec, WindowGroupLimitExecShim} -import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.internal.{LegacyBehaviorPolicy, SQLConf} +import org.apache.spark.sql.types.{IntegerType, LongType, StructField, StructType} import org.apache.spark.sql.util.CaseInsensitiveStringMap import org.apache.spark.storage.{BlockId, BlockManagerId} import org.apache.hadoop.fs.{FileStatus, Path} +import org.apache.parquet.schema.MessageType import java.time.ZoneOffset import java.util.{HashMap => JHashMap, Map => JMap} @@ -74,7 +76,12 @@ class Spark35Shims extends SparkShims { Sig[SplitPart](ExpressionNames.SPLIT_PART), Sig[Sec](ExpressionNames.SEC), Sig[Csc](ExpressionNames.CSC), - Sig[Empty2Null](ExpressionNames.EMPTY2NULL)) + Sig[KnownNullable](ExpressionNames.KNOWN_NULLABLE), + Sig[Empty2Null](ExpressionNames.EMPTY2NULL), + Sig[TimestampAdd](ExpressionNames.TIMESTAMP_ADD), + Sig[RoundFloor](ExpressionNames.FLOOR), + Sig[RoundCeil](ExpressionNames.CEIL) + ) } override def aggregateExpressionMappings: Seq[Sig] = { @@ -206,6 +213,11 @@ class Spark35Shims extends SparkShims { case other => other } + override def getFileSizeAndModificationTime( + file: PartitionedFile): (Option[Long], Option[Long]) = { + (Some(file.fileSize), Some(file.modificationTime)) + } + override def generateMetadataColumns( file: PartitionedFile, metadataColumnNames: Seq[String]): JMap[String, String] = { @@ -229,8 +241,9 @@ class Spark35Shims extends SparkShims { case _ => } } - - // TODO row_index metadata support + metadataColumn.put(InputFileName().prettyName, file.filePath.toString) + metadataColumn.put(InputFileBlockStart().prettyName, file.start.toString) + metadataColumn.put(InputFileBlockLength().prettyName, file.length.toString) metadataColumn } @@ -356,8 +369,31 @@ class Spark35Shims extends SparkShims { def getFileStatus(partition: PartitionDirectory): Seq[FileStatus] = partition.files.map(_.fileStatus) - def isRowIndexMetadataColumn(name: String): Boolean = { + def isFileSplittable( + relation: HadoopFsRelation, + filePath: Path, + sparkSchema: StructType): Boolean = { + relation.fileFormat + .isSplitable(relation.sparkSession, relation.options, filePath) + } + + def isRowIndexMetadataColumn(name: String): Boolean = name == ParquetFileFormat.ROW_INDEX_TEMPORARY_COLUMN_NAME + + def findRowIndexColumnIndexInSchema(sparkSchema: StructType): Int = { + sparkSchema.fields.zipWithIndex.find { + case (field: StructField, _: Int) => + field.name == ParquetFileFormat.ROW_INDEX_TEMPORARY_COLUMN_NAME + } match { + case Some((field: StructField, idx: Int)) => + if (field.dataType != LongType && field.dataType != IntegerType) { + throw new RuntimeException( + s"${ParquetFileFormat.ROW_INDEX_TEMPORARY_COLUMN_NAME} " + + "must be of LongType or IntegerType") + } + idx + case _ => -1 + } } def splitFiles( @@ -436,6 +472,9 @@ class Spark35Shims extends SparkShims { override def withTryEvalMode(expr: Expression): Boolean = { expr match { case a: Add => a.evalMode == EvalMode.TRY + case s: Subtract => s.evalMode == EvalMode.TRY + case d: Divide => d.evalMode == EvalMode.TRY + case m: Multiply => m.evalMode == EvalMode.TRY case _ => false } } @@ -443,6 +482,9 @@ class Spark35Shims extends SparkShims { override def withAnsiEvalMode(expr: Expression): Boolean = { expr match { case a: Add => a.evalMode == EvalMode.ANSI + case s: Subtract => s.evalMode == EvalMode.ANSI + case d: Divide => d.evalMode == EvalMode.ANSI + case m: Multiply => m.evalMode == EvalMode.ANSI case _ => false } } @@ -459,4 +501,20 @@ class Spark35Shims extends SparkShims { override def isPlannedV1Write(write: DataWritingCommandExec): Boolean = { write.cmd.isInstanceOf[V1WriteCommand] && SQLConf.get.plannedWriteEnabled } + + override def createParquetFilters( + conf: SQLConf, + schema: MessageType, + caseSensitive: Option[Boolean] = None): ParquetFilters = { + new ParquetFilters( + schema, + conf.parquetFilterPushDownDate, + conf.parquetFilterPushDownTimestamp, + conf.parquetFilterPushDownDecimal, + conf.parquetFilterPushDownStringPredicate, + conf.parquetFilterPushDownInFilterThreshold, + caseSensitive.getOrElse(conf.caseSensitiveAnalysis), + RebaseSpec(LegacyBehaviorPolicy.CORRECTED) + ) + } } diff --git a/shims/spark35/src/main/scala/org/apache/spark/sql/execution/FileSourceScanExecShim.scala b/shims/spark35/src/main/scala/org/apache/spark/sql/execution/FileSourceScanExecShim.scala index dccf1bbced1d0..c8795e31ceb43 100644 --- a/shims/spark35/src/main/scala/org/apache/spark/sql/execution/FileSourceScanExecShim.scala +++ b/shims/spark35/src/main/scala/org/apache/spark/sql/execution/FileSourceScanExecShim.scala @@ -67,14 +67,10 @@ abstract class FileSourceScanExecShim( // TODO, fallback if user define same name column due to we can't right now // detect which column is metadata column which is user defined column. val metadataColumnsNames = metadataColumns.map(_.name) - metadataColumnsNames.contains(ParquetFileFormat.ROW_INDEX_TEMPORARY_COLUMN_NAME) || output .filterNot(metadataColumns.toSet) .exists(v => metadataColumnsNames.contains(v.name)) || - output.exists( - a => - a.name == "$path" || a.name == "$bucket" || - a.name == ParquetFileFormat.ROW_INDEX_TEMPORARY_COLUMN_NAME) + output.exists(a => a.name == "$path" || a.name == "$bucket") } def isMetadataColumn(attr: Attribute): Boolean = metadataColumns.contains(attr) diff --git a/shims/spark35/src/main/scala/org/apache/spark/sql/execution/JoinSelectionShim.scala b/shims/spark35/src/main/scala/org/apache/spark/sql/execution/JoinSelectionShim.scala deleted file mode 100644 index 20b9dea333a57..0000000000000 --- a/shims/spark35/src/main/scala/org/apache/spark/sql/execution/JoinSelectionShim.scala +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.spark.sql.execution - -import org.apache.spark.sql.catalyst.expressions.Expression -import org.apache.spark.sql.catalyst.planning.ExtractEquiJoinKeys -import org.apache.spark.sql.catalyst.plans.JoinType -import org.apache.spark.sql.catalyst.plans.logical.{Join, JoinHint, LogicalPlan} - -// https://issues.apache.org/jira/browse/SPARK-36745 -object JoinSelectionShim { - object ExtractEquiJoinKeysShim { - type ReturnType = - ( - JoinType, - Seq[Expression], - Seq[Expression], - Option[Expression], - LogicalPlan, - LogicalPlan, - JoinHint) - def unapply(join: Join): Option[ReturnType] = { - ExtractEquiJoinKeys.unapply(join).map { - case ( - joinType, - leftKeys, - rightKeys, - otherPredicates, - predicatesOfJoinKeys, - left, - right, - hint) => - (joinType, leftKeys, rightKeys, otherPredicates, left, right, hint) - } - } - } -} diff --git a/shims/spark35/src/main/scala/org/apache/spark/sql/execution/adaptive/GlutenCostEvaluator.scala b/shims/spark35/src/main/scala/org/apache/spark/sql/execution/adaptive/GlutenCostEvaluator.scala new file mode 100644 index 0000000000000..8fcfa735f463d --- /dev/null +++ b/shims/spark35/src/main/scala/org/apache/spark/sql/execution/adaptive/GlutenCostEvaluator.scala @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.execution.adaptive + +import org.apache.gluten.GlutenConfig + +import org.apache.spark.sql.catalyst.SQLConfHelper +import org.apache.spark.sql.execution.SparkPlan +import org.apache.spark.sql.internal.SQLConf + +/** This [[CostEvaluator]] is to force use the new physical plan when cost is equal. */ +case class GlutenCostEvaluator() extends CostEvaluator with SQLConfHelper { + override def evaluateCost(plan: SparkPlan): Cost = { + val forceOptimizeSkewedJoin = conf.getConf(SQLConf.ADAPTIVE_FORCE_OPTIMIZE_SKEWED_JOIN) + if (GlutenConfig.getConf.enableGluten) { + new GlutenCost(SimpleCostEvaluator(forceOptimizeSkewedJoin), plan) + } else { + SimpleCostEvaluator(forceOptimizeSkewedJoin).evaluateCost(plan) + } + } +} diff --git a/shims/spark35/src/main/scala/org/apache/spark/sql/execution/datasources/v2/BatchScanExecShim.scala b/shims/spark35/src/main/scala/org/apache/spark/sql/execution/datasources/v2/BatchScanExecShim.scala index 8949a46a1ddd0..343070d7f209e 100644 --- a/shims/spark35/src/main/scala/org/apache/spark/sql/execution/datasources/v2/BatchScanExecShim.scala +++ b/shims/spark35/src/main/scala/org/apache/spark/sql/execution/datasources/v2/BatchScanExecShim.scala @@ -67,14 +67,10 @@ abstract class BatchScanExecShim( // TODO, fallback if user define same name column due to we can't right now // detect which column is metadata column which is user defined column. val metadataColumnsNames = metadataColumns.map(_.name) - metadataColumnsNames.contains(ParquetFileFormat.ROW_INDEX_TEMPORARY_COLUMN_NAME) || output .filterNot(metadataColumns.toSet) .exists(v => metadataColumnsNames.contains(v.name)) || - output.exists( - a => - a.name == "$path" || a.name == "$bucket" || - a.name == ParquetFileFormat.ROW_INDEX_TEMPORARY_COLUMN_NAME) + output.exists(a => a.name == "$path" || a.name == "$bucket") } override def doExecuteColumnar(): RDD[ColumnarBatch] = { diff --git a/shims/spark35/src/main/scala/org/apache/spark/sql/hive/execution/AbstractHiveTableScanExec.scala b/shims/spark35/src/main/scala/org/apache/spark/sql/hive/execution/AbstractHiveTableScanExec.scala index 77f15ac57087b..87aba00b0f593 100644 --- a/shims/spark35/src/main/scala/org/apache/spark/sql/hive/execution/AbstractHiveTableScanExec.scala +++ b/shims/spark35/src/main/scala/org/apache/spark/sql/hive/execution/AbstractHiveTableScanExec.scala @@ -77,7 +77,7 @@ abstract private[hive] class AbstractHiveTableScanExec( override val output: Seq[Attribute] = { // Retrieve the original attributes based on expression ID so that capitalization matches. - requestedAttributes.map(originalAttributes) + requestedAttributes.map(originalAttributes).distinct } // Bind all partition key attribute references in the partition pruning predicate for later diff --git a/tools/gluten-it/README.md b/tools/gluten-it/README.md index 59ae55e14f18a..37ed7e82b4f33 100644 --- a/tools/gluten-it/README.md +++ b/tools/gluten-it/README.md @@ -30,7 +30,7 @@ sbin/gluten-it.sh ``` Usage: gluten-it [-hV] [COMMAND] -Gluten integration test using TPC benchmark's data and queries. +Gluten integration test using various of benchmark's data and queries. -h, --help Show this help message and exit. -V, --version Print version information and exit. Commands: diff --git a/tools/gluten-it/common/pom.xml b/tools/gluten-it/common/pom.xml index 4cd4684724ad9..de0d7c2e6c8ca 100644 --- a/tools/gluten-it/common/pom.xml +++ b/tools/gluten-it/common/pom.xml @@ -57,6 +57,17 @@ provided test-jar + + org.apache.spark + spark-sql_${scala.binary.version} + provided + + + org.apache.spark + spark-sql_${scala.binary.version} + provided + test-jar + io.trino.tpcds diff --git a/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/tpc/TpcMixin.java b/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/BaseMixin.java similarity index 82% rename from tools/gluten-it/common/src/main/java/org/apache/gluten/integration/tpc/TpcMixin.java rename to tools/gluten-it/common/src/main/java/org/apache/gluten/integration/BaseMixin.java index c0313fe77b10e..93c82a6fa2573 100644 --- a/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/tpc/TpcMixin.java +++ b/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/BaseMixin.java @@ -14,12 +14,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.gluten.integration.tpc; +package org.apache.gluten.integration; -import org.apache.gluten.integration.tpc.action.Action; -import org.apache.gluten.integration.tpc.command.SparkRunModes; -import org.apache.gluten.integration.tpc.ds.TpcdsSuite; -import org.apache.gluten.integration.tpc.h.TpchSuite; +import org.apache.gluten.integration.action.Action; +import org.apache.gluten.integration.clickbench.ClickBenchSuite; +import org.apache.gluten.integration.command.SparkRunModes; +import org.apache.gluten.integration.ds.TpcdsSuite; +import org.apache.gluten.integration.h.TpchSuite; import org.apache.log4j.Level; import org.apache.spark.SparkConf; import picocli.CommandLine; @@ -30,9 +31,9 @@ import java.util.HashMap; import java.util.Map; -public class TpcMixin { +public class BaseMixin { - @CommandLine.Option(required = true, names = {"--benchmark-type"}, description = "TPC benchmark type: h, ds", defaultValue = "h") + @CommandLine.Option(required = true, names = {"--benchmark-type"}, description = "Benchmark type: h, ds, clickbench", defaultValue = "h") private String benchmarkType; @CommandLine.Option(names = {"-p", "--preset"}, description = "Preset used: vanilla, velox, velox-with-celeborn, velox-with-uniffle...", defaultValue = "velox") @@ -71,11 +72,11 @@ public class TpcMixin { @CommandLine.Option(names = {"--disable-wscg"}, description = "Disable Spark SQL whole stage code generation", defaultValue = "false") private boolean disableWscg; - @CommandLine.Option(names = {"--shuffle-partitions"}, description = "Generate data with partitions", defaultValue = "100") + @CommandLine.Option(names = {"--shuffle-partitions"}, description = "Shuffle partition number", defaultValue = "100") private int shufflePartitions; - @CommandLine.Option(names = {"--min-scan-partitions"}, description = "Use minimum number of partitions to read data", defaultValue = "false") - private boolean minimumScanPartitions; + @CommandLine.Option(names = {"--scan-partitions"}, description = "Scan partition number. This is an approximate value, so the actual scan partition number might vary around this value. -1 for letting Spark choose an appropriate number.", defaultValue = "-1") + private int scanPartitions; @CommandLine.Option(names = {"--extra-conf"}, description = "Extra Spark config entries applying to generated Spark session. E.g. --extra-conf=k1=v1 --extra-conf=k2=v2") private Map extraSparkConf = Collections.emptyMap(); @@ -124,19 +125,25 @@ public Integer runActions(Action[] actions) { mergeMapSafe(extraSparkConf, runModeEnumeration.extraSparkConf())).asScala().toMap( Predef.conforms()); - final TpcSuite suite; + final Suite suite; switch (benchmarkType) { case "h": suite = new TpchSuite(runModeEnumeration.getSparkMasterUrl(), actions, testConf, baselineConf, extraSparkConfScala, level, errorOnMemLeak, enableUi, enableHsUi, hsUiPort, offHeapSize, disableAqe, disableBhj, - disableWscg, shufflePartitions, minimumScanPartitions); + disableWscg, shufflePartitions, scanPartitions); break; case "ds": suite = new TpcdsSuite(runModeEnumeration.getSparkMasterUrl(), actions, testConf, baselineConf, extraSparkConfScala, level, errorOnMemLeak, enableUi, enableHsUi, hsUiPort, offHeapSize, disableAqe, disableBhj, - disableWscg, shufflePartitions, minimumScanPartitions); + disableWscg, shufflePartitions, scanPartitions); + break; + case "clickbench": + suite = new ClickBenchSuite(runModeEnumeration.getSparkMasterUrl(), actions, testConf, + baselineConf, extraSparkConfScala, level, errorOnMemLeak, enableUi, + enableHsUi, hsUiPort, offHeapSize, disableAqe, disableBhj, + disableWscg, shufflePartitions, scanPartitions); break; default: throw new IllegalArgumentException("TPC benchmark type not found: " + benchmarkType); diff --git a/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/tpc/Tpc.java b/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/Cli.java similarity index 68% rename from tools/gluten-it/common/src/main/java/org/apache/gluten/integration/tpc/Tpc.java rename to tools/gluten-it/common/src/main/java/org/apache/gluten/integration/Cli.java index ceaf71b54672f..be0bf57220feb 100644 --- a/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/tpc/Tpc.java +++ b/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/Cli.java @@ -14,26 +14,26 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.gluten.integration.tpc; +package org.apache.gluten.integration; -import org.apache.gluten.integration.tpc.command.DataGenOnly; -import org.apache.gluten.integration.tpc.command.Parameterized; -import org.apache.gluten.integration.tpc.command.Queries; -import org.apache.gluten.integration.tpc.command.QueriesCompare; -import org.apache.gluten.integration.tpc.command.SparkShell; +import org.apache.gluten.integration.command.DataGenOnly; +import org.apache.gluten.integration.command.Parameterized; +import org.apache.gluten.integration.command.Queries; +import org.apache.gluten.integration.command.QueriesCompare; +import org.apache.gluten.integration.command.SparkShell; import picocli.CommandLine; @CommandLine.Command(name = "gluten-it", mixinStandardHelpOptions = true, showDefaultValues = true, subcommands = {DataGenOnly.class, Queries.class, QueriesCompare.class, SparkShell.class, Parameterized.class}, - description = "Gluten integration test using TPC benchmark's data and queries.") -public class Tpc { + description = "Gluten integration test using various of benchmark's data and queries.") +public class Cli { - private Tpc() { + private Cli() { } public static void main(String... args) { - final CommandLine cmd = new CommandLine(new Tpc()); + final CommandLine cmd = new CommandLine(new Cli()); final int exitCode = cmd.execute(args); System.exit(exitCode); } diff --git a/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/tpc/command/DataGenMixin.java b/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/command/DataGenMixin.java similarity index 83% rename from tools/gluten-it/common/src/main/java/org/apache/gluten/integration/tpc/command/DataGenMixin.java rename to tools/gluten-it/common/src/main/java/org/apache/gluten/integration/command/DataGenMixin.java index 72ca0c6996737..0682f5601a926 100644 --- a/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/tpc/command/DataGenMixin.java +++ b/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/command/DataGenMixin.java @@ -14,9 +14,9 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.gluten.integration.tpc.command; +package org.apache.gluten.integration.command; -import org.apache.gluten.integration.tpc.action.Action; +import org.apache.gluten.integration.action.Action; import picocli.CommandLine; public class DataGenMixin { @@ -33,10 +33,14 @@ public Action[] makeActions() { if (skipDataGen) { return new Action[0]; } - return new Action[]{new org.apache.gluten.integration.tpc.action.DataGenOnly(scale, genPartitionedData)}; + return new Action[]{new org.apache.gluten.integration.action.DataGenOnly(scale, genPartitionedData)}; } public double getScale() { return scale; } + + public boolean genPartitionedData() { + return genPartitionedData; + } } diff --git a/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/tpc/command/DataGenOnly.java b/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/command/DataGenOnly.java similarity index 90% rename from tools/gluten-it/common/src/main/java/org/apache/gluten/integration/tpc/command/DataGenOnly.java rename to tools/gluten-it/common/src/main/java/org/apache/gluten/integration/command/DataGenOnly.java index 11e14426fab01..f1ac4888885b3 100644 --- a/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/tpc/command/DataGenOnly.java +++ b/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/command/DataGenOnly.java @@ -14,9 +14,9 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.gluten.integration.tpc.command; +package org.apache.gluten.integration.command; -import org.apache.gluten.integration.tpc.TpcMixin; +import org.apache.gluten.integration.BaseMixin; import picocli.CommandLine; import java.util.concurrent.Callable; @@ -26,7 +26,7 @@ description = "Generate data only.") public class DataGenOnly implements Callable { @CommandLine.Mixin - private TpcMixin mixin; + private BaseMixin mixin; @CommandLine.Mixin private DataGenMixin dataGenMixin; diff --git a/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/tpc/command/Parameterized.java b/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/command/Parameterized.java similarity index 81% rename from tools/gluten-it/common/src/main/java/org/apache/gluten/integration/tpc/command/Parameterized.java rename to tools/gluten-it/common/src/main/java/org/apache/gluten/integration/command/Parameterized.java index bf7d89fe60caa..225b492ef1ae4 100644 --- a/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/tpc/command/Parameterized.java +++ b/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/command/Parameterized.java @@ -14,13 +14,10 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.gluten.integration.tpc.command; +package org.apache.gluten.integration.command; import com.google.common.base.Preconditions; -import org.apache.gluten.integration.tpc.TpcMixin; -import org.apache.gluten.integration.tpc.action.Dim; -import org.apache.gluten.integration.tpc.action.DimKv; -import org.apache.gluten.integration.tpc.action.DimValue; +import org.apache.gluten.integration.BaseMixin; import org.apache.commons.lang3.ArrayUtils; import picocli.CommandLine; import scala.Tuple2; @@ -38,7 +35,7 @@ description = "Run queries with parameterized configurations") public class Parameterized implements Callable { @CommandLine.Mixin - private TpcMixin mixin; + private BaseMixin mixin; @CommandLine.Mixin private DataGenMixin dataGenMixin; @@ -62,21 +59,22 @@ public class Parameterized implements Callable { private static final Pattern dimPattern2 = Pattern.compile("([^,:]+)((?:,[^=,]+=[^=,]+)+)"); private static final Pattern excludedDimsPattern = Pattern.compile("[\\w-]+:[^,:]+(?:,[\\w-]+:[^,:]+)*"); + @Override public Integer call() throws Exception { - final Map>>> parsed = new HashMap<>(); + final Map>>> parsed = new LinkedHashMap<>(); - final Seq> excludedCombinations = JavaConverters.asScalaBufferConverter(Arrays.stream(excludedDims).map(d -> { + final Seq> excludedCombinations = JavaConverters.asScalaBufferConverter(Arrays.stream(excludedDims).map(d -> { final Matcher m = excludedDimsPattern.matcher(d); Preconditions.checkArgument(m.matches(), "Unrecognizable excluded dims: " + d); - Set out = new HashSet<>(); + Set out = new HashSet<>(); final String[] dims = d.split(","); for (String dim : dims) { final String[] kv = dim.split(":"); Preconditions.checkArgument(kv.length == 2, "Unrecognizable excluded dims: " + d); - out.add(new DimKv(kv[0], kv[1])); + out.add(new org.apache.gluten.integration.action.Parameterized.DimKv(kv[0], kv[1])); } - return JavaConverters.asScalaSetConverter(out).asScala().toSet(); + return JavaConverters.asScalaSetConverter(out).asScala().toSet(); }).collect(Collectors.toList())).asScala(); // parse dims @@ -120,18 +118,19 @@ public Integer call() throws Exception { } // Convert Map>>> to List - Seq parsedDims = JavaConverters.asScalaBufferConverter( + Seq parsedDims = JavaConverters.asScalaBufferConverter( parsed.entrySet().stream().map(e -> - new Dim(e.getKey(), JavaConverters.asScalaBufferConverter( + new org.apache.gluten.integration.action.Parameterized.Dim(e.getKey(), JavaConverters.asScalaBufferConverter( e.getValue().entrySet().stream().map(e2 -> - new DimValue(e2.getKey(), JavaConverters.asScalaBufferConverter( + new org.apache.gluten.integration.action.Parameterized.DimValue(e2.getKey(), JavaConverters.asScalaBufferConverter( e2.getValue().stream().map(e3 -> new Tuple2<>(e3.getKey(), e3.getValue())) .collect(Collectors.toList())).asScala())).collect(Collectors.toList())).asScala() )).collect(Collectors.toList())).asScala(); - org.apache.gluten.integration.tpc.action.Parameterized parameterized = - new org.apache.gluten.integration.tpc.action.Parameterized(dataGenMixin.getScale(), queriesMixin.queries(), - queriesMixin.explain(), queriesMixin.iterations(), warmupIterations, parsedDims, + org.apache.gluten.integration.action.Parameterized parameterized = + new org.apache.gluten.integration.action.Parameterized(dataGenMixin.getScale(), + dataGenMixin.genPartitionedData(), queriesMixin.queries(), + queriesMixin.explain(), queriesMixin.iterations(), warmupIterations, queriesMixin.noSessionReuse(), parsedDims, excludedCombinations, metrics); return mixin.runActions(ArrayUtils.addAll(dataGenMixin.makeActions(), parameterized)); } diff --git a/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/tpc/command/Queries.java b/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/command/Queries.java similarity index 81% rename from tools/gluten-it/common/src/main/java/org/apache/gluten/integration/tpc/command/Queries.java rename to tools/gluten-it/common/src/main/java/org/apache/gluten/integration/command/Queries.java index 53d46cc5eac38..c19d66bdae759 100644 --- a/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/tpc/command/Queries.java +++ b/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/command/Queries.java @@ -14,9 +14,9 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.gluten.integration.tpc.command; +package org.apache.gluten.integration.command; -import org.apache.gluten.integration.tpc.TpcMixin; +import org.apache.gluten.integration.BaseMixin; import org.apache.commons.lang3.ArrayUtils; import picocli.CommandLine; @@ -27,7 +27,7 @@ description = "Run queries.") public class Queries implements Callable { @CommandLine.Mixin - private TpcMixin mixin; + private BaseMixin mixin; @CommandLine.Mixin private DataGenMixin dataGenMixin; @@ -40,9 +40,9 @@ public class Queries implements Callable { @Override public Integer call() throws Exception { - org.apache.gluten.integration.tpc.action.Queries queries = - new org.apache.gluten.integration.tpc.action.Queries(dataGenMixin.getScale(), queriesMixin.queries(), - queriesMixin.explain(), queriesMixin.iterations(), randomKillTasks); + org.apache.gluten.integration.action.Queries queries = + new org.apache.gluten.integration.action.Queries(dataGenMixin.getScale(), dataGenMixin.genPartitionedData(), queriesMixin.queries(), + queriesMixin.explain(), queriesMixin.iterations(), randomKillTasks, queriesMixin.noSessionReuse()); return mixin.runActions(ArrayUtils.addAll(dataGenMixin.makeActions(), queries)); } } diff --git a/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/tpc/command/QueriesCompare.java b/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/command/QueriesCompare.java similarity index 76% rename from tools/gluten-it/common/src/main/java/org/apache/gluten/integration/tpc/command/QueriesCompare.java rename to tools/gluten-it/common/src/main/java/org/apache/gluten/integration/command/QueriesCompare.java index d4c0c684dd10d..d194aad185a9b 100644 --- a/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/tpc/command/QueriesCompare.java +++ b/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/command/QueriesCompare.java @@ -14,9 +14,9 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.gluten.integration.tpc.command; +package org.apache.gluten.integration.command; -import org.apache.gluten.integration.tpc.TpcMixin; +import org.apache.gluten.integration.BaseMixin; import org.apache.commons.lang3.ArrayUtils; import picocli.CommandLine; @@ -27,7 +27,7 @@ description = "Run queries and do result comparison with baseline preset.") public class QueriesCompare implements Callable { @CommandLine.Mixin - private TpcMixin mixin; + private BaseMixin mixin; @CommandLine.Mixin private DataGenMixin dataGenMixin; @@ -37,9 +37,10 @@ public class QueriesCompare implements Callable { @Override public Integer call() throws Exception { - org.apache.gluten.integration.tpc.action.QueriesCompare queriesCompare = - new org.apache.gluten.integration.tpc.action.QueriesCompare(dataGenMixin.getScale(), queriesMixin.queries(), - queriesMixin.explain(), queriesMixin.iterations()); + org.apache.gluten.integration.action.QueriesCompare queriesCompare = + new org.apache.gluten.integration.action.QueriesCompare(dataGenMixin.getScale(), + dataGenMixin.genPartitionedData(), queriesMixin.queries(), + queriesMixin.explain(), queriesMixin.iterations(), queriesMixin.noSessionReuse()); return mixin.runActions(ArrayUtils.addAll(dataGenMixin.makeActions(), queriesCompare)); } } diff --git a/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/tpc/command/QueriesMixin.java b/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/command/QueriesMixin.java similarity index 91% rename from tools/gluten-it/common/src/main/java/org/apache/gluten/integration/tpc/command/QueriesMixin.java rename to tools/gluten-it/common/src/main/java/org/apache/gluten/integration/command/QueriesMixin.java index f514883640b71..64e4b32eca5e5 100644 --- a/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/tpc/command/QueriesMixin.java +++ b/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/command/QueriesMixin.java @@ -14,11 +14,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.gluten.integration.tpc.command; +package org.apache.gluten.integration.command; import com.google.common.base.Preconditions; -import org.apache.gluten.integration.tpc.TpcSuite; -import org.apache.gluten.integration.tpc.action.Actions; +import org.apache.gluten.integration.Suite; +import org.apache.gluten.integration.action.Actions; import picocli.CommandLine; import scala.collection.Seq; import scala.collection.JavaConverters; @@ -42,6 +42,9 @@ public class QueriesMixin { @CommandLine.Option(names = {"--iterations"}, description = "How many iterations to run", defaultValue = "1") private int iterations; + @CommandLine.Option(names = {"--no-session-reuse"}, description = "Recreate new Spark session each time a query is about to run", defaultValue = "false") + private boolean noSessionReuse; + public boolean explain() { return explain; } @@ -50,10 +53,14 @@ public int iterations() { return iterations; } + public boolean noSessionReuse() { + return noSessionReuse; + } + public Actions.QuerySelector queries() { return new Actions.QuerySelector() { @Override - public Seq select(TpcSuite suite) { + public Seq select(Suite suite) { final List all = select0(suite); final Division div = Division.parse(shard); final List out = div(all, div); @@ -81,7 +88,7 @@ private List div(List from, Division div) { return out; } - private List select0(TpcSuite suite) { + private List select0(Suite suite) { final String[] queryIds = queries; final String[] excludedQueryIds = excludedQueries; if (queryIds.length > 0 && excludedQueryIds.length > 0) { diff --git a/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/tpc/command/SparkRunModes.java b/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/command/SparkRunModes.java similarity index 97% rename from tools/gluten-it/common/src/main/java/org/apache/gluten/integration/tpc/command/SparkRunModes.java rename to tools/gluten-it/common/src/main/java/org/apache/gluten/integration/command/SparkRunModes.java index 987099902b035..cfd3848d81580 100644 --- a/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/tpc/command/SparkRunModes.java +++ b/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/command/SparkRunModes.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.gluten.integration.tpc.command; +package org.apache.gluten.integration.command; import org.apache.spark.launcher.SparkLauncher; import org.apache.spark.util.Utils; @@ -140,6 +140,9 @@ public Map extraSparkConf() { Optional extraClassPath = Arrays.stream(classPathValues).filter(classPath -> { File file = new File(classPath); return file.exists() && file.isFile() && extraJarSet.contains(file.getName()); + }).map(classPath -> { + File file = new File(classPath); + return file.getAbsolutePath(); }).reduce((s1, s2) -> s1 + File.pathSeparator + s2); final Map extras = new HashMap<>(); diff --git a/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/tpc/command/SparkShell.java b/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/command/SparkShell.java similarity index 80% rename from tools/gluten-it/common/src/main/java/org/apache/gluten/integration/tpc/command/SparkShell.java rename to tools/gluten-it/common/src/main/java/org/apache/gluten/integration/command/SparkShell.java index f16cedb86cb4a..9c2829e923135 100644 --- a/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/tpc/command/SparkShell.java +++ b/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/command/SparkShell.java @@ -14,9 +14,9 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.gluten.integration.tpc.command; +package org.apache.gluten.integration.command; -import org.apache.gluten.integration.tpc.TpcMixin; +import org.apache.gluten.integration.BaseMixin; import org.apache.commons.lang3.ArrayUtils; import picocli.CommandLine; @@ -27,15 +27,16 @@ description = "Open a standard Spark shell.") public class SparkShell implements Callable { @CommandLine.Mixin - private TpcMixin mixin; + private BaseMixin mixin; @CommandLine.Mixin private DataGenMixin dataGenMixin; @Override public Integer call() throws Exception { - org.apache.gluten.integration.tpc.action.SparkShell sparkShell = - new org.apache.gluten.integration.tpc.action.SparkShell(dataGenMixin.getScale()); + org.apache.gluten.integration.action.SparkShell sparkShell = + new org.apache.gluten.integration.action.SparkShell(dataGenMixin.getScale(), + dataGenMixin.genPartitionedData()); return mixin.runActions(ArrayUtils.addAll(dataGenMixin.makeActions(), sparkShell)); } } diff --git a/tools/gluten-it/common/src/main/resources/clickbench-queries/q1.sql b/tools/gluten-it/common/src/main/resources/clickbench-queries/q1.sql new file mode 100644 index 0000000000000..c70aa7a844d75 --- /dev/null +++ b/tools/gluten-it/common/src/main/resources/clickbench-queries/q1.sql @@ -0,0 +1 @@ +SELECT COUNT(*) FROM hits; diff --git a/tools/gluten-it/common/src/main/resources/clickbench-queries/q10.sql b/tools/gluten-it/common/src/main/resources/clickbench-queries/q10.sql new file mode 100644 index 0000000000000..f4a9ee3446e83 --- /dev/null +++ b/tools/gluten-it/common/src/main/resources/clickbench-queries/q10.sql @@ -0,0 +1 @@ +SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM hits GROUP BY RegionID ORDER BY c DESC LIMIT 10; diff --git a/tools/gluten-it/common/src/main/resources/clickbench-queries/q11.sql b/tools/gluten-it/common/src/main/resources/clickbench-queries/q11.sql new file mode 100644 index 0000000000000..d0ea7e3b386a0 --- /dev/null +++ b/tools/gluten-it/common/src/main/resources/clickbench-queries/q11.sql @@ -0,0 +1 @@ +SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10; diff --git a/tools/gluten-it/common/src/main/resources/clickbench-queries/q12.sql b/tools/gluten-it/common/src/main/resources/clickbench-queries/q12.sql new file mode 100644 index 0000000000000..2a316d173429c --- /dev/null +++ b/tools/gluten-it/common/src/main/resources/clickbench-queries/q12.sql @@ -0,0 +1 @@ +SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10; diff --git a/tools/gluten-it/common/src/main/resources/clickbench-queries/q13.sql b/tools/gluten-it/common/src/main/resources/clickbench-queries/q13.sql new file mode 100644 index 0000000000000..7a0254690e796 --- /dev/null +++ b/tools/gluten-it/common/src/main/resources/clickbench-queries/q13.sql @@ -0,0 +1 @@ +SELECT SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; diff --git a/tools/gluten-it/common/src/main/resources/clickbench-queries/q14.sql b/tools/gluten-it/common/src/main/resources/clickbench-queries/q14.sql new file mode 100644 index 0000000000000..4ce0feed26b90 --- /dev/null +++ b/tools/gluten-it/common/src/main/resources/clickbench-queries/q14.sql @@ -0,0 +1 @@ +SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10; diff --git a/tools/gluten-it/common/src/main/resources/clickbench-queries/q15.sql b/tools/gluten-it/common/src/main/resources/clickbench-queries/q15.sql new file mode 100644 index 0000000000000..8e85255a0a2d8 --- /dev/null +++ b/tools/gluten-it/common/src/main/resources/clickbench-queries/q15.sql @@ -0,0 +1 @@ +SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10; diff --git a/tools/gluten-it/common/src/main/resources/clickbench-queries/q16.sql b/tools/gluten-it/common/src/main/resources/clickbench-queries/q16.sql new file mode 100644 index 0000000000000..f959e98a6f5b1 --- /dev/null +++ b/tools/gluten-it/common/src/main/resources/clickbench-queries/q16.sql @@ -0,0 +1 @@ +SELECT UserID, COUNT(*) FROM hits GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10; diff --git a/tools/gluten-it/common/src/main/resources/clickbench-queries/q17.sql b/tools/gluten-it/common/src/main/resources/clickbench-queries/q17.sql new file mode 100644 index 0000000000000..50b1f38320a95 --- /dev/null +++ b/tools/gluten-it/common/src/main/resources/clickbench-queries/q17.sql @@ -0,0 +1 @@ +SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; diff --git a/tools/gluten-it/common/src/main/resources/clickbench-queries/q18.sql b/tools/gluten-it/common/src/main/resources/clickbench-queries/q18.sql new file mode 100644 index 0000000000000..454cdb507fb49 --- /dev/null +++ b/tools/gluten-it/common/src/main/resources/clickbench-queries/q18.sql @@ -0,0 +1 @@ +SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10; diff --git a/tools/gluten-it/common/src/main/resources/clickbench-queries/q19.sql b/tools/gluten-it/common/src/main/resources/clickbench-queries/q19.sql new file mode 100644 index 0000000000000..fbd1049004986 --- /dev/null +++ b/tools/gluten-it/common/src/main/resources/clickbench-queries/q19.sql @@ -0,0 +1 @@ +SELECT UserID, extract(minute FROM EventTime) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, extract(minute FROM EventTime), SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; diff --git a/tools/gluten-it/common/src/main/resources/clickbench-queries/q2.sql b/tools/gluten-it/common/src/main/resources/clickbench-queries/q2.sql new file mode 100644 index 0000000000000..ad8031a76cf7b --- /dev/null +++ b/tools/gluten-it/common/src/main/resources/clickbench-queries/q2.sql @@ -0,0 +1 @@ +SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0; diff --git a/tools/gluten-it/common/src/main/resources/clickbench-queries/q20.sql b/tools/gluten-it/common/src/main/resources/clickbench-queries/q20.sql new file mode 100644 index 0000000000000..3f84066e6a11e --- /dev/null +++ b/tools/gluten-it/common/src/main/resources/clickbench-queries/q20.sql @@ -0,0 +1 @@ +SELECT UserID FROM hits WHERE UserID = 435090932899640449; diff --git a/tools/gluten-it/common/src/main/resources/clickbench-queries/q21.sql b/tools/gluten-it/common/src/main/resources/clickbench-queries/q21.sql new file mode 100644 index 0000000000000..4426afa35fc33 --- /dev/null +++ b/tools/gluten-it/common/src/main/resources/clickbench-queries/q21.sql @@ -0,0 +1 @@ +SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%'; diff --git a/tools/gluten-it/common/src/main/resources/clickbench-queries/q22.sql b/tools/gluten-it/common/src/main/resources/clickbench-queries/q22.sql new file mode 100644 index 0000000000000..bd28609bd2012 --- /dev/null +++ b/tools/gluten-it/common/src/main/resources/clickbench-queries/q22.sql @@ -0,0 +1 @@ +SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM hits WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; diff --git a/tools/gluten-it/common/src/main/resources/clickbench-queries/q23.sql b/tools/gluten-it/common/src/main/resources/clickbench-queries/q23.sql new file mode 100644 index 0000000000000..4ee87ac4528d1 --- /dev/null +++ b/tools/gluten-it/common/src/main/resources/clickbench-queries/q23.sql @@ -0,0 +1 @@ +SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM hits WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; diff --git a/tools/gluten-it/common/src/main/resources/clickbench-queries/q24.sql b/tools/gluten-it/common/src/main/resources/clickbench-queries/q24.sql new file mode 100644 index 0000000000000..935169e37996c --- /dev/null +++ b/tools/gluten-it/common/src/main/resources/clickbench-queries/q24.sql @@ -0,0 +1 @@ +SELECT * FROM hits WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10; diff --git a/tools/gluten-it/common/src/main/resources/clickbench-queries/q25.sql b/tools/gluten-it/common/src/main/resources/clickbench-queries/q25.sql new file mode 100644 index 0000000000000..1bcfd4c1e4ba3 --- /dev/null +++ b/tools/gluten-it/common/src/main/resources/clickbench-queries/q25.sql @@ -0,0 +1 @@ +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10; diff --git a/tools/gluten-it/common/src/main/resources/clickbench-queries/q26.sql b/tools/gluten-it/common/src/main/resources/clickbench-queries/q26.sql new file mode 100644 index 0000000000000..58ea7610cdcda --- /dev/null +++ b/tools/gluten-it/common/src/main/resources/clickbench-queries/q26.sql @@ -0,0 +1 @@ +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10; diff --git a/tools/gluten-it/common/src/main/resources/clickbench-queries/q27.sql b/tools/gluten-it/common/src/main/resources/clickbench-queries/q27.sql new file mode 100644 index 0000000000000..88ed7ba3e1565 --- /dev/null +++ b/tools/gluten-it/common/src/main/resources/clickbench-queries/q27.sql @@ -0,0 +1 @@ +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10; diff --git a/tools/gluten-it/common/src/main/resources/clickbench-queries/q28.sql b/tools/gluten-it/common/src/main/resources/clickbench-queries/q28.sql new file mode 100644 index 0000000000000..d18ffcd72cd31 --- /dev/null +++ b/tools/gluten-it/common/src/main/resources/clickbench-queries/q28.sql @@ -0,0 +1 @@ +SELECT CounterID, AVG(length(URL)) AS l, COUNT(*) AS c FROM hits WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; diff --git a/tools/gluten-it/common/src/main/resources/clickbench-queries/q29.sql b/tools/gluten-it/common/src/main/resources/clickbench-queries/q29.sql new file mode 100644 index 0000000000000..86d6f204be261 --- /dev/null +++ b/tools/gluten-it/common/src/main/resources/clickbench-queries/q29.sql @@ -0,0 +1 @@ +SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(length(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM hits WHERE Referer <> '' GROUP BY REGEXP_REPLACE(Referer, '^https?://(?:www\.)?([^/]+)/.*$', '\1') HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; diff --git a/tools/gluten-it/common/src/main/resources/clickbench-queries/q3.sql b/tools/gluten-it/common/src/main/resources/clickbench-queries/q3.sql new file mode 100644 index 0000000000000..7db4dc2fe9862 --- /dev/null +++ b/tools/gluten-it/common/src/main/resources/clickbench-queries/q3.sql @@ -0,0 +1 @@ +SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM hits; diff --git a/tools/gluten-it/common/src/main/resources/clickbench-queries/q30.sql b/tools/gluten-it/common/src/main/resources/clickbench-queries/q30.sql new file mode 100644 index 0000000000000..630450b432694 --- /dev/null +++ b/tools/gluten-it/common/src/main/resources/clickbench-queries/q30.sql @@ -0,0 +1 @@ +SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM hits; diff --git a/tools/gluten-it/common/src/main/resources/clickbench-queries/q31.sql b/tools/gluten-it/common/src/main/resources/clickbench-queries/q31.sql new file mode 100644 index 0000000000000..f8a80d9c6fb9f --- /dev/null +++ b/tools/gluten-it/common/src/main/resources/clickbench-queries/q31.sql @@ -0,0 +1 @@ +SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10; diff --git a/tools/gluten-it/common/src/main/resources/clickbench-queries/q32.sql b/tools/gluten-it/common/src/main/resources/clickbench-queries/q32.sql new file mode 100644 index 0000000000000..ba8ac2f91d270 --- /dev/null +++ b/tools/gluten-it/common/src/main/resources/clickbench-queries/q32.sql @@ -0,0 +1 @@ +SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; diff --git a/tools/gluten-it/common/src/main/resources/clickbench-queries/q33.sql b/tools/gluten-it/common/src/main/resources/clickbench-queries/q33.sql new file mode 100644 index 0000000000000..893773a20001e --- /dev/null +++ b/tools/gluten-it/common/src/main/resources/clickbench-queries/q33.sql @@ -0,0 +1 @@ +SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; diff --git a/tools/gluten-it/common/src/main/resources/clickbench-queries/q34.sql b/tools/gluten-it/common/src/main/resources/clickbench-queries/q34.sql new file mode 100644 index 0000000000000..f00a3e8efb935 --- /dev/null +++ b/tools/gluten-it/common/src/main/resources/clickbench-queries/q34.sql @@ -0,0 +1 @@ +SELECT URL, COUNT(*) AS c FROM hits GROUP BY URL ORDER BY c DESC LIMIT 10; diff --git a/tools/gluten-it/common/src/main/resources/clickbench-queries/q35.sql b/tools/gluten-it/common/src/main/resources/clickbench-queries/q35.sql new file mode 100644 index 0000000000000..2137530830476 --- /dev/null +++ b/tools/gluten-it/common/src/main/resources/clickbench-queries/q35.sql @@ -0,0 +1 @@ +SELECT 1, URL, COUNT(*) AS c FROM hits GROUP BY 1, URL ORDER BY c DESC LIMIT 10; diff --git a/tools/gluten-it/common/src/main/resources/clickbench-queries/q36.sql b/tools/gluten-it/common/src/main/resources/clickbench-queries/q36.sql new file mode 100644 index 0000000000000..581e5e389fcd9 --- /dev/null +++ b/tools/gluten-it/common/src/main/resources/clickbench-queries/q36.sql @@ -0,0 +1 @@ +SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM hits GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10; diff --git a/tools/gluten-it/common/src/main/resources/clickbench-queries/q37.sql b/tools/gluten-it/common/src/main/resources/clickbench-queries/q37.sql new file mode 100644 index 0000000000000..7aa52984543b4 --- /dev/null +++ b/tools/gluten-it/common/src/main/resources/clickbench-queries/q37.sql @@ -0,0 +1 @@ +SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= DATE '2013-07-01' AND EventDate <= DATE '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10; diff --git a/tools/gluten-it/common/src/main/resources/clickbench-queries/q38.sql b/tools/gluten-it/common/src/main/resources/clickbench-queries/q38.sql new file mode 100644 index 0000000000000..b1a580dba0703 --- /dev/null +++ b/tools/gluten-it/common/src/main/resources/clickbench-queries/q38.sql @@ -0,0 +1 @@ +SELECT Title, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= DATE '2013-07-01' AND EventDate <= DATE '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10; diff --git a/tools/gluten-it/common/src/main/resources/clickbench-queries/q39.sql b/tools/gluten-it/common/src/main/resources/clickbench-queries/q39.sql new file mode 100644 index 0000000000000..93b691f37cd27 --- /dev/null +++ b/tools/gluten-it/common/src/main/resources/clickbench-queries/q39.sql @@ -0,0 +1 @@ +SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= DATE '2013-07-01' AND EventDate <= DATE '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC OFFSET 1000 LIMIT 10; diff --git a/tools/gluten-it/common/src/main/resources/clickbench-queries/q4.sql b/tools/gluten-it/common/src/main/resources/clickbench-queries/q4.sql new file mode 100644 index 0000000000000..e953498fe3ab7 --- /dev/null +++ b/tools/gluten-it/common/src/main/resources/clickbench-queries/q4.sql @@ -0,0 +1 @@ +SELECT AVG(UserID) FROM hits; diff --git a/tools/gluten-it/common/src/main/resources/clickbench-queries/q40.sql b/tools/gluten-it/common/src/main/resources/clickbench-queries/q40.sql new file mode 100644 index 0000000000000..d97b60772000e --- /dev/null +++ b/tools/gluten-it/common/src/main/resources/clickbench-queries/q40.sql @@ -0,0 +1 @@ +SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= DATE '2013-07-01' AND EventDate <= DATE '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END, URL ORDER BY PageViews DESC OFFSET 1000 LIMIT 10; diff --git a/tools/gluten-it/common/src/main/resources/clickbench-queries/q41.sql b/tools/gluten-it/common/src/main/resources/clickbench-queries/q41.sql new file mode 100644 index 0000000000000..321a06e3e7013 --- /dev/null +++ b/tools/gluten-it/common/src/main/resources/clickbench-queries/q41.sql @@ -0,0 +1 @@ +SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= DATE '2013-07-01' AND EventDate <= DATE '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC OFFSET 100 LIMIT 10; diff --git a/tools/gluten-it/common/src/main/resources/clickbench-queries/q42.sql b/tools/gluten-it/common/src/main/resources/clickbench-queries/q42.sql new file mode 100644 index 0000000000000..46b81c5be467b --- /dev/null +++ b/tools/gluten-it/common/src/main/resources/clickbench-queries/q42.sql @@ -0,0 +1 @@ +SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= DATE '2013-07-01' AND EventDate <= DATE '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC OFFSET 10000 LIMIT 10; diff --git a/tools/gluten-it/common/src/main/resources/clickbench-queries/q43.sql b/tools/gluten-it/common/src/main/resources/clickbench-queries/q43.sql new file mode 100644 index 0000000000000..b54c0921149ba --- /dev/null +++ b/tools/gluten-it/common/src/main/resources/clickbench-queries/q43.sql @@ -0,0 +1 @@ +SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= DATE '2013-07-14' AND EventDate <= DATE '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) OFFSET 1000 LIMIT 10; diff --git a/tools/gluten-it/common/src/main/resources/clickbench-queries/q5.sql b/tools/gluten-it/common/src/main/resources/clickbench-queries/q5.sql new file mode 100644 index 0000000000000..2f9baf82535a7 --- /dev/null +++ b/tools/gluten-it/common/src/main/resources/clickbench-queries/q5.sql @@ -0,0 +1 @@ +SELECT COUNT(DISTINCT UserID) FROM hits; diff --git a/tools/gluten-it/common/src/main/resources/clickbench-queries/q6.sql b/tools/gluten-it/common/src/main/resources/clickbench-queries/q6.sql new file mode 100644 index 0000000000000..e9615f8fd345a --- /dev/null +++ b/tools/gluten-it/common/src/main/resources/clickbench-queries/q6.sql @@ -0,0 +1 @@ +SELECT COUNT(DISTINCT SearchPhrase) FROM hits; diff --git a/tools/gluten-it/common/src/main/resources/clickbench-queries/q7.sql b/tools/gluten-it/common/src/main/resources/clickbench-queries/q7.sql new file mode 100644 index 0000000000000..03fbb82d68b85 --- /dev/null +++ b/tools/gluten-it/common/src/main/resources/clickbench-queries/q7.sql @@ -0,0 +1 @@ +SELECT MIN(EventDate), MAX(EventDate) FROM hits; diff --git a/tools/gluten-it/common/src/main/resources/clickbench-queries/q8.sql b/tools/gluten-it/common/src/main/resources/clickbench-queries/q8.sql new file mode 100644 index 0000000000000..25a10b112d561 --- /dev/null +++ b/tools/gluten-it/common/src/main/resources/clickbench-queries/q8.sql @@ -0,0 +1 @@ +SELECT AdvEngineID, COUNT(*) FROM hits WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC; diff --git a/tools/gluten-it/common/src/main/resources/clickbench-queries/q9.sql b/tools/gluten-it/common/src/main/resources/clickbench-queries/q9.sql new file mode 100644 index 0000000000000..a6bf47e4fea18 --- /dev/null +++ b/tools/gluten-it/common/src/main/resources/clickbench-queries/q9.sql @@ -0,0 +1 @@ +SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM hits GROUP BY RegionID ORDER BY u DESC LIMIT 10; diff --git a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/tpc/Constants.scala b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/Constants.scala similarity index 97% rename from tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/tpc/Constants.scala rename to tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/Constants.scala index d39a16c325efc..e680ce9d5dda3 100644 --- a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/tpc/Constants.scala +++ b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/Constants.scala @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.gluten.integration.tpc +package org.apache.gluten.integration import org.apache.spark.SparkConf import org.apache.spark.sql.TypeUtils @@ -44,6 +44,7 @@ object Constants { val VELOX_WITH_CELEBORN_CONF: SparkConf = new SparkConf(false) .set("spark.gluten.sql.columnar.forceShuffledHashJoin", "true") + .set("spark.gluten.sql.columnar.shuffle.celeborn.fallback.enabled", "false") .set("spark.sql.parquet.enableVectorizedReader", "true") .set("spark.plugins", "org.apache.gluten.GlutenPlugin") .set( diff --git a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/tpc/DataGen.scala b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/DataGen.scala similarity index 98% rename from tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/tpc/DataGen.scala rename to tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/DataGen.scala index e810a4dc2316a..6b2d4ec71b865 100644 --- a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/tpc/DataGen.scala +++ b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/DataGen.scala @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.gluten.integration.tpc +package org.apache.gluten.integration import org.apache.spark.sql.types.{DataType, StructField, StructType} diff --git a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/QueryRunner.scala b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/QueryRunner.scala new file mode 100644 index 0000000000000..9791242f18cd4 --- /dev/null +++ b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/QueryRunner.scala @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.integration + +import com.google.common.base.Preconditions +import org.apache.commons.lang3.exception.ExceptionUtils +import org.apache.spark.sql.{RunResult, SparkQueryRunner, SparkSession} + +import java.io.File + +class QueryRunner(val queryResourceFolder: String, val dataPath: String) { + import QueryRunner._ + + Preconditions.checkState( + new File(dataPath).exists(), + s"Data not found at $dataPath, try using command ` data-gen-only ` to generate it first.", + Array(): _*) + + def createTables(creator: TableCreator, spark: SparkSession): Unit = { + creator.create(spark, dataPath) + } + + def runQuery( + spark: SparkSession, + desc: String, + caseId: String, + explain: Boolean = false, + metrics: Array[String] = Array(), + randomKillTasks: Boolean = false): QueryResult = { + val path = "%s/%s.sql".format(queryResourceFolder, caseId) + try { + val r = SparkQueryRunner.runQuery(spark, desc, path, explain, metrics, randomKillTasks) + println(s"Successfully ran query $caseId. Returned row count: ${r.rows.length}") + Success(caseId, r) + } catch { + case e: Exception => + println(s"Error running query $caseId. Error: ${ExceptionUtils.getStackTrace(e)}") + Failure(caseId, e) + } + } +} + +object QueryRunner { + sealed trait QueryResult { + def caseId(): String + def succeeded(): Boolean + } + + implicit class QueryResultOps(r: QueryResult) { + def asSuccessOption(): Option[Success] = { + r match { + case s: Success => Some(s) + case _: Failure => None + } + } + + def asFailureOption(): Option[Failure] = { + r match { + case _: Success => None + case f: Failure => Some(f) + } + } + + def asSuccess(): Success = { + asSuccessOption().get + } + + def asFailure(): Failure = { + asFailureOption().get + } + } + + case class Success(override val caseId: String, runResult: RunResult) extends QueryResult { + override def succeeded(): Boolean = true + } + case class Failure(override val caseId: String, error: Exception) extends QueryResult { + override def succeeded(): Boolean = false + } +} diff --git a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/tpc/ShimUtils.scala b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/ShimUtils.scala similarity index 97% rename from tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/tpc/ShimUtils.scala rename to tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/ShimUtils.scala index 19e15df5cca74..d2986bfa7c0cd 100644 --- a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/tpc/ShimUtils.scala +++ b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/ShimUtils.scala @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.gluten.integration.tpc +package org.apache.gluten.integration import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, RowEncoder} diff --git a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/tpc/TpcSuite.scala b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/Suite.scala similarity index 84% rename from tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/tpc/TpcSuite.scala rename to tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/Suite.scala index f7605e273eb19..070c43e9be2f9 100644 --- a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/tpc/TpcSuite.scala +++ b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/Suite.scala @@ -14,22 +14,20 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.gluten.integration.tpc - -import org.apache.gluten.integration.tpc.action.Action +package org.apache.gluten.integration +import org.apache.gluten.integration.action.Action +import org.apache.log4j.{Level, LogManager} import org.apache.spark.SparkConf import org.apache.spark.deploy.history.HistoryServerHelper import org.apache.spark.network.util.ByteUnit import org.apache.spark.sql.ConfUtils.ConfImplicits._ import org.apache.spark.sql.SparkSessionSwitcher -import org.apache.log4j.{Level, LogManager} - import java.io.File import java.util.Scanner -abstract class TpcSuite( +abstract class Suite( private val masterUrl: String, private val actions: Array[Action], private val testConf: SparkConf, @@ -45,11 +43,11 @@ abstract class TpcSuite( private val disableBhj: Boolean, private val disableWscg: Boolean, private val shufflePartitions: Int, - private val minimumScanPartitions: Boolean) { + private val scanPartitions: Int) { resetLogLevel() - private[tpc] val sessionSwitcher: SparkSessionSwitcher = + private[integration] val sessionSwitcher: SparkSessionSwitcher = new SparkSessionSwitcher(masterUrl, logLevel.toString) // define initial configs @@ -62,7 +60,10 @@ abstract class TpcSuite( .setWarningOnOverriding("spark.storage.blockManagerSlaveTimeoutMs", "3600000") sessionSwitcher .defaultConf() - .setWarningOnOverriding("spark.executor.heartbeatInterval", "1s") // for keeping metrics updated + .setWarningOnOverriding("spark.executor.heartbeatInterval", "10s") + sessionSwitcher + .defaultConf() + .setWarningOnOverriding("spark.worker.timeout", "3600") sessionSwitcher .defaultConf() .setWarningOnOverriding("spark.executor.metrics.pollingInterval", "0") @@ -105,14 +106,17 @@ abstract class TpcSuite( sessionSwitcher.defaultConf().setWarningOnOverriding("spark.sql.codegen.wholeStage", "false") } - if (minimumScanPartitions) { + if (scanPartitions != -1) { + // Scan partition number. sessionSwitcher .defaultConf() .setWarningOnOverriding("spark.sql.files.maxPartitionBytes", s"${ByteUnit.PiB.toBytes(1L)}") sessionSwitcher .defaultConf() - .setWarningOnOverriding("spark.sql.files.openCostInBytes", s"${ByteUnit.PiB.toBytes(1L)}") - sessionSwitcher.defaultConf().setWarningOnOverriding("spark.default.parallelism", "1") + .setWarningOnOverriding("spark.sql.files.openCostInBytes", "0") + sessionSwitcher + .defaultConf() + .setWarningOnOverriding("spark.sql.files.minPartitionNum", s"${(scanPartitions - 1) max 1}") } extraSparkConf.toStream.foreach { kv => @@ -153,32 +157,32 @@ abstract class TpcSuite( } } + def tableCreator(): TableCreator + private def resetLogLevel(): Unit = { LogManager.getRootLogger.setLevel(logLevel) } - private[tpc] def getBaselineConf(): SparkConf = { + private[integration] def getBaselineConf(): SparkConf = { baselineConf.clone() } - private[tpc] def getTestConf(): SparkConf = { + private[integration] def getTestConf(): SparkConf = { testConf.clone() } protected def historyWritePath(): String - private[tpc] def dataWritePath(scale: Double): String - - private[tpc] def createDataGen(scale: Double, genPartitionedData: Boolean): DataGen + private[integration] def dataWritePath(scale: Double, genPartitionedData: Boolean): String - private[tpc] def queryResource(): String + private[integration] def createDataGen(scale: Double, genPartitionedData: Boolean): DataGen - protected def typeModifiers(): List[TypeModifier] + private[integration] def queryResource(): String - private[tpc] def allQueryIds(): Array[String] + private[integration] def allQueryIds(): Array[String] - private[tpc] def desc(): String + private[integration] def desc(): String } -object TpcSuite {} +object Suite {} diff --git a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/TableCreator.scala b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/TableCreator.scala new file mode 100644 index 0000000000000..b35aceef8768a --- /dev/null +++ b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/TableCreator.scala @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.integration + +import org.apache.spark.sql.{AnalysisException, SparkSession} + +import java.io.File + +trait TableCreator { + def create(spark: SparkSession, dataPath: String): Unit +} + +object TableCreator { + def discoverSchema(): TableCreator = { + DiscoverSchema + } + + private object DiscoverSchema extends TableCreator { + override def create(spark: SparkSession, dataPath: String): Unit = { + val files = new File(dataPath).listFiles() + files.foreach(file => { + if (spark.catalog.tableExists(file.getName)) { + println("Table exists: " + file.getName) + } else { + println("Creating catalog table: " + file.getName) + spark.catalog.createTable(file.getName, file.getAbsolutePath, "parquet") + try { + spark.catalog.recoverPartitions(file.getName) + } catch { + case _: AnalysisException => + } + } + }) + } + } +} diff --git a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/tpc/action/Actions.scala b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/action/Actions.scala similarity index 82% rename from tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/tpc/action/Actions.scala rename to tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/action/Actions.scala index 5e49b2888856d..4977dda70820b 100644 --- a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/tpc/action/Actions.scala +++ b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/action/Actions.scala @@ -14,16 +14,16 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.gluten.integration.tpc.action +package org.apache.gluten.integration.action -import org.apache.gluten.integration.tpc.TpcSuite +import org.apache.gluten.integration.Suite trait Action { - def execute(tpcSuite: TpcSuite): Boolean + def execute(suite: Suite): Boolean } object Actions { trait QuerySelector { - def select(suite: TpcSuite): Seq[String] + def select(suite: Suite): Seq[String] } } diff --git a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/tpc/action/DataGenOnly.scala b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/action/DataGenOnly.scala similarity index 76% rename from tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/tpc/action/DataGenOnly.scala rename to tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/action/DataGenOnly.scala index 488bb19adaeff..bc43834610a4b 100644 --- a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/tpc/action/DataGenOnly.scala +++ b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/action/DataGenOnly.scala @@ -14,16 +14,16 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.gluten.integration.tpc.action +package org.apache.gluten.integration.action -import org.apache.gluten.integration.tpc.TpcSuite +import org.apache.gluten.integration.Suite import java.io.File case class DataGenOnly(scale: Double, genPartitionedData: Boolean) extends Action { - override def execute(tpcSuite: TpcSuite): Boolean = { - tpcSuite.sessionSwitcher.useSession("baseline", "Data Gen") - val dataGen = tpcSuite.createDataGen(scale, genPartitionedData) + override def execute(suite: Suite): Boolean = { + suite.sessionSwitcher.useSession("baseline", "Data Gen") + val dataGen = suite.createDataGen(scale, genPartitionedData) dataGen.gen() true } diff --git a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/action/Parameterized.scala b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/action/Parameterized.scala new file mode 100644 index 0000000000000..c9ebb97543947 --- /dev/null +++ b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/action/Parameterized.scala @@ -0,0 +1,363 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.integration.action + +import org.apache.commons.lang3.exception.ExceptionUtils +import org.apache.gluten.integration.QueryRunner.QueryResult +import org.apache.gluten.integration.action.Actions.QuerySelector +import org.apache.gluten.integration.action.TableRender.Field +import org.apache.gluten.integration.action.TableRender.RowParser.FieldAppender.RowAppender +import org.apache.gluten.integration.stat.RamStat +import org.apache.gluten.integration.{QueryRunner, Suite} +import org.apache.spark.sql.ConfUtils.ConfImplicits._ +import org.apache.spark.sql.SparkSession + +import java.util.concurrent.atomic.{AtomicInteger, AtomicLong} +import scala.collection.mutable +import scala.collection.mutable.ListBuffer + +class Parameterized( + scale: Double, + genPartitionedData: Boolean, + queries: QuerySelector, + explain: Boolean, + iterations: Int, + warmupIterations: Int, + noSessionReuse: Boolean, + configDimensions: Seq[Parameterized.Dim], + excludedCombinations: Seq[Set[Parameterized.DimKv]], + metrics: Array[String]) + extends Action { + import Parameterized._ + + validateDims(configDimensions) + + private def validateDims(configDimensions: Seq[Dim]): Unit = { + if (configDimensions + .map(dim => { + dim.name + }) + .toSet + .size != configDimensions.size) { + throw new IllegalStateException("Duplicated dimension name found") + } + + configDimensions.foreach { dim => + if (dim.dimValues.map(dimValue => dimValue.name).toSet.size != dim.dimValues.size) { + throw new IllegalStateException("Duplicated dimension value found") + } + } + } + + private val coordinates: mutable.LinkedHashMap[Coordinate, Seq[(String, String)]] = { + val dimCount = configDimensions.size + val coordinateMap = mutable.LinkedHashMap[Coordinate, Seq[(String, String)]]() + val nextId: AtomicInteger = new AtomicInteger(1); + + def fillCoordinates( + dimOffset: Int, + intermediateCoordinate: Map[String, String], + intermediateConf: Seq[(String, String)]): Unit = { + if (dimOffset == dimCount) { + // we got one coordinate + excludedCombinations.foreach { ec: Set[DimKv] => + if (ec.forall { kv => + intermediateCoordinate.contains(kv.k) && intermediateCoordinate(kv.k) == kv.v + }) { + println(s"Coordinate $intermediateCoordinate excluded by $ec.") + return + } + } + coordinateMap(Coordinate(nextId.getAndIncrement(), intermediateCoordinate)) = + intermediateConf + return + } + val dim = configDimensions(dimOffset) + dim.dimValues.foreach { dimValue => + fillCoordinates( + dimOffset + 1, + intermediateCoordinate + (dim.name -> dimValue.name), + intermediateConf ++ dimValue.conf) + } + } + + fillCoordinates(0, Map(), Seq()) + + coordinateMap + } + + override def execute(suite: Suite): Boolean = { + val runner: QueryRunner = + new QueryRunner(suite.queryResource(), suite.dataWritePath(scale, genPartitionedData)) + + val sessionSwitcher = suite.sessionSwitcher + val testConf = suite.getTestConf() + + println("Prepared coordinates: ") + coordinates.keys.foreach { c => + println(s" ${c.id}: $c") + } + coordinates.foreach { entry => + // register one session per coordinate + val coordinate = entry._1 + val coordinateConf = entry._2 + val conf = testConf.clone() + conf.setAllWarningOnOverriding(coordinateConf) + sessionSwitcher.registerSession(coordinate.toString, conf) + } + + val runQueryIds = queries.select(suite).map(TestResultLine.QueryId(_)) + + val marks: Seq[TestResultLine.CoordMark] = coordinates.flatMap { entry => + val coordinate = entry._1 + sessionSwitcher.useSession(coordinate.toString, "Parameterized %s".format(coordinate)) + runner.createTables(suite.tableCreator(), sessionSwitcher.spark()) + + runQueryIds.flatMap { queryId => + // warm up + (0 until warmupIterations).foreach { iteration => + println(s"Warming up: Running query $queryId (iteration $iteration)...") + try { + Parameterized.warmUp( + runner, + sessionSwitcher.spark(), + queryId.id, + coordinate, + suite.desc()) + } finally { + if (noSessionReuse) { + sessionSwitcher.renewSession() + runner.createTables(suite.tableCreator(), sessionSwitcher.spark()) + } + } + } + + // run + (0 until iterations).map { iteration => + println(s"Running query $queryId with coordinate $coordinate (iteration $iteration)...") + val r = + try { + Parameterized.runQuery( + runner, + sessionSwitcher.spark(), + queryId.id, + coordinate, + suite.desc(), + explain, + metrics) + } finally { + if (noSessionReuse) { + sessionSwitcher.renewSession() + runner.createTables(suite.tableCreator(), sessionSwitcher.spark()) + } + } + TestResultLine.CoordMark(iteration, queryId, r) + } + } + }.toSeq + + val results: Seq[TestResultLine] = marks + .groupBy(m => (m.iteration, m.queryId)) + .toSeq + .sortBy(_._1) + .map { e => + val iteration = e._1._1 + val queryId = e._1._2 + val marks = e._2 + val line = TestResultLine(queryId, marks.map(_.coord).toList) + line + } + + val succeededCount = results.count(l => l.succeeded()) + val totalCount = results.count(_ => true) + + // RAM stats + println("Performing GC to collect RAM statistics... ") + System.gc() + System.gc() + printf( + "RAM statistics: JVM Heap size: %d KiB (total %d KiB), Process RSS: %d KiB\n", + RamStat.getJvmHeapUsed(), + RamStat.getJvmHeapTotal(), + RamStat.getProcessRamUsed()) + + println("") + println("Test report: ") + println("") + printf( + "Summary: %d out of %d queries successfully run on all config combinations. \n", + succeededCount, + totalCount) + println("") + println("Configurations:") + coordinates.foreach(coord => println(s"${coord._1.id}. ${coord._1}")) + println("") + val succeeded = results.filter(_.succeeded()) + val all = succeeded match { + case Nil => None + case several => + Some( + TestResultLine( + TestResultLine.QueryId("all"), + coordinates.keys.map { c => + TestResultLine.Coord( + c, + several + .map(_.coord(c.id)) + .map(_.queryResult) + .asSuccesses() + .agg(s"coordinate $c") + .get) + }.toSeq)) + } + TestResultLines(coordinates.map(_._1.id).toSeq, configDimensions, metrics, succeeded ++ all) + .print() + println("") + + if (succeededCount == totalCount) { + println("No failed queries. ") + println("") + } else { + println("Failed queries: ") + println("") + TestResultLines( + coordinates.map(_._1.id).toSeq, + configDimensions, + metrics, + results.filter(!_.succeeded())) + .print() + println("") + } + + if (succeededCount != totalCount) { + return false + } + true + } +} + +object Parameterized { + case class DimKv(k: String, v: String) + + case class Dim(name: String, dimValues: Seq[DimValue]) + + case class DimValue(name: String, conf: Seq[(String, String)]) + + // coordinate: [dim, dim value] + case class Coordinate(id: Int, coordinate: Map[String, String]) { + override def toString: String = coordinate.mkString(", ") + } + + case class TestResultLine( + queryId: TestResultLine.QueryId, + coordinates: Seq[TestResultLine.Coord]) { + private val coordMap = coordinates.map(c => c.coordinate.id -> c).toMap + def succeeded(): Boolean = { + coordinates.forall(_.queryResult.succeeded()) + } + + def coord(id: Int): TestResultLine.Coord = coordMap(id) + } + + object TestResultLine { + case class QueryId(id: String) { + import QueryId._ + private val uid = nextUid.getAndIncrement() + override def toString: String = id + } + + object QueryId { + private val nextUid = new AtomicLong(0L) + implicit val o: Ordering[QueryId] = Ordering.by(_.uid) + } + + case class Coord(coordinate: Coordinate, queryResult: QueryResult) + case class CoordMark(iteration: Int, queryId: QueryId, coord: Coord) + + class Parser(coordIds: Seq[Int], metricNames: Seq[String]) + extends TableRender.RowParser[TestResultLine] { + override def parse(rowAppender: RowAppender, line: TestResultLine): Unit = { + val inc = rowAppender.incremental() + inc.next().write(line.queryId) + val coords = coordIds.map(id => line.coord(id)) + coords.foreach(coord => inc.next().write(coord.queryResult.succeeded())) + coords.foreach(coord => + inc.next().write(coord.queryResult.asSuccessOption().map(_.runResult.rows.size))) + metricNames.foreach(metricName => + coords.foreach(coord => + inc + .next() + .write(coord.queryResult.asSuccessOption().map(_.runResult.metrics(metricName))))) + coords.foreach(coord => + inc + .next() + .write(coord.queryResult.asSuccessOption().map(_.runResult.planningTimeMillis))) + coords.foreach(coord => + inc + .next() + .write(coord.queryResult.asSuccessOption().map(_.runResult.executionTimeMillis))) + } + } + } + + case class TestResultLines( + coordIds: Seq[Int], + configDimensions: Seq[Dim], + metricNames: Seq[String], + lines: Iterable[TestResultLine]) { + def print(): Unit = { + val fields = ListBuffer[Field](Field.Leaf("Query ID")) + val coordFields = coordIds.map(id => Field.Leaf(id.toString)) + + fields.append(Field.Branch("Succeeded", coordFields)) + fields.append(Field.Branch("Row Count", coordFields)) + metricNames.foreach(metricName => fields.append(Field.Branch(metricName, coordFields))) + fields.append(Field.Branch("Planning Time (Millis)", coordFields)) + fields.append(Field.Branch("Query Time (Millis)", coordFields)) + + val render = + TableRender.create[TestResultLine](fields: _*)( + new TestResultLine.Parser(coordIds, metricNames)) + + lines.foreach(line => render.appendRow(line)) + + render.print(System.out) + } + } + + private def runQuery( + runner: QueryRunner, + spark: SparkSession, + id: String, + coordinate: Coordinate, + desc: String, + explain: Boolean, + metrics: Array[String]): TestResultLine.Coord = { + val testDesc = "Query %s [%s] %s".format(desc, id, coordinate) + val result = runner.runQuery(spark, testDesc, id, explain, metrics) + TestResultLine.Coord(coordinate, result) + } + + private def warmUp( + runner: QueryRunner, + session: SparkSession, + id: String, + coordinate: Coordinate, + desc: String): Unit = { + runQuery(runner, session, id, coordinate, desc, explain = false, Array.empty) + } +} diff --git a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/action/Queries.scala b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/action/Queries.scala new file mode 100644 index 0000000000000..b8a42f3939321 --- /dev/null +++ b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/action/Queries.scala @@ -0,0 +1,157 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.integration.action + +import org.apache.gluten.integration.QueryRunner.QueryResult +import org.apache.gluten.integration.action.Actions.QuerySelector +import org.apache.gluten.integration.action.TableRender.RowParser.FieldAppender.RowAppender +import org.apache.gluten.integration.stat.RamStat +import org.apache.gluten.integration.{QueryRunner, Suite, TableCreator} +import org.apache.spark.sql.{SparkSession} + +case class Queries( + scale: Double, + genPartitionedData: Boolean, + queries: QuerySelector, + explain: Boolean, + iterations: Int, + randomKillTasks: Boolean, + noSessionReuse: Boolean) + extends Action { + import Queries._ + + override def execute(suite: Suite): Boolean = { + val runQueryIds = queries.select(suite) + val runner: QueryRunner = + new QueryRunner(suite.queryResource(), suite.dataWritePath(scale, genPartitionedData)) + val sessionSwitcher = suite.sessionSwitcher + sessionSwitcher.useSession("test", "Run Queries") + runner.createTables(suite.tableCreator(), sessionSwitcher.spark()) + val results = (0 until iterations).flatMap { iteration => + println(s"Running tests (iteration $iteration)...") + runQueryIds.map { queryId => + try { + Queries.runQuery( + runner, + suite.tableCreator(), + sessionSwitcher.spark(), + queryId, + suite.desc(), + explain, + randomKillTasks) + } finally { + if (noSessionReuse) { + sessionSwitcher.renewSession() + runner.createTables(suite.tableCreator(), sessionSwitcher.spark()) + } + } + } + }.toList + + val passedCount = results.count(l => l.queryResult.succeeded()) + val count = results.count(_ => true) + + // RAM stats + println("Performing GC to collect RAM statistics... ") + System.gc() + System.gc() + printf( + "RAM statistics: JVM Heap size: %d KiB (total %d KiB), Process RSS: %d KiB\n", + RamStat.getJvmHeapUsed(), + RamStat.getJvmHeapTotal(), + RamStat.getProcessRamUsed()) + + println("") + println("Test report: ") + println("") + printf("Summary: %d out of %d queries passed. \n", passedCount, count) + println("") + val succeeded = results.filter(_.queryResult.succeeded()) + val all = succeeded.map(_.queryResult).asSuccesses().agg("all").map(s => TestResultLine(s)) + Queries.printResults(succeeded ++ all) + println("") + + if (passedCount == count) { + println("No failed queries. ") + println("") + } else { + println("Failed queries: ") + println("") + Queries.printResults(results.filter(!_.queryResult.succeeded())) + println("") + } + + if (passedCount != count) { + return false + } + true + } +} + +object Queries { + case class TestResultLine(queryResult: QueryResult) + + object TestResultLine { + implicit object Parser extends TableRender.RowParser[TestResultLine] { + override def parse(rowAppender: RowAppender, line: TestResultLine): Unit = { + val inc = rowAppender.incremental() + inc.next().write(line.queryResult.caseId()) + inc.next().write(line.queryResult.succeeded()) + line.queryResult match { + case QueryRunner.Success(_, runResult) => + inc.next().write(runResult.rows.size) + inc.next().write(runResult.planningTimeMillis) + inc.next().write(runResult.executionTimeMillis) + case QueryRunner.Failure(_, error) => + inc.next().write(None) + inc.next().write(None) + inc.next().write(None) + } + } + } + } + + private def printResults(results: Seq[TestResultLine]): Unit = { + val render = TableRender.plain[TestResultLine]( + "Query ID", + "Was Passed", + "Row Count", + "Plan Time (Millis)", + "Query Time (Millis)") + + results.foreach { line => + render.appendRow(line) + } + + render.print(System.out) + } + + private def runQuery( + runner: QueryRunner, + creator: TableCreator, + session: SparkSession, + id: String, + desc: String, + explain: Boolean, + randomKillTasks: Boolean): TestResultLine = { + println(s"Running query: $id...") + val testDesc = "Query %s [%s]".format(desc, id) + val result = + runner.runQuery(session, testDesc, id, explain = explain, randomKillTasks = randomKillTasks) + TestResultLine(result) + } +} diff --git a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/action/QueriesCompare.scala b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/action/QueriesCompare.scala new file mode 100644 index 0000000000000..804f1fbd79f39 --- /dev/null +++ b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/action/QueriesCompare.scala @@ -0,0 +1,218 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.integration.action + +import org.apache.commons.lang3.exception.ExceptionUtils +import org.apache.gluten.integration.QueryRunner.QueryResult +import org.apache.gluten.integration.action.Actions.QuerySelector +import org.apache.gluten.integration.action.QueriesCompare.TestResultLine +import org.apache.gluten.integration.action.TableRender.RowParser.FieldAppender.RowAppender +import org.apache.gluten.integration.stat.RamStat +import org.apache.gluten.integration.{QueryRunner, Suite, TableCreator} +import org.apache.spark.sql.{RunResult, SparkSession, SparkSessionSwitcher, TestUtils} + +case class QueriesCompare( + scale: Double, + genPartitionedData: Boolean, + queries: QuerySelector, + explain: Boolean, + iterations: Int, + noSessionReuse: Boolean) + extends Action { + + override def execute(suite: Suite): Boolean = { + val runner: QueryRunner = + new QueryRunner(suite.queryResource(), suite.dataWritePath(scale, genPartitionedData)) + val runQueryIds = queries.select(suite) + val sessionSwitcher = suite.sessionSwitcher + + sessionSwitcher.useSession("baseline", "Run Baseline Queries") + runner.createTables(suite.tableCreator(), sessionSwitcher.spark()) + val baselineResults = (0 until iterations).flatMap { iteration => + runQueryIds.map { queryId => + println(s"Running baseline query $queryId (iteration $iteration)...") + try { + QueriesCompare.runBaselineQuery( + runner, + sessionSwitcher.spark(), + suite.desc(), + queryId, + explain) + } finally { + if (noSessionReuse) { + sessionSwitcher.renewSession() + runner.createTables(suite.tableCreator(), sessionSwitcher.spark()) + } + } + } + }.toList + + sessionSwitcher.useSession("test", "Run Test Queries") + runner.createTables(suite.tableCreator(), sessionSwitcher.spark()) + val testResults = (0 until iterations).flatMap { iteration => + runQueryIds.map { queryId => + println(s"Running test query $queryId (iteration $iteration)...") + try { + QueriesCompare.runTestQuery( + runner, + sessionSwitcher.spark(), + suite.desc(), + queryId, + explain) + } finally { + if (noSessionReuse) { + sessionSwitcher.renewSession() + runner.createTables(suite.tableCreator(), sessionSwitcher.spark()) + } + } + } + }.toList + + assert(baselineResults.size == testResults.size) + + val results: Seq[TestResultLine] = baselineResults.zip(testResults).map { case (b, t) => + assert(b.caseId() == t.caseId()) + TestResultLine(b.caseId(), b, t) + } + + val passedCount = results.count(l => l.testPassed) + val count = results.count(_ => true) + + // RAM stats + println("Performing GC to collect RAM statistics... ") + System.gc() + System.gc() + printf( + "RAM statistics: JVM Heap size: %d KiB (total %d KiB), Process RSS: %d KiB\n", + RamStat.getJvmHeapUsed(), + RamStat.getJvmHeapTotal(), + RamStat.getProcessRamUsed()) + + println("") + println("Test report: ") + println("") + printf("Summary: %d out of %d queries passed. \n", passedCount, count) + println("") + val succeeded = results.filter(_.testPassed) + val all = succeeded match { + case Nil => None + case several => + val allExpected = several.map(_.expected).asSuccesses().agg("all expected").get + val allActual = several.map(_.actual).asSuccesses().agg("all actual").get + Some(TestResultLine("all", allExpected, allActual)) + } + QueriesCompare.printResults(succeeded ++ all) + println("") + + if (passedCount == count) { + println("No failed queries. ") + println("") + } else { + println( + "Failed queries (a failed query with correct row count indicates value mismatches): ") + println("") + QueriesCompare.printResults(results.filter(!_.testPassed)) + println("") + } + + if (passedCount != count) { + return false + } + true + } +} + +object QueriesCompare { + case class TestResultLine(queryId: String, expected: QueryResult, actual: QueryResult) { + val testPassed: Boolean = { + expected.succeeded() && actual.succeeded() && + TestUtils + .compareAnswers( + expected.asSuccess().runResult.rows, + actual.asSuccess().runResult.rows, + sort = true) + .isEmpty + } + } + + object TestResultLine { + implicit object Parser extends TableRender.RowParser[TestResultLine] { + override def parse(rowAppender: RowAppender, line: TestResultLine): Unit = { + val inc = rowAppender.incremental() + inc.next().write(line.queryId) + inc.next().write(line.testPassed) + inc.next().write(line.expected.asSuccessOption().map(_.runResult.rows.size)) + inc.next().write(line.actual.asSuccessOption().map(_.runResult.rows.size)) + inc.next().write(line.expected.asSuccessOption().map(_.runResult.planningTimeMillis)) + inc.next().write(line.actual.asSuccessOption().map(_.runResult.planningTimeMillis)) + inc.next().write(line.expected.asSuccessOption().map(_.runResult.executionTimeMillis)) + inc.next().write(line.actual.asSuccessOption().map(_.runResult.executionTimeMillis)) + + val speedUp = + if (line.expected.succeeded() && line.actual.succeeded()) { + Some( + ((line.expected.asSuccess().runResult.executionTimeMillis - line.actual + .asSuccess() + .runResult + .executionTimeMillis).toDouble + / line.actual.asSuccess().runResult.executionTimeMillis) * 100) + } else None + inc.next().write(speedUp.map("%.2f%%".format(_))) + } + } + } + + private def printResults(results: Seq[TestResultLine]): Unit = { + import org.apache.gluten.integration.action.TableRender.Field._ + + val render = TableRender.create[TestResultLine]( + Leaf("Query ID"), + Leaf("Passed"), + Branch("Row Count", List(Leaf("Vanilla"), Leaf("Gluten"))), + Branch("Planning Time (Millis)", List(Leaf("Vanilla"), Leaf("Gluten"))), + Branch("Query Time (Millis)", List(Leaf("Vanilla"), Leaf("Gluten"))), + Leaf("Speedup")) + + results.foreach { line => + render.appendRow(line) + } + + render.print(System.out) + } + + private def runBaselineQuery( + runner: QueryRunner, + session: SparkSession, + desc: String, + id: String, + explain: Boolean): QueryResult = { + val testDesc = "Baseline %s [%s]".format(desc, id) + val result = runner.runQuery(session, testDesc, id, explain = explain) + result + } + + private def runTestQuery( + runner: QueryRunner, + session: SparkSession, + desc: String, + id: String, + explain: Boolean): QueryResult = { + val testDesc = "Query %s [%s]".format(desc, id) + val result = runner.runQuery(session, testDesc, id, explain = explain) + result + } +} diff --git a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/tpc/action/SparkShell.scala b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/action/SparkShell.scala similarity index 59% rename from tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/tpc/action/SparkShell.scala rename to tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/action/SparkShell.scala index 78e816955e2ed..1742b99c246de 100644 --- a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/tpc/action/SparkShell.scala +++ b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/action/SparkShell.scala @@ -14,19 +14,19 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.gluten.integration.tpc.action - -import org.apache.gluten.integration.tpc.{TpcRunner, TpcSuite} +package org.apache.gluten.integration.action +import org.apache.gluten.integration.{QueryRunner, Suite} import org.apache.spark.repl.Main -case class SparkShell(scale: Double) extends Action { - override def execute(tpcSuite: TpcSuite): Boolean = { - tpcSuite.sessionSwitcher.useSession("test", "Gluten Spark CLI") - val runner: TpcRunner = new TpcRunner(tpcSuite.queryResource(), tpcSuite.dataWritePath(scale)) - runner.createTables(tpcSuite.sessionSwitcher.spark()) - Main.sparkSession = tpcSuite.sessionSwitcher.spark() - Main.sparkContext = tpcSuite.sessionSwitcher.spark().sparkContext +case class SparkShell(scale: Double, genPartitionedData: Boolean) extends Action { + override def execute(suite: Suite): Boolean = { + suite.sessionSwitcher.useSession("test", "Spark CLI") + val runner: QueryRunner = + new QueryRunner(suite.queryResource(), suite.dataWritePath(scale, genPartitionedData)) + runner.createTables(suite.tableCreator(), suite.sessionSwitcher.spark()) + Main.sparkSession = suite.sessionSwitcher.spark() + Main.sparkContext = suite.sessionSwitcher.spark().sparkContext Main.main(Array("-usejavacp")) true } diff --git a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/action/TableRender.scala b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/action/TableRender.scala new file mode 100644 index 0000000000000..2b1cca61e3f4a --- /dev/null +++ b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/action/TableRender.scala @@ -0,0 +1,328 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.gluten.integration.action + +import org.apache.commons.lang3.StringUtils +import org.apache.gluten.integration.action.TableRender.RowParser.FieldAppender.RowAppender + +import java.io.{ByteArrayOutputStream, OutputStream, PrintStream} +import scala.collection.mutable + +trait TableRender[ROW <: Any] { + def appendRow(row: ROW): Unit + def print(s: OutputStream): Unit +} + +object TableRender { + def create[ROW <: Any](fields: Field*)(implicit parser: RowParser[ROW]): TableRender[ROW] = { + assert(fields.nonEmpty) + // Deep copy to avoid duplications (In case caller reuses a sub-tree). + new Impl[ROW](Schema(fields.map(_.makeCopy())), parser) + } + + def plain[ROW <: Any](fields: String*)(implicit parser: RowParser[ROW]): TableRender[ROW] = { + assert(fields.nonEmpty) + new Impl[ROW](Schema(fields.map(Field.Leaf)), parser) + } + + trait Field { + def id(): Int = System.identityHashCode(this) + def name: String + def leafs: Seq[Field.Leaf] + def makeCopy(): Field + } + + object Field { + case class Branch(override val name: String, children: Seq[Field]) extends Field { + override val leafs: Seq[Leaf] = { + children.map(leafsOf).reduce(_ ++ _) + } + + private def leafsOf(field: Field): Seq[Field.Leaf] = { + field match { + case l @ Field.Leaf(_) => List(l) + case b @ Field.Branch(_, children) => + children.map(child => leafsOf(child)).reduce(_ ++ _) + } + } + + override def makeCopy(): Field = copy(name, children.map(_.makeCopy())) + } + case class Leaf(override val name: String) extends Field { + override val leafs: Seq[Leaf] = List(this) + override def makeCopy(): Field = copy() + } + } + + private case class Schema(fields: Seq[Field]) { + val leafs: Seq[Field.Leaf] = { + fields.map(_.leafs).reduce(_ ++ _) + } + + val maxNestingLevel: Int = { + fields.map(maxNestingLevelOf).max + } + + private def maxNestingLevelOf(field: Field): Int = { + field match { + case _: Field.Leaf => 1 + case Field.Branch(_, children) => children.map(maxNestingLevelOf).max + 1 + } + } + } + + private class Impl[ROW <: Any](schema: Schema, parser: RowParser[ROW]) + extends TableRender[ROW] { + private val appenderFactory = RowParser.FieldAppender.TableAppender.create(schema) + + override def appendRow(row: ROW): Unit = { + parser.parse(appenderFactory.newRow(), row) + } + + override def print(s: OutputStream): Unit = { + val data = appenderFactory.data() + val printer = new PrintStream(s) + if (data.isEmpty) { + printer.println("(N/A)") + printer.flush() + return + } + + // The map is incrementally updated while walking the schema tree from top down. + val widthMap: mutable.Map[Int, Int] = mutable.Map() + + val dataWidths = schema.leafs.indices + .map { i => + data.map(_(i).length).max + } + .map(_ + 2) + + schema.leafs.zipWithIndex.foreach { + case (leaf, i) => + val dataWidth = dataWidths(i) + widthMap += (leaf.id() -> (dataWidth max (leaf.name.length + 2))) + } + + schema.fields.foreach { root => + def updateWidth(field: Field, lowerBound: Int): Unit = { + field match { + case branch @ Field.Branch(name, children) => + val leafLowerBound = + Math + .ceil((lowerBound max name.length + 2).toDouble / branch.leafs.size.toDouble) + .toInt + children.foreach(child => updateWidth(child, leafLowerBound * child.leafs.size)) + val childrenWidth = + children.map(child => widthMap(child.id())).sum + val width = childrenWidth + children.size - 1 + val hash = branch.id() + widthMap += hash -> width + case leaf @ Field.Leaf(name) => + val hash = leaf.id() + val newWidth = widthMap(hash) max lowerBound + widthMap.put(hash, newWidth) + case _ => new IllegalStateException() + } + } + + updateWidth(root, 0) + } + + trait SchemaCell + case class Given(field: Field) extends SchemaCell + case class PlaceHolder(leaf: Field.Leaf) extends SchemaCell + + (0 until schema.maxNestingLevel).foldRight[Seq[SchemaCell]](schema.fields.map(Given)) { + case (_, cells) => + val schemaLine = cells + .map { + case Given(field) => + (field.name, widthMap(field.id())) + case PlaceHolder(leaf) => + ("", widthMap(leaf.id())) + } + .map { + case (name, width) => + StringUtils.center(name, width) + } + .mkString("|", "|", "|") + printer.println(schemaLine) + cells.flatMap { f => + f match { + case Given(Field.Branch(name, children)) => children.map(Given) + case Given(l @ Field.Leaf(name)) => List(PlaceHolder(l)) + case p: PlaceHolder => List(p) + case _ => throw new IllegalStateException() + } + } + } + + val separationLine = schema.leafs + .map { leaf => + widthMap(leaf.id()) + } + .map { width => + new String(Array.tabulate(width)(_ => '-')) + } + .mkString("|", "|", "|") + + printer.println(separationLine) + + data.foreach { row => + val dataLine = row + .zip(schema.leafs) + .map { + case (value, leaf) => + (value, widthMap(leaf.id())) + } + .map { + case (value, width) => + StringUtils.leftPad(value, width) + } + .mkString("|", "|", "|") + printer.println(dataLine) + } + + printer.flush() + } + + override def toString: String = { + val out = new ByteArrayOutputStream() + print(out) + out.toString + } + } + + trait RowParser[ROW <: Any] { + def parse(rowFactory: RowAppender, row: ROW): Unit + } + + object RowParser { + trait FieldAppender { + def child(name: String): FieldAppender + def write(value: Any): Unit + } + + object FieldAppender { + trait RowAppender { + def field(name: String): FieldAppender + def field(offset: Int): FieldAppender + def incremental(): RowAppender.Incremental + } + + object RowAppender { + def create( + schema: Schema, + mutableRows: mutable.ListBuffer[Array[String]]): RowAppender = { + new RowAppenderImpl(schema, mutableRows) + } + + trait Incremental { + def next(): FieldAppender + } + + private class RowAppenderImpl( + schema: Schema, + mutableRows: mutable.ListBuffer[Array[String]]) + extends RowAppender { + private val mutableRow = Array.tabulate(schema.leafs.size) { _ => + "UNFILLED" + } + mutableRows += mutableRow + + override def field(name: String): FieldAppender = { + val fields = schema.fields + assert(fields.count(_.name == name) == 1) + val field = fields.zipWithIndex.find(_._1.name == name).getOrElse { + throw new IllegalArgumentException(s"Field $name not found in $schema") + } + val column = field._2 + new FieldAppenderImpl(field._1, mutableRow, column) + } + + override def field(offset: Int): FieldAppender = { + new FieldAppenderImpl(schema.fields(offset), mutableRow, offset) + } + + override def incremental(): Incremental = { + new Incremental { + private var offset = 0 + override def next(): FieldAppender = { + val out = new FieldAppenderImpl(schema.leafs(offset), mutableRow, offset) + offset += 1 + out + } + } + } + } + } + + trait TableAppender { + def newRow(): RowAppender + def data(): Seq[Seq[String]] + } + + object TableAppender { + def create(schema: Schema): TableAppender = { + new TableAppenderImpl(schema) + } + + private class TableAppenderImpl(schema: Schema) extends TableAppender { + private val mutableRows: mutable.ListBuffer[Array[String]] = mutable.ListBuffer() + + override def newRow(): RowAppender = { + RowAppender.create(schema, mutableRows) + } + + override def data(): Seq[Seq[String]] = { + mutableRows.map(_.toSeq) + } + } + } + + private class FieldAppenderImpl(field: Field, mutableRow: Array[String], column: Int) + extends FieldAppender { + override def child(name: String): FieldAppender = { + field match { + case Field.Branch(_, children) => + assert(children.count(_.name == name) == 1) + val child = children.zipWithIndex.find(_._1.name == name).getOrElse { + throw new IllegalArgumentException(s"Field $name not found in $field") + } + val childField = child._1 + val childOffset = child._2 + new FieldAppenderImpl(childField, mutableRow, column + childOffset) + case _ => + throw new IllegalArgumentException(s"Field $field is not a branch") + } + } + + override def write(value: Any): Unit = { + assert(field.isInstanceOf[Field.Leaf]) + mutableRow(column) = toString(value) + } + + private def toString(value: Any): String = value match { + case Some(v) => toString(v) + case None => "N/A" + case other => other.toString + } + } + } + } +} diff --git a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/action/package.scala b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/action/package.scala new file mode 100644 index 0000000000000..a84915ebe1a93 --- /dev/null +++ b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/action/package.scala @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.gluten.integration + +import org.apache.spark.sql.RunResult + +package object action { + + implicit class QueryResultsOps(results: Iterable[QueryRunner.QueryResult]) { + def asSuccesses(): Iterable[QueryRunner.Success] = { + results.map(_.asSuccess()) + } + + def asFailures(): Iterable[QueryRunner.Failure] = { + results.map(_.asFailure()) + } + } + + implicit class CompletedOps(completed: Iterable[QueryRunner.Success]) { + def agg(name: String): Option[QueryRunner.Success] = { + completed.reduceOption { (c1, c2) => + QueryRunner.Success( + name, + RunResult( + c1.runResult.rows ++ c2.runResult.rows, + c1.runResult.planningTimeMillis + c2.runResult.planningTimeMillis, + c1.runResult.executionTimeMillis + c2.runResult.executionTimeMillis, + (c1.runResult.metrics, c2.runResult.metrics).sumUp)) + } + } + } + + implicit class DualMetricsOps(value: (Map[String, Long], Map[String, Long])) { + def sumUp: Map[String, Long] = { + assert(value._1.keySet == value._2.keySet) + value._1.map { case (k, v) => k -> (v + value._2(k)) } + } + } +} diff --git a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/clickbench/ClickBenchDataGen.scala b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/clickbench/ClickBenchDataGen.scala new file mode 100644 index 0000000000000..add7b01feb6ca --- /dev/null +++ b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/clickbench/ClickBenchDataGen.scala @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.integration.clickbench + +import org.apache.commons.io.FileUtils +import org.apache.gluten.integration.DataGen +import org.apache.spark.sql.SparkSession + +import java.io.File +import scala.language.postfixOps +import scala.sys.process._ + +class ClickBenchDataGen(val spark: SparkSession, dir: String) extends DataGen { + import ClickBenchDataGen._ + override def gen(): Unit = { + println(s"Start to download ClickBench Parquet dataset from URL: $DATA_URL... ") + // Directly download from official URL. + val target = new File(dir + File.separator + FILE_NAME) + FileUtils.forceMkdirParent(target) + val cmd = + s"wget --no-verbose --show-progress --progress=bar:force:noscroll -O $target $DATA_URL" + println(s"Executing command: $cmd") + val code = Process(cmd) !; + if (code != 0) { + throw new RuntimeException("Download failed") + } + println(s"ClickBench Parquet dataset successfully downloaded to $target.") + } +} + +object ClickBenchDataGen { + private val DATA_URL = "https://datasets.clickhouse.com/hits_compatible/hits.parquet" + private[clickbench] val FILE_NAME = "hits.parquet" +} diff --git a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/clickbench/ClickBenchSuite.scala b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/clickbench/ClickBenchSuite.scala new file mode 100644 index 0000000000000..04a34d332b610 --- /dev/null +++ b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/clickbench/ClickBenchSuite.scala @@ -0,0 +1,99 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.integration.clickbench + +import org.apache.gluten.integration.action.Action +import org.apache.gluten.integration.{DataGen, Suite, TableCreator} +import org.apache.log4j.Level +import org.apache.spark.SparkConf + +/** + * ClickBench: a Benchmark For Analytical Databases + * + * See the project: https://github.com/ClickHouse/ClickBench + * Site: https://benchmark.clickhouse.com/ + */ +class ClickBenchSuite( + val masterUrl: String, + val actions: Array[Action], + val testConf: SparkConf, + val baselineConf: SparkConf, + val extraSparkConf: Map[String, String], + val logLevel: Level, + val errorOnMemLeak: Boolean, + val enableUi: Boolean, + val enableHsUi: Boolean, + val hsUiPort: Int, + val offHeapSize: String, + val disableAqe: Boolean, + val disableBhj: Boolean, + val disableWscg: Boolean, + val shufflePartitions: Int, + val scanPartitions: Int) + extends Suite( + masterUrl, + actions, + testConf, + baselineConf, + extraSparkConf, + logLevel, + errorOnMemLeak, + enableUi, + enableHsUi, + hsUiPort, + offHeapSize, + disableAqe, + disableBhj, + disableWscg, + shufflePartitions, + scanPartitions) { + import ClickBenchSuite._ + + override protected def historyWritePath(): String = HISTORY_WRITE_PATH + + override private[integration] def dataWritePath( + scale: Double, + genPartitionedData: Boolean): String = { + checkDataGenArgs(scale, genPartitionedData) + DATA_WRITE_PATH + } + + override private[integration] def createDataGen( + scale: Double, + genPartitionedData: Boolean): DataGen = { + new ClickBenchDataGen(sessionSwitcher.spark(), dataWritePath(scale, genPartitionedData)) + } + + override private[integration] def queryResource(): String = "/clickbench-queries" + + override private[integration] def allQueryIds(): Array[String] = ALL_QUERY_IDS + + override private[integration] def desc(): String = "ClickBench" + + override def tableCreator(): TableCreator = ClickBenchTableCreator +} + +private object ClickBenchSuite { + private val DATA_WRITE_PATH = "/tmp/clickbench-generated" + private val HISTORY_WRITE_PATH = "/tmp/clickbench-history" + private val ALL_QUERY_IDS = (1 to 43).map(i => s"q$i").toArray + + private def checkDataGenArgs(scale: Double, genPartitionedData: Boolean): Unit = { + assert(scale == 1.0D, "ClickBench suite doesn't support scale factor other than 1") + assert(!genPartitionedData, "ClickBench suite doesn't support generating partitioned data") + } +} diff --git a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/clickbench/ClickBenchTableCreator.scala b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/clickbench/ClickBenchTableCreator.scala new file mode 100644 index 0000000000000..33eac38629ef0 --- /dev/null +++ b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/clickbench/ClickBenchTableCreator.scala @@ -0,0 +1,150 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.gluten.integration.clickbench + +import org.apache.gluten.integration.TableCreator +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.{AnalysisException, SparkSession} + +import java.io.File + +object ClickBenchTableCreator extends TableCreator { + private val TABLE_NAME = "hits" + private val SCHEMA: StructType = StructType.fromDDL(""" + |watchid bigint, + |javaenable smallint, + |title varchar(65535), + |goodevent smallint, + |eventtime timestamp, + |eventdate date, + |counterid int, + |clientip int, + |regionid int, + |userid bigint, + |counterclass smallint, + |os smallint, + |useragent smallint, + |url varchar(65535), + |referer varchar(65535), + |isrefresh smallint, + |referercategoryid smallint, + |refererregionid int, + |urlcategoryid smallint, + |urlregionid int, + |resolutionwidth smallint, + |resolutionheight smallint, + |resolutiondepth smallint, + |flashmajor smallint, + |flashminor smallint, + |flashminor2 varchar(65535), + |netmajor smallint, + |netminor smallint, + |useragentmajor smallint, + |useragentminor varchar(65535), + |cookieenable smallint, + |javascriptenable smallint, + |ismobile smallint, + |mobilephone smallint, + |mobilephonemodel varchar(65535), + |params varchar(65535), + |ipnetworkid int, + |traficsourceid smallint, + |searchengineid smallint, + |searchphrase varchar(65535), + |advengineid smallint, + |isartifical smallint, + |windowclientwidth smallint, + |windowclientheight smallint, + |clienttimezone smallint, + |clienteventtime timestamp, + |silverlightversion1 smallint, + |silverlightversion2 smallint, + |silverlightversion3 int, + |silverlightversion4 smallint, + |pagecharset varchar(65535), + |codeversion int, + |islink smallint, + |isdownload smallint, + |isnotbounce smallint, + |funiqid bigint, + |originalurl varchar(65535), + |hid int, + |isoldcounter smallint, + |isevent smallint, + |isparameter smallint, + |dontcounthits smallint, + |withhash smallint, + |hitcolor varchar(65535), + |localeventtime timestamp, + |age smallint, + |sex smallint, + |income smallint, + |interests smallint, + |robotness smallint, + |remoteip int, + |windowname int, + |openername int, + |historylength smallint, + |browserlanguage varchar(65535), + |browsercountry varchar(65535), + |socialnetwork varchar(65535), + |socialaction varchar(65535), + |httperror smallint, + |sendtiming int, + |dnstiming int, + |connecttiming int, + |responsestarttiming int, + |responseendtiming int, + |fetchtiming int, + |socialsourcenetworkid smallint, + |socialsourcepage varchar(65535), + |paramprice bigint, + |paramorderid varchar(65535), + |paramcurrency varchar(65535), + |paramcurrencyid smallint, + |openstatservicename varchar(65535), + |openstatcampaignid varchar(65535), + |openstatadid varchar(65535), + |openstatsourceid varchar(65535), + |utmsource varchar(65535), + |utmmedium varchar(65535), + |utmcampaign varchar(65535), + |utmcontent varchar(65535), + |utmterm varchar(65535), + |fromtag varchar(65535), + |hasgclid smallint, + |refererhash bigint, + |urlhash bigint, + |clid int + |""".stripMargin) + + override def create(spark: SparkSession, dataPath: String): Unit = { + val file = new File(dataPath + File.separator + ClickBenchDataGen.FILE_NAME) + if (spark.catalog.tableExists(TABLE_NAME)) { + println("Table exists: " + TABLE_NAME) + return + } + println("Creating catalog table: " + TABLE_NAME) + spark.catalog.createTable(TABLE_NAME, "parquet", SCHEMA, Map("path" -> file.getAbsolutePath)) + try { + spark.catalog.recoverPartitions(file.getName) + } catch { + case _: AnalysisException => + } + } +} diff --git a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/tpc/ds/TpcdsDataGen.scala b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/ds/TpcdsDataGen.scala similarity index 99% rename from tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/tpc/ds/TpcdsDataGen.scala rename to tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/ds/TpcdsDataGen.scala index 82d16dd90f1af..7d63fc67a999d 100644 --- a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/tpc/ds/TpcdsDataGen.scala +++ b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/ds/TpcdsDataGen.scala @@ -14,9 +14,9 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.gluten.integration.tpc.ds +package org.apache.gluten.integration.ds -import org.apache.gluten.integration.tpc.{DataGen, ShimUtils, TypeModifier} +import org.apache.gluten.integration.{DataGen, ShimUtils, TypeModifier} import org.apache.spark.sql.{Column, Row, SaveMode, SparkSession} import org.apache.spark.sql.types._ diff --git a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/tpc/ds/TpcdsSuite.scala b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/ds/TpcdsSuite.scala similarity index 77% rename from tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/tpc/ds/TpcdsSuite.scala rename to tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/ds/TpcdsSuite.scala index c703821c1b0f7..a4365afde38dc 100644 --- a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/tpc/ds/TpcdsSuite.scala +++ b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/ds/TpcdsSuite.scala @@ -14,19 +14,17 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.gluten.integration.tpc.ds +package org.apache.gluten.integration.ds -import org.apache.gluten.integration.tpc.{Constants, DataGen, TpcSuite, TypeModifier} -import org.apache.gluten.integration.tpc.action.Action -import org.apache.gluten.integration.tpc.ds.TpcdsSuite.{ +import org.apache.gluten.integration.action.Action +import org.apache.gluten.integration.ds.TpcdsSuite.{ ALL_QUERY_IDS, HISTORY_WRITE_PATH, TPCDS_WRITE_PATH } - -import org.apache.spark.SparkConf - +import org.apache.gluten.integration.{DataGen, Suite, TableCreator, TypeModifier} import org.apache.log4j.Level +import org.apache.spark.SparkConf class TpcdsSuite( val masterUrl: String, @@ -44,8 +42,8 @@ class TpcdsSuite( val disableBhj: Boolean, val disableWscg: Boolean, val shufflePartitions: Int, - val minimumScanPartitions: Boolean) - extends TpcSuite( + val scanPartitions: Int) + extends Suite( masterUrl, actions, testConf, @@ -61,32 +59,39 @@ class TpcdsSuite( disableBhj, disableWscg, shufflePartitions, - minimumScanPartitions) { + scanPartitions) { override protected def historyWritePath(): String = HISTORY_WRITE_PATH - override private[tpc] def dataWritePath(scale: Double): String = TPCDS_WRITE_PATH + s"-$scale" + override private[integration] def dataWritePath( + scale: Double, + genPartitionedData: Boolean): String = + TPCDS_WRITE_PATH + s"-$scale" - override private[tpc] def createDataGen(scale: Double, genPartitionedData: Boolean): DataGen = + override private[integration] def createDataGen( + scale: Double, + genPartitionedData: Boolean): DataGen = new TpcdsDataGen( sessionSwitcher.spark(), scale, shufflePartitions, - dataWritePath(scale), + dataWritePath(scale, genPartitionedData), typeModifiers(), genPartitionedData) - override private[tpc] def queryResource(): String = { + override private[integration] def queryResource(): String = { "/tpcds-queries" } - override protected def typeModifiers(): List[TypeModifier] = { + private def typeModifiers(): List[TypeModifier] = { List() } - override private[tpc] def allQueryIds(): Array[String] = ALL_QUERY_IDS + override private[integration] def allQueryIds(): Array[String] = ALL_QUERY_IDS + + override private[integration] def desc(): String = "TPC-DS" - override private[tpc] def desc(): String = "TPC-DS" + override def tableCreator(): TableCreator = TableCreator.discoverSchema() } object TpcdsSuite { diff --git a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/tpc/h/TpchDataGen.scala b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/h/TpchDataGen.scala similarity index 98% rename from tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/tpc/h/TpchDataGen.scala rename to tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/h/TpchDataGen.scala index fa574f59c5d21..5223c61c99e06 100644 --- a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/tpc/h/TpchDataGen.scala +++ b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/h/TpchDataGen.scala @@ -14,9 +14,9 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.gluten.integration.tpc.h +package org.apache.gluten.integration.h -import org.apache.gluten.integration.tpc.{DataGen, ShimUtils, TypeModifier} +import org.apache.gluten.integration.{DataGen, ShimUtils, TypeModifier} import org.apache.spark.sql.{Row, SaveMode, SparkSession} import org.apache.spark.sql.types._ diff --git a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/tpc/h/TpchSuite.scala b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/h/TpchSuite.scala similarity index 69% rename from tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/tpc/h/TpchSuite.scala rename to tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/h/TpchSuite.scala index 9fbd83dc2f661..bdcac3bffe39d 100644 --- a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/tpc/h/TpchSuite.scala +++ b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/h/TpchSuite.scala @@ -14,15 +14,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.gluten.integration.tpc.h - -import org.apache.gluten.integration.tpc.{DataGen, TpcSuite, TypeModifier} -import org.apache.gluten.integration.tpc.action.Action -import org.apache.gluten.integration.tpc.h.TpchSuite.{HISTORY_WRITE_PATH, TPCH_WRITE_PATH} - -import org.apache.spark.SparkConf +package org.apache.gluten.integration.h +import org.apache.gluten.integration.action.Action +import org.apache.gluten.integration.h.TpchSuite.{HISTORY_WRITE_PATH, TPCH_WRITE_PATH} +import org.apache.gluten.integration.{DataGen, Suite, TableCreator, TypeModifier} import org.apache.log4j.Level +import org.apache.spark.SparkConf class TpchSuite( val masterUrl: String, @@ -40,8 +38,8 @@ class TpchSuite( val disableBhj: Boolean, val disableWscg: Boolean, val shufflePartitions: Int, - val minimumScanPartitions: Boolean) - extends TpcSuite( + val scanPartitions: Int) + extends Suite( masterUrl, actions, testConf, @@ -57,31 +55,38 @@ class TpchSuite( disableBhj, disableWscg, shufflePartitions, - minimumScanPartitions) { + scanPartitions) { override protected def historyWritePath(): String = HISTORY_WRITE_PATH - override private[tpc] def dataWritePath(scale: Double): String = TPCH_WRITE_PATH + s"-$scale" + override private[integration] def dataWritePath( + scale: Double, + genPartitionedData: Boolean): String = + TPCH_WRITE_PATH + s"-$scale" - override private[tpc] def createDataGen(scale: Double, genPartitionedData: Boolean): DataGen = + override private[integration] def createDataGen( + scale: Double, + genPartitionedData: Boolean): DataGen = new TpchDataGen( sessionSwitcher.spark(), scale, shufflePartitions, - dataWritePath(scale), + dataWritePath(scale, genPartitionedData), typeModifiers()) - override private[tpc] def queryResource(): String = { + override private[integration] def queryResource(): String = { "/tpch-queries" } - override protected def typeModifiers(): List[TypeModifier] = { + private def typeModifiers(): List[TypeModifier] = { List() } - override private[tpc] def allQueryIds(): Array[String] = TpchSuite.ALL_QUERY_IDS + override private[integration] def allQueryIds(): Array[String] = TpchSuite.ALL_QUERY_IDS + + override private[integration] def desc(): String = "TPC-H" - override private[tpc] def desc(): String = "TPC-H" + override def tableCreator(): TableCreator = TableCreator.discoverSchema() } object TpchSuite { diff --git a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/tpc/TpcRunner.scala b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/tpc/TpcRunner.scala deleted file mode 100644 index 908b8206eecd8..0000000000000 --- a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/tpc/TpcRunner.scala +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.gluten.integration.tpc - -import org.apache.spark.sql.{AnalysisException, QueryRunner, RunResult, SparkSession} - -import com.google.common.base.Preconditions -import org.apache.commons.io.FileUtils - -import java.io.File - -class TpcRunner(val queryResourceFolder: String, val dataPath: String) { - Preconditions.checkState( - new File(dataPath).exists(), - s"Data not found at $dataPath, try using command ` data-gen-only ` to generate it first.", - Array(): _*) - - def createTables(spark: SparkSession): Unit = { - TpcRunner.createTables(spark, dataPath) - } - - def runTpcQuery( - spark: SparkSession, - desc: String, - caseId: String, - explain: Boolean = false, - metrics: Array[String] = Array(), - randomKillTasks: Boolean = false): RunResult = { - val path = "%s/%s.sql".format(queryResourceFolder, caseId) - QueryRunner.runTpcQuery(spark, desc, path, explain, metrics, randomKillTasks) - } -} - -object TpcRunner { - def createTables(spark: SparkSession, dataPath: String): Unit = { - val files = new File(dataPath).listFiles() - files.foreach(file => { - if (spark.catalog.tableExists(file.getName)) { - println("Table exists: " + file.getName) - } else { - println("Creating catalog table: " + file.getName) - spark.catalog.createTable(file.getName, file.getAbsolutePath, "parquet") - try { - spark.catalog.recoverPartitions(file.getName) - } catch { - case _: AnalysisException => - } - } - }) - } - - private def delete(path: String): Unit = { - FileUtils.forceDelete(new File(path)) - } -} diff --git a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/tpc/action/Parameterized.scala b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/tpc/action/Parameterized.scala deleted file mode 100644 index 6fc4e66d6f056..0000000000000 --- a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/tpc/action/Parameterized.scala +++ /dev/null @@ -1,305 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.gluten.integration.tpc.action - -import org.apache.gluten.integration.stat.RamStat -import org.apache.gluten.integration.tpc.{TpcRunner, TpcSuite} -import org.apache.spark.sql.ConfUtils.ConfImplicits._ -import org.apache.spark.sql.SparkSessionSwitcher -import org.apache.commons.lang3.exception.ExceptionUtils -import org.apache.gluten.integration.tpc.action.Actions.QuerySelector - -import scala.collection.immutable.Map -import scala.collection.mutable -import scala.collection.mutable.{ArrayBuffer, ListBuffer} - -class Parameterized( - scale: Double, - queries: QuerySelector, - explain: Boolean, - iterations: Int, - warmupIterations: Int, - configDimensions: Seq[Dim], - excludedCombinations: Seq[Set[DimKv]], - metrics: Array[String]) - extends Action { - - private def validateDims(configDimensions: Seq[Dim]): Unit = { - if (configDimensions - .map(dim => { - dim.name - }) - .toSet - .size != configDimensions.size) { - throw new IllegalStateException("Duplicated dimension name found") - } - - configDimensions.foreach { dim => - if (dim.dimValues.map(dimValue => dimValue.name).toSet.size != dim.dimValues.size) { - throw new IllegalStateException("Duplicated dimension value found") - } - } - } - - private val coordinates: mutable.LinkedHashMap[Coordinate, Seq[(String, String)]] = { - validateDims(configDimensions) - val dimCount = configDimensions.size - val coordinateMap = mutable.LinkedHashMap[Coordinate, Seq[(String, String)]]() - - def fillCoordinates( - dimOffset: Int, - intermediateCoordinates: Map[String, String], - intermediateConf: Seq[(String, String)]): Unit = { - if (dimOffset == dimCount) { - // we got one coordinate - excludedCombinations.foreach { ec: Set[DimKv] => - if (ec.forall { kv => - intermediateCoordinates.contains(kv.k) && intermediateCoordinates(kv.k) == kv.v - }) { - println(s"Coordinate ${Coordinate(intermediateCoordinates)} excluded by $ec.") - return - } - } - coordinateMap(Coordinate(intermediateCoordinates)) = intermediateConf - return - } - val dim = configDimensions(dimOffset) - dim.dimValues.foreach { dimValue => - fillCoordinates( - dimOffset + 1, - intermediateCoordinates + (dim.name -> dimValue.name), - intermediateConf ++ dimValue.conf) - } - } - - fillCoordinates(0, Map(), Seq()) - - coordinateMap - } - - override def execute(tpcSuite: TpcSuite): Boolean = { - val runner: TpcRunner = new TpcRunner(tpcSuite.queryResource(), tpcSuite.dataWritePath(scale)) - val allQueries = tpcSuite.allQueryIds() - - val sessionSwitcher = tpcSuite.sessionSwitcher - val testConf = tpcSuite.getTestConf() - - println("Prepared coordinates: ") - coordinates.toList.map(_._1).zipWithIndex.foreach { - case (c, idx) => - println(s" $idx: $c") - } - coordinates.foreach { entry => - // register one session per coordinate - val coordinate = entry._1 - val coordinateConf = entry._2 - val conf = testConf.clone() - conf.setAllWarningOnOverriding(coordinateConf) - sessionSwitcher.registerSession(coordinate.toString, conf) - } - - val runQueryIds = queries.select(tpcSuite) - - // warm up - (0 until warmupIterations).foreach { _ => - runQueryIds.foreach { queryId => - Parameterized.warmUp(queryId, tpcSuite.desc(), sessionSwitcher, runner) - } - } - - val results = coordinates.flatMap { entry => - val coordinate = entry._1 - val coordinateResults = (0 until iterations).flatMap { iteration => - println(s"Running tests (iteration $iteration) with coordinate $coordinate...") - runQueryIds.map { queryId => - Parameterized.runTpcQuery( - runner, - sessionSwitcher, - queryId, - coordinate, - tpcSuite.desc(), - explain, - metrics) - } - }.toList - coordinateResults - } - - val dimNames = configDimensions.map(dim => dim.name) - - val passedCount = results.count(l => l.succeed) - val count = results.count(_ => true) - - // RAM stats - println("Performing GC to collect RAM statistics... ") - System.gc() - System.gc() - printf( - "RAM statistics: JVM Heap size: %d KiB (total %d KiB), Process RSS: %d KiB\n", - RamStat.getJvmHeapUsed(), - RamStat.getJvmHeapTotal(), - RamStat.getProcessRamUsed()) - - println("") - println("Test report: ") - println("") - printf("Summary: %d out of %d queries passed. \n", passedCount, count) - println("") - TestResultLines(dimNames, metrics, results.filter(_.succeed)).print() - println("") - - if (passedCount == count) { - println("No failed queries. ") - println("") - } else { - println("Failed queries: ") - println("") - TestResultLines(dimNames, metrics, results.filter(!_.succeed)).print() - println("") - } - - if (passedCount != count) { - return false - } - true - } -} - -case class DimKv(k: String, v: String) -case class Dim(name: String, dimValues: Seq[DimValue]) -case class DimValue(name: String, conf: Seq[(String, String)]) -case class Coordinate(coordinate: Map[String, String]) // [dim, dim value] - -case class TestResultLine( - queryId: String, - succeed: Boolean, - coordinate: Coordinate, - rowCount: Option[Long], - planningTimeMillis: Option[Long], - executionTimeMillis: Option[Long], - metrics: Map[String, Long], - errorMessage: Option[String]) - -object TestResultLine { - class Parser(dimNames: Seq[String], metricNames: Seq[String]) - extends TableFormatter.RowParser[TestResultLine] { - override def parse(line: TestResultLine): Seq[Any] = { - val values = ArrayBuffer[Any](line.queryId, line.succeed) - dimNames.foreach { dimName => - val coordinate = line.coordinate.coordinate - if (!coordinate.contains(dimName)) { - throw new IllegalStateException("Dimension name not found" + dimName) - } - values.append(coordinate(dimName)) - } - metricNames.foreach { metricName => - val metrics = line.metrics - values.append(metrics.getOrElse(metricName, "N/A")) - } - values.append(line.rowCount.getOrElse("N/A")) - values.append(line.planningTimeMillis.getOrElse("N/A")) - values.append(line.executionTimeMillis.getOrElse("N/A")) - values - } - } -} - -case class TestResultLines( - dimNames: Seq[String], - metricNames: Seq[String], - lines: Iterable[TestResultLine]) { - def print(): Unit = { - val fields = ListBuffer[String]("Query ID", "Succeed") - dimNames.foreach(dimName => fields.append(dimName)) - metricNames.foreach(metricName => fields.append(metricName)) - fields.append("Row Count") - fields.append("Planning Time (Millis)") - fields.append("Query Time (Millis)") - val formatter = TableFormatter.create[TestResultLine](fields: _*)( - new TestResultLine.Parser(dimNames, metricNames)) - - lines.foreach { line => - formatter.appendRow(line) - } - - formatter.print(System.out) - } -} - -object Parameterized { - private def runTpcQuery( - runner: TpcRunner, - sessionSwitcher: SparkSessionSwitcher, - id: String, - coordinate: Coordinate, - desc: String, - explain: Boolean, - metrics: Array[String]) = { - println(s"Running query: $id...") - try { - val testDesc = "Gluten Spark %s %s %s".format(desc, id, coordinate) - sessionSwitcher.useSession(coordinate.toString, testDesc) - runner.createTables(sessionSwitcher.spark()) - val result = - runner.runTpcQuery(sessionSwitcher.spark(), testDesc, id, explain, metrics) - val resultRows = result.rows - println( - s"Successfully ran query $id. " + - s"Returned row count: ${resultRows.length}") - TestResultLine( - id, - succeed = true, - coordinate, - Some(resultRows.length), - Some(result.planningTimeMillis), - Some(result.executionTimeMillis), - result.metrics, - None) - } catch { - case e: Exception => - val error = Some(s"FATAL: ${ExceptionUtils.getStackTrace(e)}") - println( - s"Error running query $id. " + - s" Error: ${error.get}") - TestResultLine(id, succeed = false, coordinate, None, None, None, Map.empty, error) - } - } - - private[tpc] def warmUp( - id: String, - desc: String, - sessionSwitcher: SparkSessionSwitcher, - runner: TpcRunner): Unit = { - println(s"Warming up: Running query: $id...") - try { - val testDesc = "Gluten Spark %s %s warm up".format(desc, id) - sessionSwitcher.useSession("test", testDesc) - runner.createTables(sessionSwitcher.spark()) - val result = runner.runTpcQuery(sessionSwitcher.spark(), testDesc, id, explain = false) - val resultRows = result.rows - println( - s"Warming up: Successfully ran query $id. " + - s"Returned row count: ${resultRows.length}") - } catch { - case e: Exception => - val error = Some(s"FATAL: ${ExceptionUtils.getStackTrace(e)}") - println( - s"Warming up: Error running query $id. " + - s" Error: ${error.get}") - } - } -} diff --git a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/tpc/action/Queries.scala b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/tpc/action/Queries.scala deleted file mode 100644 index 290b8e3f5b0c6..0000000000000 --- a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/tpc/action/Queries.scala +++ /dev/null @@ -1,194 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.gluten.integration.tpc.action - -import org.apache.gluten.integration.stat.RamStat -import org.apache.gluten.integration.tpc.{TpcRunner, TpcSuite} -import org.apache.commons.lang3.exception.ExceptionUtils -import org.apache.gluten.integration.tpc.action.Actions.QuerySelector - -case class Queries( - scale: Double, - queries: QuerySelector, - explain: Boolean, - iterations: Int, - randomKillTasks: Boolean) - extends Action { - - override def execute(tpcSuite: TpcSuite): Boolean = { - val runQueryIds = queries.select(tpcSuite) - val runner: TpcRunner = new TpcRunner(tpcSuite.queryResource(), tpcSuite.dataWritePath(scale)) - val results = (0 until iterations).flatMap { iteration => - println(s"Running tests (iteration $iteration)...") - runQueryIds.map { queryId => - Queries.runTpcQuery( - runner, - tpcSuite.sessionSwitcher, - queryId, - tpcSuite.desc(), - explain, - randomKillTasks) - } - }.toList - - val passedCount = results.count(l => l.testPassed) - val count = results.count(_ => true) - - // RAM stats - println("Performing GC to collect RAM statistics... ") - System.gc() - System.gc() - printf( - "RAM statistics: JVM Heap size: %d KiB (total %d KiB), Process RSS: %d KiB\n", - RamStat.getJvmHeapUsed(), - RamStat.getJvmHeapTotal(), - RamStat.getProcessRamUsed()) - - println("") - println("Test report: ") - println("") - printf("Summary: %d out of %d queries passed. \n", passedCount, count) - println("") - val succeed = results.filter(_.testPassed) - Queries.printResults(succeed) - println("") - - if (passedCount == count) { - println("No failed queries. ") - println("") - } else { - println("Failed queries: ") - println("") - Queries.printResults(results.filter(!_.testPassed)) - println("") - } - - var all = Queries.aggregate(results, "all") - - if (passedCount != count) { - all = Queries.aggregate(succeed, "succeeded") ::: all - } - - println("Overall: ") - println("") - Queries.printResults(all) - println("") - - if (passedCount != count) { - return false - } - true - } -} - -object Queries { - case class TestResultLine( - queryId: String, - testPassed: Boolean, - rowCount: Option[Long], - planningTimeMillis: Option[Long], - executionTimeMillis: Option[Long], - errorMessage: Option[String]) - - object TestResultLine { - implicit object Parser extends TableFormatter.RowParser[TestResultLine] { - override def parse(line: TestResultLine): Seq[Any] = { - Seq( - line.queryId, - line.testPassed, - line.rowCount.getOrElse("N/A"), - line.planningTimeMillis.getOrElse("N/A"), - line.executionTimeMillis.getOrElse("N/A")) - } - } - } - - private def printResults(results: List[TestResultLine]): Unit = { - val formatter = TableFormatter.create[TestResultLine]( - "Query ID", - "Was Passed", - "Row Count", - "Plan Time (Millis)", - "Query Time (Millis)") - - results.foreach { line => - formatter.appendRow(line) - } - - formatter.print(System.out) - } - - private def aggregate(succeed: List[TestResultLine], name: String): List[TestResultLine] = { - if (succeed.isEmpty) { - return Nil - } - List( - succeed.reduce((r1, r2) => - TestResultLine( - name, - testPassed = true, - if (r1.rowCount.nonEmpty && r2.rowCount.nonEmpty) - Some(r1.rowCount.get + r2.rowCount.get) - else None, - if (r1.planningTimeMillis.nonEmpty && r2.planningTimeMillis.nonEmpty) - Some(r1.planningTimeMillis.get + r2.planningTimeMillis.get) - else None, - if (r1.executionTimeMillis.nonEmpty && r2.executionTimeMillis.nonEmpty) - Some(r1.executionTimeMillis.get + r2.executionTimeMillis.get) - else None, - None))) - } - - private def runTpcQuery( - runner: _root_.org.apache.gluten.integration.tpc.TpcRunner, - sessionSwitcher: _root_.org.apache.spark.sql.SparkSessionSwitcher, - id: _root_.java.lang.String, - desc: _root_.java.lang.String, - explain: Boolean, - randomKillTasks: Boolean) = { - println(s"Running query: $id...") - try { - val testDesc = "Gluten Spark %s %s".format(desc, id) - sessionSwitcher.useSession("test", testDesc) - runner.createTables(sessionSwitcher.spark()) - val result = runner.runTpcQuery( - sessionSwitcher.spark(), - testDesc, - id, - explain = explain, - randomKillTasks = randomKillTasks) - val resultRows = result.rows - println( - s"Successfully ran query $id. " + - s"Returned row count: ${resultRows.length}") - TestResultLine( - id, - testPassed = true, - Some(resultRows.length), - Some(result.planningTimeMillis), - Some(result.executionTimeMillis), - None) - } catch { - case e: Exception => - val error = Some(s"FATAL: ${ExceptionUtils.getStackTrace(e)}") - println( - s"Error running query $id. " + - s" Error: ${error.get}") - TestResultLine(id, testPassed = false, None, None, None, error) - } - } -} diff --git a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/tpc/action/QueriesCompare.scala b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/tpc/action/QueriesCompare.scala deleted file mode 100644 index 404d75cb426ea..0000000000000 --- a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/tpc/action/QueriesCompare.scala +++ /dev/null @@ -1,238 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.gluten.integration.tpc.action - -import org.apache.gluten.integration.stat.RamStat -import org.apache.gluten.integration.tpc.{TpcRunner, TpcSuite} -import org.apache.spark.sql.{SparkSessionSwitcher, TestUtils} -import org.apache.commons.lang3.exception.ExceptionUtils -import org.apache.gluten.integration.tpc.action.Actions.QuerySelector - -case class QueriesCompare( - scale: Double, - queries: QuerySelector, - explain: Boolean, - iterations: Int) - extends Action { - - override def execute(tpcSuite: TpcSuite): Boolean = { - val runner: TpcRunner = new TpcRunner(tpcSuite.queryResource(), tpcSuite.dataWritePath(scale)) - val runQueryIds = queries.select(tpcSuite) - val results = (0 until iterations).flatMap { iteration => - println(s"Running tests (iteration $iteration)...") - runQueryIds.map { queryId => - QueriesCompare.runTpcQuery( - queryId, - explain, - tpcSuite.desc(), - tpcSuite.sessionSwitcher, - runner) - } - }.toList - - val passedCount = results.count(l => l.testPassed) - val count = results.count(_ => true) - - // RAM stats - println("Performing GC to collect RAM statistics... ") - System.gc() - System.gc() - printf( - "RAM statistics: JVM Heap size: %d KiB (total %d KiB), Process RSS: %d KiB\n", - RamStat.getJvmHeapUsed(), - RamStat.getJvmHeapTotal(), - RamStat.getProcessRamUsed()) - - println("") - println("Test report: ") - println("") - printf("Summary: %d out of %d queries passed. \n", passedCount, count) - println("") - val succeed = results.filter(_.testPassed) - QueriesCompare.printResults(succeed) - println("") - - if (passedCount == count) { - println("No failed queries. ") - println("") - } else { - println( - "Failed queries (a failed query with correct row count indicates value mismatches): ") - println("") - QueriesCompare.printResults(results.filter(!_.testPassed)) - println("") - } - - var all = QueriesCompare.aggregate(results, "all") - - if (passedCount != count) { - all = QueriesCompare.aggregate(succeed, "succeeded") ::: all - } - - println("Overall: ") - println("") - QueriesCompare.printResults(all) - println("") - - if (passedCount != count) { - return false - } - true - } -} - -object QueriesCompare { - case class TestResultLine( - queryId: String, - testPassed: Boolean, - expectedRowCount: Option[Long], - actualRowCount: Option[Long], - expectedPlanningTimeMillis: Option[Long], - actualPlanningTimeMillis: Option[Long], - expectedExecutionTimeMillis: Option[Long], - actualExecutionTimeMillis: Option[Long], - errorMessage: Option[String]) - - object TestResultLine { - implicit object Parser extends TableFormatter.RowParser[TestResultLine] { - override def parse(line: TestResultLine): Seq[Any] = { - val timeVariation = - if (line.expectedExecutionTimeMillis.nonEmpty && line.actualExecutionTimeMillis.nonEmpty) { - Some( - ((line.expectedExecutionTimeMillis.get - line.actualExecutionTimeMillis.get).toDouble - / line.actualExecutionTimeMillis.get.toDouble) * 100) - } else None - Seq( - line.queryId, - line.testPassed, - line.expectedRowCount.getOrElse("N/A"), - line.actualRowCount.getOrElse("N/A"), - line.expectedPlanningTimeMillis.getOrElse("N/A"), - line.actualPlanningTimeMillis.getOrElse("N/A"), - line.expectedExecutionTimeMillis.getOrElse("N/A"), - line.actualExecutionTimeMillis.getOrElse("N/A"), - timeVariation.map("%15.2f%%".format(_)).getOrElse("N/A")) - } - } - } - - private def printResults(results: List[TestResultLine]): Unit = { - val formatter = TableFormatter.create[TestResultLine]( - "Query ID", - "Was Passed", - "Expected Row Count", - "Actual Row Count", - "Baseline Planning Time (Millis)", - "Planning Time (Millis)", - "Baseline Query Time (Millis)", - "Query Time (Millis)", - "Query Time Variation") - - results.foreach { line => - formatter.appendRow(line) - } - - formatter.print(System.out) - } - - private def aggregate(succeed: List[TestResultLine], name: String): List[TestResultLine] = { - if (succeed.isEmpty) { - return Nil - } - List( - succeed.reduce((r1, r2) => - TestResultLine( - name, - testPassed = true, - if (r1.expectedRowCount.nonEmpty && r2.expectedRowCount.nonEmpty) - Some(r1.expectedRowCount.get + r2.expectedRowCount.get) - else None, - if (r1.actualRowCount.nonEmpty && r2.actualRowCount.nonEmpty) - Some(r1.actualRowCount.get + r2.actualRowCount.get) - else None, - if (r1.expectedPlanningTimeMillis.nonEmpty && r2.expectedPlanningTimeMillis.nonEmpty) - Some(r1.expectedPlanningTimeMillis.get + r2.expectedPlanningTimeMillis.get) - else None, - if (r1.actualPlanningTimeMillis.nonEmpty && r2.actualPlanningTimeMillis.nonEmpty) - Some(r1.actualPlanningTimeMillis.get + r2.actualPlanningTimeMillis.get) - else None, - if (r1.expectedExecutionTimeMillis.nonEmpty && r2.expectedExecutionTimeMillis.nonEmpty) - Some(r1.expectedExecutionTimeMillis.get + r2.expectedExecutionTimeMillis.get) - else None, - if (r1.actualExecutionTimeMillis.nonEmpty && r2.actualExecutionTimeMillis.nonEmpty) - Some(r1.actualExecutionTimeMillis.get + r2.actualExecutionTimeMillis.get) - else None, - None))) - } - - private[tpc] def runTpcQuery( - id: String, - explain: Boolean, - desc: String, - sessionSwitcher: SparkSessionSwitcher, - runner: TpcRunner): TestResultLine = { - println(s"Running query: $id...") - try { - val baseLineDesc = "Vanilla Spark %s %s".format(desc, id) - sessionSwitcher.useSession("baseline", baseLineDesc) - runner.createTables(sessionSwitcher.spark()) - val expected = - runner.runTpcQuery(sessionSwitcher.spark(), baseLineDesc, id, explain = explain) - val expectedRows = expected.rows - val testDesc = "Gluten Spark %s %s".format(desc, id) - sessionSwitcher.useSession("test", testDesc) - runner.createTables(sessionSwitcher.spark()) - val result = runner.runTpcQuery(sessionSwitcher.spark(), testDesc, id, explain = explain) - val resultRows = result.rows - val error = TestUtils.compareAnswers(resultRows, expectedRows, sort = true) - if (error.isEmpty) { - println( - s"Successfully ran query $id, result check was passed. " + - s"Returned row count: ${resultRows.length}, expected: ${expectedRows.length}") - return TestResultLine( - id, - testPassed = true, - Some(expectedRows.length), - Some(resultRows.length), - Some(expected.planningTimeMillis), - Some(result.planningTimeMillis), - Some(expected.executionTimeMillis), - Some(result.executionTimeMillis), - None) - } - println(s"Error running query $id, result check was not passed. " + - s"Returned row count: ${resultRows.length}, expected: ${expectedRows.length}, error: ${error.get}") - TestResultLine( - id, - testPassed = false, - Some(expectedRows.length), - Some(resultRows.length), - Some(expected.planningTimeMillis), - Some(result.planningTimeMillis), - Some(expected.executionTimeMillis), - Some(result.executionTimeMillis), - error) - } catch { - case e: Exception => - val error = Some(s"FATAL: ${ExceptionUtils.getStackTrace(e)}") - println( - s"Error running query $id. " + - s" Error: ${error.get}") - TestResultLine(id, testPassed = false, None, None, None, None, None, None, error) - } - } -} diff --git a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/tpc/action/TableFormatter.scala b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/tpc/action/TableFormatter.scala deleted file mode 100644 index 8aeea9938e901..0000000000000 --- a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/tpc/action/TableFormatter.scala +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.gluten.integration.tpc.action - -import java.io.{OutputStream, PrintStream} -import scala.collection.mutable - -trait TableFormatter[ROW <: Any] { - import TableFormatter._ - def appendRow(row: ROW): Unit - def print(s: OutputStream): Unit -} - -object TableFormatter { - def create[ROW <: Any](fields: String*)( - implicit parser: RowParser[ROW]): TableFormatter[ROW] = { - assert(fields.nonEmpty) - new Impl[ROW](Schema(fields), parser) - } - - private case class Schema(fields: Seq[String]) - - private class Impl[ROW <: Any](schema: Schema, parser: RowParser[ROW]) - extends TableFormatter[ROW] { - private val rows = mutable.ListBuffer[Seq[String]]() - - override def appendRow(row: ROW): Unit = { - val parsed = parser.parse(row) - assert(parsed.size == schema.fields.size) - rows += parsed.map(_.toString) - } - - override def print(s: OutputStream): Unit = { - val printer = new PrintStream(s) - if (rows.isEmpty) { - printer.println("(N/A)") - printer.flush() - return - } - val numFields = schema.fields.size - val widths = (0 until numFields) - .map { i => - rows.map(_(i).length).max max schema.fields(i).length - } - .map(_ + 1) - val pBuilder = StringBuilder.newBuilder - pBuilder ++= "|" - widths.foreach { w => - pBuilder ++= s"%${w}s|" - } - val pattern = pBuilder.toString() - printer.println(String.format(pattern, schema.fields: _*)) - rows.foreach { r => - printer.println(String.format(pattern, r: _*)) - } - printer.flush() - } - } - - trait RowParser[ROW <: Any] { - def parse(row: ROW): Seq[Any] - } -} diff --git a/tools/gluten-it/common/src/main/scala/org/apache/spark/sql/QueryRunner.scala b/tools/gluten-it/common/src/main/scala/org/apache/spark/sql/SparkQueryRunner.scala similarity index 87% rename from tools/gluten-it/common/src/main/scala/org/apache/spark/sql/QueryRunner.scala rename to tools/gluten-it/common/src/main/scala/org/apache/spark/sql/SparkQueryRunner.scala index a5b699a1ae48e..b68f74c1d5ed7 100644 --- a/tools/gluten-it/common/src/main/scala/org/apache/spark/sql/QueryRunner.scala +++ b/tools/gluten-it/common/src/main/scala/org/apache/spark/sql/SparkQueryRunner.scala @@ -18,13 +18,9 @@ package org.apache.spark.sql import org.apache.spark.{SparkContext, Success, TaskKilled} import org.apache.spark.executor.ExecutorMetrics -import org.apache.spark.scheduler.{ - SparkListener, - SparkListenerExecutorMetricsUpdate, - SparkListenerTaskEnd, - SparkListenerTaskStart -} +import org.apache.spark.scheduler.{SparkListener, SparkListenerExecutorMetricsUpdate, SparkListenerTaskEnd, SparkListenerTaskStart} import org.apache.spark.sql.KillTaskListener.INIT_WAIT_TIME_MS +import org.apache.spark.sql.catalyst.QueryPlanningTracker import com.google.common.base.Preconditions import org.apache.commons.lang3.RandomUtils @@ -33,7 +29,7 @@ import java.io.ByteArrayOutputStream import java.nio.charset.StandardCharsets import java.util.concurrent.atomic.AtomicInteger -object QueryRunner { +object SparkQueryRunner { private val availableExecutorMetrics: Set[String] = Set( "JVMHeapMemory", "JVMOffHeapMemory", @@ -50,9 +46,10 @@ object QueryRunner { "ProcessTreePythonVMemory", "ProcessTreePythonRSSMemory", "ProcessTreeOtherVMemory", - "ProcessTreeOtherRSSMemory") + "ProcessTreeOtherRSSMemory" + ) - def runTpcQuery( + def runQuery( spark: SparkSession, desc: String, queryPath: String, @@ -82,31 +79,39 @@ object QueryRunner { println(s"Executing SQL query from resource path $queryPath...") try { + val tracker = new QueryPlanningTracker val sql = resourceToString(queryPath) val prev = System.nanoTime() val df = spark.sql(sql) - val rows = df.collect() + val rows = QueryPlanningTracker.withTracker(tracker) { + df.collect() + } if (explain) { df.explain(extended = true) } - val planMillis = - df.queryExecution.tracker.phases.values.map(p => p.endTimeMs - p.startTimeMs).sum + val sparkTracker = df.queryExecution.tracker + val sparkRulesMillis = + sparkTracker.rules.map(_._2.totalTimeNs).sum / 1000000L + val otherRulesMillis = + tracker.rules.map(_._2.totalTimeNs).sum / 1000000L + val planMillis = sparkRulesMillis + otherRulesMillis val totalMillis = (System.nanoTime() - prev) / 1000000L val collectedMetrics = metrics.map(name => (name, em.getMetricValue(name))).toMap RunResult(rows, planMillis, totalMillis - planMillis, collectedMetrics) } finally { sc.removeSparkListener(metricsListener) - killTaskListener.foreach(l => { - sc.removeSparkListener(l) - println(s"Successful kill rate ${"%.2f%%" - .format(100 * l.successfulKillRate())} during execution of app: ${sc.applicationId}") - }) + killTaskListener.foreach( + l => { + sc.removeSparkListener(l) + println(s"Successful kill rate ${"%.2f%%" + .format(100 * l.successfulKillRate())} during execution of app: ${sc.applicationId}") + }) sc.setJobDescription(null) } } private def resourceToString(resource: String): String = { - val inStream = QueryRunner.getClass.getResourceAsStream(resource) + val inStream = SparkQueryRunner.getClass.getResourceAsStream(resource) Preconditions.checkNotNull(inStream) val outStream = new ByteArrayOutputStream try { @@ -166,7 +171,8 @@ class KillTaskListener(val sc: SparkContext) extends SparkListener { val total = Math.min( stageKillMaxWaitTimeLookup.computeIfAbsent(taskStart.stageId, _ => Long.MaxValue), stageKillWaitTimeLookup - .computeIfAbsent(taskStart.stageId, _ => INIT_WAIT_TIME_MS)) + .computeIfAbsent(taskStart.stageId, _ => INIT_WAIT_TIME_MS) + ) val elapsed = System.currentTimeMillis() - startMs val remaining = total - elapsed if (remaining <= 0L) { @@ -180,6 +186,7 @@ class KillTaskListener(val sc: SparkContext) extends SparkListener { } throw new IllegalStateException() } + val elapsed = wait() // We have 50% chance to kill the task. FIXME make it configurable? diff --git a/tools/gluten-it/common/src/main/scala/org/apache/spark/sql/SparkSessionSwitcher.scala b/tools/gluten-it/common/src/main/scala/org/apache/spark/sql/SparkSessionSwitcher.scala index 17a50fd295a7a..0a1a25351ad0e 100644 --- a/tools/gluten-it/common/src/main/scala/org/apache/spark/sql/SparkSessionSwitcher.scala +++ b/tools/gluten-it/common/src/main/scala/org/apache/spark/sql/SparkSessionSwitcher.scala @@ -65,6 +65,16 @@ class SparkSessionSwitcher(val masterUrl: String, val logLevel: String) extends useSession(SessionDesc(SessionToken(token), appName)) } + def renewSession(): Unit = synchronized { + if (!hasActiveSession()) { + return + } + val sd = _activeSessionDesc + println(s"Renewing $sd session... ") + stopActiveSession() + useSession(sd) + } + private def useSession(desc: SessionDesc): Unit = synchronized { if (desc == _activeSessionDesc) { return diff --git a/tools/gluten-it/common/src/test/java/org/apache/gluten/integration/tpc/TpcTest.java b/tools/gluten-it/common/src/test/java/org/apache/gluten/integration/CliTest.java similarity index 90% rename from tools/gluten-it/common/src/test/java/org/apache/gluten/integration/tpc/TpcTest.java rename to tools/gluten-it/common/src/test/java/org/apache/gluten/integration/CliTest.java index 2463429afdfb8..804cf3a0a5af5 100644 --- a/tools/gluten-it/common/src/test/java/org/apache/gluten/integration/tpc/TpcTest.java +++ b/tools/gluten-it/common/src/test/java/org/apache/gluten/integration/CliTest.java @@ -14,10 +14,10 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.gluten.integration.tpc; +package org.apache.gluten.integration; -public class TpcTest { +public class CliTest { public static void main(String[] args) { - Tpc.main(args); + Cli.main(args); } } diff --git a/tools/gluten-it/common/src/test/java/org/apache/gluten/integration/action/TableRenderTest.scala b/tools/gluten-it/common/src/test/java/org/apache/gluten/integration/action/TableRenderTest.scala new file mode 100644 index 0000000000000..1efc721489282 --- /dev/null +++ b/tools/gluten-it/common/src/test/java/org/apache/gluten/integration/action/TableRenderTest.scala @@ -0,0 +1,130 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.gluten.integration.action + +import org.apache.gluten.integration.action.TableRender.Field._ +import org.apache.gluten.integration.action.TableRender.RowParser +import org.apache.gluten.integration.action.TableRender.RowParser.FieldAppender + +// The tests are manually run. +object TableRenderTest { + def case0(): Unit = { + val render: TableRender[Seq[String]] = TableRender.create( + Branch("ABC", List(Branch("AB", List(Leaf("A"), Leaf("B"))), Leaf("C"))), + Branch("DE", List(Leaf("D"), Leaf("E"))))(new RowParser[Seq[String]] { + override def parse(rowFactory: FieldAppender.RowAppender, row: Seq[String]): Unit = { + val inc = rowFactory.incremental() + row.foreach(ceil => inc.next().write(ceil)) + } + }) + render.print(Console.out) + Console.out.println() + } + + def case1(): Unit = { + val render: TableRender[Seq[String]] = TableRender.create( + Branch("ABC", List(Branch("AB", List(Leaf("A"), Leaf("B"))), Leaf("C"))), + Branch("DE", List(Leaf("D"), Leaf("E"))))(new RowParser[Seq[String]] { + override def parse(rowFactory: FieldAppender.RowAppender, row: Seq[String]): Unit = { + val inc = rowFactory.incremental() + row.foreach(ceil => inc.next().write(ceil)) + } + }) + + render.appendRow(List("aaaa", "b", "cccccc", "d", "eeeee")) + render.print(Console.out) + Console.out.println() + } + + def case2(): Unit = { + val render: TableRender[Seq[String]] = TableRender.create( + Branch("ABC", List(Branch("AAAAAAAAABBBBBB", List(Leaf("A"), Leaf("B"))), Leaf("C"))), + Branch("DE", List(Leaf("D"), Leaf("E"))))(new RowParser[Seq[String]] { + override def parse(rowFactory: FieldAppender.RowAppender, row: Seq[String]): Unit = { + val inc = rowFactory.incremental() + row.foreach(ceil => inc.next().write(ceil)) + } + }) + + render.appendRow(List("aaaa", "b", "cccccc", "d", "eeeee")) + render.print(Console.out) + Console.out.println() + } + + def case3(): Unit = { + val render: TableRender[Seq[String]] = TableRender.create( + Branch("ABC", List(Branch("AB", List(Leaf("A"), Leaf("B"))), Leaf("CCCCCCCCCCCCC"))), + Branch("DE", List(Leaf("D"), Leaf("E"))))(new RowParser[Seq[String]] { + override def parse(rowFactory: FieldAppender.RowAppender, row: Seq[String]): Unit = { + val inc = rowFactory.incremental() + row.foreach(ceil => inc.next().write(ceil)) + } + }) + + render.appendRow(List("aaaa", "b", "cccccc", "d", "eeeee")) + render.appendRow(List("aaaaaaaaaaaaa", "b", "cccccc", "ddddddddddd", "eeeee")) + render.print(Console.out) + Console.out.println() + } + + def case4(): Unit = { + val render: TableRender[Seq[String]] = TableRender.create( + Branch( + "ABBBBBBBBBBBBBBBBBBBBBBBBBBBBC", + List(Branch("AB", List(Leaf("A"), Leaf("B"))), Leaf("C"))), + Branch("DE", List(Leaf("D"), Leaf("E"))))(new RowParser[Seq[String]] { + override def parse(rowFactory: FieldAppender.RowAppender, row: Seq[String]): Unit = { + val inc = rowFactory.incremental() + row.foreach(ceil => inc.next().write(ceil)) + } + }) + + render.appendRow(List("aaaa", "b", "cccccc", "d", "eeeee")) + render.print(Console.out) + Console.out.println() + } + + def case5(): Unit = { + val leafs = List(Leaf("1"), Leaf("2"), Leaf("3"), Leaf("4")) + val render: TableRender[Seq[String]] = TableRender.create( + Leaf("Query ID"), + Branch("Succeeded", leafs), + Branch("Row Count", leafs))(new RowParser[Seq[String]] { + override def parse(rowFactory: FieldAppender.RowAppender, row: Seq[String]): Unit = { + val inc = rowFactory.incremental() + row.foreach(ceil => inc.next().write(ceil)) + } + }) + + render.appendRow( + List("q1", "true", "true", "true && true && true && true", "true", "1", "1", "1", "1")) + render.appendRow( + List("q2", "true", "true", "true", "true", "100000", "100000", "100000", "100000")) + render.print(Console.out) + Console.out.println() + } + + def main(args: Array[String]): Unit = { + case0() + case1() + case2() + case3() + case4() + case5() + } +} diff --git a/tools/gluten-it/package/pom.xml b/tools/gluten-it/package/pom.xml index 1f86ee723240f..70a59cac05ea0 100644 --- a/tools/gluten-it/package/pom.xml +++ b/tools/gluten-it/package/pom.xml @@ -69,6 +69,17 @@ runtime test-jar + + org.apache.spark + spark-sql_${scala.binary.version} + runtime + + + org.apache.spark + spark-sql_${scala.binary.version} + runtime + test-jar + diff --git a/tools/gluten-it/pom.xml b/tools/gluten-it/pom.xml index c0e2fc321643f..71db637a8403e 100644 --- a/tools/gluten-it/pom.xml +++ b/tools/gluten-it/pom.xml @@ -21,7 +21,7 @@ 3.4.2 2.12 3 - 0.3.0-incubating + 0.3.2-incubating 0.8.0 1.2.0-SNAPSHOT 32.0.1-jre @@ -87,6 +87,25 @@ provided test-jar + + org.apache.spark + spark-sql_${scala.binary.version} + ${spark.version} + + + com.google.protobuf + protobuf-java + + + provided + + + org.apache.spark + spark-sql_${scala.binary.version} + ${spark.version} + test-jar + provided + @@ -100,6 +119,15 @@ 1.8 + + java-11 + + 11 + + + 11 + + java-17 @@ -117,105 +145,29 @@ 3.2.2 - - - org.apache.spark - spark-sql_${scala.binary.version} - ${spark.version} - - - com.google.protobuf - protobuf-java - - - - - org.apache.spark - spark-sql_${scala.binary.version} - ${spark.version} - test-jar - - spark-3.3 3.3.1 - - - org.apache.spark - spark-sql_${scala.binary.version} - ${spark.version} - - - com.google.protobuf - protobuf-java - - - - - org.apache.spark - spark-sql_${scala.binary.version} - ${spark.version} - test-jar - - spark-3.4 3.4.2 - - - org.apache.spark - spark-sql_${scala.binary.version} - ${spark.version} - - - com.google.protobuf - protobuf-java - - - - - org.apache.spark - spark-sql_${scala.binary.version} - ${spark.version} - test-jar - - spark-3.5 3.5.1 - - - org.apache.spark - spark-sql_${scala.binary.version} - ${spark.version} - - - com.google.protobuf - protobuf-java - - - - - org.apache.spark - spark-sql_${scala.binary.version} - ${spark.version} - test-jar - - celeborn-0.4 - 0.4.0-incubating + 0.4.1 diff --git a/tools/gluten-it/sbin/gluten-it.sh b/tools/gluten-it/sbin/gluten-it.sh index 5262df0541e3a..fda1174179363 100755 --- a/tools/gluten-it/sbin/gluten-it.sh +++ b/tools/gluten-it/sbin/gluten-it.sh @@ -47,4 +47,4 @@ $JAVA_HOME/bin/java $GLUTEN_IT_JVM_ARGS \ -Djdk.reflect.useDirectMethodHandle=false \ -Dio.netty.tryReflectionSetAccessible=true \ -cp $JAR_PATH \ - org.apache.gluten.integration.tpc.Tpc $@ + org.apache.gluten.integration.Cli $@